Code Search for Developers
 
 
  

NGramImport.java from Texai at Krugle


Show NGramImport.java syntax highlighted

/*
 * NGramImport.java
 *
 * Created on February 14, 2007, 10:26 AM
 *
 * Description:
 *
 * Copyright (C) 2007 Stephen L. Reed.
 *
 * This program is free software; you can redistribute it and/or modify it under the terms
 * of the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with this program;
 * if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package edu.cmu.sphinx.linguist.language.ngram.large;

import edu.cmu.sphinx.util.LogMath;
import edu.cmu.sphinx.util.props.PropertyException;
import java.io.File;
import java.io.IOException;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.Map;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import org.apache.log4j.Logger;
import org.texai.kb.ejb.entity.AtomicTerm;
import org.texai.kb.ejb.session.DomainEntityLoaderBean;
import org.texai.kb.ejb.session.DomainEntityPersisterBean;
import org.texai.kb.ejb.session.shared.TermFinderFacadeBean;
import org.texai.util.TexaiException;

/**
 *
 * @author reed
 */
public final class NGramImport {
  
  /** the limit during development testing */
//  private static final int TEST_LIMIT = 15;
  private static final int TEST_LIMIT = 10000000;
  
  /** the limit that when exceeded causes the entity manager to be cleared of stale managed objects */
  private static final int CLEAR_ENTITY_MANAGER_LIMIT = 20;
  
  /** the N-gram file path */
  private static final String NGRAM_FILE_NAME =
          "/home/reed/svn/SpeechRecognition/src/org/texai/speech/recognition/language_model.arpaformat.DMP";
  
  /** the Sphinx property to get the Log base */
  public final static String PROP_LOG_BASE = "logBase";
  
  /** the Sphinx property that controls whether we use the old, slow (but correct)
   * method of performing the LogMath.add by doing the actual computation.
   */
  public final static String PROP_USE_ADD_TABLE = "useAddTable";
  
  /** the entity manager factory */
  private EntityManagerFactory entityManagerFactory;
  
  /** the entity manager */
  private EntityManager entityManager;
  
  /** the domain entity loader */
  private DomainEntityLoaderBean domainEntityLoader;
  
  /** the domain entity persister */
  private DomainEntityPersisterBean domainEntityPersister;
  
  /** the term finder */
  private TermFinderFacadeBean termFinderFacade;
  
  /** the log4j logger */
  private static final Logger LOGGER = Logger.getLogger(NGramImport.class.getName());
  
  /** the creator */
  private AtomicTerm creator;
  
  /** the creation purpose */
  private AtomicTerm creationPurpose;
  
  
  private Map unigramIDMap;
  
  private Map loadedTrigramBuffer;
  
  private MyBinaryLoader loader;
  
  private LogMath logMath;
  
  private UnigramProbability[] unigrams;
  
  private float[] bigramProbTable;
  
  private float[] trigramProbTable;
  
  private float[] trigramBackoffTable;
  
  private int[] trigramSegmentTable;
  
  private Dictionary dictionary;
  
  /** Creates a new instance of NGramImport. */
  public NGramImport() {
    super();
  }
  
  public void initialize() {
    
  }
  
  
  public void importNgrams() {
    unigramIDMap = new HashMap();
    loadedTrigramBuffer = new HashMap();
    logMath = new LogMath();
    final MyPropertySheet propertySheet = new MyPropertySheet();
    try {
      propertySheet.setFloat(PROP_LOG_BASE, 1.0001f);
      propertySheet.setBoolean(PROP_USE_ADD_TABLE, true);
      logMath.newProperties(propertySheet);
    } catch (PropertyException ex) {
      ex.printStackTrace();
    }
    LOGGER.info("reading " + NGRAM_FILE_NAME);
    try {
      loader = new MyBinaryLoader(
              new File(NGRAM_FILE_NAME),
              logMath);
    } catch (final IOException ex) {
      throw new TexaiException(ex);
    }
    unigrams = loader.getUnigrams();
    bigramProbTable = loader.getBigramProbabilities();
    trigramProbTable = loader.getTrigramProbabilities();
    trigramBackoffTable = loader.getTrigramBackoffWeights();
    trigramSegmentTable = loader.getTrigramSegments();
    
    LOGGER.info("Unigrams: " + loader.getNumberUnigrams());
    LOGGER.info("Bigrams: " + loader.getNumberBigrams());
    LOGGER.info("Trigrams: " + loader.getNumberTrigrams());
    
    
  }
  
  public void finalization() {
    
  }
  
  /** Executes this application.
   *
   * @param args the command line arguments (unused)
   */
  public static void main(final String[] args) {
    final NGramImport nGramImport = new NGramImport();
    nGramImport.initialize();
    try {
      nGramImport.importNgrams();
    } catch (final TexaiException ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    } catch (final NullPointerException ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    }
    nGramImport.finalization();
  }
}




See more files for this project here

Texai

Texai is an chatbot that intelligently seeks to acquire knowledge and friendly behaviors.

Project homepage: http://sourceforge.net/projects/texai
Programming language(s): Java,Shell Script,XML
License: other

  MyBinaryLoader.java
  MyPropertySheet.java
  NGramImport.java
  file-format-description.txt