NGramImport.java from Texai at Krugle
Show NGramImport.java syntax highlighted
/*
* NGramImport.java
*
* Created on February 14, 2007, 10:26 AM
*
* Description:
*
* Copyright (C) 2007 Stephen L. Reed.
*
* This program is free software; you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program;
* if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package edu.cmu.sphinx.linguist.language.ngram.large;
import edu.cmu.sphinx.util.LogMath;
import edu.cmu.sphinx.util.props.PropertyException;
import java.io.File;
import java.io.IOException;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.Map;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import org.apache.log4j.Logger;
import org.texai.kb.ejb.entity.AtomicTerm;
import org.texai.kb.ejb.session.DomainEntityLoaderBean;
import org.texai.kb.ejb.session.DomainEntityPersisterBean;
import org.texai.kb.ejb.session.shared.TermFinderFacadeBean;
import org.texai.util.TexaiException;
/**
*
* @author reed
*/
public final class NGramImport {
/** the limit during development testing */
// private static final int TEST_LIMIT = 15;
private static final int TEST_LIMIT = 10000000;
/** the limit that when exceeded causes the entity manager to be cleared of stale managed objects */
private static final int CLEAR_ENTITY_MANAGER_LIMIT = 20;
/** the N-gram file path */
private static final String NGRAM_FILE_NAME =
"/home/reed/svn/SpeechRecognition/src/org/texai/speech/recognition/language_model.arpaformat.DMP";
/** the Sphinx property to get the Log base */
public final static String PROP_LOG_BASE = "logBase";
/** the Sphinx property that controls whether we use the old, slow (but correct)
* method of performing the LogMath.add by doing the actual computation.
*/
public final static String PROP_USE_ADD_TABLE = "useAddTable";
/** the entity manager factory */
private EntityManagerFactory entityManagerFactory;
/** the entity manager */
private EntityManager entityManager;
/** the domain entity loader */
private DomainEntityLoaderBean domainEntityLoader;
/** the domain entity persister */
private DomainEntityPersisterBean domainEntityPersister;
/** the term finder */
private TermFinderFacadeBean termFinderFacade;
/** the log4j logger */
private static final Logger LOGGER = Logger.getLogger(NGramImport.class.getName());
/** the creator */
private AtomicTerm creator;
/** the creation purpose */
private AtomicTerm creationPurpose;
private Map unigramIDMap;
private Map loadedTrigramBuffer;
private MyBinaryLoader loader;
private LogMath logMath;
private UnigramProbability[] unigrams;
private float[] bigramProbTable;
private float[] trigramProbTable;
private float[] trigramBackoffTable;
private int[] trigramSegmentTable;
private Dictionary dictionary;
/** Creates a new instance of NGramImport. */
public NGramImport() {
super();
}
public void initialize() {
}
public void importNgrams() {
unigramIDMap = new HashMap();
loadedTrigramBuffer = new HashMap();
logMath = new LogMath();
final MyPropertySheet propertySheet = new MyPropertySheet();
try {
propertySheet.setFloat(PROP_LOG_BASE, 1.0001f);
propertySheet.setBoolean(PROP_USE_ADD_TABLE, true);
logMath.newProperties(propertySheet);
} catch (PropertyException ex) {
ex.printStackTrace();
}
LOGGER.info("reading " + NGRAM_FILE_NAME);
try {
loader = new MyBinaryLoader(
new File(NGRAM_FILE_NAME),
logMath);
} catch (final IOException ex) {
throw new TexaiException(ex);
}
unigrams = loader.getUnigrams();
bigramProbTable = loader.getBigramProbabilities();
trigramProbTable = loader.getTrigramProbabilities();
trigramBackoffTable = loader.getTrigramBackoffWeights();
trigramSegmentTable = loader.getTrigramSegments();
LOGGER.info("Unigrams: " + loader.getNumberUnigrams());
LOGGER.info("Bigrams: " + loader.getNumberBigrams());
LOGGER.info("Trigrams: " + loader.getNumberTrigrams());
}
public void finalization() {
}
/** Executes this application.
*
* @param args the command line arguments (unused)
*/
public static void main(final String[] args) {
final NGramImport nGramImport = new NGramImport();
nGramImport.initialize();
try {
nGramImport.importNgrams();
} catch (final TexaiException ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
} catch (final NullPointerException ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
}
nGramImport.finalization();
}
}
See more files for this project here