CMUDictionaryInitializer.java from Texai at Krugle
Show CMUDictionaryInitializer.java syntax highlighted
package org.texai.cmudict;
/*
* CMUDictionaryInitializer.java
*
* Created on February 9, 2007, 1:36 PM
*
* Description: Creates Carnegie Mellon University Pronouncing Dictionary word domain entities and their associated phoneme domain entites.
*
* Copyright (C) 2007 Stephen L. Reed.
*
* This program is free software; you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program;
* if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import javax.persistence.EntityTransaction;
import javax.persistence.Persistence;
import net.sf.ehcache.CacheManager;
import org.apache.log4j.Logger;
import org.texai.cmudict.domainEntity.ARPABETPhoneme;
import org.texai.cmudict.domainEntity.ARPABETPhonemeAdapter;
import org.texai.cmudict.domainEntity.ARPABETPronunciation;
import org.texai.cmudict.domainEntity.CMUDictionaryEnglishWordForm;
import org.texai.kb.CacheInitializer;
import org.texai.kb.Constants;
import org.texai.kb.ejb.entity.AbstractReifiedTerm;
import org.texai.kb.ejb.entity.AtomicTerm;
import org.texai.kb.ejb.session.DomainEntityLoaderBean;
import org.texai.kb.ejb.session.DomainEntityPersisterBean;
import org.texai.kb.ejb.session.shared.AssociationFinderBean;
import org.texai.kb.ejb.session.shared.TermDefinitionAccessorBean;
import org.texai.kb.ejb.session.shared.TermDeleterFacadeBean;
import org.texai.kb.ejb.session.shared.TermFinderFacadeBean;
import org.texai.util.TexaiException;
/**
*
* @author reed
*/
public final class CMUDictionaryInitializer {
/** the entity manager factory */
private EntityManagerFactory entityManagerFactory;
/** the entity manager */
private EntityManager entityManager;
/** the domain entity loader */
private DomainEntityLoaderBean domainEntityLoader;
/** the domain entity persister */
private DomainEntityPersisterBean domainEntityPersister;
/** the term finder */
private TermFinderFacadeBean termFinderFacade;
/** the log4j logger */
private static final Logger LOGGER = Logger.getLogger(CMUDictionaryInitializer.class.getName());
/** the creator */
private AtomicTerm creator;
/** the creation purpose */
private AtomicTerm creationPurpose;
/** the dictionary of phoneme id --> phoneme instance */
private Map<String, ARPABETPhoneme> phonemeDictionary = new HashMap<String, ARPABETPhoneme>();
/**
* Creates a new instance of CMUDictionaryInitializer.
*/
public CMUDictionaryInitializer() {
super();
}
/** Initializes the application and injects the dependencies for out-of-the-container execution of J2EE session beans. */
private void initialize() {
entityManagerFactory = Persistence.createEntityManagerFactory(Constants.TEST_PERSISTENCE_UNIT_NAME);
entityManager = entityManagerFactory.createEntityManager();
CacheInitializer.initializeCaches();
domainEntityPersister = new DomainEntityPersisterBean();
domainEntityPersister.setEntityManager(entityManager);
termFinderFacade = new TermFinderFacadeBean();
termFinderFacade.setEntityManager(entityManager);
domainEntityPersister.setTermFinderFacade(termFinderFacade);
final TermDeleterFacadeBean termDeleterFacade = new TermDeleterFacadeBean();
termDeleterFacade.setEntityManager(entityManager);
domainEntityPersister.setTermDeleterFacade(termDeleterFacade);
final AssociationFinderBean associationFinder = new AssociationFinderBean();
associationFinder.setEntityManager(entityManager);
domainEntityPersister.setAssociationFinder(associationFinder);
final TermDefinitionAccessorBean termDefinitionAccessor = new TermDefinitionAccessorBean();
domainEntityPersister.setTermDefinitionAccessor(termDefinitionAccessor);
domainEntityPersister.injectSharedBeanDependencies();
domainEntityLoader = new DomainEntityLoaderBean();
domainEntityLoader.setEntityManager(entityManager);
domainEntityLoader.setTermFinderFacade(termFinderFacade);
domainEntityLoader.setAssociationFinder(associationFinder);
domainEntityLoader.setTermDefinitionAccessor(termDefinitionAccessor);
domainEntityLoader.injectSharedBeanDependencies();
}
/** Initializes the phoneme instances. */
private void initializePhonemes() {
final EntityTransaction entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
domainEntityPersister.setValidateWellFormedFormula(true);
final List<AbstractReifiedTerm> isaTerms = new ArrayList<AbstractReifiedTerm>();
isaTerms.add(termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_INDIVIDUAL));
isaTerms.add(termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CYCLIST));
creator = termFinderFacade.findOrCreateDefinedTerm(
Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_INITIALIZATION_PROCESS,
"the CMU Pronouncing Dictionary initialization process",
"This is the process that initializes the CMU Pronouncing Dictionary.",
isaTerms,
termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_SOME_CYCLIST),
termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_OPEN_CYC_PROJECT));
isaTerms.clear();
isaTerms.add(termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_INDIVIDUAL));
isaTerms.add(termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CYC_BASED_PROJECT));
LOGGER.info("creator: " + creator);
creationPurpose = termFinderFacade.findOrCreateDefinedTerm(
Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_INITIALIZATION_PROJECT,
"the CMU Pronouncing Dictionary initialization project",
"This is the project that initializes the CMU Pronouncing Dictionary.",
isaTerms,
creator,
termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_OPEN_CYC_PROJECT));
LOGGER.info("creationPurpose: " + creationPurpose);
List<AbstractReifiedTerm> genlMtTerms = new ArrayList<AbstractReifiedTerm>();
termFinderFacade.findOrCreateContextTerm(
Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_CONTEXT,
Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_CONTEXT,
Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_CONTEXT
+ " is the context that contains the CMU Pronouncing Dictionary domain entities and their associations.",
new ArrayList<AbstractReifiedTerm>(),
genlMtTerms,
creator,
creationPurpose);
AtomicTerm property = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CXG_PHONEME_NAME);
ARPABETPhoneme phomeme = null;
if (property != null) {
phomeme = (ARPABETPhoneme) domainEntityLoader.loadDomainEntityByIndentifyingPropertyValue(
property,
ARPABETPhoneme.PHONEME_AA,
ARPABETPhoneme.class);
}
if (phomeme == null) {
phomeme = new ARPABETPhoneme(ARPABETPhoneme.PHONEME_AA);
domainEntityPersister.persistDomainEntity(phomeme, creator, creationPurpose);
entityTransaction.commit();
entityTransaction.begin();
property = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CXG_PHONEME_NAME);
}
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AA0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AA1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AA2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AE, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AE0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AE1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AE2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AH0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AH1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AH2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AO, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AO0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AO1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AO2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AW, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AW0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AW1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AW2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AY, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AY0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AY1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AY2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_B, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_CH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_D, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_DH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EH0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EH1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EH2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ER, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ER0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ER1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ER2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EY, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EY0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EY1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EY2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_F, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_G, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_HH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IH0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IH1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IH2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IY, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IY0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IY1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IY2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_JH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_K, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_L, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_M, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_N, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_NG, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OW, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OW0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OW1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OW2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OY, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OY0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OY1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OY2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_P, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_R, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_S, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_SH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_T, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_TH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UH, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UH0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UH1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UH2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UW, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UW0, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UW1, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UW2, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_V, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_W, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_Y, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_Z, property);
findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ZH, property);
entityTransaction.commit();
}
/** Finds or creates the phoneme with the given name.
*
* @param name the name of the phoneme
* @param property the phoneme name domain property, found in advance for efficiency
*/
private void findOrCreatePhoneme(final String name, final AtomicTerm property) {
ARPABETPhoneme phomeme = null;
phomeme = (ARPABETPhoneme) domainEntityLoader.loadDomainEntityByIndentifyingPropertyValue(
property,
name,
ARPABETPhoneme.class);
if (phomeme == null) {
phomeme = new ARPABETPhoneme(name);
domainEntityPersister.persistDomainEntity(phomeme, creator, creationPurpose);
}
phonemeDictionary.put(name, phomeme);
}
/**
* Imports the CMU phoneme dictionary, creating wordForm constructions and attaching the phoneme sequences for each wordForm.
*/
private void importCMUPhonemeDictionary() {
int nbrAlternativePronounciations = 0;
int nbrDictionaryWordForms = 0;
EntityTransaction entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
String wordForm = null;
AtomicTerm property = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CMU_WORD_FORM);
try {
final BufferedReader bufferedReader = new BufferedReader(new FileReader("/home/reed/svn/CMUPronouncingDictionary/data/cmudict.0.6"));
boolean isPrimaryPronounciation = true;
while(true) {
final String text = bufferedReader.readLine();
if (text == null) {
bufferedReader.close();
break;
}
LOGGER.debug("text: " + text);
if (text.length() == 0 || !Character.isLetter(text.charAt(0))) {
continue;
}
int index = text.indexOf('(');
if (index > -1) {
isPrimaryPronounciation = false;
LOGGER.debug(" alternative pronounciation " + text);
nbrAlternativePronounciations++;
} else {
isPrimaryPronounciation = true;
}
nbrDictionaryWordForms++;
index = text.indexOf(' ');
index++;
if (isPrimaryPronounciation) {
wordForm = text.substring(0, index).toLowerCase().trim();
} else {
wordForm = text.substring(0, index - 4).toLowerCase().trim();
}
LOGGER.info(" word form: " + wordForm + " " + nbrDictionaryWordForms);
if (text.charAt(index) == ' ') {
index++;
}
if (text.charAt(index) == ' ') {
LOGGER.debug(" text: '" + text + "'");
}
final List<ARPABETPhonemeAdapter> arpabetPhonemeAdapters = new ArrayList<ARPABETPhonemeAdapter>();
int phonemePosition = 0;
final String[] phonemeNames = text.substring(index).split(" ");
for (final String phonemeName : phonemeNames) {
phonemePosition++;
LOGGER.debug(" phonemeName: " + phonemeName);
ARPABETPhoneme phoneme = phonemeDictionary.get(phonemeName);
if (phoneme == null) {
LOGGER.debug(" missing phonemeName: " + phonemeName + " text: '" + text + "'");
continue;
}
phoneme = (ARPABETPhoneme) domainEntityLoader.loadDomainEntity(phoneme.getTermId());
final ARPABETPhonemeAdapter arpabetPhonemeAdapter = new ARPABETPhonemeAdapter(phoneme, phonemePosition);
LOGGER.debug(" " + arpabetPhonemeAdapter);
domainEntityPersister.persistDomainEntity(arpabetPhonemeAdapter, creator, creationPurpose);
arpabetPhonemeAdapters.add(arpabetPhonemeAdapter);
}
if (property == null) {
property = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CMU_WORD_FORM);
}
CMUDictionaryEnglishWordForm cmuDictionaryEnglishWordForm = null;
if (property != null) {
// words such as 'aigner' are duplicated without the alternate syntax, so always look for a pre-existing persisted word form
cmuDictionaryEnglishWordForm =
(CMUDictionaryEnglishWordForm) domainEntityLoader.loadDomainEntityByIndentifyingPropertyValue(
property,
wordForm,
CMUDictionaryEnglishWordForm.class);
}
if (cmuDictionaryEnglishWordForm == null) {
final Set<ARPABETPronunciation> arpabetPronounciations = new HashSet<ARPABETPronunciation>();
arpabetPronounciations.add(new ARPABETPronunciation(arpabetPhonemeAdapters, isPrimaryPronounciation));
cmuDictionaryEnglishWordForm = new CMUDictionaryEnglishWordForm(wordForm, arpabetPronounciations);
} else {
// TODO handle not found
cmuDictionaryEnglishWordForm.getARPABETPronunciations().add(new ARPABETPronunciation(arpabetPhonemeAdapters, false));
}
LOGGER.debug(" " + cmuDictionaryEnglishWordForm);
domainEntityPersister.persistDomainEntity(cmuDictionaryEnglishWordForm, creator, creationPurpose);
if ((nbrDictionaryWordForms % 20) == 0) {
LOGGER.info("*** committing ***");
entityTransaction.commit();
entityManager.clear();
domainEntityPersister.setValidateWellFormedFormula(false);
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
creator = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_INITIALIZATION_PROCESS);
creationPurpose = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_INITIALIZATION_PROJECT);
}
}
bufferedReader.close();
} catch (FileNotFoundException ex) {
throw new TexaiException(ex);
} catch (IOException ex) {
throw new TexaiException(ex);
}
entityTransaction.commit();
LOGGER.info("\ntotal alternative pronounciations " + nbrAlternativePronounciations);
LOGGER.info("total CMU Pronuncing Dictionary word forms " + nbrDictionaryWordForms);
}
/** Finalizes this application. */
private void finalization() {
CacheManager.getInstance().shutdown();
entityManager.close();
entityManagerFactory.close();
LOGGER.info("ConstructionInitializer completed");
}
/** Executes this application.
*
* @param args the command line arguments (unused)
*/
public static void main(final String[] args) {
final CMUDictionaryInitializer phonemeInitialization = new CMUDictionaryInitializer();
phonemeInitialization.initialize();
try {
phonemeInitialization.initializePhonemes();
phonemeInitialization.importCMUPhonemeDictionary();
} catch (final TexaiException ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
} catch (final NullPointerException ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
}
phonemeInitialization.finalization();
}
}
See more files for this project here