Code Search for Developers
 
 
  

CMUDictionaryInitializer.java from Texai at Krugle


Show CMUDictionaryInitializer.java syntax highlighted

package org.texai.cmudict;
/*
 * CMUDictionaryInitializer.java
 *
 * Created on February 9, 2007, 1:36 PM
 *
 * Description: Creates Carnegie Mellon University Pronouncing Dictionary word domain entities and their associated phoneme domain entites.
 *
 * Copyright (C) 2007 Stephen L. Reed.
 *
 * This program is free software; you can redistribute it and/or modify it under the terms
 * of the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with this program;
 * if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import javax.persistence.EntityTransaction;
import javax.persistence.Persistence;
import net.sf.ehcache.CacheManager;
import org.apache.log4j.Logger;
import org.texai.cmudict.domainEntity.ARPABETPhoneme;
import org.texai.cmudict.domainEntity.ARPABETPhonemeAdapter;
import org.texai.cmudict.domainEntity.ARPABETPronunciation;
import org.texai.cmudict.domainEntity.CMUDictionaryEnglishWordForm;
import org.texai.kb.CacheInitializer;
import org.texai.kb.Constants;
import org.texai.kb.ejb.entity.AbstractReifiedTerm;
import org.texai.kb.ejb.entity.AtomicTerm;
import org.texai.kb.ejb.session.DomainEntityLoaderBean;
import org.texai.kb.ejb.session.DomainEntityPersisterBean;
import org.texai.kb.ejb.session.shared.AssociationFinderBean;
import org.texai.kb.ejb.session.shared.TermDefinitionAccessorBean;
import org.texai.kb.ejb.session.shared.TermDeleterFacadeBean;
import org.texai.kb.ejb.session.shared.TermFinderFacadeBean;
import org.texai.util.TexaiException;

/**
 *
 * @author reed
 */
public final class CMUDictionaryInitializer {
  
  /** the entity manager factory */
  private EntityManagerFactory entityManagerFactory;
  
  /** the entity manager */
  private EntityManager entityManager;
  
  /** the domain entity loader */
  private DomainEntityLoaderBean domainEntityLoader;
  
  /** the domain entity persister */
  private DomainEntityPersisterBean domainEntityPersister;
  
  /** the term finder */
  private TermFinderFacadeBean termFinderFacade;
  
  /** the log4j logger */
  private static final Logger LOGGER = Logger.getLogger(CMUDictionaryInitializer.class.getName());
  
  /** the creator */
  private AtomicTerm creator;
  
  /** the creation purpose */
  private AtomicTerm creationPurpose;
  
  /** the dictionary of phoneme id --> phoneme instance */
  private Map<String, ARPABETPhoneme> phonemeDictionary = new HashMap<String, ARPABETPhoneme>();
  
  /**
   * Creates a new instance of CMUDictionaryInitializer.
   */
  public CMUDictionaryInitializer() {
    super();
  }
  
  /** Initializes the application and injects the dependencies for out-of-the-container execution of J2EE session beans. */
  private void initialize() {
    entityManagerFactory = Persistence.createEntityManagerFactory(Constants.TEST_PERSISTENCE_UNIT_NAME);
    entityManager = entityManagerFactory.createEntityManager();
    CacheInitializer.initializeCaches();
    domainEntityPersister = new DomainEntityPersisterBean();
    domainEntityPersister.setEntityManager(entityManager);
    termFinderFacade = new TermFinderFacadeBean();
    termFinderFacade.setEntityManager(entityManager);
    domainEntityPersister.setTermFinderFacade(termFinderFacade);
    final TermDeleterFacadeBean termDeleterFacade = new TermDeleterFacadeBean();
    termDeleterFacade.setEntityManager(entityManager);
    domainEntityPersister.setTermDeleterFacade(termDeleterFacade);
    final AssociationFinderBean associationFinder = new AssociationFinderBean();
    associationFinder.setEntityManager(entityManager);
    domainEntityPersister.setAssociationFinder(associationFinder);
    final TermDefinitionAccessorBean termDefinitionAccessor = new TermDefinitionAccessorBean();
    domainEntityPersister.setTermDefinitionAccessor(termDefinitionAccessor);
    domainEntityPersister.injectSharedBeanDependencies();
    domainEntityLoader = new DomainEntityLoaderBean();
    domainEntityLoader.setEntityManager(entityManager);
    domainEntityLoader.setTermFinderFacade(termFinderFacade);
    domainEntityLoader.setAssociationFinder(associationFinder);
    domainEntityLoader.setTermDefinitionAccessor(termDefinitionAccessor);
    domainEntityLoader.injectSharedBeanDependencies();
  }
  
  /** Initializes the phoneme instances. */
  private void initializePhonemes() {
    final EntityTransaction entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    domainEntityPersister.setValidateWellFormedFormula(true);
    final List<AbstractReifiedTerm> isaTerms = new ArrayList<AbstractReifiedTerm>();
    isaTerms.add(termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_INDIVIDUAL));
    isaTerms.add(termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CYCLIST));
    creator = termFinderFacade.findOrCreateDefinedTerm(
            Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_INITIALIZATION_PROCESS,
            "the CMU Pronouncing Dictionary initialization process",
            "This is the process that initializes the CMU Pronouncing Dictionary.",
            isaTerms,
            termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_SOME_CYCLIST),
            termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_OPEN_CYC_PROJECT));
    isaTerms.clear();
    isaTerms.add(termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_INDIVIDUAL));
    isaTerms.add(termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CYC_BASED_PROJECT));
    LOGGER.info("creator: " + creator);
    creationPurpose = termFinderFacade.findOrCreateDefinedTerm(
            Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_INITIALIZATION_PROJECT,
            "the CMU Pronouncing Dictionary initialization project",
            "This is the project that initializes the CMU Pronouncing Dictionary.",
            isaTerms,
            creator,
            termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_OPEN_CYC_PROJECT));
    LOGGER.info("creationPurpose: " + creationPurpose);
    List<AbstractReifiedTerm> genlMtTerms = new ArrayList<AbstractReifiedTerm>();
    termFinderFacade.findOrCreateContextTerm(
            Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_CONTEXT,
            Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_CONTEXT,
            Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_CONTEXT
            + " is the context that contains the CMU Pronouncing Dictionary domain entities and their associations.",
            new ArrayList<AbstractReifiedTerm>(),
            genlMtTerms,
            creator,
            creationPurpose);    
    AtomicTerm property = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CXG_PHONEME_NAME);
    ARPABETPhoneme phomeme = null;
    if (property != null) {
      phomeme = (ARPABETPhoneme) domainEntityLoader.loadDomainEntityByIndentifyingPropertyValue(
              property,
              ARPABETPhoneme.PHONEME_AA,
              ARPABETPhoneme.class);
    }
    if (phomeme == null) {
      phomeme = new ARPABETPhoneme(ARPABETPhoneme.PHONEME_AA);
      domainEntityPersister.persistDomainEntity(phomeme, creator, creationPurpose);
      entityTransaction.commit();
      entityTransaction.begin();
      property = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CXG_PHONEME_NAME);
    }

    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AA0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AA1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AA2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AE, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AE0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AE1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AE2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AH0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AH1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AH2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AO, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AO0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AO1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AO2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AW, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AW0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AW1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AW2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AY, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AY0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AY1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_AY2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_B, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_CH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_D, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_DH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EH0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EH1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EH2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ER, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ER0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ER1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ER2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EY, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EY0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EY1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_EY2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_F, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_G, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_HH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IH0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IH1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IH2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IY, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IY0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IY1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_IY2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_JH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_K, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_L, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_M, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_N, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_NG, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OW, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OW0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OW1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OW2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OY, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OY0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OY1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_OY2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_P, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_R, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_S, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_SH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_T, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_TH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UH, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UH0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UH1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UH2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UW, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UW0, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UW1, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_UW2, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_V, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_W, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_Y, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_Z, property);
    findOrCreatePhoneme(ARPABETPhoneme.PHONEME_ZH, property);
    entityTransaction.commit();
  }
  
  /** Finds or creates the phoneme with the given name.
   *
   * @param name the name of the phoneme
   * @param property the phoneme name domain property, found in advance for efficiency
   */
  private void findOrCreatePhoneme(final String name, final AtomicTerm property) {
    ARPABETPhoneme phomeme = null;
    phomeme = (ARPABETPhoneme) domainEntityLoader.loadDomainEntityByIndentifyingPropertyValue(
            property,
            name,
            ARPABETPhoneme.class);
    if (phomeme == null) {
      phomeme = new ARPABETPhoneme(name);
      domainEntityPersister.persistDomainEntity(phomeme, creator, creationPurpose);
    }
    phonemeDictionary.put(name, phomeme);
  }
  
  
  /**
   * Imports the CMU phoneme dictionary, creating wordForm constructions and attaching the phoneme sequences for each wordForm.
   */
  private void importCMUPhonemeDictionary() {
    int nbrAlternativePronounciations = 0;
    int nbrDictionaryWordForms = 0;
    EntityTransaction entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    String wordForm = null;
    AtomicTerm property = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CMU_WORD_FORM);
    try {
      final BufferedReader bufferedReader = new BufferedReader(new FileReader("/home/reed/svn/CMUPronouncingDictionary/data/cmudict.0.6"));
      boolean isPrimaryPronounciation = true;
      while(true) {
        final String text = bufferedReader.readLine();
        if (text == null) {
          bufferedReader.close();
          break;
        }
        LOGGER.debug("text: " + text);
        if (text.length() == 0 || !Character.isLetter(text.charAt(0))) {
          continue;
        }
        int index = text.indexOf('(');
        if (index > -1) {
          isPrimaryPronounciation = false;
          LOGGER.debug("  alternative pronounciation " + text);
          nbrAlternativePronounciations++;
        } else {
          isPrimaryPronounciation = true;
        }
        nbrDictionaryWordForms++;
        index = text.indexOf(' ');
        index++;
        if (isPrimaryPronounciation) {
          wordForm = text.substring(0, index).toLowerCase().trim();
        } else {
          wordForm = text.substring(0, index - 4).toLowerCase().trim();
        }
        LOGGER.info("  word form: " + wordForm + "   " + nbrDictionaryWordForms);          
        if (text.charAt(index) == ' ') {
           index++;
        }
        if (text.charAt(index) == ' ') {
          LOGGER.debug(" text: '" + text + "'");
        }
        final List<ARPABETPhonemeAdapter> arpabetPhonemeAdapters = new ArrayList<ARPABETPhonemeAdapter>();
        int phonemePosition = 0;
        final String[] phonemeNames = text.substring(index).split(" ");
        for (final String phonemeName : phonemeNames) {
          phonemePosition++;
          LOGGER.debug("    phonemeName: " + phonemeName);
          ARPABETPhoneme phoneme = phonemeDictionary.get(phonemeName);
          if (phoneme == null) {
            LOGGER.debug("    missing phonemeName: " + phonemeName + " text: '" + text + "'");
            continue;
          }
          phoneme = (ARPABETPhoneme) domainEntityLoader.loadDomainEntity(phoneme.getTermId());
          final ARPABETPhonemeAdapter arpabetPhonemeAdapter = new ARPABETPhonemeAdapter(phoneme, phonemePosition);
          LOGGER.debug("    " + arpabetPhonemeAdapter);
          domainEntityPersister.persistDomainEntity(arpabetPhonemeAdapter, creator, creationPurpose);
          arpabetPhonemeAdapters.add(arpabetPhonemeAdapter);
        }
        if (property == null) {
          property = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CMU_WORD_FORM);
        }
        CMUDictionaryEnglishWordForm cmuDictionaryEnglishWordForm = null;
        if (property != null) {
          // words such as 'aigner' are duplicated without the alternate syntax, so always look for a pre-existing persisted word form
          cmuDictionaryEnglishWordForm = 
                  (CMUDictionaryEnglishWordForm) domainEntityLoader.loadDomainEntityByIndentifyingPropertyValue(
                  property,
                  wordForm,
                  CMUDictionaryEnglishWordForm.class);
        }
        if (cmuDictionaryEnglishWordForm == null) {
          final Set<ARPABETPronunciation> arpabetPronounciations = new HashSet<ARPABETPronunciation>();
          arpabetPronounciations.add(new ARPABETPronunciation(arpabetPhonemeAdapters, isPrimaryPronounciation));
          cmuDictionaryEnglishWordForm = new CMUDictionaryEnglishWordForm(wordForm, arpabetPronounciations);
        } else {
          // TODO handle not found
          cmuDictionaryEnglishWordForm.getARPABETPronunciations().add(new ARPABETPronunciation(arpabetPhonemeAdapters, false));
        }
        LOGGER.debug("    " + cmuDictionaryEnglishWordForm);
        domainEntityPersister.persistDomainEntity(cmuDictionaryEnglishWordForm, creator, creationPurpose);
        if ((nbrDictionaryWordForms % 20) == 0) {
          LOGGER.info("*** committing ***");
          entityTransaction.commit();
          entityManager.clear();
          domainEntityPersister.setValidateWellFormedFormula(false);
          entityTransaction = entityManager.getTransaction();
          entityTransaction.begin();
          creator = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_INITIALIZATION_PROCESS);
          creationPurpose = termFinderFacade.findAtomicTermByTermName(Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_INITIALIZATION_PROJECT);
        }
      }
      bufferedReader.close();
    } catch (FileNotFoundException ex) {
      throw new TexaiException(ex);
    } catch (IOException ex) {
      throw new TexaiException(ex);
    }
    entityTransaction.commit();
    LOGGER.info("\ntotal alternative pronounciations " + nbrAlternativePronounciations);
    LOGGER.info("total CMU Pronuncing Dictionary word forms " + nbrDictionaryWordForms);
  }
  
  /** Finalizes this application. */
  private void finalization() {
    CacheManager.getInstance().shutdown();
    entityManager.close();
    entityManagerFactory.close();
    LOGGER.info("ConstructionInitializer completed");
  }
  
  /** Executes this application.
   *
   * @param args the command line arguments (unused)
   */
  public static void main(final String[] args) {
    final CMUDictionaryInitializer phonemeInitialization = new CMUDictionaryInitializer();
    phonemeInitialization.initialize();
    try {
      phonemeInitialization.initializePhonemes();
      phonemeInitialization.importCMUPhonemeDictionary();
    } catch (final TexaiException ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    } catch (final NullPointerException ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    }
    phonemeInitialization.finalization();
  }
}




See more files for this project here

Texai

Texai is an chatbot that intelligently seeks to acquire knowledge and friendly behaviors.

Project homepage: http://sourceforge.net/projects/texai
Programming language(s): Java,Shell Script,XML
License: other

  domainEntity/
    ARPABETPhoneme.java
    ARPABETPhonemeAdapter.java
    ARPABETPronunciation.java
    CMUDictionaryEnglishWordForm.java
    package-info.java
  CMUDictionaryInitializer.java