Code Search for Developers
 
 
  

LexiconInitializer.java from Texai at Krugle


Show LexiconInitializer.java syntax highlighted

/*
 * LexiconInitializer.java
 *
 * Created on February 22, 2007, 10:46 AM
 *
 * Description: Provides a lexicon initializer and maintainer that merges the following imported machine-readable dictioanaries:
 * WordNet 2.1,  Wiktionary, OpenCyc, and the CMU Pronounciation Dictionary.
 * <P>
 * WordNet has fine-grained word senses and its word senses take precedence over Wiktionary, unless Wordnet has a missing speech part
 * definition that Wiktionary provides.  The CMU Pronouncing Dictioanary and Wiktionary provide pronounciations for word forms.
 *
 * Copyright (C) 2007 Stephen L. Reed.
 *
 * This program is free software; you can redistribute it and/or modify it under the terms
 * of the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with this program;
 * if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package org.texai.lexicon;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import javax.persistence.EntityTransaction;
import javax.persistence.Persistence;
import net.sf.ehcache.CacheManager;
import org.apache.log4j.Logger;
import org.texai.cmudict.domainEntity.ARPABETPronunciation;
import org.texai.cmudict.domainEntity.CMUDictionaryEnglishWordForm;
import org.texai.kb.CacheInitializer;
import org.texai.kb.Constants;
import org.texai.kb.ejb.entity.AbstractReifiedTerm;
import org.texai.kb.ejb.entity.AtomicTerm;
import org.texai.kb.ejb.entity.BinaryGAF;
import org.texai.kb.ejb.entity.PString;
import org.texai.kb.ejb.session.DomainEntityManagerBean;
import org.texai.lexicon.domainEntity.TexaiEnglishWord;
import org.texai.lexicon.domainEntity.TexaiEnglishWordForm;
import org.texai.lexicon.domainEntity.TexaiEnglishWordSense;
import org.texai.lexicon.domainEntity.TexaiSamplePhrase;
import org.texai.util.TexaiException;
import org.texai.wiktionary.domainEntity.WiktionaryEnglishWord;
import org.texai.wiktionary.domainEntity.WiktionaryEnglishWordForm;
import org.texai.wiktionary.domainEntity.WiktionaryEnglishWordSense;
import org.texai.wiktionary.domainEntity.WiktionarySamplePhrase;
import org.texai.wordnet.domain.entity.WordNetCasedEnglishWord;
import org.texai.wordnet.domain.entity.WordNetEnglishWord;
import org.texai.wordnet.domain.entity.WordNetSamplePhraseItem;
import org.texai.wordnet.domain.entity.WordNetSynset;
import org.texai.wordnet.domain.entity.WordNetWordSense;

/**
 *
 * @author reed
 */
public final class LexiconInitializer {
  
  /** the log4j logger */
  private static final Logger LOGGER = Logger.getLogger(LexiconInitializer.class.getName());
  
  /** the entity manager factory */
  private EntityManagerFactory entityManagerFactory;
  
  /** the entity manager */
  private EntityManager entityManager;
  
  /** the domain entity manager */
  private DomainEntityManagerBean domainEntityManager;
  
  /** the entity transaction */
  private EntityTransaction entityTransaction;
  
  /** the creator */
  private AtomicTerm creator;
  
  /** the creation purpose */
  private AtomicTerm creationPurpose;
  
  /** the number of English words acquired */
  private int nbrEnglishWordsAcquired = 0;
  
  /** the number of English word senses acquired from WordNet */
  private int nbrEnglishWordsProcessed = 0;
  
  /** the set of speech parts that have a corresponding word form created */
  private Set<AtomicTerm> speechPartsHavingWordForm = new HashSet<AtomicTerm>();
  
  /** Creates a new instance of LexiconInitializer. */
  public LexiconInitializer() {
    super();
  }
  
  /** Initializes the application and injects the dependencies for out-of-the-container execution of J2EE session beans. */
  private void initialize() {
    entityManagerFactory = Persistence.createEntityManagerFactory(Constants.TEST_PERSISTENCE_UNIT_NAME);
    entityManager = entityManagerFactory.createEntityManager();
    CacheInitializer.initializeCaches();
    domainEntityManager = new DomainEntityManagerBean();
    domainEntityManager.setEntityManager(entityManager);
    domainEntityManager.injectSharedBeanDependencies();
  }
  
  /** Initializes the knowledge base terms. */
  private void initializeKBTerms() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    domainEntityManager.setValidateWellFormedFormula(true);
    
    domainEntityManager.setCreator(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_SOME_CYCLIST));
    domainEntityManager.setCreationPurpose(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_OPEN_CYC_PROJECT));
    final List<AbstractReifiedTerm> isaTerms = new ArrayList<AbstractReifiedTerm>();
    isaTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_INDIVIDUAL));
    isaTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_CYCLIST));
    
    creator = domainEntityManager.findOrCreateDefinedTerm(
            Constants.TERM_NAME_LEXICON_INITIALIZATION_PROCESS,
            "the lexicon initialization process",
            "This is the process that initializes the texai lexicon.",
            isaTerms);
    domainEntityManager.setCreator(creator);
    LOGGER.info("creator: " + creator);
    
    isaTerms.clear();
    isaTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_INDIVIDUAL));
    isaTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_CYC_BASED_PROJECT));
    creationPurpose = domainEntityManager.findOrCreateDefinedTerm(
            Constants.TERM_NAME_LEXICON_INITIALIZATION_PROJECT,
            "the lexicon initialization project",
            "This is the project that initializes the lexicon.",
            isaTerms);
    domainEntityManager.setCreationPurpose(creationPurpose);
    LOGGER.info("creationPurpose: " + creationPurpose);
    
    List<AbstractReifiedTerm> genlMtTerms = new ArrayList<AbstractReifiedTerm>();
    genlMtTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_CONTEXT));
    domainEntityManager.findOrCreateContextTerm(
            Constants.TERM_NAME_TEXAI_ENGLISH_LEXICON_CONTEXT,
            Constants.TERM_NAME_TEXAI_ENGLISH_LEXICON_CONTEXT,
            Constants.TERM_NAME_TEXAI_ENGLISH_LEXICON_CONTEXT
            + " is the context that contains the English lexicon domain entities and their associations.",
            new ArrayList<AbstractReifiedTerm>(),
            genlMtTerms);
    
    // replace arg2Isa(texaiNounType, Noun)
    // with    arg2Isa(texaiNounType, SpeechPart)
    try {
      final AtomicTerm oldArg1 = domainEntityManager.findAtomicTermByTermName("texaiNounType");
      final AtomicTerm newArg1 = oldArg1;
      domainEntityManager.editBinaryGAF(
              domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ARG2_ISA),
              oldArg1,
              newArg1,
              domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_NOUN),
              domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_SPEECH_PART),
              domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_UNIVERSAL_VOCABULARY_MT));
    } catch (final TexaiException ex) {
      LOGGER.info("binary gaf not found for editing");
    }
    entityTransaction.commit();
  }
  
  /** Acquires lexicon from WordNet domain entities in the KB. */
  private void acquireLexiconFromWordNet() {
    acquireWordsFromWordNet();
    acquireWordSensesFromWordNet();
  }
  
  /** Acquires words from WordNet domain entities in the KB. */
  private void acquireWordsFromWordNet() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final Iterator<Object> wordNetEnglishWord_iter = domainEntityManager.domainEntityIterator(WordNetEnglishWord.class);
    final Set<TexaiEnglishWordForm> texaiEnglishWordForms = new HashSet<TexaiEnglishWordForm>();
    final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
    final Set<AtomicTerm> nounTypes = new HashSet<AtomicTerm>();
    while (wordNetEnglishWord_iter.hasNext()) {
      final WordNetEnglishWord wordNetEnglishWord = (WordNetEnglishWord) wordNetEnglishWord_iter.next();
      final TexaiEnglishWord texaiEnglishWord = new TexaiEnglishWord(
              wordNetEnglishWord.getWNLemma(),
              texaiEnglishWordForms,
              texaiEnglishWordSenses,
              nounTypes,
              wordNetEnglishWord,
              null,
              false);
      domainEntityManager.persistDomainEntity(texaiEnglishWord);
      if (++nbrEnglishWordsAcquired % 20 == 0) {
        commit(nbrEnglishWordsAcquired);
      }
    }
  }
  
  /** Acquires word senses from WordNet domain entities in the KB. */
  @SuppressWarnings("unchecked")
  private void acquireWordSensesFromWordNet() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final Iterator<Object> texaiEnglishWord_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWord.class);
    while (texaiEnglishWord_iter.hasNext()) {
      final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) texaiEnglishWord_iter.next();
      LOGGER.info("processing word " + texaiEnglishWord);
      speechPartsHavingWordForm.clear();
      final WordNetEnglishWord wordNetEnglishWord = texaiEnglishWord.getWordNetEnglishWord();
      final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final Set<WordNetWordSense> wordNetWordSenses = wordNetEnglishWord.getWNWordSenses();
      if (wordNetWordSenses.size() == 1) {
        // with a singleton word sense both the positon and wordSenseNbr are 1
        final Iterator<WordNetWordSense> wordNetWordSense_iter = wordNetWordSenses.iterator();
        final WordNetWordSense wordNetWordSense = wordNetWordSense_iter.next();
        acquireWordSenseFromWordNet(1, texaiEnglishWord, 1, wordNetWordSense);
      } else {
        // WordNet word senses are not numbered, so sort within speech part, then assign overall position
        final List<WordNetWordSense> nounWordNetWordSenses = new ArrayList<WordNetWordSense>();
        final List<WordNetWordSense> verbWordNetWordSenses = new ArrayList<WordNetWordSense>();
        final List<WordNetWordSense> adjectiveWordNetWordSenses = new ArrayList<WordNetWordSense>();
        final List<WordNetWordSense> adverbWordNetWordSenses = new ArrayList<WordNetWordSense>();
        for (final WordNetWordSense wordNetWordSense : wordNetEnglishWord.getWNWordSenses()) {
          final AtomicTerm speechPart = wordNetWordSense.getWordNetSynset().getWNSynsetSpeechPart();
          if (Constants.TERM_NAME_NOUN.equals(speechPart.toString())) {
            nounWordNetWordSenses.add(wordNetWordSense);
          } else if (Constants.TERM_NAME_VERB.equals(speechPart.toString())) {
            verbWordNetWordSenses.add(wordNetWordSense);
          } else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPart.toString())) {
            adjectiveWordNetWordSenses.add(wordNetWordSense);
          } else if (Constants.TERM_NAME_ADVERB.equals(speechPart.toString())) {
            adverbWordNetWordSenses.add(wordNetWordSense);
          } else {
            assert false : "invalid speech part for WordNet synset " + wordNetWordSense.getWordNetSynset();
          }
        }
        final WordNetWordSenseComparator wordNetWordSenseComparator = new WordNetWordSenseComparator();
        Collections.sort(nounWordNetWordSenses, wordNetWordSenseComparator);
        Collections.sort(verbWordNetWordSenses, wordNetWordSenseComparator);
        Collections.sort(adjectiveWordNetWordSenses, wordNetWordSenseComparator);
        Collections.sort(adverbWordNetWordSenses, wordNetWordSenseComparator);
        int position = 0;
        int wordSenseNbr = 0;
        for (final WordNetWordSense wordNetWordSense : nounWordNetWordSenses) {
          position++;
          wordSenseNbr++;
          acquireWordSenseFromWordNet(position, texaiEnglishWord, wordSenseNbr, wordNetWordSense);
        }
        wordSenseNbr = 0;
        for (final WordNetWordSense wordNetWordSense : verbWordNetWordSenses) {
          position++;
          wordSenseNbr++;
          acquireWordSenseFromWordNet(position, texaiEnglishWord, wordSenseNbr, wordNetWordSense);
        }
        wordSenseNbr = 0;
        for (final WordNetWordSense wordNetWordSense : adverbWordNetWordSenses) {
          position++;
          wordSenseNbr++;
          acquireWordSenseFromWordNet(position, texaiEnglishWord, wordSenseNbr, wordNetWordSense);
        }
        wordSenseNbr = 0;
        for (final WordNetWordSense wordNetWordSense : adjectiveWordNetWordSenses) {
          position++;
          wordSenseNbr++;
          acquireWordSenseFromWordNet(position, texaiEnglishWord, wordSenseNbr, wordNetWordSense);
        }
      }
      if (++nbrEnglishWordsProcessed % 20 == 0) {
        commit(nbrEnglishWordsProcessed);
      }
    }
  }
  
  /** Acquires a word sense from WordNet domain entities.
   *
   * @param position the word sense position in the list of word senses for the word
   * @param texaiEnglishWord the texai English word
   * @param wordSenseNbr the sense number
   * @param wordNetWordSense the WordNet word sense
   */
  private void acquireWordSenseFromWordNet(
          final int position,
          final TexaiEnglishWord texaiEnglishWord,
          final int wordSenseNbr,
          final WordNetWordSense wordNetWordSense) {
    wordNetWordSense.getWordNetSynset().getTermId();  // load lazy object
    final WordNetSynset wordNetSynset = wordNetWordSense.getWordNetSynset();
    final AtomicTerm speechPart = wordNetSynset.getWNSynsetSpeechPart();
    final Set<String> glosses = new HashSet<String>();
    glosses.add(wordNetSynset.getWNSynsetGloss());
    final Set<TexaiSamplePhrase> texaiSamplePhrases = new HashSet<TexaiSamplePhrase>();
    final Set<String> categoryNames = new HashSet<String>();
    final TexaiEnglishWordSense texaiEnglishWordSense = new TexaiEnglishWordSense(
            position,
            speechPart,
            speechPart,
            wordSenseNbr,
            glosses,
            texaiSamplePhrases,
            texaiEnglishWord,
            wordNetSynset.getWNMappedTerm(),
            categoryNames,
            wordNetSynset.getWNCategory(),
            wordNetSynset,
            null,
            false);
    LOGGER.info("  created word sense " + texaiEnglishWordSense);
    domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
    for (final WordNetSamplePhraseItem wordNetSamplePhraseItem : wordNetSynset.getWNSynsetSamplePhraseItems()) {
      final String samplePhrase = wordNetSamplePhraseItem.getWnSamplePhrase().toLowerCase();
      if (samplePhrase.contains(" " + texaiEnglishWord.getLemma() + " ")
      || samplePhrase.contains(" " + texaiEnglishWord.getLemma())
      || samplePhrase.contains(texaiEnglishWord.getLemma() + " ")) {
        final TexaiSamplePhrase texaiSamplePhrase = new TexaiSamplePhrase(samplePhrase, texaiEnglishWordSense);
        LOGGER.info("  created sample phrase " + texaiSamplePhrase);
        domainEntityManager.persistDomainEntity(texaiSamplePhrase);
      }
    }
    final Set<ARPABETPronunciation> arpabetPronounciations = new HashSet<ARPABETPronunciation>();
    if (!speechPartsHavingWordForm.contains(speechPart)) {
      final TexaiEnglishWordForm texaiEnglishWordForm = new TexaiEnglishWordForm(
              texaiEnglishWord.getLemma(),
              speechPart,
              arpabetPronounciations,
              texaiEnglishWord,
              null,
              null,
              wordNetWordSense.getWordNetCasedEnglishWord(),
              false);
      speechPartsHavingWordForm.add(speechPart);
      LOGGER.info("  created word form " + texaiEnglishWordForm);
      domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
    }
  }
  
  /** Populates the word sense basic speech part for WordNet dervived word senses. */
  private void populateBasicSpeechPart() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final Iterator<Object> texaiEnglishWordSense_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWordSense.class);
    while (texaiEnglishWordSense_iter.hasNext()) {
      final TexaiEnglishWordSense texaiEnglishWordSense = (TexaiEnglishWordSense) texaiEnglishWordSense_iter.next();
      LOGGER.info(texaiEnglishWordSense);
      final AtomicTerm speechPart = texaiEnglishWordSense.getSpeechPart();
      assert speechPart != null : "speechPart must not be null " + texaiEnglishWordSense;
      texaiEnglishWordSense.setBasicSpeechPart(speechPart);
      domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
      if (++nbrEnglishWordsProcessed % 20 == 0) {
        commit(nbrEnglishWordsProcessed);
      }
    }
  }
  
  
  /** Acquires lexicon from Wiktionary domain entities in the KB. */
  private void acquireLexiconFromWiktionary() {
//    acquireWordsFromWiktionary();
//    acquireWordSensesFromWiktionary();
    acquireWordFormsFromWiktionary();
  }
  
  /** Acquires lexicon from Wiktionary word domain entities. */
  private void acquireWordsFromWiktionary() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final Iterator<Object> wiktionaryEnglishWord_iter = domainEntityManager.domainEntityIterator(WiktionaryEnglishWord.class);
    final AtomicTerm property = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
    final Set<TexaiEnglishWordForm> texaiEnglishWordForms = new HashSet<TexaiEnglishWordForm>();
    final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
    final Set<AtomicTerm> nounTypes = new HashSet<AtomicTerm>();
    while (wiktionaryEnglishWord_iter.hasNext()) {
      final WiktionaryEnglishWord wiktionaryEnglishWord = (WiktionaryEnglishWord) wiktionaryEnglishWord_iter.next();
      LOGGER.info("processing " + wiktionaryEnglishWord);
      String lemma = wiktionaryEnglishWord.getLemma();
      if (lemma.endsWith(".")) {
        lemma = lemma.substring(0, lemma.length() - 1);
        wiktionaryEnglishWord.setLemma(lemma);
        domainEntityManager.persistDomainEntity(wiktionaryEnglishWord);
      }
      TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntityByIndentifyingPropertyValue(
              property,
              lemma,
              TexaiEnglishWord.class);
      if (texaiEnglishWord == null) {
        texaiEnglishWord = new TexaiEnglishWord(
                lemma,
                texaiEnglishWordForms,
                texaiEnglishWordSenses,
                nounTypes,
                null,
                wiktionaryEnglishWord,
                false);
        nbrEnglishWordsAcquired++;
        LOGGER.info("  created " + texaiEnglishWord);
      } else {
        texaiEnglishWord.setWiktionaryEnglishWord(wiktionaryEnglishWord);
        wiktionaryEnglishWord.getNounTypes().size();  // load the lazy set
        texaiEnglishWord.setNounTypes(wiktionaryEnglishWord.getNounTypes());
      }
      domainEntityManager.persistDomainEntity(texaiEnglishWord);
      if (++nbrEnglishWordsProcessed % 20 == 0) {
        commit(nbrEnglishWordsProcessed);
      }
    }
  }
  
  /** Acquires lexicon from Wiktionary word sense domain entities. */
  private void acquireWordSensesFromWiktionary() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final Iterator<Object> wiktionaryEnglishWord_iter = domainEntityManager.domainEntityIterator(WiktionaryEnglishWord.class);
    final AtomicTerm property = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
    final Set<TexaiEnglishWordForm> texaiEnglishWordForms = new HashSet<TexaiEnglishWordForm>();
    final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
    final Set<AtomicTerm> nounTypes = new HashSet<AtomicTerm>();
    while (wiktionaryEnglishWord_iter.hasNext()) {
      final WiktionaryEnglishWord wiktionaryEnglishWord = (WiktionaryEnglishWord) wiktionaryEnglishWord_iter.next();
      LOGGER.info("processing " + wiktionaryEnglishWord);
      String lemma = wiktionaryEnglishWord.getLemma();
      // categorize the Wiktionary word senses
      final List<WiktionaryEnglishWordSense> nounWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      final List<WiktionaryEnglishWordSense> verbWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      final List<WiktionaryEnglishWordSense> adjectiveWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      final List<WiktionaryEnglishWordSense> adverbWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      final List<WiktionaryEnglishWordSense> otherWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      for (final WiktionaryEnglishWordSense wiktionaryEnglishWordSense : wiktionaryEnglishWord.getWiktionaryEnglishWordSenses()) {
        final String speechPartString = wiktionaryEnglishWordSense.getSpeechPart().toString();
        // nouns
        if (Constants.TERM_NAME_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_ABBREVIATION.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_ACRONYM.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_COLLECTIVE_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_GERUNDIVE_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_INITIALISM.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_MASS_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_NOUN_PHRASE.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_PLURAL_NOUN_WORD_FORM.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_PROPER_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
          
          // verbs
        } else if (Constants.TERM_NAME_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_AUX_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_INTRANSITIVE_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_REFLEXIVE_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_TRANSITIVE_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_VERB_PHRASE.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
          
          // adjectives
        } else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPartString)) {
          adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_ADJECTIVE_PHRASE.equals(speechPartString)) {
          adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_PROPER_ADJECTIVE.equals(speechPartString)) {
          adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
          
          // adverbs
        } else if (Constants.TERM_NAME_ADVERB.equals(speechPartString)) {
          adverbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_ADVERB_PHRASE.equals(speechPartString)) {
          adverbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
          
          // others
        } else {
          otherWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        }
      }
      TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntityByIndentifyingPropertyValue(
              property,
              lemma,
              TexaiEnglishWord.class);
      assert texaiEnglishWord != null : "cannot find " + lemma;
      // categorize the texai word senses
      final List<TexaiEnglishWordSense> nounTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final List<TexaiEnglishWordSense> verbTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final List<TexaiEnglishWordSense> adjectiveTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final List<TexaiEnglishWordSense> adverbTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final List<TexaiEnglishWordSense> otherTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWord.getTexaiEnglishWordSenses()) {
        String speechPartString = null;
        try {
          speechPartString = texaiEnglishWordSense.getBasicSpeechPart().toString();
        } catch (final NullPointerException ex) {
          LOGGER.info("null pointer exception detected when referencing texaiEnglishWordSense.getBasicSpeechPart()");
        }
        if (speechPartString == null) {
          otherTexaiWordSenses.add(texaiEnglishWordSense);
        } else if (Constants.TERM_NAME_NOUN.equals(speechPartString)) {
          nounTexaiWordSenses.add(texaiEnglishWordSense);
        } else if (Constants.TERM_NAME_VERB.equals(speechPartString)) {
          verbTexaiWordSenses.add(texaiEnglishWordSense);
        } else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPartString)) {
          adjectiveTexaiWordSenses.add(texaiEnglishWordSense);
        } else if (Constants.TERM_NAME_ADVERB.equals(speechPartString)) {
          adverbTexaiWordSenses.add(texaiEnglishWordSense);
        } else {
          LOGGER.info("uncategorized WordNet-dervived word sense");
          otherTexaiWordSenses.add(texaiEnglishWordSense);
        }
      }
      boolean hasMergedNounWordSenses = false;
      boolean hasMergedVerbWordSenses = false;
      boolean hasMergedAdjectiveWordSenses = false;
      boolean hasMergedAdverbWordSenses = false;
      boolean hasMergedOtherWordSenses = false;
      // merge wiktionary word senses for unpopulated texai categories among Noun, Verb, Adjective and Adverb
      if (nounTexaiWordSenses.isEmpty() && !nounWiktionaryWordSenses.isEmpty()) {
        hasMergedNounWordSenses = true;
        mergeWiktionaryWordSenses(
                nounWiktionaryWordSenses,
                nounTexaiWordSenses,
                domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_NOUN),
                texaiEnglishWord);
      } else if (verbTexaiWordSenses.isEmpty() && !verbWiktionaryWordSenses.isEmpty()) {
        hasMergedVerbWordSenses = true;
        mergeWiktionaryWordSenses(
                verbWiktionaryWordSenses,
                verbTexaiWordSenses,
                domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_VERB),
                texaiEnglishWord);
      } else if (adjectiveTexaiWordSenses.isEmpty() && !adjectiveWiktionaryWordSenses.isEmpty()) {
        hasMergedAdjectiveWordSenses = true;
        mergeWiktionaryWordSenses(
                adjectiveWiktionaryWordSenses,
                adjectiveTexaiWordSenses,
                domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ADJECTIVE),
                texaiEnglishWord);
      } else if (adverbTexaiWordSenses.isEmpty() && !adverbWiktionaryWordSenses.isEmpty()) {
        hasMergedAdverbWordSenses = true;
        mergeWiktionaryWordSenses(
                adverbWiktionaryWordSenses,
                adverbTexaiWordSenses,
                domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ADVERB),
                texaiEnglishWord);
      } else if (!otherWiktionaryWordSenses.isEmpty()) {
        hasMergedOtherWordSenses = true;
        mergeWiktionaryWordSenses(
                otherWiktionaryWordSenses,
                otherTexaiWordSenses,
                null,
                texaiEnglishWord);
      }
      // recalculate the texai word sense positions if any merging occurred
      if (hasMergedNounWordSenses
              || hasMergedVerbWordSenses
              || hasMergedAdjectiveWordSenses
              || hasMergedAdverbWordSenses
              || hasMergedOtherWordSenses) {
        int position = 0;
        position = orderTexaiWordSenses(nounTexaiWordSenses, position, hasMergedNounWordSenses);
        position = orderTexaiWordSenses(verbTexaiWordSenses, position, hasMergedVerbWordSenses);
        position = orderTexaiWordSenses(adjectiveTexaiWordSenses, position, hasMergedAdjectiveWordSenses);
        position = orderTexaiWordSenses(adverbTexaiWordSenses, position, hasMergedAdverbWordSenses);
        orderTexaiWordSenses(otherTexaiWordSenses, position, hasMergedOtherWordSenses);
        if (++nbrEnglishWordsProcessed % 20 == 0) {
          commit(nbrEnglishWordsProcessed);
        }
      }
    }
  }
  
  /** Merges a given list of Wiktionary word senses into the empty list of Texai word senses for the same speech part.
   *
   * @param wiktionaryWordSenses the list of Wiktonary word senses
   * @param texaiWordSenses the empty list of Texai word senses into which the Wiktionary word senses are merged
   * @param basicSpeechPart the basic speech part, e.g. Noun, Verb
   * @param texaiEnglishWord the texai English word
   */
  private void mergeWiktionaryWordSenses(
          final List<WiktionaryEnglishWordSense> wiktionaryEnglishWordSenses,
          final List<TexaiEnglishWordSense> texaiEnglishWordSenses,
          final AtomicTerm basicSpeechPart,
          final TexaiEnglishWord texaiEnglishWord) {
    //Preconditions
    assert wiktionaryEnglishWordSenses != null : "wiktionaryEnglishWordSenses must not be null";
    assert texaiEnglishWordSenses != null : "texaiEnglishWordSenses must not be null";
    assert texaiEnglishWord != null : "texaiEnglishWord must not be null";
    
    for (final WiktionaryEnglishWordSense wiktionaryEnglishWordSense : wiktionaryEnglishWordSenses) {
      final Set<String> glosses = new HashSet<String>();
      glosses.add(wiktionaryEnglishWordSense.getGloss());
      final Set<TexaiSamplePhrase> texaiSamplePhrases = new HashSet<TexaiSamplePhrase>();
      wiktionaryEnglishWordSense.getCategoryNames().size();  // load lazy set
      final TexaiEnglishWordSense texaiEnglishWordSense = new TexaiEnglishWordSense(
              1,  // placeholder position pending recalculation
              wiktionaryEnglishWordSense.getSpeechPart(),
              basicSpeechPart,
              wiktionaryEnglishWordSense.getSenseNbr(),
              glosses,
              texaiSamplePhrases,
              texaiEnglishWord,
              null,
              wiktionaryEnglishWordSense.getCategoryNames(),
              null,
              null,
              wiktionaryEnglishWordSense,
              false);
      if (wiktionaryEnglishWordSense.getPosition() < 1) {
        wiktionaryEnglishWordSense.setPosition(wiktionaryEnglishWordSense.getSenseNbr());
        LOGGER.debug("  fixed WiktionaryWordSense " + wiktionaryEnglishWordSense);
        domainEntityManager.persistDomainEntity(wiktionaryEnglishWordSense);
      }
      texaiEnglishWordSenses.add(texaiEnglishWordSense);
      LOGGER.info("  merged " + texaiEnglishWordSense);
      for (final WiktionarySamplePhrase wiktionarySamplePhrase : wiktionaryEnglishWordSense.getSamplePhrases()) {
        final TexaiSamplePhrase texaiSamplePhrase = new TexaiSamplePhrase(
                wiktionarySamplePhrase.getWiktionarySamplePhrase(),
                texaiEnglishWordSense);
        texaiSamplePhrases.add(texaiSamplePhrase);
        LOGGER.info("  merged " + texaiSamplePhrase);
      }
    }
  }
  
  /** Orders and persists the Texai word senses that may have been merged from the Wiktionary word senses.
   *
   * @param texaiWordSenses the list of texai word senses
   * @param startingPosition the previous word sense position within the word
   * @param hasMergedWordSenses the indicator whether this list of word senses, associated with a speech part, has been
   * merged from Wiktionary
   * @return the current word sense position
   */
  private int orderTexaiWordSenses(
          final List<TexaiEnglishWordSense> texaiWordSenses,
          final int startingPosition,
          final boolean hasMergedWordSenses) {
    //Preconditions
    assert texaiWordSenses != null : "texaiWordSenses must not be null";
    assert startingPosition >= 0 : "startingPosition must not be negative";
    
    int position = startingPosition;
    for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiWordSenses) {
      position++;
      texaiEnglishWordSense.setPosition(position);
      if (hasMergedWordSenses) {
        for (final TexaiSamplePhrase texaiSamplePhrase : texaiEnglishWordSense.getTexaiSamplePhrases()) {
          domainEntityManager.persistDomainEntity(texaiSamplePhrase);
        }
      }
      domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
    }
    return position;
  }
  
  /** Acquires word forms from the Wiktionary lexicion. */
  private void acquireWordFormsFromWiktionary() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final Iterator<Object> wiktionaryEnglishWordForm_iter = domainEntityManager.domainEntityIterator(WiktionaryEnglishWordForm.class);
    final AtomicTerm texaiWordFormTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
    final AtomicTerm texaiLemmaTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
    while (wiktionaryEnglishWordForm_iter.hasNext()) {
      final WiktionaryEnglishWordForm wiktionaryEnglishWordForm = (WiktionaryEnglishWordForm) wiktionaryEnglishWordForm_iter.next();
      LOGGER.info("processing " + wiktionaryEnglishWordForm);
      nbrEnglishWordsProcessed++;
      final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
              texaiWordFormTerm,
              wiktionaryEnglishWordForm.getWordForm(),
              TexaiEnglishWordForm.class);
      TexaiEnglishWordForm texaiEnglishWordForm = null;
      for (final Object texaiEnglishWordFormObj : texaiEnglishWordForms) {
        LOGGER.debug("  comparing " + texaiEnglishWordFormObj);
        if (((TexaiEnglishWordForm) texaiEnglishWordFormObj).getWordFormInflection().equals(wiktionaryEnglishWordForm.getWordFormInflection())) {
          texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordFormObj;
          break;
        }
      }
      if (texaiEnglishWordForm == null) {
        final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntityByIndentifyingPropertyValue(
                texaiLemmaTerm,
                wiktionaryEnglishWordForm.getWiktionaryEnglishWord().getLemma(),
                TexaiEnglishWord.class);
        if (texaiEnglishWord == null) {
          LOGGER.info("expected word not found for " + wiktionaryEnglishWordForm.getWiktionaryEnglishWord().getLemma());
          continue;
        }
        wiktionaryEnglishWordForm.getARPABETPronunciations().size();  // load the lazy set
        texaiEnglishWordForm = new TexaiEnglishWordForm(
                wiktionaryEnglishWordForm.getWordForm(),
                wiktionaryEnglishWordForm.getWordFormInflection(),
                wiktionaryEnglishWordForm.getARPABETPronunciations(),
                texaiEnglishWord,
                wiktionaryEnglishWordForm,
                null,
                null,
                false);
        LOGGER.info("  adding " + texaiEnglishWordForm);
        domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
        if (++nbrEnglishWordsAcquired % 20 == 0) {
          commit(nbrEnglishWordsAcquired);
        }
      }
    }
  }
  
  /** Acquires lexicon from the CMU Pronouncing Dictionary domain entities in the KB. */
  private void acquireLexiconFromCMUPronouncingDictionary() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final Iterator<Object> cmuDictionaryEnglishWordForm_iter = domainEntityManager.domainEntityIterator(CMUDictionaryEnglishWordForm.class);
    final AtomicTerm texaiWordFormTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
    final AtomicTerm texaiLemmaTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
    while (cmuDictionaryEnglishWordForm_iter.hasNext()) {
      final CMUDictionaryEnglishWordForm cmuDictionaryEnglishWordForm = (CMUDictionaryEnglishWordForm) cmuDictionaryEnglishWordForm_iter.next();
      LOGGER.info("processing " + cmuDictionaryEnglishWordForm);
      nbrEnglishWordsProcessed++;
      final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
              texaiWordFormTerm,
              cmuDictionaryEnglishWordForm.getWordForm(),
              TexaiEnglishWordForm.class);
      for (final Object texaiEnglishWordFormObj : texaiEnglishWordForms) {
        final TexaiEnglishWordForm texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordFormObj;
        for (final ARPABETPronunciation arpabetPronounciation : cmuDictionaryEnglishWordForm.getARPABETPronunciations()) {
          texaiEnglishWordForm.getARPABETPronunciations().add(arpabetPronounciation);
        }
        texaiEnglishWordForm.setCMUDictionaryEnglishWordForm(cmuDictionaryEnglishWordForm);
        LOGGER.info("  updating " + texaiEnglishWordForm);
        domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
        if (++nbrEnglishWordsAcquired % 20 == 0) {
          commit(nbrEnglishWordsAcquired);
        }
      }
    }
  }
  
  /** Populate word sense mapped terms from WordNetSynset mapped terms. */
  private void fixMappedTerms() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final Iterator<Object> texaiEnglishWordSense_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWordSense.class);
    while (texaiEnglishWordSense_iter.hasNext()) {
      final TexaiEnglishWordSense texaiEnglishWordSense = (TexaiEnglishWordSense) texaiEnglishWordSense_iter.next();
      LOGGER.debug("processing " + texaiEnglishWordSense);
      nbrEnglishWordsProcessed++;
      final WordNetSynset wordNetSynset = texaiEnglishWordSense.getWordNetSynset();
      if (wordNetSynset != null) {
        final AbstractReifiedTerm texaiMappedTerm = wordNetSynset.getWNMappedTerm();
        if (texaiMappedTerm != null) {
          texaiEnglishWordSense.setTexaiMappedTerm(texaiMappedTerm);
          LOGGER.info("  " + texaiEnglishWordSense + " --> " + texaiMappedTerm);
          domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
          if (++nbrEnglishWordsAcquired % 20 == 0) {
            commit(nbrEnglishWordsAcquired);
          }
        }
      }
    }
  }
  
  /** Acquire proper nouns from WordNet cased words, by promoting Nouns to ProperNouns. */
  private void acquireProperNounsFromWordNet() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    LOGGER.info("finding WordNet cased words");
    final Iterator<Object> wordNetCasedEnglishWord_iter = domainEntityManager.domainEntityIterator(WordNetCasedEnglishWord.class);
    final AtomicTerm texaiWordForm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
    final AtomicTerm texaiLemma = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
    AtomicTerm properNoun = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PROPER_NOUN);
    final Set<ARPABETPronunciation> arpabetPronounciations = new HashSet<ARPABETPronunciation>();
    final Set<TexaiEnglishWordForm> duplicateWordForms = new HashSet<TexaiEnglishWordForm>();
    while (wordNetCasedEnglishWord_iter.hasNext()) {
      final WordNetCasedEnglishWord wordNetCasedEnglishWord = (WordNetCasedEnglishWord) wordNetCasedEnglishWord_iter.next();
      LOGGER.info("processing " + wordNetCasedEnglishWord);
      nbrEnglishWordsProcessed++;
      final String wnLemma = wordNetCasedEnglishWord.getWNLemma();
      final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
              texaiWordForm,
              wnLemma,
              TexaiEnglishWordForm.class);
      LOGGER.info("  word forms " + texaiEnglishWordForms);
      TexaiEnglishWordForm texaiEnglishWordForm = null;
      boolean haveProperNoun = false;
      duplicateWordForms.clear();
      final Iterator<Object> texaiEnglishWordForms_iter = texaiEnglishWordForms.iterator();
      while (texaiEnglishWordForms_iter.hasNext()) {
        texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordForms_iter.next();
        final String wordFormInflectionString = texaiEnglishWordForm.getWordFormInflection().toString();
        if (Constants.TERM_NAME_NOUN.equals(wordFormInflectionString)
        || Constants.TERM_NAME_PROPER_NOUN.equals(wordFormInflectionString)) {
          if (haveProperNoun) {
            duplicateWordForms.add(texaiEnglishWordForm);
          } else {
            haveProperNoun = true;
            if (Constants.TERM_NAME_NOUN.equals(wordFormInflectionString)) {
              texaiEnglishWordForm.setWordFormInflection(properNoun);
              LOGGER.info("  set word form " + texaiEnglishWordForm);
              domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
            }
          }
        }
      }
      if (texaiEnglishWordForm != null) {
        TexaiEnglishWord texaiEnglishWord = texaiEnglishWordForm.getTexaiEnglishWord();
        final List<TexaiEnglishWordSense> texaiEnglishWordSenses = texaiEnglishWord.getTexaiEnglishWordSenses();
        // replace proxy with the loaded domain entity, for subsequent persistence
        texaiEnglishWord = texaiEnglishWordForm.getTexaiEnglishWord();
        for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWordSenses) {
          if ("Noun".equals(texaiEnglishWordSense.getSpeechPart().toString())) {
            texaiEnglishWordSense.setSpeechPart(properNoun);
            LOGGER.info("  set word sense " + texaiEnglishWordSense);
            domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
            if (++nbrEnglishWordsAcquired % 20 == 0) {
              commit(nbrEnglishWordsAcquired);
              properNoun = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PROPER_NOUN);
            }
          }
        }
        for (final TexaiEnglishWordForm duplicateWordForm : duplicateWordForms) {
          LOGGER.info("  deleting duplicate " + duplicateWordForm);
          LOGGER.info("before");
          for (final TexaiEnglishWordForm wf : texaiEnglishWord.getTexaiEnglishWordForms()) {
            LOGGER.info("    wf " + wf.getTermId());
          }
          texaiEnglishWord.getTexaiEnglishWordForms().remove(duplicateWordForm);
          LOGGER.info("after");
          for (final TexaiEnglishWordForm wf : texaiEnglishWord.getTexaiEnglishWordForms()) {
            LOGGER.info("    wf " + wf.getTermId());
          }
          domainEntityManager.deleteDomainEntity(duplicateWordForm);
        }
        if (!duplicateWordForms.isEmpty()) {
          // note that the below persisted object is not the cglib proxy
          domainEntityManager.persistDomainEntity(texaiEnglishWord);
        }
      }
    }
  }
  
  /** Acquire proper adjectives from WordNet cased words, by promoting Adjectives to ProperAdjectives. */
  private void acquireProperAdjectivesFromWordNet() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    LOGGER.info("finding WordNet cased words");
    final Iterator<Object> wordNetCasedEnglishWord_iter = domainEntityManager.domainEntityIterator(WordNetCasedEnglishWord.class);
    final AtomicTerm texaiWordForm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
    final AtomicTerm texaiLemma = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
    AtomicTerm properAdjective = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PROPER_ADJECTIVE);
    final Set<ARPABETPronunciation> arpabetPronounciations = new HashSet<ARPABETPronunciation>();
    final Set<TexaiEnglishWordForm> duplicateWordForms = new HashSet<TexaiEnglishWordForm>();
    while (wordNetCasedEnglishWord_iter.hasNext()) {
      final WordNetCasedEnglishWord wordNetCasedEnglishWord = (WordNetCasedEnglishWord) wordNetCasedEnglishWord_iter.next();
      LOGGER.info("processing " + wordNetCasedEnglishWord);
      nbrEnglishWordsProcessed++;
      final String wnLemma = wordNetCasedEnglishWord.getWNLemma();
      final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
              texaiWordForm,
              wnLemma,
              TexaiEnglishWordForm.class);
      LOGGER.info("  word forms " + texaiEnglishWordForms);
      TexaiEnglishWordForm texaiEnglishWordForm = null;
      boolean haveProperAdjective = false;
      duplicateWordForms.clear();
      final Iterator<Object> texaiEnglishWordForms_iter = texaiEnglishWordForms.iterator();
      while (texaiEnglishWordForms_iter.hasNext()) {
        texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordForms_iter.next();
        final String wordFormInflectionString = texaiEnglishWordForm.getWordFormInflection().toString();
        if (Constants.TERM_NAME_ADJECTIVE.equals(wordFormInflectionString)
        || Constants.TERM_NAME_PROPER_ADJECTIVE.equals(wordFormInflectionString)) {
          if (haveProperAdjective) {
            duplicateWordForms.add(texaiEnglishWordForm);
          } else {
            haveProperAdjective = true;
            if (Constants.TERM_NAME_ADJECTIVE.equals(wordFormInflectionString)) {
              texaiEnglishWordForm.setWordFormInflection(properAdjective);
              LOGGER.info("  set word form " + texaiEnglishWordForm);
              domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
            }
          }
        }
      }
      if (texaiEnglishWordForm != null) {
        TexaiEnglishWord texaiEnglishWord = texaiEnglishWordForm.getTexaiEnglishWord();
        final List<TexaiEnglishWordSense> texaiEnglishWordSenses = texaiEnglishWord.getTexaiEnglishWordSenses();
        // replace proxy with the loaded domain entity, for subsequent persistence
        texaiEnglishWord = texaiEnglishWordForm.getTexaiEnglishWord();
        for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWordSenses) {
          if (Constants.TERM_NAME_ADJECTIVE.equals(texaiEnglishWordSense.getSpeechPart().toString())) {
            texaiEnglishWordSense.setSpeechPart(properAdjective);
            LOGGER.info("  set word sense " + texaiEnglishWordSense);
            domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
          }
        }
        for (final TexaiEnglishWordForm duplicateWordForm : duplicateWordForms) {
          LOGGER.info("  deleting duplicate " + duplicateWordForm);
          texaiEnglishWord.getTexaiEnglishWordForms().remove(duplicateWordForm);
          domainEntityManager.deleteDomainEntity(duplicateWordForm);
        }
        if (!duplicateWordForms.isEmpty()) {
          // note that the below persisted object is not the cglib proxy
          domainEntityManager.persistDomainEntity(texaiEnglishWord);
        }
      }
      if (++nbrEnglishWordsAcquired % 20 == 0) {
        commit(nbrEnglishWordsAcquired);
        properAdjective = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PROPER_ADJECTIVE);
      }
    }
  }
  
  /** Acquires lexicon from OpenCyc print strings associated with KB terms. */
  private void acquireLexiconFromOpenCyc() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    LOGGER.info("finding OpenCyc prettyString terms");
    final Iterator<BinaryGAF> binaryGAF_iter = domainEntityManager.binaryGAFByPredicateIterator(
            domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PRETTY_STRING_CANONICAL),
            domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ENGLISH_MT));
    final AtomicTerm texaiWordFormTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
    while (binaryGAF_iter.hasNext()) {
      final BinaryGAF binaryGAF = binaryGAF_iter.next();
      final AbstractReifiedTerm openCycTerm = (AbstractReifiedTerm) binaryGAF.getArg1();
      final String openCycTermString = openCycTerm.toString();
      if (Character.isLowerCase(openCycTermString.charAt(0))) {
        // do not map OpenCyc predicates
        continue;
      } else if (openCycTermString.endsWith("Fn")) {
        // do not map OpenCyc functor terms
        continue;
      } else if (openCycTermString.endsWith("-TheMovie")) {
        // do not map OpenCyc Movie terms
        continue;
      } else if (openCycTermString.endsWith("-TheBand")) {
        // do not map OpenCyc Band terms
        continue;
      } else if (openCycTermString.endsWith("-TheTVShow")) {
        // do not map OpenCyc TV show terms
        continue;
      } else if (openCycTermString.endsWith("-TheProgram")) {
        // do not map OpenCyc Program terms
        continue;
      } else if (openCycTermString.startsWith("PredicateNamedFn")) {
        // do not map OpenCyc underspecified terms
        continue;
      } else if (openCycTermString.startsWith("InstanceNamedFn")) {
        // do not map OpenCyc underspecified terms
        continue;
      }
      final PString pString = (PString) binaryGAF.getArg2();
      final String openCycPrettyString = pString.getStringValue();
      LOGGER.debug("processing " + openCycPrettyString);
      nbrEnglishWordsProcessed++;
      final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
              texaiWordFormTerm,
              openCycPrettyString,
              TexaiEnglishWordForm.class);
      if (texaiEnglishWordForms.size() == 1) {
        final TexaiEnglishWordForm texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordForms.iterator().next();
        final List<TexaiEnglishWordSense> texaiEnglishWordSenses = texaiEnglishWordForm.getTexaiEnglishWord().getTexaiEnglishWordSenses();
        if (texaiEnglishWordSenses.size() == 1) {
          final TexaiEnglishWordSense texaiEnglishWordSense = texaiEnglishWordSenses.get(0);
          LOGGER.debug("  " + texaiEnglishWordSense);
          if (texaiEnglishWordSense.getTexaiMappedTerm() == null) {
            texaiEnglishWordSense.setIsOpenCycWordSense(true);
            texaiEnglishWordSense.setTexaiMappedTerm(openCycTerm);
            LOGGER.info("  " + texaiEnglishWordSense + " --> " + openCycTerm);
            domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
            if (++nbrEnglishWordsAcquired % 20 == 0) {
              commit(nbrEnglishWordsAcquired);
            }
          }
        }
      }
    }
  }
  
  /** Acquires Wiktionary glosses and sample phrases. */
  private void acquireWiktionaryGlossesAndSamplePhrases() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    LOGGER.info("gathering Wiktionary English words");
    final Iterator<Object> wiktionaryEnglishWord_iter = domainEntityManager.domainEntityIterator(WiktionaryEnglishWord.class);
    final AtomicTerm property = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
    final Set<TexaiEnglishWordForm> texaiEnglishWordForms = new HashSet<TexaiEnglishWordForm>();
    final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
    final Set<AtomicTerm> nounTypes = new HashSet<AtomicTerm>();
    while (wiktionaryEnglishWord_iter.hasNext()) {
      final WiktionaryEnglishWord wiktionaryEnglishWord = (WiktionaryEnglishWord) wiktionaryEnglishWord_iter.next();
      LOGGER.debug("processing " + wiktionaryEnglishWord);
      String lemma = wiktionaryEnglishWord.getLemma();
      // categorize the Wiktionary word senses
      final List<WiktionaryEnglishWordSense> nounWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      final List<WiktionaryEnglishWordSense> verbWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      final List<WiktionaryEnglishWordSense> adjectiveWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      final List<WiktionaryEnglishWordSense> adverbWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      final List<WiktionaryEnglishWordSense> otherWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
      for (final WiktionaryEnglishWordSense wiktionaryEnglishWordSense : wiktionaryEnglishWord.getWiktionaryEnglishWordSenses()) {
        final String speechPartString = wiktionaryEnglishWordSense.getSpeechPart().toString();
        // nouns
        if (Constants.TERM_NAME_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_ABBREVIATION.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_ACRONYM.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_COLLECTIVE_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_GERUNDIVE_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_INITIALISM.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_MASS_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_NOUN_PHRASE.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_PLURAL_NOUN_WORD_FORM.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_PROPER_NOUN.equals(speechPartString)) {
          nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
          
          // verbs
        } else if (Constants.TERM_NAME_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_AUX_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_INTRANSITIVE_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_REFLEXIVE_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_TRANSITIVE_VERB.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_VERB_PHRASE.equals(speechPartString)) {
          verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
          
          // adjectives
        } else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPartString)) {
          adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_ADJECTIVE_PHRASE.equals(speechPartString)) {
          adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_PROPER_ADJECTIVE.equals(speechPartString)) {
          adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
          
          // adverbs
        } else if (Constants.TERM_NAME_ADVERB.equals(speechPartString)) {
          adverbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        } else if (Constants.TERM_NAME_ADVERB_PHRASE.equals(speechPartString)) {
          adverbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
          
          // others
        } else {
          otherWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
        }
      }
      TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntityByIndentifyingPropertyValue(
              property,
              lemma,
              TexaiEnglishWord.class);
      assert texaiEnglishWord != null : "cannot find " + lemma;
      // categorize the texai word senses
      final List<TexaiEnglishWordSense> nounTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final List<TexaiEnglishWordSense> verbTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final List<TexaiEnglishWordSense> adjectiveTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final List<TexaiEnglishWordSense> adverbTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      final List<TexaiEnglishWordSense> otherTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
      for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWord.getTexaiEnglishWordSenses()) {
        String speechPartString = null;
        if (texaiEnglishWordSense.getBasicSpeechPart() != null) {
          speechPartString = texaiEnglishWordSense.getBasicSpeechPart().toString();
        }
        if (speechPartString == null) {
          otherTexaiWordSenses.add(texaiEnglishWordSense);
        } else if (Constants.TERM_NAME_NOUN.equals(speechPartString)) {
          nounTexaiWordSenses.add(texaiEnglishWordSense);
        } else if (Constants.TERM_NAME_VERB.equals(speechPartString)) {
          verbTexaiWordSenses.add(texaiEnglishWordSense);
        } else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPartString)) {
          adjectiveTexaiWordSenses.add(texaiEnglishWordSense);
        } else if (Constants.TERM_NAME_ADVERB.equals(speechPartString)) {
          adverbTexaiWordSenses.add(texaiEnglishWordSense);
        } else {
          LOGGER.info("uncategorized WordNet-dervived word sense");
          otherTexaiWordSenses.add(texaiEnglishWordSense);
        }
      }
      boolean hasMergedNounWordSenses = false;
      boolean hasMergedVerbWordSenses = false;
      boolean hasMergedAdjectiveWordSenses = false;
      boolean hasMergedAdverbWordSenses = false;
      boolean hasMergedOtherWordSenses = false;
      // merge wiktionary word senses for unpopulated texai categories among Noun, Verb, Adjective and Adverb
      if (nounTexaiWordSenses.size() == 1 && nounWiktionaryWordSenses.size() == 1) {
        mergeWiktionaryGlossesAndSamplePhrases(
                nounWiktionaryWordSenses.get(0),
                nounTexaiWordSenses.get(0),
                domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_NOUN),
                texaiEnglishWord);
      } else if (verbTexaiWordSenses.size() == 1 && verbWiktionaryWordSenses.size() == 1) {
        mergeWiktionaryGlossesAndSamplePhrases(
                verbWiktionaryWordSenses.get(0),
                verbTexaiWordSenses.get(0),
                domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_VERB),
                texaiEnglishWord);
      } else if (adjectiveTexaiWordSenses.size() == 1 && adjectiveWiktionaryWordSenses.size() == 1) {
        mergeWiktionaryGlossesAndSamplePhrases(
                adjectiveWiktionaryWordSenses.get(0),
                adjectiveTexaiWordSenses.get(0),
                domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ADJECTIVE),
                texaiEnglishWord);
      } else if (adverbTexaiWordSenses.size() == 1 && adverbWiktionaryWordSenses.size() == 1) {
        mergeWiktionaryGlossesAndSamplePhrases(
                adverbWiktionaryWordSenses.get(0),
                adverbTexaiWordSenses.get(0),
                domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ADVERB),
                texaiEnglishWord);
      }
      if (++nbrEnglishWordsProcessed % 100 == 0) {
        commit(nbrEnglishWordsProcessed);
      }
    }
  }
  
  /** Merges the glosses and sample phrases from the given Wiktionary word sense into the given texai word sense.
   *
   * @param wiktionaryWordSense the list of Wiktonary word sense
   * @param texaiWordSense the Texai word senses into which the Wiktionary word sense is merged
   * @param basicSpeechPart the basic speech part, e.g. Noun, Verb
   * @param texaiEnglishWord the texai English word
   */
  private void mergeWiktionaryGlossesAndSamplePhrases(
          final WiktionaryEnglishWordSense wiktionaryEnglishWordSense,
          final TexaiEnglishWordSense texaiEnglishWordSense,
          final AtomicTerm basicSpeechPart,
          final TexaiEnglishWord texaiEnglishWord) {
    //Preconditions
    assert wiktionaryEnglishWordSense != null : "wiktionaryEnglishWordSense must not be null";
    assert texaiEnglishWordSense != null : "texaiEnglishWordSense must not be null";
    assert texaiEnglishWord != null : "texaiEnglishWord must not be null";
    
    final String gloss = wiktionaryEnglishWordSense.getGloss();
    if (!texaiEnglishWordSense.getGlosses().contains(gloss)) {
      nbrEnglishWordsAcquired++;
      LOGGER.info("");
      LOGGER.info(texaiEnglishWordSense + "  adding gloss \"" + gloss + "\"");
      texaiEnglishWordSense.getGlosses().add(wiktionaryEnglishWordSense.getGloss());
      LOGGER.debug("  updated glosses" + texaiEnglishWordSense.getGlosses());
      for (final WiktionarySamplePhrase wiktionarySamplePhrase : wiktionaryEnglishWordSense.getSamplePhrases()) {
        final TexaiSamplePhrase texaiSamplePhrase = new TexaiSamplePhrase(
                wiktionarySamplePhrase.getWiktionarySamplePhrase(),
                texaiEnglishWordSense);
        LOGGER.info(texaiEnglishWordSense + "  adding sample phrase \"" + texaiSamplePhrase.getSamplePhrase() + "\"");
        domainEntityManager.persistDomainEntity(texaiSamplePhrase);
        texaiEnglishWordSense.getTexaiSamplePhrases().add(texaiSamplePhrase);
        LOGGER.debug("  updated sample phrases" + texaiEnglishWordSense.getTexaiSamplePhrases());
      }
      // fix Wiktionary word sense position
      wiktionaryEnglishWordSense.setPosition(1);
      texaiEnglishWordSense.setWiktionaryEnglishWordSense(wiktionaryEnglishWordSense);
      domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
    }
  }
  
  private void fixWordNetSynsetReferences() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    final AtomicTerm property = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_SENSE_GLOSS);
    LOGGER.info("gathering WordNet synsets");
    final Iterator<Object> wordNetSynset_iter = domainEntityManager.domainEntityIterator(WordNetSynset.class);
    while (wordNetSynset_iter.hasNext()) {
      final WordNetSynset wordNetSynset = (WordNetSynset) wordNetSynset_iter.next();
      LOGGER.info("");
      LOGGER.info("processing " + wordNetSynset.description());
      nbrEnglishWordsProcessed++;
      final Set<Object> texaiEnglishWordSenses = domainEntityManager.loadDomainEntitiesByPropertyValue(
              property,
              wordNetSynset.getWNSynsetGloss(),
              TexaiEnglishWordSense.class);
      for (final Object texaiEnglishWordSenseObj : texaiEnglishWordSenses) {
        final TexaiEnglishWordSense texaiEnglishWordSense = (TexaiEnglishWordSense) texaiEnglishWordSenseObj;
        if (wordNetSynset.synsetWordsAsString().contains(texaiEnglishWordSense.getTexaiEnglishWord().getLemma())) {
          if (wordNetSynset.getWNSynsetSpeechPart().equals(texaiEnglishWordSense.getBasicSpeechPart())) {
            texaiEnglishWordSense.setWordNetSynset(wordNetSynset);
            LOGGER.info("  set " + texaiEnglishWordSense);
            domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
            if (++nbrEnglishWordsAcquired % 20 == 0) {
              commit(nbrEnglishWordsAcquired);
            }
          } else {
            LOGGER.info("  ** " + wordNetSynset.getWNSynsetSpeechPart() + " did not match " + texaiEnglishWordSense.getBasicSpeechPart());
          }
        }
      }
    }
  }
  
  /** Fixes the WordNet synset references in matching texai word senses. */
  private void fixUppercaseWords() {
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    LOGGER.info("gathering words");
    final Iterator<Object> texaiEnglishWord_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWord.class);
    while (texaiEnglishWord_iter.hasNext()) {
      final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) texaiEnglishWord_iter.next();
      LOGGER.info("processing " + texaiEnglishWord);
      final String lemma = texaiEnglishWord.getLemma().toLowerCase();
      if (!texaiEnglishWord.getLemma().equals(lemma)) {
        texaiEnglishWord.setLemma(lemma);
        LOGGER.info("  fixed " + texaiEnglishWord);
        domainEntityManager.persistDomainEntity(texaiEnglishWord);
        if (++nbrEnglishWordsAcquired % 20 == 0) {
          commit(nbrEnglishWordsAcquired);
        }
      }
    }
  }
  
  /** Fixes the WordNet synset references in matching texai word senses. */
  private void fixUppercaseWordForms() {
    //TODO only ProperNoun and ProperAdjective word forms should have upper case first letter, initialisms and abbreviations are ok.
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    LOGGER.info("gathering word forms");
    final Iterator<Object> texaiEnglishWordForm_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWordForm.class);
    final StringBuilder stringBuilder = new StringBuilder(Constants.STRING_BUILDER_SIZE_SMALL);
    while (texaiEnglishWordForm_iter.hasNext()) {
      final TexaiEnglishWordForm texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordForm_iter.next();
      LOGGER.info("processing " + texaiEnglishWordForm);
      final String speechPartString = texaiEnglishWordForm.getWordFormInflection().toString();
      final String wordForm = texaiEnglishWordForm.getWordForm().toLowerCase();
      if (!texaiEnglishWordForm.getWordForm().equals(wordForm)) {
        if ("Initialism".equals(speechPartString) 
        || "Abbreviation".equals(speechPartString)
        || "ProperNoun".equals(speechPartString) 
        || "ProperAdjective".equals(speechPartString)) {
          continue;
        } else {
          // EAT --> eat
          texaiEnglishWordForm.setWordForm(wordForm);
        }
        LOGGER.info("  fixed " + texaiEnglishWordForm);
        domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
        if (++nbrEnglishWordsAcquired % 20 == 0) {
          commit(nbrEnglishWordsAcquired);
        }
      }
    }
  }
  
  /** Fixes the WordNet synset references in matching texai word senses. */
  /** Commits a group of persisted domain entities to the knowledge base.
   *
   * @param count the count to log
   */
  private void commit(final int count) {
    LOGGER.info("");
    LOGGER.info("*** committing ***     " + count);
    LOGGER.info("");
    entityTransaction.commit();
    entityManager.clear();
    if (count >= 200) {
      domainEntityManager.setValidateWellFormedFormula(false);
    }
    entityTransaction = entityManager.getTransaction();
    entityTransaction.begin();
    domainEntityManager.setCreator(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_LEXICON_INITIALIZATION_PROCESS));
    domainEntityManager.setCreationPurpose(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_LEXICON_INITIALIZATION_PROJECT));
  }
  
  /** Finalizes this application. */
  private void finalization() {
    entityTransaction.commit();
    CacheManager.getInstance().shutdown();
    entityManager.close();
    entityManagerFactory.close();
    LOGGER.info("Number of English words acquired " + nbrEnglishWordsAcquired);
    LOGGER.info("Number of English words processed for word senses " + nbrEnglishWordsProcessed);
  }
  
  /** Provides a comparator to sort WordNet word senses by sense number. */
  @SuppressWarnings("unchecked")
  private class WordNetWordSenseComparator implements Comparator {
    
    /** Constructs a new WordNetWordSenseComparator instance. */
    public WordNetWordSenseComparator() {
    }
    
    /** Compares its two arguments for order.  Returns a negative integer,
     * zero, or a positive integer as the first argument is less than, equal
     * to, or greater than the second.
     *
     * @param o1 the first WordNet word sense
     * @param o2 the second WordNet word sense
     * @return a negative integer, zero, or a positive integer as the first word sense is less than, equal
     * to, or greater than the second word sense
     */
    public int compare(final Object o1, final Object o2) {
      //Preconditions
      assert o1 != null : "o1 must not be null";
      assert o2 != null : "o2 must not be null";
      assert o1 instanceof WordNetWordSense : "o1 must be a WordNetWordSense " + o1;
      assert o2 instanceof WordNetWordSense : "o2 must be a WordNetWordSense " + o2;
      
      final WordNetWordSense first = (WordNetWordSense) o1;
      final WordNetWordSense second = (WordNetWordSense) o2;
      // rank by descending usage first
      if (first.getWNWordSenseFrequencyOfUsageRank() > second.getWNWordSenseFrequencyOfUsageRank()) {
        return -1;
      } else if (first.getWNWordSenseFrequencyOfUsageRank() < second.getWNWordSenseFrequencyOfUsageRank()) {
        return 1;
      }
      // rank by term id second
      return first.getTermId().compareTo(second.getTermId());
    }
  }
  
  /** Executes this application.
   *
   * @param args the command line arguments (unused)
   */
  public static void main(final String[] args) {
    final LexiconInitializer lexiconInitializer = new LexiconInitializer();
    lexiconInitializer.initialize();
    
    
    try {
      lexiconInitializer.initializeKBTerms();
//      lexiconInitializer.acquireLexiconFromWordNet();
//      lexiconInitializer.populateBasicSpeechPart();
//      lexiconInitializer.acquireLexiconFromWiktionary();
//      lexiconInitializer.acquireLexiconFromCMUPronouncingDictionary();
//      lexiconInitializer.fixMappedTerms();
//      lexiconInitializer.acquireProperNounsFromWordNet();
//      lexiconInitializer.acquireProperAdjectivesFromWordNet();
//      lexiconInitializer.acquireLexiconFromOpenCyc();
//      lexiconInitializer.acquireWiktionaryGlossesAndSamplePhrases();
//      lexiconInitializer.fixWordNetSynsetReferences();
      
//        lexiconInitializer.fixUppercaseWords();
        lexiconInitializer.fixUppercaseWordForms();
      
    } catch (final TexaiException ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    } catch (final NullPointerException ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    } catch (final AssertionError ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    }
    lexiconInitializer.finalization();
  }
}




See more files for this project here

Texai

Texai is an chatbot that intelligently seeks to acquire knowledge and friendly behaviors.

Project homepage: http://sourceforge.net/projects/texai
Programming language(s): Java,Shell Script,XML
License: other

  domainEntity/
    TexaiEnglishWord.java
    TexaiEnglishWordForm.java
    TexaiEnglishWordSense.java
    TexaiSamplePhrase.java
    package-info.java
  LexiconInitializer.java
  LexiconToXML.java