LexiconInitializer.java from Texai at Krugle
Show LexiconInitializer.java syntax highlighted
/*
* LexiconInitializer.java
*
* Created on February 22, 2007, 10:46 AM
*
* Description: Provides a lexicon initializer and maintainer that merges the following imported machine-readable dictioanaries:
* WordNet 2.1, Wiktionary, OpenCyc, and the CMU Pronounciation Dictionary.
* <P>
* WordNet has fine-grained word senses and its word senses take precedence over Wiktionary, unless Wordnet has a missing speech part
* definition that Wiktionary provides. The CMU Pronouncing Dictioanary and Wiktionary provide pronounciations for word forms.
*
* Copyright (C) 2007 Stephen L. Reed.
*
* This program is free software; you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program;
* if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.texai.lexicon;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import javax.persistence.EntityTransaction;
import javax.persistence.Persistence;
import net.sf.ehcache.CacheManager;
import org.apache.log4j.Logger;
import org.texai.cmudict.domainEntity.ARPABETPronunciation;
import org.texai.cmudict.domainEntity.CMUDictionaryEnglishWordForm;
import org.texai.kb.CacheInitializer;
import org.texai.kb.Constants;
import org.texai.kb.ejb.entity.AbstractReifiedTerm;
import org.texai.kb.ejb.entity.AtomicTerm;
import org.texai.kb.ejb.entity.BinaryGAF;
import org.texai.kb.ejb.entity.PString;
import org.texai.kb.ejb.session.DomainEntityManagerBean;
import org.texai.lexicon.domainEntity.TexaiEnglishWord;
import org.texai.lexicon.domainEntity.TexaiEnglishWordForm;
import org.texai.lexicon.domainEntity.TexaiEnglishWordSense;
import org.texai.lexicon.domainEntity.TexaiSamplePhrase;
import org.texai.util.TexaiException;
import org.texai.wiktionary.domainEntity.WiktionaryEnglishWord;
import org.texai.wiktionary.domainEntity.WiktionaryEnglishWordForm;
import org.texai.wiktionary.domainEntity.WiktionaryEnglishWordSense;
import org.texai.wiktionary.domainEntity.WiktionarySamplePhrase;
import org.texai.wordnet.domain.entity.WordNetCasedEnglishWord;
import org.texai.wordnet.domain.entity.WordNetEnglishWord;
import org.texai.wordnet.domain.entity.WordNetSamplePhraseItem;
import org.texai.wordnet.domain.entity.WordNetSynset;
import org.texai.wordnet.domain.entity.WordNetWordSense;
/**
*
* @author reed
*/
public final class LexiconInitializer {
/** the log4j logger */
private static final Logger LOGGER = Logger.getLogger(LexiconInitializer.class.getName());
/** the entity manager factory */
private EntityManagerFactory entityManagerFactory;
/** the entity manager */
private EntityManager entityManager;
/** the domain entity manager */
private DomainEntityManagerBean domainEntityManager;
/** the entity transaction */
private EntityTransaction entityTransaction;
/** the creator */
private AtomicTerm creator;
/** the creation purpose */
private AtomicTerm creationPurpose;
/** the number of English words acquired */
private int nbrEnglishWordsAcquired = 0;
/** the number of English word senses acquired from WordNet */
private int nbrEnglishWordsProcessed = 0;
/** the set of speech parts that have a corresponding word form created */
private Set<AtomicTerm> speechPartsHavingWordForm = new HashSet<AtomicTerm>();
/** Creates a new instance of LexiconInitializer. */
public LexiconInitializer() {
super();
}
/** Initializes the application and injects the dependencies for out-of-the-container execution of J2EE session beans. */
private void initialize() {
entityManagerFactory = Persistence.createEntityManagerFactory(Constants.TEST_PERSISTENCE_UNIT_NAME);
entityManager = entityManagerFactory.createEntityManager();
CacheInitializer.initializeCaches();
domainEntityManager = new DomainEntityManagerBean();
domainEntityManager.setEntityManager(entityManager);
domainEntityManager.injectSharedBeanDependencies();
}
/** Initializes the knowledge base terms. */
private void initializeKBTerms() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
domainEntityManager.setValidateWellFormedFormula(true);
domainEntityManager.setCreator(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_SOME_CYCLIST));
domainEntityManager.setCreationPurpose(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_OPEN_CYC_PROJECT));
final List<AbstractReifiedTerm> isaTerms = new ArrayList<AbstractReifiedTerm>();
isaTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_INDIVIDUAL));
isaTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_CYCLIST));
creator = domainEntityManager.findOrCreateDefinedTerm(
Constants.TERM_NAME_LEXICON_INITIALIZATION_PROCESS,
"the lexicon initialization process",
"This is the process that initializes the texai lexicon.",
isaTerms);
domainEntityManager.setCreator(creator);
LOGGER.info("creator: " + creator);
isaTerms.clear();
isaTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_INDIVIDUAL));
isaTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_CYC_BASED_PROJECT));
creationPurpose = domainEntityManager.findOrCreateDefinedTerm(
Constants.TERM_NAME_LEXICON_INITIALIZATION_PROJECT,
"the lexicon initialization project",
"This is the project that initializes the lexicon.",
isaTerms);
domainEntityManager.setCreationPurpose(creationPurpose);
LOGGER.info("creationPurpose: " + creationPurpose);
List<AbstractReifiedTerm> genlMtTerms = new ArrayList<AbstractReifiedTerm>();
genlMtTerms.add(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_CMU_PRONOUNCING_DICTIONARY_CONTEXT));
domainEntityManager.findOrCreateContextTerm(
Constants.TERM_NAME_TEXAI_ENGLISH_LEXICON_CONTEXT,
Constants.TERM_NAME_TEXAI_ENGLISH_LEXICON_CONTEXT,
Constants.TERM_NAME_TEXAI_ENGLISH_LEXICON_CONTEXT
+ " is the context that contains the English lexicon domain entities and their associations.",
new ArrayList<AbstractReifiedTerm>(),
genlMtTerms);
// replace arg2Isa(texaiNounType, Noun)
// with arg2Isa(texaiNounType, SpeechPart)
try {
final AtomicTerm oldArg1 = domainEntityManager.findAtomicTermByTermName("texaiNounType");
final AtomicTerm newArg1 = oldArg1;
domainEntityManager.editBinaryGAF(
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ARG2_ISA),
oldArg1,
newArg1,
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_NOUN),
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_SPEECH_PART),
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_UNIVERSAL_VOCABULARY_MT));
} catch (final TexaiException ex) {
LOGGER.info("binary gaf not found for editing");
}
entityTransaction.commit();
}
/** Acquires lexicon from WordNet domain entities in the KB. */
private void acquireLexiconFromWordNet() {
acquireWordsFromWordNet();
acquireWordSensesFromWordNet();
}
/** Acquires words from WordNet domain entities in the KB. */
private void acquireWordsFromWordNet() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final Iterator<Object> wordNetEnglishWord_iter = domainEntityManager.domainEntityIterator(WordNetEnglishWord.class);
final Set<TexaiEnglishWordForm> texaiEnglishWordForms = new HashSet<TexaiEnglishWordForm>();
final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
final Set<AtomicTerm> nounTypes = new HashSet<AtomicTerm>();
while (wordNetEnglishWord_iter.hasNext()) {
final WordNetEnglishWord wordNetEnglishWord = (WordNetEnglishWord) wordNetEnglishWord_iter.next();
final TexaiEnglishWord texaiEnglishWord = new TexaiEnglishWord(
wordNetEnglishWord.getWNLemma(),
texaiEnglishWordForms,
texaiEnglishWordSenses,
nounTypes,
wordNetEnglishWord,
null,
false);
domainEntityManager.persistDomainEntity(texaiEnglishWord);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
}
}
}
/** Acquires word senses from WordNet domain entities in the KB. */
@SuppressWarnings("unchecked")
private void acquireWordSensesFromWordNet() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final Iterator<Object> texaiEnglishWord_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWord.class);
while (texaiEnglishWord_iter.hasNext()) {
final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) texaiEnglishWord_iter.next();
LOGGER.info("processing word " + texaiEnglishWord);
speechPartsHavingWordForm.clear();
final WordNetEnglishWord wordNetEnglishWord = texaiEnglishWord.getWordNetEnglishWord();
final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
final Set<WordNetWordSense> wordNetWordSenses = wordNetEnglishWord.getWNWordSenses();
if (wordNetWordSenses.size() == 1) {
// with a singleton word sense both the positon and wordSenseNbr are 1
final Iterator<WordNetWordSense> wordNetWordSense_iter = wordNetWordSenses.iterator();
final WordNetWordSense wordNetWordSense = wordNetWordSense_iter.next();
acquireWordSenseFromWordNet(1, texaiEnglishWord, 1, wordNetWordSense);
} else {
// WordNet word senses are not numbered, so sort within speech part, then assign overall position
final List<WordNetWordSense> nounWordNetWordSenses = new ArrayList<WordNetWordSense>();
final List<WordNetWordSense> verbWordNetWordSenses = new ArrayList<WordNetWordSense>();
final List<WordNetWordSense> adjectiveWordNetWordSenses = new ArrayList<WordNetWordSense>();
final List<WordNetWordSense> adverbWordNetWordSenses = new ArrayList<WordNetWordSense>();
for (final WordNetWordSense wordNetWordSense : wordNetEnglishWord.getWNWordSenses()) {
final AtomicTerm speechPart = wordNetWordSense.getWordNetSynset().getWNSynsetSpeechPart();
if (Constants.TERM_NAME_NOUN.equals(speechPart.toString())) {
nounWordNetWordSenses.add(wordNetWordSense);
} else if (Constants.TERM_NAME_VERB.equals(speechPart.toString())) {
verbWordNetWordSenses.add(wordNetWordSense);
} else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPart.toString())) {
adjectiveWordNetWordSenses.add(wordNetWordSense);
} else if (Constants.TERM_NAME_ADVERB.equals(speechPart.toString())) {
adverbWordNetWordSenses.add(wordNetWordSense);
} else {
assert false : "invalid speech part for WordNet synset " + wordNetWordSense.getWordNetSynset();
}
}
final WordNetWordSenseComparator wordNetWordSenseComparator = new WordNetWordSenseComparator();
Collections.sort(nounWordNetWordSenses, wordNetWordSenseComparator);
Collections.sort(verbWordNetWordSenses, wordNetWordSenseComparator);
Collections.sort(adjectiveWordNetWordSenses, wordNetWordSenseComparator);
Collections.sort(adverbWordNetWordSenses, wordNetWordSenseComparator);
int position = 0;
int wordSenseNbr = 0;
for (final WordNetWordSense wordNetWordSense : nounWordNetWordSenses) {
position++;
wordSenseNbr++;
acquireWordSenseFromWordNet(position, texaiEnglishWord, wordSenseNbr, wordNetWordSense);
}
wordSenseNbr = 0;
for (final WordNetWordSense wordNetWordSense : verbWordNetWordSenses) {
position++;
wordSenseNbr++;
acquireWordSenseFromWordNet(position, texaiEnglishWord, wordSenseNbr, wordNetWordSense);
}
wordSenseNbr = 0;
for (final WordNetWordSense wordNetWordSense : adverbWordNetWordSenses) {
position++;
wordSenseNbr++;
acquireWordSenseFromWordNet(position, texaiEnglishWord, wordSenseNbr, wordNetWordSense);
}
wordSenseNbr = 0;
for (final WordNetWordSense wordNetWordSense : adjectiveWordNetWordSenses) {
position++;
wordSenseNbr++;
acquireWordSenseFromWordNet(position, texaiEnglishWord, wordSenseNbr, wordNetWordSense);
}
}
if (++nbrEnglishWordsProcessed % 20 == 0) {
commit(nbrEnglishWordsProcessed);
}
}
}
/** Acquires a word sense from WordNet domain entities.
*
* @param position the word sense position in the list of word senses for the word
* @param texaiEnglishWord the texai English word
* @param wordSenseNbr the sense number
* @param wordNetWordSense the WordNet word sense
*/
private void acquireWordSenseFromWordNet(
final int position,
final TexaiEnglishWord texaiEnglishWord,
final int wordSenseNbr,
final WordNetWordSense wordNetWordSense) {
wordNetWordSense.getWordNetSynset().getTermId(); // load lazy object
final WordNetSynset wordNetSynset = wordNetWordSense.getWordNetSynset();
final AtomicTerm speechPart = wordNetSynset.getWNSynsetSpeechPart();
final Set<String> glosses = new HashSet<String>();
glosses.add(wordNetSynset.getWNSynsetGloss());
final Set<TexaiSamplePhrase> texaiSamplePhrases = new HashSet<TexaiSamplePhrase>();
final Set<String> categoryNames = new HashSet<String>();
final TexaiEnglishWordSense texaiEnglishWordSense = new TexaiEnglishWordSense(
position,
speechPart,
speechPart,
wordSenseNbr,
glosses,
texaiSamplePhrases,
texaiEnglishWord,
wordNetSynset.getWNMappedTerm(),
categoryNames,
wordNetSynset.getWNCategory(),
wordNetSynset,
null,
false);
LOGGER.info(" created word sense " + texaiEnglishWordSense);
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
for (final WordNetSamplePhraseItem wordNetSamplePhraseItem : wordNetSynset.getWNSynsetSamplePhraseItems()) {
final String samplePhrase = wordNetSamplePhraseItem.getWnSamplePhrase().toLowerCase();
if (samplePhrase.contains(" " + texaiEnglishWord.getLemma() + " ")
|| samplePhrase.contains(" " + texaiEnglishWord.getLemma())
|| samplePhrase.contains(texaiEnglishWord.getLemma() + " ")) {
final TexaiSamplePhrase texaiSamplePhrase = new TexaiSamplePhrase(samplePhrase, texaiEnglishWordSense);
LOGGER.info(" created sample phrase " + texaiSamplePhrase);
domainEntityManager.persistDomainEntity(texaiSamplePhrase);
}
}
final Set<ARPABETPronunciation> arpabetPronounciations = new HashSet<ARPABETPronunciation>();
if (!speechPartsHavingWordForm.contains(speechPart)) {
final TexaiEnglishWordForm texaiEnglishWordForm = new TexaiEnglishWordForm(
texaiEnglishWord.getLemma(),
speechPart,
arpabetPronounciations,
texaiEnglishWord,
null,
null,
wordNetWordSense.getWordNetCasedEnglishWord(),
false);
speechPartsHavingWordForm.add(speechPart);
LOGGER.info(" created word form " + texaiEnglishWordForm);
domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
}
}
/** Populates the word sense basic speech part for WordNet dervived word senses. */
private void populateBasicSpeechPart() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final Iterator<Object> texaiEnglishWordSense_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWordSense.class);
while (texaiEnglishWordSense_iter.hasNext()) {
final TexaiEnglishWordSense texaiEnglishWordSense = (TexaiEnglishWordSense) texaiEnglishWordSense_iter.next();
LOGGER.info(texaiEnglishWordSense);
final AtomicTerm speechPart = texaiEnglishWordSense.getSpeechPart();
assert speechPart != null : "speechPart must not be null " + texaiEnglishWordSense;
texaiEnglishWordSense.setBasicSpeechPart(speechPart);
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
if (++nbrEnglishWordsProcessed % 20 == 0) {
commit(nbrEnglishWordsProcessed);
}
}
}
/** Acquires lexicon from Wiktionary domain entities in the KB. */
private void acquireLexiconFromWiktionary() {
// acquireWordsFromWiktionary();
// acquireWordSensesFromWiktionary();
acquireWordFormsFromWiktionary();
}
/** Acquires lexicon from Wiktionary word domain entities. */
private void acquireWordsFromWiktionary() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final Iterator<Object> wiktionaryEnglishWord_iter = domainEntityManager.domainEntityIterator(WiktionaryEnglishWord.class);
final AtomicTerm property = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
final Set<TexaiEnglishWordForm> texaiEnglishWordForms = new HashSet<TexaiEnglishWordForm>();
final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
final Set<AtomicTerm> nounTypes = new HashSet<AtomicTerm>();
while (wiktionaryEnglishWord_iter.hasNext()) {
final WiktionaryEnglishWord wiktionaryEnglishWord = (WiktionaryEnglishWord) wiktionaryEnglishWord_iter.next();
LOGGER.info("processing " + wiktionaryEnglishWord);
String lemma = wiktionaryEnglishWord.getLemma();
if (lemma.endsWith(".")) {
lemma = lemma.substring(0, lemma.length() - 1);
wiktionaryEnglishWord.setLemma(lemma);
domainEntityManager.persistDomainEntity(wiktionaryEnglishWord);
}
TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntityByIndentifyingPropertyValue(
property,
lemma,
TexaiEnglishWord.class);
if (texaiEnglishWord == null) {
texaiEnglishWord = new TexaiEnglishWord(
lemma,
texaiEnglishWordForms,
texaiEnglishWordSenses,
nounTypes,
null,
wiktionaryEnglishWord,
false);
nbrEnglishWordsAcquired++;
LOGGER.info(" created " + texaiEnglishWord);
} else {
texaiEnglishWord.setWiktionaryEnglishWord(wiktionaryEnglishWord);
wiktionaryEnglishWord.getNounTypes().size(); // load the lazy set
texaiEnglishWord.setNounTypes(wiktionaryEnglishWord.getNounTypes());
}
domainEntityManager.persistDomainEntity(texaiEnglishWord);
if (++nbrEnglishWordsProcessed % 20 == 0) {
commit(nbrEnglishWordsProcessed);
}
}
}
/** Acquires lexicon from Wiktionary word sense domain entities. */
private void acquireWordSensesFromWiktionary() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final Iterator<Object> wiktionaryEnglishWord_iter = domainEntityManager.domainEntityIterator(WiktionaryEnglishWord.class);
final AtomicTerm property = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
final Set<TexaiEnglishWordForm> texaiEnglishWordForms = new HashSet<TexaiEnglishWordForm>();
final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
final Set<AtomicTerm> nounTypes = new HashSet<AtomicTerm>();
while (wiktionaryEnglishWord_iter.hasNext()) {
final WiktionaryEnglishWord wiktionaryEnglishWord = (WiktionaryEnglishWord) wiktionaryEnglishWord_iter.next();
LOGGER.info("processing " + wiktionaryEnglishWord);
String lemma = wiktionaryEnglishWord.getLemma();
// categorize the Wiktionary word senses
final List<WiktionaryEnglishWordSense> nounWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
final List<WiktionaryEnglishWordSense> verbWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
final List<WiktionaryEnglishWordSense> adjectiveWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
final List<WiktionaryEnglishWordSense> adverbWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
final List<WiktionaryEnglishWordSense> otherWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
for (final WiktionaryEnglishWordSense wiktionaryEnglishWordSense : wiktionaryEnglishWord.getWiktionaryEnglishWordSenses()) {
final String speechPartString = wiktionaryEnglishWordSense.getSpeechPart().toString();
// nouns
if (Constants.TERM_NAME_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_ABBREVIATION.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_ACRONYM.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_COLLECTIVE_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_GERUNDIVE_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_INITIALISM.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_MASS_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_NOUN_PHRASE.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_PLURAL_NOUN_WORD_FORM.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_PROPER_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
// verbs
} else if (Constants.TERM_NAME_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_AUX_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_INTRANSITIVE_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_REFLEXIVE_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_TRANSITIVE_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_VERB_PHRASE.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
// adjectives
} else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPartString)) {
adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_ADJECTIVE_PHRASE.equals(speechPartString)) {
adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_PROPER_ADJECTIVE.equals(speechPartString)) {
adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
// adverbs
} else if (Constants.TERM_NAME_ADVERB.equals(speechPartString)) {
adverbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_ADVERB_PHRASE.equals(speechPartString)) {
adverbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
// others
} else {
otherWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
}
}
TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntityByIndentifyingPropertyValue(
property,
lemma,
TexaiEnglishWord.class);
assert texaiEnglishWord != null : "cannot find " + lemma;
// categorize the texai word senses
final List<TexaiEnglishWordSense> nounTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
final List<TexaiEnglishWordSense> verbTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
final List<TexaiEnglishWordSense> adjectiveTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
final List<TexaiEnglishWordSense> adverbTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
final List<TexaiEnglishWordSense> otherTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWord.getTexaiEnglishWordSenses()) {
String speechPartString = null;
try {
speechPartString = texaiEnglishWordSense.getBasicSpeechPart().toString();
} catch (final NullPointerException ex) {
LOGGER.info("null pointer exception detected when referencing texaiEnglishWordSense.getBasicSpeechPart()");
}
if (speechPartString == null) {
otherTexaiWordSenses.add(texaiEnglishWordSense);
} else if (Constants.TERM_NAME_NOUN.equals(speechPartString)) {
nounTexaiWordSenses.add(texaiEnglishWordSense);
} else if (Constants.TERM_NAME_VERB.equals(speechPartString)) {
verbTexaiWordSenses.add(texaiEnglishWordSense);
} else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPartString)) {
adjectiveTexaiWordSenses.add(texaiEnglishWordSense);
} else if (Constants.TERM_NAME_ADVERB.equals(speechPartString)) {
adverbTexaiWordSenses.add(texaiEnglishWordSense);
} else {
LOGGER.info("uncategorized WordNet-dervived word sense");
otherTexaiWordSenses.add(texaiEnglishWordSense);
}
}
boolean hasMergedNounWordSenses = false;
boolean hasMergedVerbWordSenses = false;
boolean hasMergedAdjectiveWordSenses = false;
boolean hasMergedAdverbWordSenses = false;
boolean hasMergedOtherWordSenses = false;
// merge wiktionary word senses for unpopulated texai categories among Noun, Verb, Adjective and Adverb
if (nounTexaiWordSenses.isEmpty() && !nounWiktionaryWordSenses.isEmpty()) {
hasMergedNounWordSenses = true;
mergeWiktionaryWordSenses(
nounWiktionaryWordSenses,
nounTexaiWordSenses,
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_NOUN),
texaiEnglishWord);
} else if (verbTexaiWordSenses.isEmpty() && !verbWiktionaryWordSenses.isEmpty()) {
hasMergedVerbWordSenses = true;
mergeWiktionaryWordSenses(
verbWiktionaryWordSenses,
verbTexaiWordSenses,
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_VERB),
texaiEnglishWord);
} else if (adjectiveTexaiWordSenses.isEmpty() && !adjectiveWiktionaryWordSenses.isEmpty()) {
hasMergedAdjectiveWordSenses = true;
mergeWiktionaryWordSenses(
adjectiveWiktionaryWordSenses,
adjectiveTexaiWordSenses,
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ADJECTIVE),
texaiEnglishWord);
} else if (adverbTexaiWordSenses.isEmpty() && !adverbWiktionaryWordSenses.isEmpty()) {
hasMergedAdverbWordSenses = true;
mergeWiktionaryWordSenses(
adverbWiktionaryWordSenses,
adverbTexaiWordSenses,
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ADVERB),
texaiEnglishWord);
} else if (!otherWiktionaryWordSenses.isEmpty()) {
hasMergedOtherWordSenses = true;
mergeWiktionaryWordSenses(
otherWiktionaryWordSenses,
otherTexaiWordSenses,
null,
texaiEnglishWord);
}
// recalculate the texai word sense positions if any merging occurred
if (hasMergedNounWordSenses
|| hasMergedVerbWordSenses
|| hasMergedAdjectiveWordSenses
|| hasMergedAdverbWordSenses
|| hasMergedOtherWordSenses) {
int position = 0;
position = orderTexaiWordSenses(nounTexaiWordSenses, position, hasMergedNounWordSenses);
position = orderTexaiWordSenses(verbTexaiWordSenses, position, hasMergedVerbWordSenses);
position = orderTexaiWordSenses(adjectiveTexaiWordSenses, position, hasMergedAdjectiveWordSenses);
position = orderTexaiWordSenses(adverbTexaiWordSenses, position, hasMergedAdverbWordSenses);
orderTexaiWordSenses(otherTexaiWordSenses, position, hasMergedOtherWordSenses);
if (++nbrEnglishWordsProcessed % 20 == 0) {
commit(nbrEnglishWordsProcessed);
}
}
}
}
/** Merges a given list of Wiktionary word senses into the empty list of Texai word senses for the same speech part.
*
* @param wiktionaryWordSenses the list of Wiktonary word senses
* @param texaiWordSenses the empty list of Texai word senses into which the Wiktionary word senses are merged
* @param basicSpeechPart the basic speech part, e.g. Noun, Verb
* @param texaiEnglishWord the texai English word
*/
private void mergeWiktionaryWordSenses(
final List<WiktionaryEnglishWordSense> wiktionaryEnglishWordSenses,
final List<TexaiEnglishWordSense> texaiEnglishWordSenses,
final AtomicTerm basicSpeechPart,
final TexaiEnglishWord texaiEnglishWord) {
//Preconditions
assert wiktionaryEnglishWordSenses != null : "wiktionaryEnglishWordSenses must not be null";
assert texaiEnglishWordSenses != null : "texaiEnglishWordSenses must not be null";
assert texaiEnglishWord != null : "texaiEnglishWord must not be null";
for (final WiktionaryEnglishWordSense wiktionaryEnglishWordSense : wiktionaryEnglishWordSenses) {
final Set<String> glosses = new HashSet<String>();
glosses.add(wiktionaryEnglishWordSense.getGloss());
final Set<TexaiSamplePhrase> texaiSamplePhrases = new HashSet<TexaiSamplePhrase>();
wiktionaryEnglishWordSense.getCategoryNames().size(); // load lazy set
final TexaiEnglishWordSense texaiEnglishWordSense = new TexaiEnglishWordSense(
1, // placeholder position pending recalculation
wiktionaryEnglishWordSense.getSpeechPart(),
basicSpeechPart,
wiktionaryEnglishWordSense.getSenseNbr(),
glosses,
texaiSamplePhrases,
texaiEnglishWord,
null,
wiktionaryEnglishWordSense.getCategoryNames(),
null,
null,
wiktionaryEnglishWordSense,
false);
if (wiktionaryEnglishWordSense.getPosition() < 1) {
wiktionaryEnglishWordSense.setPosition(wiktionaryEnglishWordSense.getSenseNbr());
LOGGER.debug(" fixed WiktionaryWordSense " + wiktionaryEnglishWordSense);
domainEntityManager.persistDomainEntity(wiktionaryEnglishWordSense);
}
texaiEnglishWordSenses.add(texaiEnglishWordSense);
LOGGER.info(" merged " + texaiEnglishWordSense);
for (final WiktionarySamplePhrase wiktionarySamplePhrase : wiktionaryEnglishWordSense.getSamplePhrases()) {
final TexaiSamplePhrase texaiSamplePhrase = new TexaiSamplePhrase(
wiktionarySamplePhrase.getWiktionarySamplePhrase(),
texaiEnglishWordSense);
texaiSamplePhrases.add(texaiSamplePhrase);
LOGGER.info(" merged " + texaiSamplePhrase);
}
}
}
/** Orders and persists the Texai word senses that may have been merged from the Wiktionary word senses.
*
* @param texaiWordSenses the list of texai word senses
* @param startingPosition the previous word sense position within the word
* @param hasMergedWordSenses the indicator whether this list of word senses, associated with a speech part, has been
* merged from Wiktionary
* @return the current word sense position
*/
private int orderTexaiWordSenses(
final List<TexaiEnglishWordSense> texaiWordSenses,
final int startingPosition,
final boolean hasMergedWordSenses) {
//Preconditions
assert texaiWordSenses != null : "texaiWordSenses must not be null";
assert startingPosition >= 0 : "startingPosition must not be negative";
int position = startingPosition;
for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiWordSenses) {
position++;
texaiEnglishWordSense.setPosition(position);
if (hasMergedWordSenses) {
for (final TexaiSamplePhrase texaiSamplePhrase : texaiEnglishWordSense.getTexaiSamplePhrases()) {
domainEntityManager.persistDomainEntity(texaiSamplePhrase);
}
}
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
}
return position;
}
/** Acquires word forms from the Wiktionary lexicion. */
private void acquireWordFormsFromWiktionary() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final Iterator<Object> wiktionaryEnglishWordForm_iter = domainEntityManager.domainEntityIterator(WiktionaryEnglishWordForm.class);
final AtomicTerm texaiWordFormTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
final AtomicTerm texaiLemmaTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
while (wiktionaryEnglishWordForm_iter.hasNext()) {
final WiktionaryEnglishWordForm wiktionaryEnglishWordForm = (WiktionaryEnglishWordForm) wiktionaryEnglishWordForm_iter.next();
LOGGER.info("processing " + wiktionaryEnglishWordForm);
nbrEnglishWordsProcessed++;
final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
texaiWordFormTerm,
wiktionaryEnglishWordForm.getWordForm(),
TexaiEnglishWordForm.class);
TexaiEnglishWordForm texaiEnglishWordForm = null;
for (final Object texaiEnglishWordFormObj : texaiEnglishWordForms) {
LOGGER.debug(" comparing " + texaiEnglishWordFormObj);
if (((TexaiEnglishWordForm) texaiEnglishWordFormObj).getWordFormInflection().equals(wiktionaryEnglishWordForm.getWordFormInflection())) {
texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordFormObj;
break;
}
}
if (texaiEnglishWordForm == null) {
final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntityByIndentifyingPropertyValue(
texaiLemmaTerm,
wiktionaryEnglishWordForm.getWiktionaryEnglishWord().getLemma(),
TexaiEnglishWord.class);
if (texaiEnglishWord == null) {
LOGGER.info("expected word not found for " + wiktionaryEnglishWordForm.getWiktionaryEnglishWord().getLemma());
continue;
}
wiktionaryEnglishWordForm.getARPABETPronunciations().size(); // load the lazy set
texaiEnglishWordForm = new TexaiEnglishWordForm(
wiktionaryEnglishWordForm.getWordForm(),
wiktionaryEnglishWordForm.getWordFormInflection(),
wiktionaryEnglishWordForm.getARPABETPronunciations(),
texaiEnglishWord,
wiktionaryEnglishWordForm,
null,
null,
false);
LOGGER.info(" adding " + texaiEnglishWordForm);
domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
}
}
}
}
/** Acquires lexicon from the CMU Pronouncing Dictionary domain entities in the KB. */
private void acquireLexiconFromCMUPronouncingDictionary() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final Iterator<Object> cmuDictionaryEnglishWordForm_iter = domainEntityManager.domainEntityIterator(CMUDictionaryEnglishWordForm.class);
final AtomicTerm texaiWordFormTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
final AtomicTerm texaiLemmaTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
while (cmuDictionaryEnglishWordForm_iter.hasNext()) {
final CMUDictionaryEnglishWordForm cmuDictionaryEnglishWordForm = (CMUDictionaryEnglishWordForm) cmuDictionaryEnglishWordForm_iter.next();
LOGGER.info("processing " + cmuDictionaryEnglishWordForm);
nbrEnglishWordsProcessed++;
final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
texaiWordFormTerm,
cmuDictionaryEnglishWordForm.getWordForm(),
TexaiEnglishWordForm.class);
for (final Object texaiEnglishWordFormObj : texaiEnglishWordForms) {
final TexaiEnglishWordForm texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordFormObj;
for (final ARPABETPronunciation arpabetPronounciation : cmuDictionaryEnglishWordForm.getARPABETPronunciations()) {
texaiEnglishWordForm.getARPABETPronunciations().add(arpabetPronounciation);
}
texaiEnglishWordForm.setCMUDictionaryEnglishWordForm(cmuDictionaryEnglishWordForm);
LOGGER.info(" updating " + texaiEnglishWordForm);
domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
}
}
}
}
/** Populate word sense mapped terms from WordNetSynset mapped terms. */
private void fixMappedTerms() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final Iterator<Object> texaiEnglishWordSense_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWordSense.class);
while (texaiEnglishWordSense_iter.hasNext()) {
final TexaiEnglishWordSense texaiEnglishWordSense = (TexaiEnglishWordSense) texaiEnglishWordSense_iter.next();
LOGGER.debug("processing " + texaiEnglishWordSense);
nbrEnglishWordsProcessed++;
final WordNetSynset wordNetSynset = texaiEnglishWordSense.getWordNetSynset();
if (wordNetSynset != null) {
final AbstractReifiedTerm texaiMappedTerm = wordNetSynset.getWNMappedTerm();
if (texaiMappedTerm != null) {
texaiEnglishWordSense.setTexaiMappedTerm(texaiMappedTerm);
LOGGER.info(" " + texaiEnglishWordSense + " --> " + texaiMappedTerm);
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
}
}
}
}
}
/** Acquire proper nouns from WordNet cased words, by promoting Nouns to ProperNouns. */
private void acquireProperNounsFromWordNet() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
LOGGER.info("finding WordNet cased words");
final Iterator<Object> wordNetCasedEnglishWord_iter = domainEntityManager.domainEntityIterator(WordNetCasedEnglishWord.class);
final AtomicTerm texaiWordForm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
final AtomicTerm texaiLemma = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
AtomicTerm properNoun = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PROPER_NOUN);
final Set<ARPABETPronunciation> arpabetPronounciations = new HashSet<ARPABETPronunciation>();
final Set<TexaiEnglishWordForm> duplicateWordForms = new HashSet<TexaiEnglishWordForm>();
while (wordNetCasedEnglishWord_iter.hasNext()) {
final WordNetCasedEnglishWord wordNetCasedEnglishWord = (WordNetCasedEnglishWord) wordNetCasedEnglishWord_iter.next();
LOGGER.info("processing " + wordNetCasedEnglishWord);
nbrEnglishWordsProcessed++;
final String wnLemma = wordNetCasedEnglishWord.getWNLemma();
final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
texaiWordForm,
wnLemma,
TexaiEnglishWordForm.class);
LOGGER.info(" word forms " + texaiEnglishWordForms);
TexaiEnglishWordForm texaiEnglishWordForm = null;
boolean haveProperNoun = false;
duplicateWordForms.clear();
final Iterator<Object> texaiEnglishWordForms_iter = texaiEnglishWordForms.iterator();
while (texaiEnglishWordForms_iter.hasNext()) {
texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordForms_iter.next();
final String wordFormInflectionString = texaiEnglishWordForm.getWordFormInflection().toString();
if (Constants.TERM_NAME_NOUN.equals(wordFormInflectionString)
|| Constants.TERM_NAME_PROPER_NOUN.equals(wordFormInflectionString)) {
if (haveProperNoun) {
duplicateWordForms.add(texaiEnglishWordForm);
} else {
haveProperNoun = true;
if (Constants.TERM_NAME_NOUN.equals(wordFormInflectionString)) {
texaiEnglishWordForm.setWordFormInflection(properNoun);
LOGGER.info(" set word form " + texaiEnglishWordForm);
domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
}
}
}
}
if (texaiEnglishWordForm != null) {
TexaiEnglishWord texaiEnglishWord = texaiEnglishWordForm.getTexaiEnglishWord();
final List<TexaiEnglishWordSense> texaiEnglishWordSenses = texaiEnglishWord.getTexaiEnglishWordSenses();
// replace proxy with the loaded domain entity, for subsequent persistence
texaiEnglishWord = texaiEnglishWordForm.getTexaiEnglishWord();
for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWordSenses) {
if ("Noun".equals(texaiEnglishWordSense.getSpeechPart().toString())) {
texaiEnglishWordSense.setSpeechPart(properNoun);
LOGGER.info(" set word sense " + texaiEnglishWordSense);
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
properNoun = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PROPER_NOUN);
}
}
}
for (final TexaiEnglishWordForm duplicateWordForm : duplicateWordForms) {
LOGGER.info(" deleting duplicate " + duplicateWordForm);
LOGGER.info("before");
for (final TexaiEnglishWordForm wf : texaiEnglishWord.getTexaiEnglishWordForms()) {
LOGGER.info(" wf " + wf.getTermId());
}
texaiEnglishWord.getTexaiEnglishWordForms().remove(duplicateWordForm);
LOGGER.info("after");
for (final TexaiEnglishWordForm wf : texaiEnglishWord.getTexaiEnglishWordForms()) {
LOGGER.info(" wf " + wf.getTermId());
}
domainEntityManager.deleteDomainEntity(duplicateWordForm);
}
if (!duplicateWordForms.isEmpty()) {
// note that the below persisted object is not the cglib proxy
domainEntityManager.persistDomainEntity(texaiEnglishWord);
}
}
}
}
/** Acquire proper adjectives from WordNet cased words, by promoting Adjectives to ProperAdjectives. */
private void acquireProperAdjectivesFromWordNet() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
LOGGER.info("finding WordNet cased words");
final Iterator<Object> wordNetCasedEnglishWord_iter = domainEntityManager.domainEntityIterator(WordNetCasedEnglishWord.class);
final AtomicTerm texaiWordForm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
final AtomicTerm texaiLemma = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
AtomicTerm properAdjective = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PROPER_ADJECTIVE);
final Set<ARPABETPronunciation> arpabetPronounciations = new HashSet<ARPABETPronunciation>();
final Set<TexaiEnglishWordForm> duplicateWordForms = new HashSet<TexaiEnglishWordForm>();
while (wordNetCasedEnglishWord_iter.hasNext()) {
final WordNetCasedEnglishWord wordNetCasedEnglishWord = (WordNetCasedEnglishWord) wordNetCasedEnglishWord_iter.next();
LOGGER.info("processing " + wordNetCasedEnglishWord);
nbrEnglishWordsProcessed++;
final String wnLemma = wordNetCasedEnglishWord.getWNLemma();
final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
texaiWordForm,
wnLemma,
TexaiEnglishWordForm.class);
LOGGER.info(" word forms " + texaiEnglishWordForms);
TexaiEnglishWordForm texaiEnglishWordForm = null;
boolean haveProperAdjective = false;
duplicateWordForms.clear();
final Iterator<Object> texaiEnglishWordForms_iter = texaiEnglishWordForms.iterator();
while (texaiEnglishWordForms_iter.hasNext()) {
texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordForms_iter.next();
final String wordFormInflectionString = texaiEnglishWordForm.getWordFormInflection().toString();
if (Constants.TERM_NAME_ADJECTIVE.equals(wordFormInflectionString)
|| Constants.TERM_NAME_PROPER_ADJECTIVE.equals(wordFormInflectionString)) {
if (haveProperAdjective) {
duplicateWordForms.add(texaiEnglishWordForm);
} else {
haveProperAdjective = true;
if (Constants.TERM_NAME_ADJECTIVE.equals(wordFormInflectionString)) {
texaiEnglishWordForm.setWordFormInflection(properAdjective);
LOGGER.info(" set word form " + texaiEnglishWordForm);
domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
}
}
}
}
if (texaiEnglishWordForm != null) {
TexaiEnglishWord texaiEnglishWord = texaiEnglishWordForm.getTexaiEnglishWord();
final List<TexaiEnglishWordSense> texaiEnglishWordSenses = texaiEnglishWord.getTexaiEnglishWordSenses();
// replace proxy with the loaded domain entity, for subsequent persistence
texaiEnglishWord = texaiEnglishWordForm.getTexaiEnglishWord();
for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWordSenses) {
if (Constants.TERM_NAME_ADJECTIVE.equals(texaiEnglishWordSense.getSpeechPart().toString())) {
texaiEnglishWordSense.setSpeechPart(properAdjective);
LOGGER.info(" set word sense " + texaiEnglishWordSense);
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
}
}
for (final TexaiEnglishWordForm duplicateWordForm : duplicateWordForms) {
LOGGER.info(" deleting duplicate " + duplicateWordForm);
texaiEnglishWord.getTexaiEnglishWordForms().remove(duplicateWordForm);
domainEntityManager.deleteDomainEntity(duplicateWordForm);
}
if (!duplicateWordForms.isEmpty()) {
// note that the below persisted object is not the cglib proxy
domainEntityManager.persistDomainEntity(texaiEnglishWord);
}
}
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
properAdjective = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PROPER_ADJECTIVE);
}
}
}
/** Acquires lexicon from OpenCyc print strings associated with KB terms. */
private void acquireLexiconFromOpenCyc() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
LOGGER.info("finding OpenCyc prettyString terms");
final Iterator<BinaryGAF> binaryGAF_iter = domainEntityManager.binaryGAFByPredicateIterator(
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_PRETTY_STRING_CANONICAL),
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ENGLISH_MT));
final AtomicTerm texaiWordFormTerm = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_FORM);
while (binaryGAF_iter.hasNext()) {
final BinaryGAF binaryGAF = binaryGAF_iter.next();
final AbstractReifiedTerm openCycTerm = (AbstractReifiedTerm) binaryGAF.getArg1();
final String openCycTermString = openCycTerm.toString();
if (Character.isLowerCase(openCycTermString.charAt(0))) {
// do not map OpenCyc predicates
continue;
} else if (openCycTermString.endsWith("Fn")) {
// do not map OpenCyc functor terms
continue;
} else if (openCycTermString.endsWith("-TheMovie")) {
// do not map OpenCyc Movie terms
continue;
} else if (openCycTermString.endsWith("-TheBand")) {
// do not map OpenCyc Band terms
continue;
} else if (openCycTermString.endsWith("-TheTVShow")) {
// do not map OpenCyc TV show terms
continue;
} else if (openCycTermString.endsWith("-TheProgram")) {
// do not map OpenCyc Program terms
continue;
} else if (openCycTermString.startsWith("PredicateNamedFn")) {
// do not map OpenCyc underspecified terms
continue;
} else if (openCycTermString.startsWith("InstanceNamedFn")) {
// do not map OpenCyc underspecified terms
continue;
}
final PString pString = (PString) binaryGAF.getArg2();
final String openCycPrettyString = pString.getStringValue();
LOGGER.debug("processing " + openCycPrettyString);
nbrEnglishWordsProcessed++;
final Set<Object> texaiEnglishWordForms = domainEntityManager.loadDomainEntitiesByPropertyValue(
texaiWordFormTerm,
openCycPrettyString,
TexaiEnglishWordForm.class);
if (texaiEnglishWordForms.size() == 1) {
final TexaiEnglishWordForm texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordForms.iterator().next();
final List<TexaiEnglishWordSense> texaiEnglishWordSenses = texaiEnglishWordForm.getTexaiEnglishWord().getTexaiEnglishWordSenses();
if (texaiEnglishWordSenses.size() == 1) {
final TexaiEnglishWordSense texaiEnglishWordSense = texaiEnglishWordSenses.get(0);
LOGGER.debug(" " + texaiEnglishWordSense);
if (texaiEnglishWordSense.getTexaiMappedTerm() == null) {
texaiEnglishWordSense.setIsOpenCycWordSense(true);
texaiEnglishWordSense.setTexaiMappedTerm(openCycTerm);
LOGGER.info(" " + texaiEnglishWordSense + " --> " + openCycTerm);
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
}
}
}
}
}
}
/** Acquires Wiktionary glosses and sample phrases. */
private void acquireWiktionaryGlossesAndSamplePhrases() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
LOGGER.info("gathering Wiktionary English words");
final Iterator<Object> wiktionaryEnglishWord_iter = domainEntityManager.domainEntityIterator(WiktionaryEnglishWord.class);
final AtomicTerm property = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_LEMMA);
final Set<TexaiEnglishWordForm> texaiEnglishWordForms = new HashSet<TexaiEnglishWordForm>();
final List<TexaiEnglishWordSense> texaiEnglishWordSenses = new ArrayList<TexaiEnglishWordSense>();
final Set<AtomicTerm> nounTypes = new HashSet<AtomicTerm>();
while (wiktionaryEnglishWord_iter.hasNext()) {
final WiktionaryEnglishWord wiktionaryEnglishWord = (WiktionaryEnglishWord) wiktionaryEnglishWord_iter.next();
LOGGER.debug("processing " + wiktionaryEnglishWord);
String lemma = wiktionaryEnglishWord.getLemma();
// categorize the Wiktionary word senses
final List<WiktionaryEnglishWordSense> nounWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
final List<WiktionaryEnglishWordSense> verbWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
final List<WiktionaryEnglishWordSense> adjectiveWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
final List<WiktionaryEnglishWordSense> adverbWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
final List<WiktionaryEnglishWordSense> otherWiktionaryWordSenses = new ArrayList<WiktionaryEnglishWordSense>();
for (final WiktionaryEnglishWordSense wiktionaryEnglishWordSense : wiktionaryEnglishWord.getWiktionaryEnglishWordSenses()) {
final String speechPartString = wiktionaryEnglishWordSense.getSpeechPart().toString();
// nouns
if (Constants.TERM_NAME_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_ABBREVIATION.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_ACRONYM.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_COLLECTIVE_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_GERUNDIVE_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_INITIALISM.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_MASS_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_NOUN_PHRASE.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_PLURAL_NOUN_WORD_FORM.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_PROPER_NOUN.equals(speechPartString)) {
nounWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
// verbs
} else if (Constants.TERM_NAME_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_AUX_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_INTRANSITIVE_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_REFLEXIVE_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_TRANSITIVE_VERB.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_VERB_PHRASE.equals(speechPartString)) {
verbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
// adjectives
} else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPartString)) {
adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_ADJECTIVE_PHRASE.equals(speechPartString)) {
adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_PROPER_ADJECTIVE.equals(speechPartString)) {
adjectiveWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
// adverbs
} else if (Constants.TERM_NAME_ADVERB.equals(speechPartString)) {
adverbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
} else if (Constants.TERM_NAME_ADVERB_PHRASE.equals(speechPartString)) {
adverbWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
// others
} else {
otherWiktionaryWordSenses.add(wiktionaryEnglishWordSense);
}
}
TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntityByIndentifyingPropertyValue(
property,
lemma,
TexaiEnglishWord.class);
assert texaiEnglishWord != null : "cannot find " + lemma;
// categorize the texai word senses
final List<TexaiEnglishWordSense> nounTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
final List<TexaiEnglishWordSense> verbTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
final List<TexaiEnglishWordSense> adjectiveTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
final List<TexaiEnglishWordSense> adverbTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
final List<TexaiEnglishWordSense> otherTexaiWordSenses = new ArrayList<TexaiEnglishWordSense>();
for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWord.getTexaiEnglishWordSenses()) {
String speechPartString = null;
if (texaiEnglishWordSense.getBasicSpeechPart() != null) {
speechPartString = texaiEnglishWordSense.getBasicSpeechPart().toString();
}
if (speechPartString == null) {
otherTexaiWordSenses.add(texaiEnglishWordSense);
} else if (Constants.TERM_NAME_NOUN.equals(speechPartString)) {
nounTexaiWordSenses.add(texaiEnglishWordSense);
} else if (Constants.TERM_NAME_VERB.equals(speechPartString)) {
verbTexaiWordSenses.add(texaiEnglishWordSense);
} else if (Constants.TERM_NAME_ADJECTIVE.equals(speechPartString)) {
adjectiveTexaiWordSenses.add(texaiEnglishWordSense);
} else if (Constants.TERM_NAME_ADVERB.equals(speechPartString)) {
adverbTexaiWordSenses.add(texaiEnglishWordSense);
} else {
LOGGER.info("uncategorized WordNet-dervived word sense");
otherTexaiWordSenses.add(texaiEnglishWordSense);
}
}
boolean hasMergedNounWordSenses = false;
boolean hasMergedVerbWordSenses = false;
boolean hasMergedAdjectiveWordSenses = false;
boolean hasMergedAdverbWordSenses = false;
boolean hasMergedOtherWordSenses = false;
// merge wiktionary word senses for unpopulated texai categories among Noun, Verb, Adjective and Adverb
if (nounTexaiWordSenses.size() == 1 && nounWiktionaryWordSenses.size() == 1) {
mergeWiktionaryGlossesAndSamplePhrases(
nounWiktionaryWordSenses.get(0),
nounTexaiWordSenses.get(0),
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_NOUN),
texaiEnglishWord);
} else if (verbTexaiWordSenses.size() == 1 && verbWiktionaryWordSenses.size() == 1) {
mergeWiktionaryGlossesAndSamplePhrases(
verbWiktionaryWordSenses.get(0),
verbTexaiWordSenses.get(0),
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_VERB),
texaiEnglishWord);
} else if (adjectiveTexaiWordSenses.size() == 1 && adjectiveWiktionaryWordSenses.size() == 1) {
mergeWiktionaryGlossesAndSamplePhrases(
adjectiveWiktionaryWordSenses.get(0),
adjectiveTexaiWordSenses.get(0),
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ADJECTIVE),
texaiEnglishWord);
} else if (adverbTexaiWordSenses.size() == 1 && adverbWiktionaryWordSenses.size() == 1) {
mergeWiktionaryGlossesAndSamplePhrases(
adverbWiktionaryWordSenses.get(0),
adverbTexaiWordSenses.get(0),
domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_ADVERB),
texaiEnglishWord);
}
if (++nbrEnglishWordsProcessed % 100 == 0) {
commit(nbrEnglishWordsProcessed);
}
}
}
/** Merges the glosses and sample phrases from the given Wiktionary word sense into the given texai word sense.
*
* @param wiktionaryWordSense the list of Wiktonary word sense
* @param texaiWordSense the Texai word senses into which the Wiktionary word sense is merged
* @param basicSpeechPart the basic speech part, e.g. Noun, Verb
* @param texaiEnglishWord the texai English word
*/
private void mergeWiktionaryGlossesAndSamplePhrases(
final WiktionaryEnglishWordSense wiktionaryEnglishWordSense,
final TexaiEnglishWordSense texaiEnglishWordSense,
final AtomicTerm basicSpeechPart,
final TexaiEnglishWord texaiEnglishWord) {
//Preconditions
assert wiktionaryEnglishWordSense != null : "wiktionaryEnglishWordSense must not be null";
assert texaiEnglishWordSense != null : "texaiEnglishWordSense must not be null";
assert texaiEnglishWord != null : "texaiEnglishWord must not be null";
final String gloss = wiktionaryEnglishWordSense.getGloss();
if (!texaiEnglishWordSense.getGlosses().contains(gloss)) {
nbrEnglishWordsAcquired++;
LOGGER.info("");
LOGGER.info(texaiEnglishWordSense + " adding gloss \"" + gloss + "\"");
texaiEnglishWordSense.getGlosses().add(wiktionaryEnglishWordSense.getGloss());
LOGGER.debug(" updated glosses" + texaiEnglishWordSense.getGlosses());
for (final WiktionarySamplePhrase wiktionarySamplePhrase : wiktionaryEnglishWordSense.getSamplePhrases()) {
final TexaiSamplePhrase texaiSamplePhrase = new TexaiSamplePhrase(
wiktionarySamplePhrase.getWiktionarySamplePhrase(),
texaiEnglishWordSense);
LOGGER.info(texaiEnglishWordSense + " adding sample phrase \"" + texaiSamplePhrase.getSamplePhrase() + "\"");
domainEntityManager.persistDomainEntity(texaiSamplePhrase);
texaiEnglishWordSense.getTexaiSamplePhrases().add(texaiSamplePhrase);
LOGGER.debug(" updated sample phrases" + texaiEnglishWordSense.getTexaiSamplePhrases());
}
// fix Wiktionary word sense position
wiktionaryEnglishWordSense.setPosition(1);
texaiEnglishWordSense.setWiktionaryEnglishWordSense(wiktionaryEnglishWordSense);
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
}
}
private void fixWordNetSynsetReferences() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
final AtomicTerm property = domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_TEXAI_WORD_SENSE_GLOSS);
LOGGER.info("gathering WordNet synsets");
final Iterator<Object> wordNetSynset_iter = domainEntityManager.domainEntityIterator(WordNetSynset.class);
while (wordNetSynset_iter.hasNext()) {
final WordNetSynset wordNetSynset = (WordNetSynset) wordNetSynset_iter.next();
LOGGER.info("");
LOGGER.info("processing " + wordNetSynset.description());
nbrEnglishWordsProcessed++;
final Set<Object> texaiEnglishWordSenses = domainEntityManager.loadDomainEntitiesByPropertyValue(
property,
wordNetSynset.getWNSynsetGloss(),
TexaiEnglishWordSense.class);
for (final Object texaiEnglishWordSenseObj : texaiEnglishWordSenses) {
final TexaiEnglishWordSense texaiEnglishWordSense = (TexaiEnglishWordSense) texaiEnglishWordSenseObj;
if (wordNetSynset.synsetWordsAsString().contains(texaiEnglishWordSense.getTexaiEnglishWord().getLemma())) {
if (wordNetSynset.getWNSynsetSpeechPart().equals(texaiEnglishWordSense.getBasicSpeechPart())) {
texaiEnglishWordSense.setWordNetSynset(wordNetSynset);
LOGGER.info(" set " + texaiEnglishWordSense);
domainEntityManager.persistDomainEntity(texaiEnglishWordSense);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
}
} else {
LOGGER.info(" ** " + wordNetSynset.getWNSynsetSpeechPart() + " did not match " + texaiEnglishWordSense.getBasicSpeechPart());
}
}
}
}
}
/** Fixes the WordNet synset references in matching texai word senses. */
private void fixUppercaseWords() {
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
LOGGER.info("gathering words");
final Iterator<Object> texaiEnglishWord_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWord.class);
while (texaiEnglishWord_iter.hasNext()) {
final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) texaiEnglishWord_iter.next();
LOGGER.info("processing " + texaiEnglishWord);
final String lemma = texaiEnglishWord.getLemma().toLowerCase();
if (!texaiEnglishWord.getLemma().equals(lemma)) {
texaiEnglishWord.setLemma(lemma);
LOGGER.info(" fixed " + texaiEnglishWord);
domainEntityManager.persistDomainEntity(texaiEnglishWord);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
}
}
}
}
/** Fixes the WordNet synset references in matching texai word senses. */
private void fixUppercaseWordForms() {
//TODO only ProperNoun and ProperAdjective word forms should have upper case first letter, initialisms and abbreviations are ok.
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
LOGGER.info("gathering word forms");
final Iterator<Object> texaiEnglishWordForm_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWordForm.class);
final StringBuilder stringBuilder = new StringBuilder(Constants.STRING_BUILDER_SIZE_SMALL);
while (texaiEnglishWordForm_iter.hasNext()) {
final TexaiEnglishWordForm texaiEnglishWordForm = (TexaiEnglishWordForm) texaiEnglishWordForm_iter.next();
LOGGER.info("processing " + texaiEnglishWordForm);
final String speechPartString = texaiEnglishWordForm.getWordFormInflection().toString();
final String wordForm = texaiEnglishWordForm.getWordForm().toLowerCase();
if (!texaiEnglishWordForm.getWordForm().equals(wordForm)) {
if ("Initialism".equals(speechPartString)
|| "Abbreviation".equals(speechPartString)
|| "ProperNoun".equals(speechPartString)
|| "ProperAdjective".equals(speechPartString)) {
continue;
} else {
// EAT --> eat
texaiEnglishWordForm.setWordForm(wordForm);
}
LOGGER.info(" fixed " + texaiEnglishWordForm);
domainEntityManager.persistDomainEntity(texaiEnglishWordForm);
if (++nbrEnglishWordsAcquired % 20 == 0) {
commit(nbrEnglishWordsAcquired);
}
}
}
}
/** Fixes the WordNet synset references in matching texai word senses. */
/** Commits a group of persisted domain entities to the knowledge base.
*
* @param count the count to log
*/
private void commit(final int count) {
LOGGER.info("");
LOGGER.info("*** committing *** " + count);
LOGGER.info("");
entityTransaction.commit();
entityManager.clear();
if (count >= 200) {
domainEntityManager.setValidateWellFormedFormula(false);
}
entityTransaction = entityManager.getTransaction();
entityTransaction.begin();
domainEntityManager.setCreator(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_LEXICON_INITIALIZATION_PROCESS));
domainEntityManager.setCreationPurpose(domainEntityManager.findAtomicTermByTermName(Constants.TERM_NAME_LEXICON_INITIALIZATION_PROJECT));
}
/** Finalizes this application. */
private void finalization() {
entityTransaction.commit();
CacheManager.getInstance().shutdown();
entityManager.close();
entityManagerFactory.close();
LOGGER.info("Number of English words acquired " + nbrEnglishWordsAcquired);
LOGGER.info("Number of English words processed for word senses " + nbrEnglishWordsProcessed);
}
/** Provides a comparator to sort WordNet word senses by sense number. */
@SuppressWarnings("unchecked")
private class WordNetWordSenseComparator implements Comparator {
/** Constructs a new WordNetWordSenseComparator instance. */
public WordNetWordSenseComparator() {
}
/** Compares its two arguments for order. Returns a negative integer,
* zero, or a positive integer as the first argument is less than, equal
* to, or greater than the second.
*
* @param o1 the first WordNet word sense
* @param o2 the second WordNet word sense
* @return a negative integer, zero, or a positive integer as the first word sense is less than, equal
* to, or greater than the second word sense
*/
public int compare(final Object o1, final Object o2) {
//Preconditions
assert o1 != null : "o1 must not be null";
assert o2 != null : "o2 must not be null";
assert o1 instanceof WordNetWordSense : "o1 must be a WordNetWordSense " + o1;
assert o2 instanceof WordNetWordSense : "o2 must be a WordNetWordSense " + o2;
final WordNetWordSense first = (WordNetWordSense) o1;
final WordNetWordSense second = (WordNetWordSense) o2;
// rank by descending usage first
if (first.getWNWordSenseFrequencyOfUsageRank() > second.getWNWordSenseFrequencyOfUsageRank()) {
return -1;
} else if (first.getWNWordSenseFrequencyOfUsageRank() < second.getWNWordSenseFrequencyOfUsageRank()) {
return 1;
}
// rank by term id second
return first.getTermId().compareTo(second.getTermId());
}
}
/** Executes this application.
*
* @param args the command line arguments (unused)
*/
public static void main(final String[] args) {
final LexiconInitializer lexiconInitializer = new LexiconInitializer();
lexiconInitializer.initialize();
try {
lexiconInitializer.initializeKBTerms();
// lexiconInitializer.acquireLexiconFromWordNet();
// lexiconInitializer.populateBasicSpeechPart();
// lexiconInitializer.acquireLexiconFromWiktionary();
// lexiconInitializer.acquireLexiconFromCMUPronouncingDictionary();
// lexiconInitializer.fixMappedTerms();
// lexiconInitializer.acquireProperNounsFromWordNet();
// lexiconInitializer.acquireProperAdjectivesFromWordNet();
// lexiconInitializer.acquireLexiconFromOpenCyc();
// lexiconInitializer.acquireWiktionaryGlossesAndSamplePhrases();
// lexiconInitializer.fixWordNetSynsetReferences();
// lexiconInitializer.fixUppercaseWords();
lexiconInitializer.fixUppercaseWordForms();
} catch (final TexaiException ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
} catch (final NullPointerException ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
} catch (final AssertionError ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
}
lexiconInitializer.finalization();
}
}
See more files for this project here