LexiconToXML.java from Texai at Krugle
Show LexiconToXML.java syntax highlighted
/*
* LexiconToXML.java
*
* Created on March 14, 2007, 11:30 AM
*
* Description: Serializes the lexicon to XML.
*
* Copyright (C) 2007 Stephen L. Reed.
*
* This program is free software; you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program;
* if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.texai.lexicon;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import javax.persistence.Persistence;
import net.sf.ehcache.CacheManager;
import org.apache.log4j.Logger;
import org.texai.cmudict.domainEntity.ARPABETPronunciation;
import org.texai.kb.CacheInitializer;
import org.texai.kb.Constants;
import org.texai.kb.ejb.session.DomainEntityManagerBean;
import org.texai.lexicon.domainEntity.TexaiEnglishWord;
import org.texai.lexicon.domainEntity.TexaiEnglishWordForm;
import org.texai.lexicon.domainEntity.TexaiEnglishWordSense;
import org.texai.lexicon.domainEntity.TexaiSamplePhrase;
import org.texai.util.TexaiException;
/**
*
* @author reed
*/
public final class LexiconToXML {
/** the log4j logger */
private static final Logger LOGGER = Logger.getLogger(LexiconToXML.class.getName());
/** the word processing limit for testing */
// private static final int TEST_LIMIT = 200;
private static final int TEST_LIMIT = Integer.MAX_VALUE;
/** the XML open lemma tag */
private static final String OPEN_LEMMA_TAG = "<lemma>";
/** the XML close lemma tag */
private static final String CLOSE_LEMMA_TAG = "</lemma>";
/** the XML open lexicon tag */
private static final String OPEN_LEXICON_TAG = "<lexicon>";
/** the XML close lexicon tag */
private static final String CLOSE_LEXICON_TAG = "</lexicon>";
/** the XML open word tag */
private static final String OPEN_WORD_TAG = "<word>";
/** the XML close word tag */
private static final String CLOSE_WORD_TAG = "</word>";
/** the XML open word source tag */
private static final String OPEN_WORD_SOURCE_TAG = "<word-source>";
/** the XML close word source tag */
private static final String CLOSE_WORD_SOURCE_TAG = "</word-source>";
/** the XML open word form tag */
private static final String OPEN_WORD_FORM_TAG = "<word-form>";
/** the XML close word form tag */
private static final String CLOSE_WORD_FORM_TAG = "</word-form>";
/** the XML open word form pronunciation tag */
private static final String OPEN_WORD_FORM_PRONUNCIATION_TAG = "<word-form-pronunciation>";
/** the XML close word form pronunciation tag */
private static final String CLOSE_WORD_FORM_PRONUNCIATION_TAG = "</word-form-pronunciation>";
/** the XML open word form source tag */
private static final String OPEN_WORD_FORM_SOURCE_TAG = "<word-form-source>";
/** the XML close word form source tag */
private static final String CLOSE_WORD_FORM_SOURCE_TAG = "</word-form-source>";
/** the XML open word form speech part tag */
private static final String OPEN_WORD_FORM_SPEECH_PART_TAG = "<word-form-speech-part>";
/** the XML close word form speech part tag */
private static final String CLOSE_WORD_FORM_SPEECH_PART_TAG = "</word-form-speech-part>";
/** the XML open word form word tag */
private static final String OPEN_WORD_FORM_WORD_TAG = "<word-form-word>";
/** the XML close word form word tag */
private static final String CLOSE_WORD_FORM_WORD_TAG = "</word-form-word>";
/** the XML open word sense tag */
private static final String OPEN_WORD_SENSE_TAG = "<word-sense>";
/** the XML close word sense tag */
private static final String CLOSE_WORD_SENSE_TAG = "</word-sense>";
/** the XML open word sense category tag */
private static final String OPEN_WORD_SENSE_CATEGORY_TAG = "<word-sense-category>";
/** the XML close word sense category tag */
private static final String CLOSE_WORD_SENSE_CATEGORY_TAG = "</word-sense-category>";
/** the XML open word sense gloss tag */
private static final String OPEN_WORD_SENSE_GLOSS_TAG = "<word-sense-gloss>";
/** the XML close word sense gloss tag */
private static final String CLOSE_WORD_SENSE_GLOSS_TAG = "</word-sense-gloss>";
/** the XML open word sense mapped term tag */
private static final String OPEN_WORD_SENSE_MAPPED_TERM_TAG = "<word-sense-mapped-term>";
/** the XML close word sense mapped term tag */
private static final String CLOSE_WORD_SENSE_MAPPED_TERM_TAG = "</word-sense-mapped-term>";
/** the XML open word sense number tag */
private static final String OPEN_WORD_SENSE_NUMBER_TAG = "<word-sense-number>";
/** the XML close word sense number tag */
private static final String CLOSE_WORD_SENSE_NUMBER_TAG = "</word-sense-number>";
/** the XML open word sense sample phrase tag */
private static final String OPEN_WORD_SENSE_SAMPLE_PHRASE_TAG = "<word-sense-sample-phrase>";
/** the XML close word sense sample phrase tag */
private static final String CLOSE_WORD_SENSE_SAMPLE_PHRASE_TAG = "</word-sense-sample-phrase>";
/** the XML open word sense source tag */
private static final String OPEN_WORD_SENSE_SOURCE_TAG = "<word-sense-source>";
/** the XML close word sense source tag */
private static final String CLOSE_WORD_SENSE_SOURCE_TAG = "</word-sense-source>";
/** the XML open word sense speech part tag */
private static final String OPEN_WORD_SENSE_SPEECH_PART_TAG = "<word-sense-speech-part>";
/** the XML close word sense speech part tag */
private static final String CLOSE_WORD_SENSE_SPEECH_PART_TAG = "</word-sense-speech-part>";
/** the output XML path */
private static final String OUTPUT_FILE_PATH = "/home/reed/svn/Lexicon/data/lexicon.xml";
/** the XML output stream */
private BufferedWriter bufferedWriter;
/** the XML text indentation */
private int indent = 0;
/** the entity manager factory */
private EntityManagerFactory entityManagerFactory;
/** the entity manager */
private EntityManager entityManager;
/** the domain entity manager */
private DomainEntityManagerBean domainEntityManager;
/** the number of English word senses acquired from WordNet */
private int nbrEnglishWordsProcessed = 0;
/** the sorted word dictionary, word --> word term id */
private Map<String, Long> wordDictionary = new TreeMap<String, Long>();
/** Creates a new instance of LexiconToXML. */
public LexiconToXML() {
super();
}
/** Initializes the application and injects the dependencies for out-of-the-container execution of J2EE session beans. */
private void initialize() {
entityManagerFactory = Persistence.createEntityManagerFactory(Constants.TEST_PERSISTENCE_UNIT_NAME);
entityManager = entityManagerFactory.createEntityManager();
CacheInitializer.initializeCaches();
domainEntityManager = new DomainEntityManagerBean();
domainEntityManager.setEntityManager(entityManager);
domainEntityManager.injectSharedBeanDependencies();
//TODO create transaction and perform periodic commits to clear the entity manager
try {
bufferedWriter = new BufferedWriter(new FileWriter(OUTPUT_FILE_PATH));
} catch (final IOException ex) {
throw new TexaiException(ex);
}
}
/** Serializes the lexicon to XML. */
private void serializeLexiconToXML() {
sortWords();
nbrEnglishWordsProcessed = 0;
writeXML(OPEN_LEXICON_TAG, 0);
for (final Long termId : wordDictionary.values()) {
final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntity(termId);
nbrEnglishWordsProcessed++;
LOGGER.info("");
LOGGER.info("----------------------------------------- " + nbrEnglishWordsProcessed);
LOGGER.info("");
LOGGER.info(texaiEnglishWord.getLemma());
indent = 2;
writeXML(OPEN_WORD_TAG, indent);
writeXML(OPEN_LEMMA_TAG + texaiEnglishWord.getLemma() + CLOSE_LEMMA_TAG, indent + 2);
if (texaiEnglishWord.getWiktionaryEnglishWord() != null) {
LOGGER.info(" source Wiktionary");
writeXML(OPEN_WORD_SOURCE_TAG + "Wiktionary" + CLOSE_WORD_SOURCE_TAG, indent + 2);
}
if (texaiEnglishWord.getWordNetEnglishWord() != null) {
LOGGER.info(" source WordNet");
writeXML(OPEN_WORD_SOURCE_TAG + "WordNet" + CLOSE_WORD_SOURCE_TAG, indent + 2);
}
if (texaiEnglishWord.getIsOpenCycWord()) {
LOGGER.info(" source OpenCyc");
writeXML(OPEN_WORD_SOURCE_TAG + "OpenCyc" + CLOSE_WORD_SOURCE_TAG, indent + 2);
}
for (final TexaiEnglishWordForm texaiEnglishWordForm : texaiEnglishWord.getTexaiEnglishWordForms()) {
LOGGER.info(" " + texaiEnglishWordForm.getWordFormInflection() + " " + texaiEnglishWordForm.getWordForm());
writeXML(OPEN_WORD_FORM_TAG, indent + 2);
writeXML(OPEN_WORD_FORM_WORD_TAG + texaiEnglishWordForm.getWordForm() + CLOSE_WORD_FORM_WORD_TAG, indent + 4);
writeXML(OPEN_WORD_FORM_SPEECH_PART_TAG + texaiEnglishWordForm.getWordFormInflection()
+ CLOSE_WORD_FORM_SPEECH_PART_TAG, indent + 4);
for (final ARPABETPronunciation arpabetPronounciation : texaiEnglishWordForm.getARPABETPronunciations()) {
LOGGER.info(" [" + arpabetPronounciation.getPhonemeString() + "]");
writeXML(OPEN_WORD_FORM_PRONUNCIATION_TAG + arpabetPronounciation.getPhonemeString()
+ CLOSE_WORD_FORM_PRONUNCIATION_TAG, indent + 4);
}
if (texaiEnglishWordForm.getWiktionaryEnglishWordForm() != null) {
LOGGER.info(" source Wiktionary");
writeXML(OPEN_WORD_FORM_SOURCE_TAG + "Wiktionary" + CLOSE_WORD_FORM_SOURCE_TAG, indent + 4);
}
if (texaiEnglishWordForm.getWordNetCasedEnglishWord() != null) {
LOGGER.info(" source WordNet");
writeXML(OPEN_WORD_FORM_SOURCE_TAG + "WordNet" + CLOSE_WORD_FORM_SOURCE_TAG, indent + 4);
}
if (texaiEnglishWordForm.getCMUDictionaryEnglishWordForm() != null) {
LOGGER.info(" source CMU Pronouncing Dictionary");
writeXML(OPEN_WORD_FORM_SOURCE_TAG + "CMU Pronouncing Dictionary" + CLOSE_WORD_FORM_SOURCE_TAG, indent + 4);
}
if (texaiEnglishWordForm.getIsOpenCycWordForm()) {
LOGGER.info(" source OpenCyc");
writeXML(OPEN_WORD_FORM_SOURCE_TAG + "OpenCyc" + CLOSE_WORD_FORM_SOURCE_TAG, indent + 4);
}
writeXML(CLOSE_WORD_FORM_TAG, indent + 2);
}
for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWord.getTexaiEnglishWordSenses()) {
writeXML(OPEN_WORD_SENSE_TAG, indent + 2);
writeXML(OPEN_WORD_SENSE_NUMBER_TAG + texaiEnglishWordSense.getWordSenseNbr() + CLOSE_WORD_SENSE_NUMBER_TAG, indent + 4);
writeXML(OPEN_WORD_SENSE_SPEECH_PART_TAG + texaiEnglishWordSense.getSpeechPart() + CLOSE_WORD_SENSE_SPEECH_PART_TAG, indent + 4);
if (texaiEnglishWordSense.getTexaiMappedTerm() == null) {
LOGGER.info(" " + texaiEnglishWordSense.getWordSenseNbr() + ". " + texaiEnglishWordSense.getSpeechPart());
} else {
LOGGER.info(" " + texaiEnglishWordSense.getWordSenseNbr() + ". " + texaiEnglishWordSense.getSpeechPart()
+ " --> " + texaiEnglishWordSense.getTexaiMappedTerm());
writeXML(OPEN_WORD_SENSE_MAPPED_TERM_TAG + texaiEnglishWordSense.getTexaiMappedTerm()
+ CLOSE_WORD_SENSE_MAPPED_TERM_TAG, indent + 4);
}
for (final String gloss : texaiEnglishWordSense.getGlosses()) {
LOGGER.info(" " + gloss);
writeXML(OPEN_WORD_SENSE_GLOSS_TAG + gloss + CLOSE_WORD_SENSE_GLOSS_TAG, indent + 4);
}
for (final TexaiSamplePhrase texaiSamplePhrase : texaiEnglishWordSense.getTexaiSamplePhrases()) {
LOGGER.info(" \"" + texaiSamplePhrase.getSamplePhrase() + "\"");
writeXML(OPEN_WORD_SENSE_SAMPLE_PHRASE_TAG + texaiSamplePhrase.getSamplePhrase()
+ CLOSE_WORD_SENSE_SAMPLE_PHRASE_TAG, indent + 4);
}
final StringBuilder stringBuilder = new StringBuilder(Constants.STRING_BUILDER_SIZE_SMALL);
stringBuilder.append('[');
if (texaiEnglishWordSense.getWordNetCategory() != null) {
stringBuilder.append(texaiEnglishWordSense.getWordNetCategory().getName());
writeXML(OPEN_WORD_SENSE_CATEGORY_TAG + texaiEnglishWordSense.getWordNetCategory().getName()
+ CLOSE_WORD_SENSE_CATEGORY_TAG, indent + 4);
}
for (final String categoryName : texaiEnglishWordSense.getCategoryNames()) {
if (stringBuilder.length() > 1) {
stringBuilder.append(", ");
}
stringBuilder.append(categoryName);
writeXML(OPEN_WORD_SENSE_CATEGORY_TAG + categoryName + CLOSE_WORD_SENSE_CATEGORY_TAG, indent + 4);
}
if (stringBuilder.length() > 1) {
stringBuilder.append(']');
LOGGER.info(" " + stringBuilder.toString());
}
if (texaiEnglishWordSense.getWordNetSynset() != null) {
LOGGER.info(" source Wordnet");
writeXML(OPEN_WORD_SENSE_SOURCE_TAG + "Wordnet" + CLOSE_WORD_SENSE_SOURCE_TAG, indent + 4);
}
if (texaiEnglishWordSense.getWiktionaryEnglishWordSense() != null) {
LOGGER.info(" source Wikitionary");
writeXML(OPEN_WORD_SENSE_SOURCE_TAG + "Wikitionary" + CLOSE_WORD_SENSE_SOURCE_TAG, indent + 4);
}
if (texaiEnglishWordSense.getIsOpenCycWordSense()) {
LOGGER.info(" source OpenCyc");
writeXML(OPEN_WORD_SENSE_SOURCE_TAG + "OpenCyc" + CLOSE_WORD_SENSE_SOURCE_TAG, indent + 4);
}
writeXML(CLOSE_WORD_SENSE_TAG, indent + 2);
}
writeXML(CLOSE_WORD_TAG, indent);
}
writeXML(CLOSE_LEXICON_TAG, 0);
}
/** Writes the given text line using the given indentation.
*
* @param text the given text
* @param localIndent the local indentation
*/
private void writeXML(final String text, final int localIndent) {
//Preconditions
assert localIndent >= 0 : "localIndent must not be negative";
assert text != null : "text must not be null";
assert !text.isEmpty() : "text must not be an empty string";
try {
for (int i = 0; i < localIndent; i++) {
bufferedWriter.write(' ');
}
bufferedWriter.write(text);
bufferedWriter.newLine();
} catch (final IOException ex) {
throw new TexaiException(ex);
}
}
/** Sorts the lexicon words. */
private void sortWords() {
LOGGER.info("gathering lexicon words");
final Iterator<Object> texaiEnglishWord_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWord.class);
while (texaiEnglishWord_iter.hasNext()) {
final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) texaiEnglishWord_iter.next();
nbrEnglishWordsProcessed++;
LOGGER.info("sorting " + texaiEnglishWord + " " + nbrEnglishWordsProcessed);
wordDictionary.put(texaiEnglishWord.getLemma(), texaiEnglishWord.getTermId());
if (nbrEnglishWordsProcessed % 20 == 0) {
entityManager.clear();
}
if (nbrEnglishWordsProcessed > TEST_LIMIT) {
break;
}
}
}
/** Finalizes this application. */
private void finalization() {
CacheManager.getInstance().shutdown();
entityManager.close();
entityManagerFactory.close();
try {
bufferedWriter.close();
} catch (final IOException ex) {
throw new TexaiException(ex);
}
LOGGER.info("Number of English words processed " + nbrEnglishWordsProcessed);
}
/** Executes this application.
*
* @param args the command line arguments (unused)
*/
public static void main(final String[] args) {
final LexiconToXML lexiconToXML = new LexiconToXML();
lexiconToXML.initialize();
try {
lexiconToXML.serializeLexiconToXML();
} catch (final TexaiException ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
} catch (final NullPointerException ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
} catch (final AssertionError ex) {
LOGGER.error(ex);
ex.printStackTrace(System.err);
}
lexiconToXML.finalization();
}
}
See more files for this project here