Code Search for Developers
 
 
  

LexiconToXML.java from Texai at Krugle


Show LexiconToXML.java syntax highlighted

/*
 * LexiconToXML.java
 *
 * Created on March 14, 2007, 11:30 AM
 *
 * Description: Serializes the lexicon to XML.
 *
 * Copyright (C) 2007 Stephen L. Reed.
 *
 * This program is free software; you can redistribute it and/or modify it under the terms
 * of the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with this program;
 * if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package org.texai.lexicon;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import javax.persistence.Persistence;
import net.sf.ehcache.CacheManager;
import org.apache.log4j.Logger;
import org.texai.cmudict.domainEntity.ARPABETPronunciation;
import org.texai.kb.CacheInitializer;
import org.texai.kb.Constants;
import org.texai.kb.ejb.session.DomainEntityManagerBean;
import org.texai.lexicon.domainEntity.TexaiEnglishWord;
import org.texai.lexicon.domainEntity.TexaiEnglishWordForm;
import org.texai.lexicon.domainEntity.TexaiEnglishWordSense;
import org.texai.lexicon.domainEntity.TexaiSamplePhrase;
import org.texai.util.TexaiException;

/**
 *
 * @author reed
 */
public final class LexiconToXML {
  
  /** the log4j logger */
  private static final Logger LOGGER = Logger.getLogger(LexiconToXML.class.getName());
  
  /** the word processing limit for testing */
//  private static final int TEST_LIMIT = 200;
  private static final int TEST_LIMIT = Integer.MAX_VALUE;
  
  /** the XML open lemma tag */
  private static final String OPEN_LEMMA_TAG = "<lemma>";
  
  /** the XML close lemma tag */
  private static final String CLOSE_LEMMA_TAG = "</lemma>";
  
  /** the XML open lexicon tag */
  private static final String OPEN_LEXICON_TAG = "<lexicon>";
  
  /** the XML close lexicon tag */
  private static final String CLOSE_LEXICON_TAG = "</lexicon>";
  
  /** the XML open word tag */
  private static final String OPEN_WORD_TAG = "<word>";
  
  /** the XML close word tag */
  private static final String CLOSE_WORD_TAG = "</word>";
  
  /** the XML open word source tag */
  private static final String OPEN_WORD_SOURCE_TAG = "<word-source>";
  
  /** the XML close word source tag */
  private static final String CLOSE_WORD_SOURCE_TAG = "</word-source>";
  
  /** the XML open word form tag */
  private static final String OPEN_WORD_FORM_TAG = "<word-form>";
  
  /** the XML close word form tag */
  private static final String CLOSE_WORD_FORM_TAG = "</word-form>";
  
  /** the XML open word form pronunciation tag */
  private static final String OPEN_WORD_FORM_PRONUNCIATION_TAG = "<word-form-pronunciation>";
  
  /** the XML close word form pronunciation tag */
  private static final String CLOSE_WORD_FORM_PRONUNCIATION_TAG = "</word-form-pronunciation>";
  
  /** the XML open word form source tag */
  private static final String OPEN_WORD_FORM_SOURCE_TAG = "<word-form-source>";
  
  /** the XML close word form source tag */
  private static final String CLOSE_WORD_FORM_SOURCE_TAG = "</word-form-source>";
  
  /** the XML open word form speech part tag */
  private static final String OPEN_WORD_FORM_SPEECH_PART_TAG = "<word-form-speech-part>";
  
  /** the XML close word form speech part tag */
  private static final String CLOSE_WORD_FORM_SPEECH_PART_TAG = "</word-form-speech-part>";
  
  /** the XML open word form word tag */
  private static final String OPEN_WORD_FORM_WORD_TAG = "<word-form-word>";
  
  /** the XML close word form word tag */
  private static final String CLOSE_WORD_FORM_WORD_TAG = "</word-form-word>";
  
  /** the XML open word sense tag */
  private static final String OPEN_WORD_SENSE_TAG = "<word-sense>";
  
  /** the XML close word sense tag */
  private static final String CLOSE_WORD_SENSE_TAG = "</word-sense>";
  
  /** the XML open word sense category tag */
  private static final String OPEN_WORD_SENSE_CATEGORY_TAG = "<word-sense-category>";
  
  /** the XML close word sense category tag */
  private static final String CLOSE_WORD_SENSE_CATEGORY_TAG = "</word-sense-category>";
  
  /** the XML open word sense gloss tag */
  private static final String OPEN_WORD_SENSE_GLOSS_TAG = "<word-sense-gloss>";
  
  /** the XML close word sense gloss tag */
  private static final String CLOSE_WORD_SENSE_GLOSS_TAG = "</word-sense-gloss>";
  
  /** the XML open word sense mapped term tag */
  private static final String OPEN_WORD_SENSE_MAPPED_TERM_TAG = "<word-sense-mapped-term>";
  
  /** the XML close word sense mapped term tag */
  private static final String CLOSE_WORD_SENSE_MAPPED_TERM_TAG = "</word-sense-mapped-term>";
  
  /** the XML open word sense number tag */
  private static final String OPEN_WORD_SENSE_NUMBER_TAG = "<word-sense-number>";
  
  /** the XML close word sense number tag */
  private static final String CLOSE_WORD_SENSE_NUMBER_TAG = "</word-sense-number>";
  
  /** the XML open word sense sample phrase tag */
  private static final String OPEN_WORD_SENSE_SAMPLE_PHRASE_TAG = "<word-sense-sample-phrase>";
  
  /** the XML close word sense sample phrase tag */
  private static final String CLOSE_WORD_SENSE_SAMPLE_PHRASE_TAG = "</word-sense-sample-phrase>";
  
  /** the XML open word sense source tag */
  private static final String OPEN_WORD_SENSE_SOURCE_TAG = "<word-sense-source>";
  
  /** the XML close word sense source tag */
  private static final String CLOSE_WORD_SENSE_SOURCE_TAG = "</word-sense-source>";
  
  /** the XML open word sense speech part tag */
  private static final String OPEN_WORD_SENSE_SPEECH_PART_TAG = "<word-sense-speech-part>";
  
  /** the XML close word sense speech part tag */
  private static final String CLOSE_WORD_SENSE_SPEECH_PART_TAG = "</word-sense-speech-part>";
  
  /** the output XML path */
  private static final String OUTPUT_FILE_PATH = "/home/reed/svn/Lexicon/data/lexicon.xml";
  
  /** the XML output stream */
  private BufferedWriter bufferedWriter;
  
  /** the XML text indentation */
  private int indent = 0;
  
  /** the entity manager factory */
  private EntityManagerFactory entityManagerFactory;
  
  /** the entity manager */
  private EntityManager entityManager;
  
  /** the domain entity manager */
  private DomainEntityManagerBean domainEntityManager;
  
  /** the number of English word senses acquired from WordNet */
  private int nbrEnglishWordsProcessed = 0;
  
  /** the sorted word dictionary, word --> word term id */
  private Map<String, Long> wordDictionary = new TreeMap<String, Long>();
  
  /** Creates a new instance of LexiconToXML. */
  public LexiconToXML() {
    super();
  }
  
  /** Initializes the application and injects the dependencies for out-of-the-container execution of J2EE session beans. */
  private void initialize() {
    entityManagerFactory = Persistence.createEntityManagerFactory(Constants.TEST_PERSISTENCE_UNIT_NAME);
    entityManager = entityManagerFactory.createEntityManager();
    CacheInitializer.initializeCaches();
    domainEntityManager = new DomainEntityManagerBean();
    domainEntityManager.setEntityManager(entityManager);
    domainEntityManager.injectSharedBeanDependencies();
    
    //TODO create transaction and perform periodic commits to clear the entity manager
    
    try {
      bufferedWriter = new BufferedWriter(new FileWriter(OUTPUT_FILE_PATH));
    } catch (final IOException ex) {
      throw new TexaiException(ex);
    }
  }
  
  /** Serializes the lexicon to XML. */
  private void serializeLexiconToXML() {
    sortWords();
    nbrEnglishWordsProcessed = 0;
      writeXML(OPEN_LEXICON_TAG, 0);
      for (final Long termId : wordDictionary.values()) {
        final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) domainEntityManager.loadDomainEntity(termId);
        nbrEnglishWordsProcessed++;
        LOGGER.info("");
        LOGGER.info("-----------------------------------------  " + nbrEnglishWordsProcessed);
        LOGGER.info("");
        LOGGER.info(texaiEnglishWord.getLemma());
        indent = 2;
        writeXML(OPEN_WORD_TAG, indent);
        writeXML(OPEN_LEMMA_TAG + texaiEnglishWord.getLemma() + CLOSE_LEMMA_TAG, indent + 2);
        if (texaiEnglishWord.getWiktionaryEnglishWord() != null) {
          LOGGER.info("    source Wiktionary");
          writeXML(OPEN_WORD_SOURCE_TAG + "Wiktionary" + CLOSE_WORD_SOURCE_TAG, indent + 2);
        }
        if (texaiEnglishWord.getWordNetEnglishWord() != null) {
          LOGGER.info("    source WordNet");
          writeXML(OPEN_WORD_SOURCE_TAG + "WordNet" + CLOSE_WORD_SOURCE_TAG, indent + 2);
        }
        if (texaiEnglishWord.getIsOpenCycWord()) {
          LOGGER.info("    source OpenCyc");
          writeXML(OPEN_WORD_SOURCE_TAG + "OpenCyc" + CLOSE_WORD_SOURCE_TAG, indent + 2);
        }
        for (final TexaiEnglishWordForm texaiEnglishWordForm : texaiEnglishWord.getTexaiEnglishWordForms()) {
          LOGGER.info("  " + texaiEnglishWordForm.getWordFormInflection() + " " + texaiEnglishWordForm.getWordForm());
          writeXML(OPEN_WORD_FORM_TAG, indent + 2);
          writeXML(OPEN_WORD_FORM_WORD_TAG + texaiEnglishWordForm.getWordForm() + CLOSE_WORD_FORM_WORD_TAG, indent + 4);
          writeXML(OPEN_WORD_FORM_SPEECH_PART_TAG + texaiEnglishWordForm.getWordFormInflection() 
          + CLOSE_WORD_FORM_SPEECH_PART_TAG, indent + 4);
          for (final ARPABETPronunciation arpabetPronounciation : texaiEnglishWordForm.getARPABETPronunciations()) {
            LOGGER.info("    [" + arpabetPronounciation.getPhonemeString() + "]");
            writeXML(OPEN_WORD_FORM_PRONUNCIATION_TAG + arpabetPronounciation.getPhonemeString() 
            + CLOSE_WORD_FORM_PRONUNCIATION_TAG, indent + 4);
          }
          if (texaiEnglishWordForm.getWiktionaryEnglishWordForm() != null) {
            LOGGER.info("    source Wiktionary");
            writeXML(OPEN_WORD_FORM_SOURCE_TAG + "Wiktionary" + CLOSE_WORD_FORM_SOURCE_TAG, indent + 4);
          }
          if (texaiEnglishWordForm.getWordNetCasedEnglishWord() != null) {
            LOGGER.info("    source WordNet");
            writeXML(OPEN_WORD_FORM_SOURCE_TAG + "WordNet" + CLOSE_WORD_FORM_SOURCE_TAG, indent + 4);
          }
          if (texaiEnglishWordForm.getCMUDictionaryEnglishWordForm() != null) {
            LOGGER.info("    source CMU Pronouncing Dictionary");
            writeXML(OPEN_WORD_FORM_SOURCE_TAG + "CMU Pronouncing Dictionary" + CLOSE_WORD_FORM_SOURCE_TAG, indent + 4);
          }
          if (texaiEnglishWordForm.getIsOpenCycWordForm()) {
            LOGGER.info("    source OpenCyc");
            writeXML(OPEN_WORD_FORM_SOURCE_TAG + "OpenCyc" + CLOSE_WORD_FORM_SOURCE_TAG, indent + 4);
          }
          writeXML(CLOSE_WORD_FORM_TAG, indent + 2);
        }
        for (final TexaiEnglishWordSense texaiEnglishWordSense : texaiEnglishWord.getTexaiEnglishWordSenses()) {
          writeXML(OPEN_WORD_SENSE_TAG, indent + 2);
          writeXML(OPEN_WORD_SENSE_NUMBER_TAG + texaiEnglishWordSense.getWordSenseNbr() + CLOSE_WORD_SENSE_NUMBER_TAG, indent + 4);
          writeXML(OPEN_WORD_SENSE_SPEECH_PART_TAG + texaiEnglishWordSense.getSpeechPart() + CLOSE_WORD_SENSE_SPEECH_PART_TAG, indent + 4);
          if (texaiEnglishWordSense.getTexaiMappedTerm() == null) {
            LOGGER.info("    " + texaiEnglishWordSense.getWordSenseNbr() + ". " + texaiEnglishWordSense.getSpeechPart());
          } else {
            LOGGER.info("    " + texaiEnglishWordSense.getWordSenseNbr() + ". " + texaiEnglishWordSense.getSpeechPart()
            + " --> " + texaiEnglishWordSense.getTexaiMappedTerm());
            writeXML(OPEN_WORD_SENSE_MAPPED_TERM_TAG + texaiEnglishWordSense.getTexaiMappedTerm() 
            + CLOSE_WORD_SENSE_MAPPED_TERM_TAG, indent + 4);
          }
          for (final String gloss : texaiEnglishWordSense.getGlosses()) {
            LOGGER.info("       " + gloss);
            writeXML(OPEN_WORD_SENSE_GLOSS_TAG + gloss + CLOSE_WORD_SENSE_GLOSS_TAG, indent + 4);
          }
          for (final TexaiSamplePhrase texaiSamplePhrase : texaiEnglishWordSense.getTexaiSamplePhrases()) {
            LOGGER.info("         \"" + texaiSamplePhrase.getSamplePhrase() + "\"");
            writeXML(OPEN_WORD_SENSE_SAMPLE_PHRASE_TAG + texaiSamplePhrase.getSamplePhrase() 
            + CLOSE_WORD_SENSE_SAMPLE_PHRASE_TAG, indent + 4);
          }
          final StringBuilder stringBuilder = new StringBuilder(Constants.STRING_BUILDER_SIZE_SMALL);
          stringBuilder.append('[');
          if (texaiEnglishWordSense.getWordNetCategory() != null) {
            stringBuilder.append(texaiEnglishWordSense.getWordNetCategory().getName());
            writeXML(OPEN_WORD_SENSE_CATEGORY_TAG + texaiEnglishWordSense.getWordNetCategory().getName() 
            + CLOSE_WORD_SENSE_CATEGORY_TAG, indent + 4);
          }
          for (final String categoryName : texaiEnglishWordSense.getCategoryNames()) {
            if (stringBuilder.length() > 1) {
              stringBuilder.append(", ");
            }
            stringBuilder.append(categoryName);
            writeXML(OPEN_WORD_SENSE_CATEGORY_TAG + categoryName + CLOSE_WORD_SENSE_CATEGORY_TAG, indent + 4);
          }
          if (stringBuilder.length() > 1) {
            stringBuilder.append(']');
            LOGGER.info("      " + stringBuilder.toString());
          }
          if (texaiEnglishWordSense.getWordNetSynset() != null) {
            LOGGER.info("       source Wordnet");
            writeXML(OPEN_WORD_SENSE_SOURCE_TAG + "Wordnet" + CLOSE_WORD_SENSE_SOURCE_TAG, indent + 4);
          }
          if (texaiEnglishWordSense.getWiktionaryEnglishWordSense() != null) {
            LOGGER.info("       source Wikitionary");
            writeXML(OPEN_WORD_SENSE_SOURCE_TAG + "Wikitionary" + CLOSE_WORD_SENSE_SOURCE_TAG, indent + 4);
          }
          if (texaiEnglishWordSense.getIsOpenCycWordSense()) {
            LOGGER.info("       source OpenCyc");
            writeXML(OPEN_WORD_SENSE_SOURCE_TAG + "OpenCyc" + CLOSE_WORD_SENSE_SOURCE_TAG, indent + 4);
          }
          writeXML(CLOSE_WORD_SENSE_TAG, indent + 2);
        }
        writeXML(CLOSE_WORD_TAG, indent);
      }
      writeXML(CLOSE_LEXICON_TAG, 0);
  }
  
  /** Writes the given text line using the given indentation.
   *
   * @param text the given text
   * @param localIndent the local indentation
   */
  private void writeXML(final String text, final int localIndent) {
    //Preconditions
    assert localIndent >= 0 : "localIndent must not be negative";
    assert text != null : "text must not be null";
    assert !text.isEmpty() : "text must not be an empty string";
    
    try {
      for (int i = 0; i < localIndent; i++) {
        bufferedWriter.write(' ');
      }
      bufferedWriter.write(text);
      bufferedWriter.newLine();
    } catch (final IOException ex) {
      throw new TexaiException(ex);
    }
  }
  
  
  /** Sorts the lexicon words. */
  private void sortWords() {
    LOGGER.info("gathering lexicon words");
    final Iterator<Object> texaiEnglishWord_iter = domainEntityManager.domainEntityIterator(TexaiEnglishWord.class);
    while (texaiEnglishWord_iter.hasNext()) {
      final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) texaiEnglishWord_iter.next();
      nbrEnglishWordsProcessed++;
      LOGGER.info("sorting " + texaiEnglishWord + "  " + nbrEnglishWordsProcessed);
      wordDictionary.put(texaiEnglishWord.getLemma(), texaiEnglishWord.getTermId());
      if (nbrEnglishWordsProcessed % 20 == 0) {
        entityManager.clear();
      }
      if (nbrEnglishWordsProcessed > TEST_LIMIT) {
        break;
      }
    }
  }
  
  /** Finalizes this application. */
  private void finalization() {
    CacheManager.getInstance().shutdown();
    entityManager.close();
    entityManagerFactory.close();
    try {
      bufferedWriter.close();
    } catch (final IOException ex) {
      throw new TexaiException(ex);
    }
    LOGGER.info("Number of English words processed " + nbrEnglishWordsProcessed);
  }
  
  /** Executes this application.
   *
   * @param args the command line arguments (unused)
   */
  public static void main(final String[] args) {
    final LexiconToXML lexiconToXML = new LexiconToXML();
    lexiconToXML.initialize();
    
    
    try {
      lexiconToXML.serializeLexiconToXML();
    } catch (final TexaiException ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    } catch (final NullPointerException ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    } catch (final AssertionError ex) {
      LOGGER.error(ex);
      ex.printStackTrace(System.err);
    }
    lexiconToXML.finalization();
  }
}




See more files for this project here

Texai

Texai is an chatbot that intelligently seeks to acquire knowledge and friendly behaviors.

Project homepage: http://sourceforge.net/projects/texai
Programming language(s): Java,Shell Script,XML
License: other

  domainEntity/
    TexaiEnglishWord.java
    TexaiEnglishWordForm.java
    TexaiEnglishWordSense.java
    TexaiSamplePhrase.java
    package-info.java
  LexiconInitializer.java
  LexiconToXML.java