Code Search for Developers
 
 
  

MyBinaryLoader.java from Texai at Krugle


Show MyBinaryLoader.java syntax highlighted

package edu.cmu.sphinx.linguist.language.ngram.large;

import edu.cmu.sphinx.linguist.dictionary.Dictionary;
import edu.cmu.sphinx.util.LogMath;
import edu.cmu.sphinx.util.Utilities;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;

/*
 * MyBinaryLoader.java
 *
 * Created on February 14, 2007, 11:21 AM
 *
 * Description: Loads the NGram binary file.
 *
/*
 * Copyright 1999-2002 Carnegie Mellon University.
 * Portions Copyright 2002 Sun Microsystems, Inc.
 * Portions Copyright 2002 Mitsubishi Electric Research Laboratories.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * Modified by Stephen Reed to simply read the NGram file.
 *
 * This program is free software; you can redistribute it and/or modify it under the terms
 * of the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with this program;
 * if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/**
 * Reads a binary language model file generated by the CMU-Cambridge
 * Statistical Language Modelling Toolkit.
 *
 * Note that all probabilites in the grammar are stored in LogMath log base
 * format. Language Probabilties in the language model file are stored in log
 * 10 base. They are converted to the LogMath logbase.
 */
public final class MyBinaryLoader {
  
  private static final String DARPA_LM_HEADER = "Darpa Trigram LM";
  private static final int LOG2_BIGRAM_SEGMENT_SIZE_DEFAULT = 9;
  private static final float MIN_PROBABILITY = -99.0f;
  private static final int MAX_PROB_TABLE_SIZE = 65536;
  
  private LogMath logMath;
  private int maxNGram;
  private boolean bigEndian = true;
  private boolean applyLanguageWeightAndWip;
  
  private int bytesRead = 0;
  
  private UnigramProbability[] unigrams;
  private String[] words;
  private int bigramOffset;
  private int trigramOffset;
  private int numberUnigrams;
  private int numberBigrams;
  private int numberTrigrams;
  private int logBigramSegmentSize;
  private int startWordID;
  private int endWordID;
  private int[] trigramSegmentTable;
  
  private float[] bigramProbTable;
  private float[] trigramBackoffTable;
  private float[] trigramProbTable;
  
  private RandomAccessFile file;
  
  /**
   * Initializes the binary loader
   *
   * @param location the location of the model
   * @param applyLanguageWeightAndWip if true apply lw and wip
   * @param logMath the logmath to sue
   * @throws IOException if an I/O error occurs
   */
  public MyBinaryLoader(final File location, final LogMath logMath)
          throws IOException {
    startWordID = -1;
    endWordID = -1;
    applyLanguageWeightAndWip = false;
    this.logMath = logMath;
    loadBinary(location);
  }
  
  /**
   * Returns the number of unigrams
   *
   * @return the nubmer of unigrams
   */
  public int getNumberUnigrams() {
    return numberUnigrams;
  }
  
  /**
   * Returns the number of bigrams
   *
   * @return the nubmer of bigrams
   */
  public int getNumberBigrams() {
    return numberBigrams;
  }
  
  /**
   * Returns the number of trigrams
   *
   * @return the nubmer of trigrams
   */
  public int getNumberTrigrams() {
    return numberTrigrams;
  }
  
  /**
   * Returns all the unigrams
   *
   * @return all the unigrams
   */
  public UnigramProbability[] getUnigrams() {
    return unigrams;
  }
  
  /**
   * Returns all the bigram probabilities.
   *
   * @return all the bigram probabilities
   */
  public float[] getBigramProbabilities() {
    return bigramProbTable;
  }
  
  /**
   * Returns all the trigram probabilities.
   *
   * @return all the trigram probabilities
   */
  public float[] getTrigramProbabilities() {
    return trigramProbTable;
  }
  
  /**
   * Returns all the trigram backoff weights
   *
   * @return all the trigram backoff weights
   */
  public float[] getTrigramBackoffWeights() {
    return trigramBackoffTable;
  }
  
  /**
   * Returns the trigram segment table.
   *
   * @return the trigram segment table
   */
  public int[] getTrigramSegments() {
    return trigramSegmentTable;
  }
  
  /**
   * Returns the log of the bigram segment size
   *
   * @return the log of the bigram segment size
   */
  public int getLogBigramSegmentSize() {
    return logBigramSegmentSize;
  }
  
  /**
   * Returns all the words.
   *
   * @return all the words
   */
  public String[] getWords() {
    return words;
  }
  
  /**
   * Initializes this LanguageModel
   *
   * @param context
   *                the context to associate this linguist with
   */
  
  
  /**
   * Returns the location (or offset) into the file where bigrams start.
   *
   * @return the location of the bigrams
   */
  public int getBigramOffset() {
    return bigramOffset;
  }
  
  /**
   * Returns the location (or offset) into the file where trigrams start.
   *
   * @return the location of the trigrams
   */
  public int getTrigramOffset() {
    return trigramOffset;
  }
  
  /**
   * Returns the maximum depth of the language model
   *
   * @return the maximum depth of the language mdoel
   */
  public int getMaxDepth() {
    return maxNGram;
  }
  
  /**
   * Returns true if the loaded file is in big-endian.
   *
   * @return true if the loaded file is big-endian
   */
  public boolean getBigEndian() {
    return bigEndian;
  }
  
  /**
   * Loads the contents of the memory-mapped file starting at the given
   * position and for the given size, into a byte buffer. This method is
   * implemented because MappedByteBuffer.load() does not work properly.
   *
   * @param position
   *                the starting position in the file
   * @param size
   *                the number of bytes to load
   *
   * @return the loaded ByteBuffer
   */
  public byte[] loadBuffer(long position, int size) throws IOException {
    // assert ((position + size) <= fileChannel.size());
    file.seek(position);
    byte[] bytes = new byte[size];
    if (file.read(bytes) != size) {
      throw new IOException("Incorrect number of bytes read.");
    }
    return bytes;
  }
  
  /**
   * Loads the language model from the given file.
   *
   * @param location
   *                the file containing the language model
   */
  private void loadBinary(File location) throws IOException {
    
    DataInputStream stream = new DataInputStream
            (new BufferedInputStream(new FileInputStream(location)));
    
    // read standard header string-size; set bigEndian flag
    readHeader(stream);
    
    // +1 is the sentinel unigram at the end
    unigrams = readUnigrams(stream, numberUnigrams + 1, bigEndian);
    
    skipBigramsTrigrams(stream);
    
    // read the bigram probabilities table
    if (numberBigrams > 0) {
      this.bigramProbTable = readFloatTable(stream, bigEndian);
    }
    
    // read the trigram backoff weight table and trigram prob table
    if (numberTrigrams > 0) {
      trigramBackoffTable = readFloatTable(stream, bigEndian);
      trigramProbTable = readFloatTable(stream, bigEndian);
      
      int bigramSegmentSize = 1 << logBigramSegmentSize;
      int trigramSegTableSize = ((numberBigrams + 1) / bigramSegmentSize) + 1;
      trigramSegmentTable = readIntTable(stream, bigEndian,
              trigramSegTableSize);
    }
    
    // read word string names
    int wordsStringLength = readInt(stream, bigEndian);
    if (wordsStringLength <= 0) {
      throw new Error("Bad word string size: " + wordsStringLength);
    }
    
    // read the string of all words
    this.words = readWords(stream, wordsStringLength, numberUnigrams);
    
    if (startWordID > -1) {
      UnigramProbability unigram = unigrams[startWordID];
      unigram.setLogProbability(MIN_PROBABILITY);
    }
    if (endWordID > -1) {
      UnigramProbability unigram = unigrams[endWordID];
      unigram.setLogBackoff(MIN_PROBABILITY);
    }
    
    stream.close();
    file = new RandomAccessFile(location, "r");
  }
  
  /**
   * Reads the LM file header
   *
   * @param stream
   *                the data stream of the LM file
   */
  private void readHeader(DataInputStream stream) throws IOException {
    int headerLength = readInt(stream, bigEndian);
    
    if (headerLength != (DARPA_LM_HEADER.length() + 1)) { // not big-endian
      headerLength = Utilities.swapInteger(headerLength);
      if (headerLength == (DARPA_LM_HEADER.length() + 1)) {
        bigEndian = false;
        System.out.println("Little-endian");
      } else {
        throw new Error("Bad binary LM file magic number: "
                + headerLength + ", not an LM dumpfile?");
      }
    } else {
      System.out.println("Big-endian");
    }
    
    // read and verify standard header string
    String header = readString(stream, headerLength - 1);
    readByte(stream); // read the '\0'
    
    if (!header.equals(DARPA_LM_HEADER)) {
      throw new Error("Bad binary LM file header: " + header);
    }
    
    // read LM filename string size and string
    int fileNameLength = readInt(stream, bigEndian);
    bytesRead += stream.skipBytes(fileNameLength);
    
    numberUnigrams = 0;
    logBigramSegmentSize = LOG2_BIGRAM_SEGMENT_SIZE_DEFAULT;
    
    // read version number, if present. it must be <= 0.
    
    int version = readInt(stream, bigEndian);
    System.out.println("Version: " + version);
    
    if (version <= 0) { // yes, its the version number
      readInt(stream, bigEndian); // read and skip timestamp
      
      // read and skip format description
      int formatLength;
      for (;;) {
        if ((formatLength = readInt(stream, bigEndian)) == 0) {
          break;
        }
        bytesRead += stream.skipBytes(formatLength);
      }
      
      // read log bigram segment size if present
      if (version <= -2) {
        logBigramSegmentSize = readInt(stream, bigEndian);
        if (logBigramSegmentSize < 1 || logBigramSegmentSize > 15) {
          throw new Error("log2(bg_seg_sz) outside range 1..15");
        }
      }
      
      numberUnigrams = readInt(stream, bigEndian);
    } else {
      numberUnigrams = version;
    }
    
    if (numberUnigrams <= 0) {
      throw new Error("Bad number of unigrams: " + numberUnigrams
              + ", must be > 0.");
    } else {
      maxNGram = 1;
    }
    
    if ((numberBigrams = readInt(stream, bigEndian)) < 0) {
      throw new Error("Bad number of bigrams: " + numberBigrams);
    } else {
      maxNGram = 2;
    }
    
    if ((numberTrigrams = readInt(stream, bigEndian)) < 0) {
      throw new Error("Bad number of trigrams: " + numberTrigrams);
    } else {
      maxNGram = 3;
    }
  }
  
  /**
   * Skips the bigrams and trigrams of the LM.
   *
   * @param stream
   *                the source of data
   */
  private void skipBigramsTrigrams(DataInputStream stream) throws IOException {
    // skip all the bigram entries, the +1 is the sentinel at the end
    if (numberBigrams > 0) {
      bigramOffset = bytesRead;
      int bytesToSkip = (numberBigrams + 1)
      * LargeTrigramModel.BYTES_PER_BIGRAM;
      stream.skipBytes(bytesToSkip);
      bytesRead += bytesToSkip;
    }
    
    // skip all the trigram entries
    if (numberTrigrams > 0) {
      trigramOffset = bytesRead;
      int bytesToSkip = numberTrigrams
              * LargeTrigramModel.BYTES_PER_TRIGRAM;
      stream.skipBytes(bytesToSkip);
      bytesRead += bytesToSkip;
    }
  }
    
  /**
   * Apply the language weight to the given array of probabilities.
   */
  private void applyLanguageWeight(float[] logProbabilities,
          float languageWeight) {
    for (int i = 0; i < logProbabilities.length; i++) {
      logProbabilities[i] = logProbabilities[i] * languageWeight;
    }
  }
  
  /**
   * Apply the WIP to the given array of probabilities.
   */
  private void applyWip(float[] logProbabilities, double wip) {
    float logWip = logMath.linearToLog(wip);
    for (int i = 0; i < logProbabilities.length; i++) {
      logProbabilities[i] = logProbabilities[i] + logWip;
    }
  }
  
  /**
   * Reads the probability table from the given DataInputStream.
   *
   * @param stream
   *                the DataInputStream from which to read the table
   * @param bigEndian
   *                true if the given stream is bigEndian, false otherwise
   */
  private float[] readFloatTable(DataInputStream stream, boolean bigEndian)
  throws IOException {
    
    int numProbs = readInt(stream, bigEndian);
    if (numProbs <= 0 || numProbs > MAX_PROB_TABLE_SIZE) {
      throw new Error("Bad probabilities table size: " + numProbs);
    }
    
    float[] probTable = new float[numProbs];
    
    for (int i = 0; i < numProbs; i++) {
      probTable[i] = logMath.log10ToLog(readFloat(stream, bigEndian));
    }
    
    return probTable;
  }
  
  /**
   * Reads a table of integers from the given DataInputStream.
   *
   * @param stream
   *                the DataInputStream from which to read the table
   * @param bigEndian
   *                true if the given stream is bigEndian, false otherwise
   * @param tableSize
   *                the size of the trigram segment table
   *
   * @return the trigram segment table, which is an array of integers
   */
  private int[] readIntTable(DataInputStream stream, boolean bigEndian,
          int tableSize) throws IOException {
    int numSegments = readInt(stream, bigEndian);
    if (numSegments != tableSize) {
      throw new Error("Bad trigram seg table size: " + numSegments);
    }
    int[] segmentTable = new int[numSegments];
    for (int i = 0; i < numSegments; i++) {
      segmentTable[i] = readInt(stream, bigEndian);
    }
    return segmentTable;
  }
  
  /**
   * Read in the unigrams in the given DataInputStream.
   *
   * @param stream
   *                the DataInputStream to read from
   * @param numberUnigrams
   *                the number of unigrams to read
   * @param bigEndian
   *                true if the DataInputStream is big-endian, false
   *                otherwise
   *
   * @return an array of UnigramProbability index by the unigram ID
   */
  private UnigramProbability[] readUnigrams(DataInputStream stream,
          int numberUnigrams, boolean bigEndian) throws IOException {
    
    UnigramProbability[] unigrams = new UnigramProbability[numberUnigrams];
    
    for (int i = 0; i < numberUnigrams; i++) {
      
      // read unigram ID, unigram probability, unigram backoff weight
      int unigramID = readInt(stream, bigEndian);
      
      // if we're not reading the sentinel unigram at the end,
      // make sure that the unigram IDs are consecutive
      if (i != (numberUnigrams - 1)) {
        assert(unigramID == i);
      }
      
      float unigramProbability = readFloat(stream, bigEndian);
      float unigramBackoff = readFloat(stream, bigEndian);
      int firstBigramEntry = readInt(stream, bigEndian);
      System.out.println("unigramID " + unigramID 
              + " unigramProbability " + unigramProbability 
              + " unigramBackoff " + unigramBackoff 
              + " firstBigramEntry " + firstBigramEntry);
      float logProbability = logMath.log10ToLog(unigramProbability);
      float logBackoff = logMath.log10ToLog(unigramBackoff);
      
      unigrams[i] = new UnigramProbability(unigramID, logProbability,
              logBackoff, firstBigramEntry);
    }
    
    return unigrams;
  }
  
  /**
   * Reads a byte from the given DataInputStream.
   *
   * @param stream
   *                the DataInputStream to read from
   *
   * @return the byte read
   */
  private final byte readByte(DataInputStream stream) throws IOException {
    bytesRead++;
    return stream.readByte();
  }
  
  /**
   * Reads an integer from the given DataInputStream.
   *
   * @param stream
   *                the DataInputStream to read from
   * @param bigEndian
   *                true if the DataInputStream is in bigEndian, false
   *                otherwise
   *
   * @return the integer read
   */
  private final int readInt(DataInputStream stream, boolean bigEndian)
  throws IOException {
    bytesRead += 4;
    if (bigEndian) {
      return stream.readInt();
    } else {
      return Utilities.readLittleEndianInt(stream);
    }
  }
  
  /**
   * Reads a float from the given DataInputStream.
   *
   * @param stream
   *                the DataInputStream to read from
   * @param bigEndian
   *                true if the DataInputStream is in bigEndian, false
   *                otherwise
   *
   * @return the float read
   */
  private final float readFloat(DataInputStream stream, boolean bigEndian)
  throws IOException {
    bytesRead += 4;
    if (bigEndian) {
      return stream.readFloat();
    } else {
      return Utilities.readLittleEndianFloat(stream);
    }
  }
  
  /**
   * Reads a string of the given length from the given DataInputStream. It is
   * assumed that the DataInputStream contains 8-bit chars.
   *
   * @param stream
   *                the DataInputStream to read from
   * @param length
   *                the number of characters in the returned string
   *
   * @return a string of the given length from the given DataInputStream
   */
  private final String readString(DataInputStream stream, int length)
  throws IOException {
    StringBuffer buffer = new StringBuffer();
    byte[] bytes = new byte[length];
    bytesRead += stream.read(bytes);
    
    for (int i = 0; i < length; i++) {
      buffer.append((char) bytes[i]);
    }
    return buffer.toString();
  }
  
  /**
   * Reads a series of consecutive Strings from the given stream.
   *
   * @param stream
   *                the DataInputStream to read from
   * @param length
   *                the total length in bytes of all the Strings
   * @param numberUnigrams
   *                the number of String to read
   *
   * @return an array of the Strings read
   */
  private final String[] readWords(DataInputStream stream, int length,
          int numberUnigrams) throws IOException {
    String[] words = new String[numberUnigrams];
    
    StringBuffer buffer = new StringBuffer();
    byte[] bytes = new byte[length];
    bytesRead += stream.read(bytes);
    
    int s = 0;
    for (int i = 0; i < length; i++) {
      char c = (char) bytes[i];
      bytesRead++;
      if (c == '\0') {
        // if its the end of a string, add it to the 'words' array
        words[s] = buffer.toString().toLowerCase();
        buffer = new StringBuffer();
        if (words[s].equals(Dictionary.SENTENCE_START_SPELLING)) {
          startWordID = s;
        } else if (words[s].equals(Dictionary.SENTENCE_END_SPELLING)) {
          endWordID = s;
        }
        s++;
      } else {
        buffer.append(c);
      }
    }
    assert(s == numberUnigrams);
    return words;
  }
}




See more files for this project here

Texai

Texai is an chatbot that intelligently seeks to acquire knowledge and friendly behaviors.

Project homepage: http://sourceforge.net/projects/texai
Programming language(s): Java,Shell Script,XML
License: other

  MyBinaryLoader.java
  MyPropertySheet.java
  NGramImport.java
  file-format-description.txt