Code Search for Developers
 
 
  

Tokenizer.java from Texai at Krugle


Show Tokenizer.java syntax highlighted

/*
 * Tokenizer.java
 *
 * Created on September 21, 2006, 10:27 AM
 *
 * Description: Tokenizer parses an English language text into Tokens.
 *
 * Copyright (C) 2006 Stephen L. Reed.
 *
 * This program is free software; you can redistribute it and/or modify it under the terms
 * of the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with this program;
 * if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package org.texai.grammar;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.texai.kb.Constants;

/**
 *
 * @author reed
 */
public class Tokenizer {
  
  /** the initial size of the tokens list */
  private static int TOKENS_INITIAL_SIZE = 30;
  
  /** the text */
  private String text;
  
  /** the text character index */
  private int textCharacterIndex;
  
  /** the current text character */
  private char textCharacter;
  
  /** the list of tokens parsed from the text */
  private List<Token> tokens;
  
  /** the token buffer */
  private StringBuilder tokenBuffer;
  
  /** Creates a new instance of Tokenizer */
  public Tokenizer() {
  }
  
  /** Returns an iterator over the list of tokens parsed from the given text.
   *
   * @param text the text
   * @return an iterator over the list of tokens parsed from the given text
   */
  public Iterator<Token> tokens(final String text) {
    //Preconditions
    assert text != null : "text must not be null";
    
    this.text = text;
    tokenBuffer = new StringBuilder(Constants.STRING_BUILDER_SIZE_SMALL);
    tokens = new ArrayList<Token>(TOKENS_INITIAL_SIZE);
    textCharacterIndex = 0;
    while (textCharacterIndex < text.length()) {
      textCharacter = text.charAt(textCharacterIndex);
      if (Character.isWhitespace(textCharacter)) {
        textCharacterIndex++;
        continue;
      } else if (Character.isLetter(textCharacter)) {
        parseWord();
      } else if (textCharacter == '<') {
        parseTag();
      } else if (textCharacter == '.') {
        parsePunctuation();
      }
    }
    return tokens.iterator();
  }
  
  /** Parses the next word from the text. */
  private void parseWord() {
    while (true) {
      tokenBuffer.append(textCharacter);
      if (++textCharacterIndex > text.length()) {
        return;
      }
      textCharacter = text.charAt(textCharacterIndex);
      if (Character.isLetter(textCharacter)
      || Character.isJavaIdentifierPart(textCharacter)
      || textCharacter == '@'
              || textCharacter == '-') {
        continue;
      } else if (textCharacter == '.') {
        final int nextTextCharacterIndex = textCharacterIndex + 1;
        if (nextTextCharacterIndex < text.length()) {
          final char nextTextCharacter = text.charAt(nextTextCharacterIndex);
          if (Character.isLetter(nextTextCharacter)
          || Character.isJavaIdentifierPart(nextTextCharacter)) {
            // period embedded in a word
            continue;
          }
        }
        
      }
      final Token token = new Token(Token.TokenType.WORD, tokenBuffer.toString());
      tokens.add(token);
      tokenBuffer.setLength(0);
      return;
    }
  }
  
  /** Parses the next tag from the text. */
  private void parseTag() {
    while (true) {
      tokenBuffer.append(textCharacter);
      if (++textCharacterIndex > text.length()) {
        return;
      }
      textCharacter = text.charAt(textCharacterIndex);
      if (textCharacter == '>') {
        tokenBuffer.append(textCharacter);
        final Token token = new Token(Token.TokenType.TAG, tokenBuffer.toString());
        tokens.add(token);
        tokenBuffer.setLength(0);
        ++textCharacterIndex;
        return;
      }
    }
  }
  
  /** Parses the next punctuation symbol from the text. */
  private void parsePunctuation() {
    tokenBuffer.append(textCharacter);
    final Token token = new Token(Token.TokenType.PUNCTUATION, tokenBuffer.toString());
    tokens.add(token);
    tokenBuffer.setLength(0);
    ++textCharacterIndex;
  }
}




See more files for this project here

Texai

Texai is an chatbot that intelligently seeks to acquire knowledge and friendly behaviors.

Project homepage: http://sourceforge.net/projects/texai
Programming language(s): Java,Shell Script,XML
License: other

  domainEntity/
    AbstractComposedConstruction.java
    AbstractConstruction.java
    AlternativeConstruction.java
    CategoryConstruction.java
    ConstituentAdapter.java
    OptionalConstruction.java
    RegularExpression.java
    RegularExpression_GroupVariableInfo.java
    RepetitiveConstruction.java
    SequenceConstruction.java
    SimpleConstruction.java
    ValidateGrammarEntities.java
    WordFormConstruction.java
    WordSenseConstruction.java
    XMLTagWord.java
  integration/
    IntegrationGraph.java
    Link.java
    Node.java
  understanding/
    AbstractActiveComposedConstruction.java
    AbstractActiveConstruction.java
    ActiveAbstractComposedConstruction.java
    ActiveAlternativeConstruction.java
    ActiveCategoryConstruction.java
    ActiveComposedConstruction.java
    ActiveComposedConstructionSet.java
    ActiveConstituent.java
    ActiveOptionalConstruction.java
    ActiveRegularExpression.java
    ActiveRepetitiveConstruction.java
    ActiveSequenceConstruction.java
    ActiveSimpleConstruction.java
    ActiveWordFormConstruction.java
    ActiveXMLTagWord.java
  ComposedConstruction.java
  Constituent.java
  ConstructionInitializer.java
  GrammarConstants.java
  Token.java
  Tokenizer.java