Tokenizer.java from Texai at Krugle
Show Tokenizer.java syntax highlighted
/*
* Tokenizer.java
*
* Created on September 21, 2006, 10:27 AM
*
* Description: Tokenizer parses an English language text into Tokens.
*
* Copyright (C) 2006 Stephen L. Reed.
*
* This program is free software; you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program;
* if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.texai.grammar;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.texai.kb.Constants;
/**
*
* @author reed
*/
public class Tokenizer {
/** the initial size of the tokens list */
private static int TOKENS_INITIAL_SIZE = 30;
/** the text */
private String text;
/** the text character index */
private int textCharacterIndex;
/** the current text character */
private char textCharacter;
/** the list of tokens parsed from the text */
private List<Token> tokens;
/** the token buffer */
private StringBuilder tokenBuffer;
/** Creates a new instance of Tokenizer */
public Tokenizer() {
}
/** Returns an iterator over the list of tokens parsed from the given text.
*
* @param text the text
* @return an iterator over the list of tokens parsed from the given text
*/
public Iterator<Token> tokens(final String text) {
//Preconditions
assert text != null : "text must not be null";
this.text = text;
tokenBuffer = new StringBuilder(Constants.STRING_BUILDER_SIZE_SMALL);
tokens = new ArrayList<Token>(TOKENS_INITIAL_SIZE);
textCharacterIndex = 0;
while (textCharacterIndex < text.length()) {
textCharacter = text.charAt(textCharacterIndex);
if (Character.isWhitespace(textCharacter)) {
textCharacterIndex++;
continue;
} else if (Character.isLetter(textCharacter)) {
parseWord();
} else if (textCharacter == '<') {
parseTag();
} else if (textCharacter == '.') {
parsePunctuation();
}
}
return tokens.iterator();
}
/** Parses the next word from the text. */
private void parseWord() {
while (true) {
tokenBuffer.append(textCharacter);
if (++textCharacterIndex > text.length()) {
return;
}
textCharacter = text.charAt(textCharacterIndex);
if (Character.isLetter(textCharacter)
|| Character.isJavaIdentifierPart(textCharacter)
|| textCharacter == '@'
|| textCharacter == '-') {
continue;
} else if (textCharacter == '.') {
final int nextTextCharacterIndex = textCharacterIndex + 1;
if (nextTextCharacterIndex < text.length()) {
final char nextTextCharacter = text.charAt(nextTextCharacterIndex);
if (Character.isLetter(nextTextCharacter)
|| Character.isJavaIdentifierPart(nextTextCharacter)) {
// period embedded in a word
continue;
}
}
}
final Token token = new Token(Token.TokenType.WORD, tokenBuffer.toString());
tokens.add(token);
tokenBuffer.setLength(0);
return;
}
}
/** Parses the next tag from the text. */
private void parseTag() {
while (true) {
tokenBuffer.append(textCharacter);
if (++textCharacterIndex > text.length()) {
return;
}
textCharacter = text.charAt(textCharacterIndex);
if (textCharacter == '>') {
tokenBuffer.append(textCharacter);
final Token token = new Token(Token.TokenType.TAG, tokenBuffer.toString());
tokens.add(token);
tokenBuffer.setLength(0);
++textCharacterIndex;
return;
}
}
}
/** Parses the next punctuation symbol from the text. */
private void parsePunctuation() {
tokenBuffer.append(textCharacter);
final Token token = new Token(Token.TokenType.PUNCTUATION, tokenBuffer.toString());
tokens.add(token);
tokenBuffer.setLength(0);
++textCharacterIndex;
}
}
See more files for this project here