HtmlNekoParser.java from Kneobase at Krugle
Show HtmlNekoParser.java syntax highlighted
/*
* Created on 24/02/2005
*
*/
package com.kneobase.extractors.parser;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import org.apache.html.dom.HTMLDocumentImpl;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* @author Ernesto De Santis (ernesto.desantis@colaborativa.net)
*
* Colaborativa.net
*
*/
public class HtmlNekoParser extends A_StringParser {
private String encoding;
private static final String[] metadata = {METADATA_TITLE};
public HtmlNekoParser() {
super(metadata);
}
private DOMFragmentParser parser = new DOMFragmentParser();
public ParsedBody parse(InputStream is) throws IOException, ParseException {
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {
parser.parse(new InputSource(is), node);
} catch (SAXException e) {
throw new ParseException(e.getMessage(), -1);
}
StringBuffer sb = new StringBuffer();
getText(sb, node, "title");
String title = sb.toString();
sb.setLength(0);
getText(sb, node);
String body = sb.toString();
ParsedBody parsedBody = new ParsedBody();
if ((title != null) && (!title.equals(""))) {
parsedBody.put(METADATA_TITLE, title);
}
if ((body != null) && (!body.equals(""))) {
parsedBody.setBody(body);
}
return parsedBody;
}
private void getText(StringBuffer sb, Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
sb.append(node.getNodeValue());
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
getText(sb, children.item(i));
}
}
}
private boolean getText(StringBuffer sb, Node node, String element) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
if (element.equalsIgnoreCase(node.getNodeName())) {
getText(sb, node);
return true;
}
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
if (getText(sb, children.item(i), element)) {
return true;
}
}
}
return false;
}
/**
* @param string
*/
public void setEncoding(String string) {
encoding = string;
}
}
See more files for this project here
Kneobase is an enterprise search engine, based upon the Lucene search engine and the Spring framework. It allows to perform full-text search across many different content sources. It is highly adaptable out-of-the-box and has a pluggable architecture.
Project homepage:
http://sourceforge.net/projects/kneobase
Programming language(s): Java,XML
License: other
HtmlNekoParser.java