Code Search for Developers
 
 
  

HtmlNekoParser.java from Kneobase at Krugle


Show HtmlNekoParser.java syntax highlighted

/*
 * Created on 24/02/2005
 * 
 */
package com.kneobase.extractors.parser;

import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;

import org.apache.html.dom.HTMLDocumentImpl;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 * @author Ernesto De Santis (ernesto.desantis@colaborativa.net)
 * 
 * Colaborativa.net
 *
 */
public class HtmlNekoParser extends A_StringParser {
 
    private String encoding; 
	private static final String[] metadata = {METADATA_TITLE};

	public HtmlNekoParser() {
		super(metadata);
	}

    private DOMFragmentParser parser = new DOMFragmentParser();

    public ParsedBody parse(InputStream is) throws IOException, ParseException {
        DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();

        try {
            parser.parse(new InputSource(is), node);
        } catch (SAXException e) {
            throw new ParseException(e.getMessage(), -1);
        }

        StringBuffer sb = new StringBuffer();
        getText(sb, node, "title");
        String title = sb.toString();
        sb.setLength(0);
        getText(sb, node);
        String body = sb.toString();
		ParsedBody parsedBody = new ParsedBody();
		if ((title != null) && (!title.equals(""))) {
			parsedBody.put(METADATA_TITLE, title);
		}
		if ((body != null) && (!body.equals(""))) {
			parsedBody.setBody(body);
		}
		return parsedBody;
    }

    private void getText(StringBuffer sb, Node node) {
        if (node.getNodeType() == Node.TEXT_NODE) {
            sb.append(node.getNodeValue());
        }
        NodeList children = node.getChildNodes();
        if (children != null) {
            int len = children.getLength();
            for (int i = 0; i < len; i++) {
                getText(sb, children.item(i));
            }
        }
    }
    private boolean getText(StringBuffer sb, Node node, String element) {
        if (node.getNodeType() == Node.ELEMENT_NODE) {
            if (element.equalsIgnoreCase(node.getNodeName())) {
                getText(sb, node);
                return true;
            }
        }
        NodeList children = node.getChildNodes();
        if (children != null) {
            int len = children.getLength();
            for (int i = 0; i < len; i++) {
                if (getText(sb, children.item(i), element)) {
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * @param string
     */
    public void setEncoding(String string) {
        encoding = string;
    }

}




See more files for this project here

Kneobase

Kneobase is an enterprise search engine, based upon the Lucene search engine and the Spring framework. It allows to perform full-text search across many different content sources. It is highly adaptable out-of-the-box and has a pluggable architecture.

Project homepage: http://sourceforge.net/projects/kneobase
Programming language(s): Java,XML
License: other

  HtmlNekoParser.java