001: /*
002: * $Id: HTMLParser.java,v 1.5 2004/10/10 14:27:47 csaltos Exp $
003: *
004: * Copyright 1999 PUCE [http://www.puce.edu.ec]
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018: package org.oxyus.crawler.parser;
019:
020: import java.io.IOException;
021: import java.io.InputStream;
022:
023: import org.apache.log4j.Logger;
024: import org.apache.lucene.document.Document;
025: import org.apache.lucene.document.Field;
026: import org.cyberneko.html.parsers.DOMParser;
027: import org.oxyus.crawler.CrawlingException;
028: import org.oxyus.crawler.Page;
029: import org.w3c.dom.Node;
030: import org.xml.sax.InputSource;
031: import org.xml.sax.SAXException;
032:
033: /**
034: * @author Carlos Saltos (csaltos[@]users.sourceforge.net)
035: */
036: public class HTMLParser {
037:
038: protected Document doc;
039:
040: protected Logger log;
041:
042: protected StringBuffer contents;
043:
044: protected StringBuffer summary;
045:
046: protected boolean contentCollect;
047:
048: protected Page page;
049:
050: String htmlDocumentBase;
051:
052: public HTMLParser(Page page) {
053: log = Logger.getLogger(HTMLParser.class);
054: this .page = page;
055: }
056:
057: public Document collect(InputStream in) throws CrawlingException {
058: doc = new Document();
059: // Assume contentCollect for documents without the BODY element
060: contentCollect = true;
061: contents = new StringBuffer(); // TODO: set the buffer sizes from configuration
062: summary = new StringBuffer();
063: try {
064: DOMParser htmlDOMParser = new DOMParser();
065: InputSource htmlIn = new InputSource(in);
066: htmlDOMParser.parse(htmlIn);
067: collect(htmlDOMParser.getDocument());
068: doc.add(Field.Text("contents", contents.toString()));
069: doc.add(Field.Text("summary", summary.toString()));
070: log.debug("summary:\n" + summary.toString());
071: log.debug("contents:\n" + contents.toString());
072: } catch (SAXException se) {
073: log.error("HTML parse exception");
074: throw new CrawlingException("HTML parse exception");
075: } catch (IOException ioe) {
076: log.error("IO exception");
077: throw new CrawlingException("IO exception");
078: }
079: return doc;
080: }
081:
082: protected void collect(Node node) {
083: int nodeType = node.getNodeType();
084: // Actions for element nodes
085: if (nodeType == Node.ELEMENT_NODE) {
086: String nodeName = node.getNodeName();
087: // Check if we are in the head element
088: if ("HEAD".equals(nodeName)) {
089: contentCollect = false;
090: }
091: // Check if we are in the body element
092: if ("BODY".equals(nodeName)) {
093: contentCollect = true;
094: }
095: // Get the document's title
096: if ("TITLE".equals(nodeName)) {
097: String title = getFirstChildText(node);
098: if (title != null) {
099: log.debug("Adding title '" + title + "'");
100: doc.add(Field.Text("title", title));
101: }
102: }
103: // Get any referenced link from this page
104: if ("A".equals(nodeName)) {
105: String link = getAttributeText(node, "href");
106: if (link != null) {
107: log.debug("Link found '" + link + "'");
108: page.recordLink(link);
109: }
110: }
111: if ("FRAME".equals(nodeName)) {
112: String link = getAttributeText(node, "src");
113: if (link != null) {
114: log.debug("Link found '" + link + "'");
115: page.recordLink(link);
116: }
117: }
118: }
119: // If a text is found and the contentCollect is active store it
120: // in the contents buffer
121: if (contentCollect && node.getNodeType() == Node.TEXT_NODE) {
122: // Ensure is not a script text
123: if (!"SCRIPT".equals(getParentNodeName(node))) {
124: String text = node.getNodeValue();
125: if (text != null) {
126: text = clean(text);
127: if (text.length() > 0) {
128: contents.append(text);
129: // Separe the text with a blank for not merge two
130: // continues texts
131: contents.append(" ");
132: // TODO: remove static summary
133: // Add the text to the summary
134: if (summary.length() < 230) {
135: summary.append(text.trim());
136: summary.append(" ");
137: }
138: }
139: }
140: }
141: }
142: Node child = node.getFirstChild();
143: while (child != null) {
144: collect(child);
145: child = child.getNextSibling();
146: }
147: }
148:
149: protected String clean(String text) {
150: // TODO: improve the text filtering
151: String cleanedText = text.replace('\r', ' ');
152: cleanedText = cleanedText.replace('\n', ' ');
153: cleanedText = cleanedText.replace('\t', ' ');
154: return cleanedText.trim();
155: }
156:
157: protected String getFirstChildText(Node node) {
158: String text = null;
159: Node child = node.getFirstChild();
160: if (child != null && child.getNodeType() == Node.TEXT_NODE) {
161: text = child.getNodeValue();
162: }
163: return text;
164: }
165:
166: protected String getAttributeText(Node node, String attributeName) {
167: String text = null;
168: if (node.hasAttributes()) {
169: Node attributeNode = node.getAttributes().getNamedItem(
170: attributeName);
171: if (attributeNode != null) {
172: text = attributeNode.getNodeValue();
173: }
174: }
175: return text;
176: }
177:
178: protected String getParentNodeName(Node node) {
179: String parentNodeName = null;
180: Node parentNode = node.getParentNode();
181: if (parentNode != null) {
182: parentNodeName = parentNode.getNodeName();
183: }
184: return parentNodeName;
185: }
186:
187: }
|