001: package org.openedit.sitesearch.parse;
002:
003: import java.io.StringWriter;
004: import java.io.Writer;
005: import java.net.MalformedURLException;
006: import java.net.URL;
007: import java.util.ArrayList;
008: import java.util.Iterator;
009: import java.util.List;
010:
011: import org.apache.commons.logging.Log;
012: import org.apache.commons.logging.LogFactory;
013: import org.openedit.links.Link;
014: import org.openedit.sitesearch.Content;
015: import org.openedit.sitesearch.Parse;
016: import org.openedit.sitesearch.Parser;
017:
018: import au.id.jericho.lib.html.CharacterReference;
019: import au.id.jericho.lib.html.Element;
020: import au.id.jericho.lib.html.HTMLElementName;
021: import au.id.jericho.lib.html.Segment;
022: import au.id.jericho.lib.html.Source;
023: import au.id.jericho.lib.html.StartTag;
024:
025: import com.openedit.OpenEditException;
026: import com.openedit.util.PathUtilities;
027:
028: public class HtmlParser implements Parser {
029: private static final Log log = LogFactory.getLog(HtmlParser.class);
030:
031: public Parse getParse(Content inUrl) throws OpenEditException {
032: log.info("parse " + inUrl.getUrl());
033: Parse results = new Parse();
034: Source source = null;
035: try {
036: source = new Source(new URL(inUrl.getUrl().getPath()));
037: } catch (Exception ex) {
038: throw new OpenEditException(ex);
039: }
040:
041: Writer l = new StringWriter();
042:
043: source.setLogWriter(l); // send log messages to stderr
044:
045: source.fullSequentialParse();
046:
047: //System.out.println("Document title:");
048: String title = getTitle(source);
049: results.setTitle(title);
050: //System.out.println(title==null ? "(none)" : title);
051:
052: //System.out.println("\nDocument keywords:");
053: String keywords = getMetaValue(source, "keywords");
054: //System.out.println(keywords==null ? "(none)" : keywords);
055: if (keywords != null) {
056: keywords = keywords.replace(",", " ");
057: }
058: //TODO: Each time someone links to this page maybe we should add to a list of links
059: if (inUrl.getUrl().getText() != null) {
060: if (keywords == null) {
061: keywords = inUrl.getUrl().getText();
062: } else {
063: keywords = inUrl.getUrl().getText() + " " + keywords;
064: }
065: }
066: results.put("keywords", keywords);
067:
068: //System.out.println("\nDocument description:");
069: String description = getMetaValue(source, "description");
070: //System.out.println(description==null ? "(none)" : description);
071: results.put("summary", description);
072:
073: //System.out.println("\nLinks to other documents:");
074: List linkElements = source.findAllElements(HTMLElementName.A);
075:
076: List links = new ArrayList();
077: String hostName = inUrl.getUrl().getPath();
078: hostName = hostName.substring(0, hostName.indexOf("/", 8));
079:
080: for (Iterator i = linkElements.iterator(); i.hasNext();) {
081: Element linkElement = (Element) i.next();
082: String href = linkElement.getAttributeValue("href");
083: if (href == null)
084: continue;
085:
086: String follow = linkElement.getAttributeValue("rel");
087: if (follow != null && "nofollow".equalsIgnoreCase(follow)) {
088: continue;
089: }
090: //System.out.println(href+" ("+label+")");
091: if (href.length() < 2 || href.startsWith("#")
092: || href.indexOf("?") > -1) {
093: continue;
094: }
095: int pound = href.indexOf("#");
096: if (pound > -1) {
097: href = href.substring(0, pound);
098: }
099: if (href.length() < 2) {
100: continue;
101: }
102: // A element can contain other tags so need to extract the text from it:
103: Link thelink = new Link();
104: String label = linkElement.getContent().extractText();
105: thelink.setText(label);
106: if (href.startsWith("http")) {
107: thelink.setPath(href);
108: } else if (href.startsWith("/")) {
109: thelink.setPath(hostName + href);
110: } else {
111: String cleanhref = PathUtilities.resolveRelativePath(
112: href, inUrl.toString());
113: thelink.setPath(hostName + cleanhref);
114: }
115: links.add(thelink);
116:
117: }
118: results.put("links", links);
119:
120: //System.out.println("\nAll text from BODY (exluding content inside SCRIPT and STYLE elements):");
121: Element bodyElement = source.findNextElement(0,
122: HTMLElementName.BODY);
123: Segment contentSegment = (bodyElement == null) ? source
124: : bodyElement.getContent();
125: results.setText(contentSegment.extractText(true));
126: return results;
127: }
128:
129: private static String getTitle(Source source) {
130: Element titleElement = source.findNextElement(0,
131: HTMLElementName.TITLE);
132: if (titleElement == null)
133: return null;
134: // TITLE element never contains other tags so just decode it collapsing whitespace:
135: return CharacterReference.decodeCollapseWhiteSpace(titleElement
136: .getContent());
137: }
138:
139: private static String getMetaValue(Source source, String key) {
140: for (int pos = 0; pos < source.length();) {
141: StartTag startTag = source.findNextStartTag(pos, "name",
142: key, false);
143: if (startTag == null)
144: return null;
145: if (startTag.getName() == HTMLElementName.META)
146: return startTag.getAttributeValue("content"); // Attribute values are automatically decoded
147: pos = startTag.getEnd();
148: }
149: return null;
150: }
151:
152: }
|