Source Code Cross Referenced for ExtractorHTML.java in  » Web-Crawler » heritrix » org » archive » crawler » extractor » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.extractor 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * SimpleHTMLExtractor.java
020:         * Created on Jun 5, 2003
021:         *
022:         * $Header$
023:         */
024:        package org.archive.crawler.extractor;
025:
026:        import java.io.IOException;
027:        import java.util.ArrayList;
028:        import java.util.Iterator;
029:        import java.util.logging.Level;
030:        import java.util.logging.Logger;
031:        import java.util.regex.Matcher;
032:
033:        import org.apache.commons.httpclient.URIException;
034:        import org.archive.crawler.datamodel.CoreAttributeConstants;
035:        import org.archive.crawler.datamodel.CrawlURI;
036:        import org.archive.crawler.datamodel.RobotsHonoringPolicy;
037:        import org.archive.crawler.settings.SimpleType;
038:        import org.archive.crawler.settings.Type;
039:        import org.archive.io.ReplayCharSequence;
040:        import org.archive.net.UURI;
041:        import org.archive.net.UURIFactory;
042:        import org.archive.util.DevUtils;
043:        import org.archive.util.HttpRecorder;
044:        import org.archive.util.TextUtils;
045:
046:        /**
047:         * Basic link-extraction, from an HTML content-body,
048:         * using regular expressions.
049:         *
050:         * @author gojomo
051:         *
052:         */
053:        public class ExtractorHTML extends Extractor implements 
054:                CoreAttributeConstants {
055:
056:            private static final long serialVersionUID = 5855731422080471017L;
057:
058:            private static Logger logger = Logger.getLogger(ExtractorHTML.class
059:                    .getName());
060:
061:            /**
062:             * Compiled relevant tag extractor.
063:             *
064:             * <p>
065:             * This pattern extracts either:
066:             * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
067:             * <li> (2) &lt;style&gt;...&lt;/style&gt; or
068:             * <li> (3) &lt;meta ...&gt; or
069:             * <li> (4) any other open-tag with at least one attribute
070:             * (eg matches "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
071:             * <p>
072:             * groups:
073:             * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
074:             * <li> 2: just script open tag
075:             * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
076:             * <li> 4: just style open tag
077:             * <li> 5: entire other tag, without '<' '>'
078:             * <li> 6: element
079:             * <li> 7: META
080:             * <li> 8: !-- comment --
081:             */
082:            // version w/ less unnecessary backtracking
083:            private static final int MAX_ELEMENT_LENGTH = Integer
084:                    .parseInt(System.getProperty(ExtractorHTML.class.getName()
085:                            + ".maxElementNameLength", "1024"));
086:
087:            static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)"
088:                    + // 1, 2
089:                    "|((style[^>]*+)>.*?</style)"
090:                    + // 3, 4
091:                    "|(((meta)|(?:\\w{1,"
092:                    + MAX_ELEMENT_LENGTH
093:                    + "}))\\s+[^>]*+)" + // 5, 6, 7
094:                    "|(!--.*?--))>"; // 8 
095:
096:            //    version w/ problems with unclosed script tags 
097:            //    static final String RELEVANT_TAG_EXTRACTOR =
098:            //    "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>";
099:
100:            //    // this pattern extracts 'href' or 'src' attributes from
101:            //    // any open-tag innards matched by the above
102:            //    static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(
103:            //     "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");
104:            //
105:            //    // this pattern extracts 'robots' attributes
106:            //    static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(
107:            //     "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))");
108:
109:            private static final int MAX_ATTR_NAME_LENGTH = Integer
110:                    .parseInt(System.getProperty(ExtractorHTML.class.getName()
111:                            + ".maxAttributeNameLength", "1024")); // 1K; 
112:
113:            static final int MAX_ATTR_VAL_LENGTH = Integer.parseInt(System
114:                    .getProperty(ExtractorHTML.class.getName()
115:                            + ".maxAttributeValueLength", "16384")); // 16K; 
116:
117:            // TODO: perhaps cut to near MAX_URI_LENGTH
118:
119:            // this pattern extracts attributes from any open-tag innards
120:            // matched by the above. attributes known to be URIs of various
121:            // sorts are matched specially
122:            static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s((href)|(action)|(on\\w*)" // 1, 2, 3, 4 
123:                    + "|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...
124:                    + "|(?:usemap)|(?:profile)|(?:datasrc))" // 5
125:                    + "|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9
126:                    + "|(value)|(style)|([-\\w]{1,"
127:                    + MAX_ATTR_NAME_LENGTH
128:                    + "}))" // 10, 11, 12
129:                    + "\\s*=\\s*"
130:                    + "(?:(?:\"(.{0,"
131:                    + MAX_ATTR_VAL_LENGTH
132:                    + "}?)(?:\"|$))" // 13
133:                    + "|(?:'(.{0," + MAX_ATTR_VAL_LENGTH + "}?)(?:'|$))" // 14
134:                    + "|(\\S{1," + MAX_ATTR_VAL_LENGTH + "}))"; // 15
135:            // groups:
136:            // 1: attribute name
137:            // 2: HREF - single URI relative to doc base, or occasionally javascript:
138:            // 3: ACTION - single URI relative to doc base, or occasionally javascript:
139:            // 4: ON[WHATEVER] - script handler
140:            // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC
141:            //    single URI relative to doc base
142:            // 6: CODEBASE - a single URI relative to doc base, affecting other
143:            //    attributes
144:            // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
145:            // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
146:            //    (if supplied)
147:            // 9: CODE - a single URI relative to the CODEBASE (is specified).
148:            // 10: VALUE - often includes a uri path on forms
149:            // 11: STYLE - inline attribute style info
150:            // 12: any other attribute
151:            // 13: double-quote delimited attr value
152:            // 14: single-quote delimited attr value
153:            // 15: space-delimited attr value
154:
155:            // much like the javascript likely-URI extractor, but
156:            // without requiring quotes -- this can indicate whether
157:            // an HTML tag attribute that isn't definitionally a
158:            // URI might be one anyway, as in form-tag VALUE attributes
159:            static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
160:            static final String WHITESPACE = "\\s";
161:            static final String CLASSEXT = ".class";
162:            static final String APPLET = "applet";
163:            static final String BASE = "base";
164:            static final String LINK = "link";
165:            static final String FRAME = "frame";
166:            static final String IFRAME = "iframe";
167:
168:            public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS = "treat-frames-as-embed-links";
169:
170:            public static final String ATTR_IGNORE_FORM_ACTION_URLS = "ignore-form-action-urls";
171:
172:            /** whether to try finding links in Javscript; default true */
173:            public static final String ATTR_EXTRACT_JAVASCRIPT = "extract-javascript";
174:
175:            public static final String ATTR_OVERLY_EAGER_LINK_DETECTION = "overly-eager-link-detection";
176:
177:            public static final String ATTR_IGNORE_UNEXPECTED_HTML = "ignore-unexpected-html";
178:
179:            protected long numberOfCURIsHandled = 0;
180:            protected long numberOfLinksExtracted = 0;
181:
182:            public ExtractorHTML(String name) {
183:                this (name, "HTML extractor. Extracts links from HTML documents");
184:            }
185:
186:            public ExtractorHTML(String name, String description) {
187:                super (name, description);
188:                Type t = addElementToDefinition(new SimpleType(
189:                        ATTR_EXTRACT_JAVASCRIPT,
190:                        "If true, in-page Javascript is scanned for strings that "
191:                                + "appear likely to be URIs. This typically finds both valid "
192:                                + "and invalid URIs, and attempts to fetch the invalid URIs "
193:                                + "sometimes generates webmaster concerns over odd crawler "
194:                                + "behavior. Default is true.", Boolean.TRUE));
195:                t.setExpertSetting(true);
196:                t = addElementToDefinition(new SimpleType(
197:                        ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
198:                        "If true, FRAME/IFRAME SRC-links are treated as embedded "
199:                                + "resources (like IMG, 'E' hop-type), otherwise they are "
200:                                + "treated as navigational links. Default is true.",
201:                        Boolean.TRUE));
202:                t.setExpertSetting(true);
203:                t = addElementToDefinition(new SimpleType(
204:                        ATTR_IGNORE_FORM_ACTION_URLS,
205:                        "If true, URIs appearing as the ACTION attribute in "
206:                                + "HTML FORMs are ignored. Default is false.",
207:                        Boolean.FALSE));
208:                t.setExpertSetting(true);
209:                t = addElementToDefinition(new SimpleType(
210:                        ATTR_OVERLY_EAGER_LINK_DETECTION,
211:                        "If true, strings that look like URIs found in unusual "
212:                                + "places (such as form VALUE attributes) will be extracted. "
213:                                + "This typically finds both valid and invalid URIs, and "
214:                                + "attempts to fetch the invalid URIs sometimes generate "
215:                                + "webmaster concerns over odd crawler behavior. Default "
216:                                + "is true.", Boolean.TRUE));
217:                t.setExpertSetting(true);
218:                t = addElementToDefinition(new SimpleType(
219:                        ATTR_IGNORE_UNEXPECTED_HTML,
220:                        "If true, URIs which end in typical non-HTML extensions "
221:                                + "(such as .gif) will not be scanned as if it were HTML. "
222:                                + "Default is true.", Boolean.TRUE));
223:                t.setExpertSetting(true);
224:            }
225:
226:            protected void processGeneralTag(CrawlURI curi,
227:                    CharSequence element, CharSequence cs) {
228:
229:                Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR,
230:                        cs);
231:
232:                // Just in case it's an OBJECT or APPLET tag
233:                String codebase = null;
234:                ArrayList<String> resources = null;
235:
236:                final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(
237:                        curi, ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
238:
239:                final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
240:                        curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
241:
242:                final boolean overlyEagerLinkDetection = ((Boolean) getUncheckedAttribute(
243:                        curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();
244:
245:                final String elementStr = element.toString();
246:
247:                while (attr.find()) {
248:                    int valueGroup = (attr.start(13) > -1) ? 13 : (attr
249:                            .start(14) > -1) ? 14 : 15;
250:                    int start = attr.start(valueGroup);
251:                    int end = attr.end(valueGroup);
252:                    assert start >= 0 : "Start is: " + start + ", " + curi;
253:                    assert end >= 0 : "End is :" + end + ", " + curi;
254:                    CharSequence value = cs.subSequence(start, end);
255:                    value = TextUtils.unescapeHtml(value);
256:                    if (attr.start(2) > -1) {
257:                        // HREF
258:                        CharSequence context = Link.elementContext(element,
259:                                attr.group(2));
260:                        if (elementStr.equalsIgnoreCase(LINK)) {
261:                            // <LINK> elements treated as embeds (css, ico, etc)
262:                            processEmbed(curi, value, context);
263:                        } else {
264:                            // other HREFs treated as links
265:                            processLink(curi, value, context);
266:                        }
267:                        if (elementStr.equalsIgnoreCase(BASE)) {
268:                            try {
269:                                curi.setBaseURI(value.toString());
270:                            } catch (URIException e) {
271:                                if (getController() != null) {
272:                                    // Controller can be null: e.g. when running
273:                                    // ExtractorTool.
274:                                    getController().logUriError(e,
275:                                            curi.getUURI(), value.toString());
276:                                } else {
277:                                    logger.info("Failed set base uri: " + curi
278:                                            + ", " + value.toString() + ": "
279:                                            + e.getMessage());
280:                                }
281:                            }
282:                        }
283:                    } else if (attr.start(3) > -1) {
284:                        // ACTION
285:                        if (!ignoreFormActions) {
286:                            CharSequence context = Link.elementContext(element,
287:                                    attr.group(3));
288:                            processLink(curi, value, context);
289:                        }
290:                    } else if (attr.start(4) > -1) {
291:                        // ON____
292:                        processScriptCode(curi, value); // TODO: context?
293:                    } else if (attr.start(5) > -1) {
294:                        // SRC etc.
295:                        CharSequence context = Link.elementContext(element,
296:                                attr.group(5));
297:
298:                        // true, if we expect another HTML page instead of an image etc.
299:                        final char hopType;
300:
301:                        if (!framesAsEmbeds
302:                                && (elementStr.equalsIgnoreCase(FRAME) || elementStr
303:                                        .equalsIgnoreCase(IFRAME))) {
304:                            hopType = Link.NAVLINK_HOP;
305:                        } else {
306:                            hopType = Link.EMBED_HOP;
307:                        }
308:                        processEmbed(curi, value, context, hopType);
309:                    } else if (attr.start(6) > -1) {
310:                        // CODEBASE
311:                        codebase = (value instanceof  String) ? (String) value
312:                                : value.toString();
313:                        CharSequence context = Link.elementContext(element,
314:                                attr.group(6));
315:                        processEmbed(curi, codebase, context);
316:                    } else if (attr.start(7) > -1) {
317:                        // CLASSID, DATA
318:                        if (resources == null) {
319:                            resources = new ArrayList<String>();
320:                        }
321:                        resources.add(value.toString());
322:                    } else if (attr.start(8) > -1) {
323:                        // ARCHIVE
324:                        if (resources == null) {
325:                            resources = new ArrayList<String>();
326:                        }
327:                        String[] multi = TextUtils.split(WHITESPACE, value);
328:                        for (int i = 0; i < multi.length; i++) {
329:                            resources.add(multi[i]);
330:                        }
331:                    } else if (attr.start(9) > -1) {
332:                        // CODE
333:                        if (resources == null) {
334:                            resources = new ArrayList<String>();
335:                        }
336:                        // If element is applet and code value does not end with
337:                        // '.class' then append '.class' to the code value.
338:                        if (elementStr.equalsIgnoreCase(APPLET)
339:                                && !value.toString().toLowerCase().endsWith(
340:                                        CLASSEXT)) {
341:                            resources.add(value.toString() + CLASSEXT);
342:                        } else {
343:                            resources.add(value.toString());
344:                        }
345:                    } else if (attr.start(10) > -1) {
346:                        // VALUE, with possibility of URI
347:                        if (overlyEagerLinkDetection
348:                                && TextUtils.matches(LIKELY_URI_PATH, value)) {
349:                            CharSequence context = Link.elementContext(element,
350:                                    attr.group(10));
351:                            processLink(curi, value, context);
352:                        }
353:
354:                    } else if (attr.start(11) > -1) {
355:                        // STYLE inline attribute
356:                        // then, parse for URIs
357:                        this .numberOfLinksExtracted += ExtractorCSS
358:                                .processStyleCode(curi, value, getController());
359:
360:                    } else if (attr.start(12) > -1) {
361:                        // any other attribute
362:                        // ignore for now
363:                        // could probe for path- or script-looking strings, but
364:                        // those should be vanishingly rare in other attributes,
365:                        // and/or symptomatic of page bugs
366:                    }
367:                }
368:                TextUtils.recycleMatcher(attr);
369:
370:                // handle codebase/resources
371:                if (resources == null) {
372:                    return;
373:                }
374:                Iterator iter = resources.iterator();
375:                UURI codebaseURI = null;
376:                String res = null;
377:                try {
378:                    if (codebase != null) {
379:                        // TODO: Pass in the charset.
380:                        codebaseURI = UURIFactory.getInstance(curi.getUURI(),
381:                                codebase);
382:                    }
383:                    while (iter.hasNext()) {
384:                        res = iter.next().toString();
385:                        res = (String) TextUtils.unescapeHtml(res);
386:                        if (codebaseURI != null) {
387:                            res = codebaseURI.resolve(res).toString();
388:                        }
389:                        processEmbed(curi, res, element); // TODO: include attribute too
390:                    }
391:                } catch (URIException e) {
392:                    curi.addLocalizedError(getName(), e, "BAD CODEBASE "
393:                            + codebase);
394:                } catch (IllegalArgumentException e) {
395:                    DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
396:                            + "codebase=" + codebase + " res=" + res + "\n"
397:                            + DevUtils.extraInfo(), e);
398:                }
399:            }
400:
401:            /**
402:             * Extract the (java)script source in the given CharSequence. 
403:             * 
404:             * @param curi source CrawlURI
405:             * @param cs CharSequence of javascript code
406:             */
407:            protected void processScriptCode(CrawlURI curi, CharSequence cs) {
408:                if ((Boolean) getUncheckedAttribute(curi,
409:                        ATTR_EXTRACT_JAVASCRIPT)) {
410:                    this .numberOfLinksExtracted += ExtractorJS.considerStrings(
411:                            curi, cs, getController(), false);
412:                } // else do nothing
413:            }
414:
415:            static final String JAVASCRIPT = "(?i)^javascript:.*";
416:
417:            /**
418:             * Handle generic HREF cases.
419:             * 
420:             * @param curi
421:             * @param value
422:             * @param context
423:             */
424:            protected void processLink(CrawlURI curi, final CharSequence value,
425:                    CharSequence context) {
426:                if (TextUtils.matches(JAVASCRIPT, value)) {
427:                    processScriptCode(curi, value.subSequence(11, value
428:                            .length()));
429:                } else {
430:                    if (logger.isLoggable(Level.FINEST)) {
431:                        logger.finest("link: " + value.toString() + " from "
432:                                + curi);
433:                    }
434:                    addLinkFromString(curi,
435:                            (value instanceof  String) ? (String) value : value
436:                                    .toString(), context, Link.NAVLINK_HOP);
437:                    this .numberOfLinksExtracted++;
438:                }
439:            }
440:
441:            private void addLinkFromString(CrawlURI curi, String uri,
442:                    CharSequence context, char hopType) {
443:                try {
444:                    // We do a 'toString' on context because its a sequence from
445:                    // the underlying ReplayCharSequence and the link its about
446:                    // to become a part of is expected to outlive the current
447:                    // ReplayCharSequence.
448:                    curi.createAndAddLinkRelativeToBase(uri,
449:                            context.toString(), hopType);
450:                } catch (URIException e) {
451:                    if (getController() != null) {
452:                        getController().logUriError(e, curi.getUURI(), uri);
453:                    } else {
454:                        logger.info("Failed createAndAddLinkRelativeToBase "
455:                                + curi + ", " + uri + ", " + context + ", "
456:                                + hopType + ": " + e);
457:                    }
458:                }
459:            }
460:
461:            protected final void processEmbed(CrawlURI curi,
462:                    CharSequence value, CharSequence context) {
463:                processEmbed(curi, value, context, Link.EMBED_HOP);
464:            }
465:
466:            protected void processEmbed(CrawlURI curi,
467:                    final CharSequence value, CharSequence context, char hopType) {
468:                if (logger.isLoggable(Level.FINEST)) {
469:                    logger.finest("embed (" + hopType + "): "
470:                            + value.toString() + " from " + curi);
471:                }
472:                addLinkFromString(curi,
473:                        (value instanceof  String) ? (String) value : value
474:                                .toString(), context, hopType);
475:                this .numberOfLinksExtracted++;
476:            }
477:
478:            public void extract(CrawlURI curi) {
479:                if (!isHttpTransactionContentToProcess(curi)
480:                        || !(isExpectedMimeType(curi.getContentType(),
481:                                "text/html") || isExpectedMimeType(curi
482:                                .getContentType(), "application/xhtml"))) {
483:                    return;
484:                }
485:
486:                final boolean ignoreUnexpectedHTML = ((Boolean) getUncheckedAttribute(
487:                        curi, ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();
488:
489:                if (ignoreUnexpectedHTML) {
490:                    try {
491:                        if (!isHtmlExpectedHere(curi)) {
492:                            // HTML was not expected (eg a GIF was expected) so ignore
493:                            // (as if a soft 404)
494:                            return;
495:                        }
496:                    } catch (URIException e) {
497:                        logger.severe("Failed expectedHTML test: "
498:                                + e.getMessage());
499:                    }
500:                }
501:
502:                this .numberOfCURIsHandled++;
503:
504:                ReplayCharSequence cs = null;
505:
506:                try {
507:                    HttpRecorder hr = curi.getHttpRecorder();
508:                    if (hr == null) {
509:                        throw new IOException("Why is recorder null here?");
510:                    }
511:                    cs = hr.getReplayCharSequence();
512:                } catch (IOException e) {
513:                    curi.addLocalizedError(this .getName(), e,
514:                            "Failed get of replay char sequence "
515:                                    + curi.toString() + " " + e.getMessage());
516:                    logger.log(Level.SEVERE,
517:                            "Failed get of replay char sequence in "
518:                                    + Thread.currentThread().getName(), e);
519:                }
520:
521:                if (cs == null) {
522:                    return;
523:                }
524:
525:                // We have a ReplayCharSequence open.  Wrap all in finally so we
526:                // for sure close it before we leave.
527:                try {
528:                    // Extract all links from the charsequence
529:                    extract(curi, cs);
530:                    // Set flag to indicate that link extraction is completed.
531:                    curi.linkExtractorFinished();
532:                } finally {
533:                    if (cs != null) {
534:                        try {
535:                            cs.close();
536:                        } catch (IOException ioe) {
537:                            logger
538:                                    .warning(TextUtils
539:                                            .exceptionToString(
540:                                                    "Failed close of ReplayCharSequence.",
541:                                                    ioe));
542:                        }
543:                    }
544:                }
545:            }
546:
547:            /**
548:             * Run extractor.
549:             * This method is package visible to ease testing.
550:             * @param curi CrawlURI we're processing.
551:             * @param cs Sequence from underlying ReplayCharSequence. This
552:             * is TRANSIENT data. Make a copy if you want the data to live outside
553:             * of this extractors' lifetime.
554:             */
555:            void extract(CrawlURI curi, CharSequence cs) {
556:                Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
557:                while (tags.find()) {
558:                    if (Thread.interrupted()) {
559:                        break;
560:                    }
561:                    if (tags.start(8) > 0) {
562:                        // comment match
563:                        // for now do nothing
564:                    } else if (tags.start(7) > 0) {
565:                        // <meta> match
566:                        int start = tags.start(5);
567:                        int end = tags.end(5);
568:                        assert start >= 0 : "Start is: " + start + ", " + curi;
569:                        assert end >= 0 : "End is :" + end + ", " + curi;
570:                        if (processMeta(curi, cs.subSequence(start, end))) {
571:
572:                            // meta tag included NOFOLLOW; abort processing
573:                            break;
574:                        }
575:                    } else if (tags.start(5) > 0) {
576:                        // generic <whatever> match
577:                        int start5 = tags.start(5);
578:                        int end5 = tags.end(5);
579:                        assert start5 >= 0 : "Start is: " + start5 + ", "
580:                                + curi;
581:                        assert end5 >= 0 : "End is :" + end5 + ", " + curi;
582:                        int start6 = tags.start(6);
583:                        int end6 = tags.end(6);
584:                        assert start6 >= 0 : "Start is: " + start6 + ", "
585:                                + curi;
586:                        assert end6 >= 0 : "End is :" + end6 + ", " + curi;
587:                        processGeneralTag(curi, cs.subSequence(start6, end6),
588:                                cs.subSequence(start5, end5));
589:
590:                    } else if (tags.start(1) > 0) {
591:                        // <script> match
592:                        int start = tags.start(1);
593:                        int end = tags.end(1);
594:                        assert start >= 0 : "Start is: " + start + ", " + curi;
595:                        assert end >= 0 : "End is :" + end + ", " + curi;
596:                        assert tags.end(2) >= 0 : "Tags.end(2) illegal "
597:                                + tags.end(2) + ", " + curi;
598:                        processScript(curi, cs.subSequence(start, end), tags
599:                                .end(2)
600:                                - start);
601:
602:                    } else if (tags.start(3) > 0) {
603:                        // <style... match
604:                        int start = tags.start(3);
605:                        int end = tags.end(3);
606:                        assert start >= 0 : "Start is: " + start + ", " + curi;
607:                        assert end >= 0 : "End is :" + end + ", " + curi;
608:                        assert tags.end(4) >= 0 : "Tags.end(4) illegal "
609:                                + tags.end(4) + ", " + curi;
610:                        processStyle(curi, cs.subSequence(start, end), tags
611:                                .end(4)
612:                                - start);
613:                    }
614:                }
615:                TextUtils.recycleMatcher(tags);
616:            }
617:
618:            static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"
619:                    + "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
620:
621:            /**
622:             * Test whether this HTML is so unexpected (eg in place of a GIF URI)
623:             * that it shouldn't be scanned for links.
624:             *
625:             * @param curi CrawlURI to examine.
626:             * @return True if HTML is acceptable/expected here
627:             * @throws URIException
628:             */
629:            protected boolean isHtmlExpectedHere(CrawlURI curi)
630:                    throws URIException {
631:                String path = curi.getUURI().getPath();
632:                if (path == null) {
633:                    // no path extension, HTML is fine
634:                    return true;
635:                }
636:                int dot = path.lastIndexOf('.');
637:                if (dot < 0) {
638:                    // no path extension, HTML is fine
639:                    return true;
640:                }
641:                if (dot < (path.length() - 5)) {
642:                    // extension too long to recognize, HTML is fine
643:                    return true;
644:                }
645:                String ext = path.substring(dot + 1);
646:                return !TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
647:            }
648:
649:            protected void processScript(CrawlURI curi, CharSequence sequence,
650:                    int endOfOpenTag) {
651:                // first, get attributes of script-open tag
652:                // as per any other tag
653:                processGeneralTag(curi, sequence.subSequence(0, 6), sequence
654:                        .subSequence(0, endOfOpenTag));
655:
656:                // then, apply best-effort string-analysis heuristics
657:                // against any code present (false positives are OK)
658:                processScriptCode(curi, sequence.subSequence(endOfOpenTag,
659:                        sequence.length()));
660:            }
661:
662:            /**
663:             * Process metadata tags.
664:             * @param curi CrawlURI we're processing.
665:             * @param cs Sequence from underlying ReplayCharSequence. This
666:             * is TRANSIENT data. Make a copy if you want the data to live outside
667:             * of this extractors' lifetime.
668:             * @return True robots exclusion metatag.
669:             */
670:            protected boolean processMeta(CrawlURI curi, CharSequence cs) {
671:                Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR,
672:                        cs);
673:                String name = null;
674:                String httpEquiv = null;
675:                String content = null;
676:                while (attr.find()) {
677:                    int valueGroup = (attr.start(13) > -1) ? 13 : (attr
678:                            .start(14) > -1) ? 14 : 15;
679:                    CharSequence value = cs.subSequence(attr.start(valueGroup),
680:                            attr.end(valueGroup));
681:                    if (attr.group(1).equalsIgnoreCase("name")) {
682:                        name = value.toString();
683:                    } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
684:                        httpEquiv = value.toString();
685:                    } else if (attr.group(1).equalsIgnoreCase("content")) {
686:                        content = value.toString();
687:                    }
688:                    // TODO: handle other stuff
689:                }
690:                TextUtils.recycleMatcher(attr);
691:
692:                // Look for the 'robots' meta-tag
693:                if ("robots".equalsIgnoreCase(name) && content != null) {
694:                    curi.putString(A_META_ROBOTS, content);
695:                    RobotsHonoringPolicy policy = getSettingsHandler()
696:                            .getOrder().getRobotsHonoringPolicy();
697:                    String contentLower = content.toLowerCase();
698:                    if ((policy == null || (!policy.isType(curi,
699:                            RobotsHonoringPolicy.IGNORE) && !policy.isType(
700:                            curi, RobotsHonoringPolicy.CUSTOM)))
701:                            && (contentLower.indexOf("nofollow") >= 0 || contentLower
702:                                    .indexOf("none") >= 0)) {
703:                        // if 'nofollow' or 'none' is specified and the
704:                        // honoring policy is not IGNORE or CUSTOM, end html extraction
705:                        logger
706:                                .fine("HTML extraction skipped due to robots meta-tag for: "
707:                                        + curi.toString());
708:                        return true;
709:                    }
710:                } else if ("refresh".equalsIgnoreCase(httpEquiv)
711:                        && content != null) {
712:                    String refreshUri = content
713:                            .substring(content.indexOf("=") + 1);
714:                    try {
715:                        curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
716:                                Link.REFER_HOP);
717:                    } catch (URIException e) {
718:                        if (getController() != null) {
719:                            getController().logUriError(e, curi.getUURI(),
720:                                    refreshUri);
721:                        } else {
722:                            logger
723:                                    .info("Failed createAndAddLinkRelativeToBase "
724:                                            + curi
725:                                            + ", "
726:                                            + cs
727:                                            + ", "
728:                                            + refreshUri + ": " + e);
729:                        }
730:                    }
731:                }
732:                return false;
733:            }
734:
735:            /**
736:             * Process style text.
737:             * @param curi CrawlURI we're processing.
738:             * @param sequence Sequence from underlying ReplayCharSequence. This
739:             * is TRANSIENT data. Make a copy if you want the data to live outside
740:             * of this extractors' lifetime.
741:             * @param endOfOpenTag
742:             */
743:            protected void processStyle(CrawlURI curi, CharSequence sequence,
744:                    int endOfOpenTag) {
745:                // First, get attributes of script-open tag as per any other tag.
746:                processGeneralTag(curi, sequence.subSequence(0, 6), sequence
747:                        .subSequence(0, endOfOpenTag));
748:
749:                // then, parse for URIs
750:                this .numberOfLinksExtracted += ExtractorCSS.processStyleCode(
751:                        curi, sequence.subSequence(endOfOpenTag, sequence
752:                                .length()), getController());
753:            }
754:
755:            /* (non-Javadoc)
756:             * @see org.archive.crawler.framework.Processor#report()
757:             */
758:            public String report() {
759:                StringBuffer ret = new StringBuffer();
760:                ret
761:                        .append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
762:                ret
763:                        .append("  Function:          Link extraction on HTML documents\n");
764:                ret.append("  CrawlURIs handled: " + this .numberOfCURIsHandled
765:                        + "\n");
766:                ret.append("  Links extracted:   "
767:                        + this .numberOfLinksExtracted + "\n\n");
768:                return ret.toString();
769:            }
770:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.