Source Code Cross Referenced for AdaptiveRevisitFrontier.java in  » Web-Crawler » heritrix » org » archive » crawler » frontier » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.frontier 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /* AdaptiveRevisitFrontier.java
0002:         *
0003:         * Created on Sep 13, 2004
0004:         *
0005:         * Copyright (C) 2004 Kristinn Sigur?sson.
0006:         *
0007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
0008:         *
0009:         * Heritrix is free software; you can redistribute it and/or modify
0010:         * it under the terms of the GNU Lesser Public License as published by
0011:         * the Free Software Foundation; either version 2.1 of the License, or
0012:         * any later version.
0013:         *
0014:         * Heritrix is distributed in the hope that it will be useful,
0015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
0016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0017:         * GNU Lesser Public License for more details.
0018:         *
0019:         * You should have received a copy of the GNU Lesser Public License
0020:         * along with Heritrix; if not, write to the Free Software
0021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0022:         */
0023:        package org.archive.crawler.frontier;
0024:
0025:        import java.io.File;
0026:        import java.io.IOException;
0027:        import java.io.PrintWriter;
0028:        import java.io.Serializable;
0029:        import java.io.StringWriter;
0030:        import java.io.Writer;
0031:        import java.util.ArrayList;
0032:        import java.util.Date;
0033:        import java.util.Iterator;
0034:        import java.util.List;
0035:        import java.util.logging.Level;
0036:        import java.util.logging.Logger;
0037:
0038:        import javax.management.AttributeNotFoundException;
0039:
0040:        import org.apache.commons.httpclient.HttpStatus;
0041:        import org.archive.crawler.datamodel.CandidateURI;
0042:        import org.archive.crawler.datamodel.CoreAttributeConstants;
0043:        import org.archive.crawler.datamodel.CrawlServer;
0044:        import org.archive.crawler.datamodel.CrawlURI;
0045:        import org.archive.crawler.datamodel.FetchStatusCodes;
0046:        import org.archive.crawler.datamodel.UriUniqFilter;
0047:        import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;
0048:        import org.archive.crawler.event.CrawlStatusListener;
0049:        import org.archive.crawler.framework.CrawlController;
0050:        import org.archive.crawler.framework.Frontier;
0051:        import org.archive.crawler.framework.FrontierMarker;
0052:        import org.archive.crawler.framework.exceptions.EndedException;
0053:        import org.archive.crawler.framework.exceptions.FatalConfigurationException;
0054:        import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
0055:        import org.archive.crawler.settings.ModuleType;
0056:        import org.archive.crawler.settings.RegularExpressionConstraint;
0057:        import org.archive.crawler.settings.SimpleType;
0058:        import org.archive.crawler.settings.Type;
0059:        import org.archive.crawler.url.Canonicalizer;
0060:        import org.archive.crawler.util.BdbUriUniqFilter;
0061:        import org.archive.net.UURI;
0062:        import org.archive.queue.MemQueue;
0063:        import org.archive.queue.Queue;
0064:        import org.archive.util.ArchiveUtils;
0065:
0066:        /**
0067:         * A Frontier that will repeatedly visit all encountered URIs. 
0068:         * <p>
0069:         * Wait time between visits is configurable and varies based on observed 
0070:         * changes of documents.
0071:         * <p>
0072:         * The Frontier borrows many things from HostQueuesFrontier, but implements 
0073:         * an entirely different strategy in issuing URIs and consequently in keeping a
0074:         * record of discovered URIs.
0075:         *
0076:         * @author Kristinn Sigurdsson
0077:         */
0078:        public class AdaptiveRevisitFrontier extends ModuleType implements 
0079:                Frontier, FetchStatusCodes, CoreAttributeConstants,
0080:                AdaptiveRevisitAttributeConstants, CrawlStatusListener,
0081:                HasUriReceiver {
0082:
0083:            private static final long serialVersionUID = -8666872690438543671L;
0084:
0085:            private static final Logger logger = Logger
0086:                    .getLogger(AdaptiveRevisitFrontier.class.getName());
0087:
0088:            /** How many multiples of last fetch elapsed time to wait before recontacting
0089:             * same server */
0090:            public final static String ATTR_DELAY_FACTOR = "delay-factor";
0091:            private final static Float DEFAULT_DELAY_FACTOR = new Float(5);
0092:
0093:            /** Always wait this long after one completion before recontacting
0094:             * same server, regardless of multiple */
0095:            public final static String ATTR_MIN_DELAY = "min-delay-ms";
0096:
0097:            // 2 seconds
0098:            private final static Integer DEFAULT_MIN_DELAY = new Integer(2000);
0099:
0100:            /** Never wait more than this long, regardless of multiple */
0101:            public final static String ATTR_MAX_DELAY = "max-delay-ms";
0102:
0103:            // 30 seconds
0104:            private final static Integer DEFAULT_MAX_DELAY = new Integer(30000);
0105:
0106:            /** Maximum times to emit a CrawlURI without final disposition */
0107:            public final static String ATTR_MAX_RETRIES = "max-retries";
0108:            private final static Integer DEFAULT_MAX_RETRIES = new Integer(30);
0109:
0110:            /** For retryable problems, seconds to wait before a retry */
0111:            public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";
0112:
0113:            // 15 minutes
0114:            private final static Long DEFAULT_RETRY_DELAY = new Long(900);
0115:
0116:            /** Maximum simultaneous requests in process to a host (queue) */
0117:            public final static String ATTR_HOST_VALENCE = "host-valence";
0118:            private final static Integer DEFAULT_HOST_VALENCE = new Integer(1);
0119:
0120:            /** Number of hops of embeds (ERX) to bump to front of host queue */
0121:            public final static String ATTR_PREFERENCE_EMBED_HOPS = "preference-embed-hops";
0122:            private final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(
0123:                    0);
0124:
0125:            /** Queue assignment to force on CrawlURIs. Intended to be used 
0126:             *  via overrides*/
0127:            public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";
0128:            protected final static String DEFAULT_FORCE_QUEUE = "";
0129:            /** Acceptable characters in forced queue names.
0130:             *  Word chars, dash, period, comma, colon */
0131:            protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";
0132:
0133:            /** Should the queue assignment ignore www in hostnames, effectively 
0134:             *  stripping them away. 
0135:             */
0136:            public final static String ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www";
0137:            protected final static Boolean DEFAULT_QUEUE_IGNORE_WWW = new Boolean(
0138:                    false);
0139:
0140:            /** Should the Frontier use a seperate 'already included' datastructure
0141:             *  or rely on the queues'. 
0142:             */
0143:            public final static String ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter";
0144:            protected final static Boolean DEFAULT_USE_URI_UNIQ_FILTER = new Boolean(
0145:                    false);
0146:
0147:            private CrawlController controller;
0148:
0149:            private AdaptiveRevisitQueueList hostQueues;
0150:
0151:            private UriUniqFilter alreadyIncluded;
0152:
0153:            private ThreadLocalQueue threadWaiting = new ThreadLocalQueue();
0154:
0155:            /** Policy for assigning CrawlURIs to named queues */
0156:            private QueueAssignmentPolicy queueAssignmentPolicy = null;
0157:
0158:            // top-level stats
0159:            private long succeededFetchCount = 0;
0160:            private long failedFetchCount = 0;
0161:            // URI's that are disregarded (for example because of robot.txt rules)
0162:            private long disregardedUriCount = 0;
0163:
0164:            private long totalProcessedBytes = 0;
0165:
0166:            // Flags indicating operator-specified crawl pause/end 
0167:            private boolean shouldPause = false;
0168:            private boolean shouldTerminate = false;
0169:
0170:            public AdaptiveRevisitFrontier(String name) {
0171:                this (
0172:                        name,
0173:                        "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that "
0174:                                + "will repeatedly visit all "
0175:                                + "encountered URIs. Wait time between visits is configurable"
0176:                                + " and is determined by seperate Processor(s). See "
0177:                                + "WaitEvaluators "
0178:                                + "See documentation for ARFrontier limitations.");
0179:            }
0180:
0181:            public AdaptiveRevisitFrontier(String name, String description) {
0182:                super (Frontier.ATTR_NAME, description);
0183:                addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
0184:                        "How many multiples of last fetch elapsed time to wait before "
0185:                                + "recontacting same server",
0186:                        DEFAULT_DELAY_FACTOR));
0187:                addElementToDefinition(new SimpleType(
0188:                        ATTR_MAX_DELAY,
0189:                        "Never wait more than this long, regardless of multiple",
0190:                        DEFAULT_MAX_DELAY));
0191:                addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
0192:                        "Always wait this long after one completion before recontacting "
0193:                                + "same server, regardless of multiple",
0194:                        DEFAULT_MIN_DELAY));
0195:                addElementToDefinition(new SimpleType(
0196:                        ATTR_MAX_RETRIES,
0197:                        "How often to retry fetching a URI that failed to be retrieved.\n"
0198:                                + "If zero, the crawler will get the robots.txt only.",
0199:                        DEFAULT_MAX_RETRIES));
0200:                addElementToDefinition(new SimpleType(
0201:                        ATTR_RETRY_DELAY,
0202:                        "How long to wait by default until we retry fetching a"
0203:                                + " URI that failed to be retrieved (seconds). ",
0204:                        DEFAULT_RETRY_DELAY));
0205:                addElementToDefinition(new SimpleType(
0206:                        ATTR_PREFERENCE_EMBED_HOPS,
0207:                        "Number of embedded (or redirected) hops up to which "
0208:                                + "a URI has higher priority scheduling. For example, if set "
0209:                                + "to 1 (the default), items such as inline images (1-hop "
0210:                                + "embedded resources) will be scheduled ahead of all regular "
0211:                                + "links (or many-hop resources, like nested frames). If set to "
0212:                                + "zero, no preferencing will occur, and embeds/redirects are "
0213:                                + "scheduled the same as regular links.",
0214:                        DEFAULT_PREFERENCE_EMBED_HOPS));
0215:                Type t;
0216:                t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,
0217:                        "Maximum number of simultaneous requests to a single"
0218:                                + " host.", DEFAULT_HOST_VALENCE));
0219:                t.setExpertSetting(true);
0220:                t = addElementToDefinition(new SimpleType(
0221:                        ATTR_QUEUE_IGNORE_WWW,
0222:                        "If true then documents from x.com, www.x.com and any "
0223:                                + "www[0-9]+.x.com will be assigned to the same queue.",
0224:                        DEFAULT_QUEUE_IGNORE_WWW));
0225:                t.setExpertSetting(true);
0226:                t = addElementToDefinition(new SimpleType(
0227:                        ATTR_FORCE_QUEUE,
0228:                        "The queue name into which to force URIs. Should "
0229:                                + "be left blank at global level.  Specify a "
0230:                                + "per-domain/per-host override to force URIs into "
0231:                                + "a particular named queue, regardless of the assignment "
0232:                                + "policy in effect (domain or ip-based politeness). "
0233:                                + "This could be used on domains known to all be from "
0234:                                + "the same small set of IPs (eg blogspot, dailykos, etc.) "
0235:                                + "to simulate IP-based politeness, or it could be used if "
0236:                                + "you wanted to enforce politeness over a whole domain, even "
0237:                                + "though the subdomains are split across many IPs.",
0238:                        DEFAULT_FORCE_QUEUE));
0239:                t.setOverrideable(true);
0240:                t.setExpertSetting(true);
0241:                t
0242:                        .addConstraint(new RegularExpressionConstraint(
0243:                                ACCEPTABLE_FORCE_QUEUE,
0244:                                Level.WARNING,
0245:                                "This field must contain only alphanumeric "
0246:                                        + "characters plus period, dash, comma, colon, or underscore."));
0247:                t = addElementToDefinition(new SimpleType(
0248:                        ATTR_USE_URI_UNIQ_FILTER,
0249:                        "If true then the Frontier will use a seperate "
0250:                                + "datastructure to detect and eliminate duplicates.\n"
0251:                                + "This is required for Canonicalization rules to work.",
0252:                        DEFAULT_USE_URI_UNIQ_FILTER));
0253:                t.setExpertSetting(true);
0254:                t.setOverrideable(false);
0255:
0256:                // Register persistent CrawlURI items 
0257:                CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);
0258:                CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);
0259:            }
0260:
0261:            public synchronized void initialize(CrawlController c)
0262:                    throws FatalConfigurationException, IOException {
0263:                controller = c;
0264:                controller.addCrawlStatusListener(this );
0265:
0266:                queueAssignmentPolicy = new HostnameQueueAssignmentPolicy();
0267:
0268:                hostQueues = new AdaptiveRevisitQueueList(
0269:                        c.getBdbEnvironment(), c.getBdbEnvironment()
0270:                                .getClassCatalog());
0271:
0272:                if (((Boolean) getUncheckedAttribute(null,
0273:                        ATTR_USE_URI_UNIQ_FILTER)).booleanValue()) {
0274:                    alreadyIncluded = createAlreadyIncluded();
0275:                } else {
0276:                    alreadyIncluded = null;
0277:                }
0278:
0279:                loadSeeds();
0280:            }
0281:
0282:            /**
0283:             * Create a UriUniqFilter that will serve as record 
0284:             * of already seen URIs.
0285:             *
0286:             * @return A UURISet that will serve as a record of already seen URIs
0287:             * @throws IOException
0288:             */
0289:            protected UriUniqFilter createAlreadyIncluded() throws IOException {
0290:                UriUniqFilter uuf = new BdbUriUniqFilter(this .controller
0291:                        .getBdbEnvironment());
0292:                uuf.setDestination(this );
0293:                return uuf;
0294:            }
0295:
0296:            /**
0297:             * Loads the seeds
0298:             * <p>
0299:             * This method is called by initialize() and kickUpdate()
0300:             */
0301:            public void loadSeeds() {
0302:                Writer ignoredWriter = new StringWriter();
0303:                // Get the seeds to refresh.
0304:                Iterator iter = this .controller.getScope().seedsIterator(
0305:                        ignoredWriter);
0306:                while (iter.hasNext()) {
0307:                    CandidateURI caUri = CandidateURI
0308:                            .createSeedCandidateURI((UURI) iter.next());
0309:                    caUri.setSchedulingDirective(CandidateURI.MEDIUM);
0310:                    schedule(caUri);
0311:                }
0312:                batchFlush();
0313:                // save ignored items (if any) where they can be consulted later
0314:                AbstractFrontier.saveIgnoredItems(ignoredWriter.toString(),
0315:                        controller.getDisk());
0316:            }
0317:
0318:            public String getClassKey(CandidateURI cauri) {
0319:                String queueKey = (String) getUncheckedAttribute(cauri,
0320:                        ATTR_FORCE_QUEUE);
0321:                if ("".equals(queueKey)) {
0322:                    // Typical case, barring overrides
0323:                    queueKey = queueAssignmentPolicy.getClassKey(controller,
0324:                            cauri);
0325:                    // The queueAssignmentPolicy is always based on Hostnames
0326:                    // We may need to remove any www[0-9]{0,}\. prefixes from the
0327:                    // hostnames
0328:                    if (((Boolean) getUncheckedAttribute(cauri,
0329:                            ATTR_QUEUE_IGNORE_WWW)).booleanValue()) {
0330:                        queueKey = queueKey.replaceAll("^www[0-9]{0,}\\.", "");
0331:                    }
0332:                }
0333:                return queueKey;
0334:            }
0335:
0336:            /**
0337:             * Canonicalize passed uuri. Its would be sweeter if this canonicalize
0338:             * function was encapsulated by that which it canonicalizes but because
0339:             * settings change with context -- i.e. there may be overrides in operation
0340:             * for a particular URI -- its not so easy; Each CandidateURI would need a
0341:             * reference to the settings system. That's awkward to pass in.
0342:             * 
0343:             * @param uuri Candidate URI to canonicalize.
0344:             * @return Canonicalized version of passed <code>uuri</code>.
0345:             */
0346:            protected String canonicalize(UURI uuri) {
0347:                return Canonicalizer.canonicalize(uuri, this .controller
0348:                        .getOrder());
0349:            }
0350:
0351:            /**
0352:             * Canonicalize passed CandidateURI. This method differs from
0353:             * {@link #canonicalize(UURI)} in that it takes a look at
0354:             * the CandidateURI context possibly overriding any canonicalization effect if
0355:             * it could make us miss content. If canonicalization produces an URL that
0356:             * was 'alreadyseen', but the entry in the 'alreadyseen' database did
0357:             * nothing but redirect to the current URL, we won't get the current URL;
0358:             * we'll think we've already see it. Examples would be archive.org
0359:             * redirecting to www.archive.org or the inverse, www.netarkivet.net
0360:             * redirecting to netarkivet.net (assuming stripWWW rule enabled).
0361:             * <p>Note, this method under circumstance sets the forceFetch flag.
0362:             * 
0363:             * @param cauri CandidateURI to examine.
0364:             * @return Canonicalized <code>cacuri</code>.
0365:             */
0366:            protected String canonicalize(CandidateURI cauri) {
0367:                String canon = canonicalize(cauri.getUURI());
0368:                if (cauri.isLocation()) {
0369:                    // If the via is not the same as where we're being redirected (i.e.
0370:                    // we're not being redirected back to the same page, AND the
0371:                    // canonicalization of the via is equal to the the current cauri, 
0372:                    // THEN forcefetch (Forcefetch so no chance of our not crawling
0373:                    // content because alreadyseen check things its seen the url before.
0374:                    // An example of an URL that redirects to itself is:
0375:                    // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
0376:                    // An example of an URL whose canonicalization equals its via's
0377:                    // canonicalization, and we want to fetch content at the
0378:                    // redirection (i.e. need to set forcefetch), is netarkivet.dk.
0379:                    if (!cauri.toString().equals(cauri.getVia().toString())
0380:                            && canonicalize(cauri.getVia()).equals(canon)) {
0381:                        cauri.setForceFetch(true);
0382:                    }
0383:                }
0384:                return canon;
0385:            }
0386:
0387:            /**
0388:             * 
0389:             * @param caUri The URI to schedule.
0390:             */
0391:            protected void innerSchedule(CandidateURI caUri) {
0392:                CrawlURI curi;
0393:                if (caUri instanceof  CrawlURI) {
0394:                    curi = (CrawlURI) caUri;
0395:                } else {
0396:                    curi = CrawlURI.from(caUri, System.currentTimeMillis());
0397:                    curi.putLong(A_TIME_OF_NEXT_PROCESSING, System
0398:                            .currentTimeMillis());
0399:                    // New CrawlURIs get 'current time' as the time of next processing.
0400:                }
0401:
0402:                if (curi.getClassKey() == null) {
0403:                    curi.setClassKey(getClassKey(curi));
0404:                }
0405:
0406:                if (curi.isSeed() && curi.getVia() != null
0407:                        && curi.flattenVia().length() > 0) {
0408:                    // The only way a seed can have a non-empty via is if it is the
0409:                    // result of a seed redirect.  Add it to the seeds list.
0410:                    //
0411:                    // This is a feature.  This is handling for case where a seed
0412:                    // gets immediately redirected to another page.  What we're doing
0413:                    // is treating the immediate redirect target as a seed.
0414:                    this .controller.getScope().addSeed(curi);
0415:                    // And it needs rapid scheduling.
0416:                    curi.setSchedulingDirective(CandidateURI.MEDIUM);
0417:                }
0418:
0419:                // Optionally preferencing embeds up to MEDIUM
0420:                int prefHops = ((Integer) getUncheckedAttribute(curi,
0421:                        ATTR_PREFERENCE_EMBED_HOPS)).intValue();
0422:                boolean prefEmbed = false;
0423:                if (prefHops > 0) {
0424:                    int embedHops = curi.getTransHops();
0425:                    if (embedHops > 0
0426:                            && embedHops <= prefHops
0427:                            && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
0428:                        // number of embed hops falls within the preferenced range, and
0429:                        // uri is not already MEDIUM -- so promote it
0430:                        curi.setSchedulingDirective(CandidateURI.MEDIUM);
0431:                        prefEmbed = true;
0432:                    }
0433:                }
0434:
0435:                // Finally, allow curi to be fetched right now 
0436:                // (while not overriding overdue items)
0437:                curi.putLong(A_TIME_OF_NEXT_PROCESSING, System
0438:                        .currentTimeMillis());
0439:
0440:                try {
0441:                    logger.finest("scheduling " + curi.toString());
0442:                    AdaptiveRevisitHostQueue hq = getHQ(curi);
0443:                    hq.add(curi, prefEmbed);
0444:                } catch (IOException e) {
0445:                    // TODO Handle IOExceptions
0446:                    e.printStackTrace();
0447:                }
0448:
0449:            }
0450:
0451:            /**
0452:             * Get the AdaptiveRevisitHostQueue for the given CrawlURI, creating
0453:             * it if necessary. 
0454:             * 
0455:             * @param curi CrawlURI for which to get a queue
0456:             * @return AdaptiveRevisitHostQueue for given CrawlURI
0457:             * @throws IOException
0458:             */
0459:            protected AdaptiveRevisitHostQueue getHQ(CrawlURI curi)
0460:                    throws IOException {
0461:                AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0462:                        .getClassKey());
0463:                if (hq == null) {
0464:                    // Need to create it.
0465:                    int valence = DEFAULT_HOST_VALENCE.intValue();
0466:                    try {
0467:                        valence = ((Integer) getAttribute(curi,
0468:                                ATTR_HOST_VALENCE)).intValue();
0469:                    } catch (AttributeNotFoundException e2) {
0470:                        logger.severe("Unable to load valence.");
0471:                    }
0472:                    hq = hostQueues.createHQ(curi.getClassKey(), valence);
0473:                }
0474:                return hq;
0475:            }
0476:
0477:            protected void batchSchedule(CandidateURI caUri) {
0478:                threadWaiting.getQueue().enqueue(caUri);
0479:            }
0480:
0481:            protected void batchFlush() {
0482:                innerBatchFlush();
0483:            }
0484:
0485:            private void innerBatchFlush() {
0486:                Queue q = threadWaiting.getQueue();
0487:                while (!q.isEmpty()) {
0488:                    CandidateURI caUri = (CandidateURI) q.dequeue();
0489:                    if (alreadyIncluded != null) {
0490:                        String cannon = canonicalize(caUri);
0491:                        System.out.println("Cannon of " + caUri + " is "
0492:                                + cannon);
0493:                        if (caUri.forceFetch()) {
0494:                            alreadyIncluded.addForce(cannon, caUri);
0495:                        } else {
0496:                            alreadyIncluded.add(cannon, caUri);
0497:                        }
0498:                    } else {
0499:                        innerSchedule(caUri);
0500:                    }
0501:                }
0502:            }
0503:
0504:            /**
0505:             * @param curi
0506:             * @return the CrawlServer to be associated with this CrawlURI
0507:             */
0508:            protected CrawlServer getServer(CrawlURI curi) {
0509:                return this .controller.getServerCache().getServerFor(curi);
0510:            }
0511:
0512:            /* (non-Javadoc)
0513:             * @see org.archive.crawler.framework.Frontier#next()
0514:             */
0515:            public synchronized CrawlURI next() throws InterruptedException,
0516:                    EndedException {
0517:                controller.checkFinish();
0518:
0519:                while (shouldPause) {
0520:                    controller.toePaused();
0521:                    wait();
0522:                }
0523:
0524:                if (shouldTerminate) {
0525:                    throw new EndedException("terminated");
0526:                }
0527:
0528:                AdaptiveRevisitHostQueue hq = hostQueues.getTopHQ();
0529:
0530:                while (hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_READY) {
0531:                    // Ok, so we don't have a ready queue, wait until the top one
0532:                    // will become available.
0533:                    long waitTime = hq.getNextReadyTime()
0534:                            - System.currentTimeMillis();
0535:                    if (waitTime > 0) {
0536:                        wait(waitTime);
0537:                    }
0538:                    // The top HQ may have changed, so get it again
0539:                    hq = hostQueues.getTopHQ();
0540:                }
0541:
0542:                if (shouldTerminate) {
0543:                    // May have been terminated while thread was waiting for IO
0544:                    throw new EndedException("terminated");
0545:                }
0546:
0547:                try {
0548:                    CrawlURI curi = hq.next();
0549:                    // Populate CURI with 'transient' variables such as server.
0550:                    logger.fine("Issuing " + curi.toString());
0551:                    long temp = curi.getLong(A_TIME_OF_NEXT_PROCESSING);
0552:                    long currT = System.currentTimeMillis();
0553:                    long overdue = (currT - temp);
0554:                    if (logger.isLoggable(Level.FINER)) {
0555:                        String waitI = "not set";
0556:                        if (curi.containsKey(A_WAIT_INTERVAL)) {
0557:                            waitI = ArchiveUtils
0558:                                    .formatMillisecondsToConventional(curi
0559:                                            .getLong(A_WAIT_INTERVAL));
0560:                        }
0561:                        logger.finer("Wait interval: " + waitI
0562:                                + ", Time of next proc: " + temp
0563:                                + ", Current time: " + currT + ", Overdue by: "
0564:                                + overdue + "ms");
0565:                    }
0566:                    if (overdue < 0) {
0567:                        // This should never happen.
0568:                        logger.severe("Time overdue for " + curi.toString()
0569:                                + "is negative (" + overdue + ")!");
0570:                    }
0571:                    curi.putLong(A_FETCH_OVERDUE, overdue);
0572:                    return curi;
0573:                } catch (IOException e) {
0574:                    // TODO: Need to handle this in an intelligent manner. 
0575:                    //       Is probably fatal?
0576:                    e.printStackTrace();
0577:                }
0578:
0579:                return null;
0580:            }
0581:
0582:            /* (non-Javadoc)
0583:             * @see org.archive.crawler.framework.Frontier#isEmpty()
0584:             */
0585:            public boolean isEmpty() {
0586:                // Technically, the Frontier should never become empty since URIs are
0587:                // only discarded under exceptional circumstances.
0588:                return hostQueues.getSize() == 0;
0589:            }
0590:
0591:            /* (non-Javadoc)
0592:             * @see org.archive.crawler.framework.Frontier#schedule(org.archive.crawler.datamodel.CandidateURI)
0593:             */
0594:            public void schedule(CandidateURI caURI) {
0595:                batchSchedule(caURI);
0596:            }
0597:
0598:            /* (non-Javadoc)
0599:             * @see org.archive.crawler.framework.Frontier#finished(org.archive.crawler.datamodel.CrawlURI)
0600:             */
0601:            public synchronized void finished(CrawlURI curi) {
0602:                logger.fine(curi.toString()
0603:                        + " "
0604:                        + CrawlURI.fetchStatusCodesToString(curi
0605:                                .getFetchStatus()));
0606:                curi.incrementFetchAttempts();
0607:                logLocalizedErrors(curi);
0608:
0609:                innerFinished(curi);
0610:            }
0611:
0612:            protected synchronized void innerFinished(CrawlURI curi) {
0613:                try {
0614:                    innerBatchFlush();
0615:
0616:                    if (curi.isSuccess()) {
0617:                        successDisposition(curi);
0618:                    } else if (needsPromptRetry(curi)) {
0619:                        // Consider statuses which allow nearly-immediate retry
0620:                        // (like deferred to allow precondition to be fetched)
0621:                        reschedule(curi, false);
0622:                    } else if (needsRetrying(curi)) {
0623:                        // Consider errors which can be retried
0624:                        reschedule(curi, true);
0625:                        controller.fireCrawledURINeedRetryEvent(curi);
0626:                    } else if (isDisregarded(curi)) {
0627:                        // Check for codes that mean that while the crawler did
0628:                        // manage to get it it must be disregarded for any reason.
0629:                        disregardDisposition(curi);
0630:                    } else {
0631:                        // In that case FAILURE, note & log
0632:                        failureDisposition(curi);
0633:                    }
0634:
0635:                    // New items might be available, let waiting threads know
0636:                    // More then one queue might have become available due to 
0637:                    // scheduling of items outside the parent URIs host, so we
0638:                    // wake all waiting threads.
0639:                    notifyAll();
0640:                } catch (RuntimeException e) {
0641:                    curi.setFetchStatus(S_RUNTIME_EXCEPTION);
0642:                    // store exception temporarily for logging
0643:                    logger.warning("RTE in innerFinished() " + e.getMessage());
0644:                    e.printStackTrace();
0645:                    curi.putObject(A_RUNTIME_EXCEPTION, e);
0646:                    failureDisposition(curi);
0647:                } catch (AttributeNotFoundException e) {
0648:                    logger.severe(e.getMessage());
0649:                }
0650:            }
0651:
0652:            /**
0653:             * Take note of any processor-local errors that have
0654:             * been entered into the CrawlURI.
0655:             * @param curi CrawlURI with errors.
0656:             */
0657:            private void logLocalizedErrors(CrawlURI curi) {
0658:                if (curi.containsKey(A_LOCALIZED_ERRORS)) {
0659:                    List localErrors = (List) curi
0660:                            .getObject(A_LOCALIZED_ERRORS);
0661:                    Iterator iter = localErrors.iterator();
0662:                    while (iter.hasNext()) {
0663:                        Object array[] = { curi, iter.next() };
0664:                        controller.localErrors.log(Level.WARNING, curi
0665:                                .getUURI().toString(), array);
0666:                    }
0667:                    // once logged, discard
0668:                    curi.remove(A_LOCALIZED_ERRORS);
0669:                }
0670:            }
0671:
0672:            /**
0673:             * The CrawlURI has been successfully crawled. 
0674:             *
0675:             * @param curi The CrawlURI
0676:             */
0677:            protected void successDisposition(CrawlURI curi) {
0678:                curi.aboutToLog();
0679:
0680:                long waitInterval = 0;
0681:
0682:                if (curi.containsKey(A_WAIT_INTERVAL)) {
0683:                    waitInterval = curi.getLong(A_WAIT_INTERVAL);
0684:                    curi
0685:                            .addAnnotation("wt:"
0686:                                    + ArchiveUtils
0687:                                            .formatMillisecondsToConventional(waitInterval));
0688:                } else {
0689:                    logger.severe("Missing wait interval for "
0690:                            + curi.toString()
0691:                            + " WaitEvaluator may be missing.");
0692:                }
0693:                if (curi.containsKey(A_NUMBER_OF_VISITS)) {
0694:                    curi.addAnnotation(curi.getInt(A_NUMBER_OF_VISITS) + "vis");
0695:                }
0696:                if (curi.containsKey(A_NUMBER_OF_VERSIONS)) {
0697:                    curi.addAnnotation(curi.getInt(A_NUMBER_OF_VERSIONS)
0698:                            + "ver");
0699:                }
0700:                if (curi.containsKey(A_FETCH_OVERDUE)) {
0701:                    curi.addAnnotation("ov:"
0702:                            + ArchiveUtils
0703:                                    .formatMillisecondsToConventional((curi
0704:                                            .getLong(A_FETCH_OVERDUE))));
0705:                }
0706:
0707:                Object array[] = { curi };
0708:                controller.uriProcessing.log(Level.INFO, curi.getUURI()
0709:                        .toString(), array);
0710:
0711:                succeededFetchCount++;
0712:                totalProcessedBytes += curi.getContentSize();
0713:
0714:                // Let everyone know in case they want to do something before we strip
0715:                // the curi.
0716:                controller.fireCrawledURISuccessfulEvent(curi);
0717:
0718:                curi.setSchedulingDirective(CandidateURI.NORMAL);
0719:
0720:                // Set time of next processing
0721:                curi.putLong(A_TIME_OF_NEXT_PROCESSING, System
0722:                        .currentTimeMillis()
0723:                        + waitInterval);
0724:
0725:                /* Update HQ */
0726:                AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0727:                        .getClassKey());
0728:
0729:                // Wake up time is based on the time when a fetch was completed + the
0730:                // calculated snooze time for politeness. If the fetch completion time
0731:                // is missing, we'll use current time.
0732:                long wakeupTime = (curi.containsKey(A_FETCH_COMPLETED_TIME) ? curi
0733:                        .getLong(A_FETCH_COMPLETED_TIME)
0734:                        : (new Date()).getTime())
0735:                        + calculateSnoozeTime(curi);
0736:
0737:                // Ready the URI for reserialization.
0738:                curi.processingCleanup();
0739:                curi.resetDeferrals();
0740:                curi.resetFetchAttempts();
0741:                try {
0742:                    hq.update(curi, true, wakeupTime);
0743:                } catch (IOException e) {
0744:                    logger.severe("An IOException occured when updating "
0745:                            + curi.toString() + "\n" + e.getMessage());
0746:                    e.printStackTrace();
0747:                }
0748:            }
0749:
0750:            /**
0751:             * Put near top of relevant hostQueue (but behind anything recently
0752:             * scheduled 'high')..
0753:             *
0754:             * @param curi CrawlURI to reschedule. Its time of next processing is not
0755:             *             modified.
0756:             * @param errorWait signals if there should be a wait before retrying.
0757:             * @throws AttributeNotFoundException
0758:             */
0759:            protected void reschedule(CrawlURI curi, boolean errorWait)
0760:                    throws AttributeNotFoundException {
0761:                long delay = 0;
0762:                if (errorWait) {
0763:                    if (curi.containsKey(A_RETRY_DELAY)) {
0764:                        delay = curi.getLong(A_RETRY_DELAY);
0765:                    } else {
0766:                        // use ARFrontier default
0767:                        delay = ((Long) getAttribute(ATTR_RETRY_DELAY, curi))
0768:                                .longValue();
0769:                    }
0770:                }
0771:
0772:                long retryTime = (curi.containsKey(A_FETCH_COMPLETED_TIME) ? curi
0773:                        .getLong(A_FETCH_COMPLETED_TIME)
0774:                        : (new Date()).getTime())
0775:                        + delay;
0776:
0777:                AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0778:                        .getClassKey());
0779:                // Ready the URI for reserialization.
0780:                curi.processingCleanup();
0781:                if (errorWait) {
0782:                    curi.resetDeferrals(); //Defferals only refer to immediate retries.
0783:                }
0784:                try {
0785:                    hq.update(curi, errorWait, retryTime);
0786:                } catch (IOException e) {
0787:                    // TODO Handle IOException
0788:                    e.printStackTrace();
0789:                }
0790:            }
0791:
0792:            /**
0793:             * The CrawlURI has encountered a problem, and will not
0794:             * be retried.
0795:             *
0796:             * @param curi The CrawlURI
0797:             */
0798:            protected void failureDisposition(CrawlURI curi) {
0799:                //Let interested listeners know of failed disposition.
0800:                this .controller.fireCrawledURIFailureEvent(curi);
0801:
0802:                // send to basic log
0803:                curi.aboutToLog();
0804:                Object array[] = { curi };
0805:                this .controller.uriProcessing.log(Level.INFO, curi.getUURI()
0806:                        .toString(), array);
0807:
0808:                // if exception, also send to crawlErrors
0809:                if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
0810:                    this .controller.runtimeErrors.log(Level.WARNING, curi
0811:                            .getUURI().toString(), array);
0812:                }
0813:                failedFetchCount++;
0814:
0815:                // Put the failed URI at the very back of the queue.
0816:                curi.setSchedulingDirective(CandidateURI.NORMAL);
0817:                // TODO: reconsider this
0818:                curi.putLong(A_TIME_OF_NEXT_PROCESSING, Long.MAX_VALUE);
0819:
0820:                AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0821:                        .getClassKey());
0822:                // Ready the URI for serialization.
0823:                curi.processingCleanup();
0824:                curi.resetDeferrals();
0825:                curi.resetFetchAttempts();
0826:                try {
0827:                    // No wait on failure. No contact was made with the server.
0828:                    boolean shouldForget = shouldBeForgotten(curi);
0829:                    if (shouldForget && alreadyIncluded != null) {
0830:                        alreadyIncluded.forget(canonicalize(curi.getUURI()),
0831:                                curi);
0832:                    }
0833:                    hq.update(curi, false, 0, shouldForget);
0834:                } catch (IOException e) {
0835:                    // TODO Handle IOException
0836:                    e.printStackTrace();
0837:                }
0838:            }
0839:
0840:            protected void disregardDisposition(CrawlURI curi) {
0841:                //Let interested listeners know of disregard disposition.
0842:                controller.fireCrawledURIDisregardEvent(curi);
0843:
0844:                // send to basic log
0845:                curi.aboutToLog();
0846:                Object array[] = { curi };
0847:                controller.uriProcessing.log(Level.INFO, curi.getUURI()
0848:                        .toString(), array);
0849:
0850:                disregardedUriCount++;
0851:
0852:                // Todo: consider timout before retrying disregarded elements.
0853:                //       Possibly add a setting to the WaitEvaluators?
0854:                curi.putLong(A_TIME_OF_NEXT_PROCESSING, Long.MAX_VALUE);
0855:                curi.setSchedulingDirective(CandidateURI.NORMAL);
0856:
0857:                AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0858:                        .getClassKey());
0859:                // Ready the URI for reserialization.
0860:                curi.processingCleanup();
0861:                curi.resetDeferrals();
0862:                curi.resetFetchAttempts();
0863:                try {
0864:                    // No politness wait on disregard. No contact was made with server
0865:                    hq.update(curi, false, 0, shouldBeForgotten(curi));
0866:                } catch (IOException e) {
0867:                    // TODO Handle IOException
0868:                    e.printStackTrace();
0869:                }
0870:            }
0871:
0872:            /**
0873:             * Some URIs, if they recur,  deserve another
0874:             * chance at consideration: they might not be too
0875:             * many hops away via another path, or the scope
0876:             * may have been updated to allow them passage.
0877:             *
0878:             * @param curi
0879:             * @return True if curi should be forgotten.
0880:             */
0881:            protected boolean shouldBeForgotten(CrawlURI curi) {
0882:                switch (curi.getFetchStatus()) {
0883:                case S_OUT_OF_SCOPE:
0884:                case S_TOO_MANY_EMBED_HOPS:
0885:                case S_TOO_MANY_LINK_HOPS:
0886:                    return true;
0887:                default:
0888:                    return false;
0889:                }
0890:            }
0891:
0892:            /**
0893:             * Checks if a recently completed CrawlURI that did not finish successfully
0894:             * needs to be retried immediately (processed again as soon as politeness
0895:             * allows.)
0896:             *
0897:             * @param curi The CrawlURI to check
0898:             * @return True if we need to retry promptly.
0899:             * @throws AttributeNotFoundException If problems occur trying to read the
0900:             *            maximum number of retries from the settings framework.
0901:             */
0902:            protected boolean needsPromptRetry(CrawlURI curi)
0903:                    throws AttributeNotFoundException {
0904:                if (curi.getFetchAttempts() >= ((Integer) getAttribute(
0905:                        ATTR_MAX_RETRIES, curi)).intValue()) {
0906:                    return false;
0907:                }
0908:
0909:                switch (curi.getFetchStatus()) {
0910:                case S_DEFERRED:
0911:                    return true;
0912:
0913:                case HttpStatus.SC_UNAUTHORIZED:
0914:                    // We can get here though usually a positive status code is
0915:                    // a success.  We get here if there is rfc2617 credential data
0916:                    // loaded and we're supposed to go around again.  See if any
0917:                    // rfc2617 credential present and if there, assume it got
0918:                    // loaded in FetchHTTP on expectation that we're to go around
0919:                    // again.  If no rfc2617 loaded, we should not be here.
0920:                    boolean loaded = curi.hasRfc2617CredentialAvatar();
0921:                    if (!loaded) {
0922:                        logger.severe("Have 401 but no creds loaded " + curi);
0923:                    }
0924:                    return loaded;
0925:
0926:                default:
0927:                    return false;
0928:                }
0929:            }
0930:
0931:            /**
0932:             * Checks if a recently completed CrawlURI that did not finish successfully
0933:             * needs to be retried (processed again after some time elapses)
0934:             *
0935:             * @param curi The CrawlURI to check
0936:             * @return True if we need to retry.
0937:             * @throws AttributeNotFoundException If problems occur trying to read the
0938:             *            maximum number of retries from the settings framework.
0939:             */
0940:            protected boolean needsRetrying(CrawlURI curi)
0941:                    throws AttributeNotFoundException {
0942:                // Check to see if maximum number of retries has been exceeded.
0943:                if (curi.getFetchAttempts() >= ((Integer) getAttribute(
0944:                        ATTR_MAX_RETRIES, curi)).intValue()) {
0945:                    return false;
0946:                } else {
0947:                    // Check if FetchStatus indicates that a delayed retry is needed.
0948:                    switch (curi.getFetchStatus()) {
0949:                    case S_CONNECT_FAILED:
0950:                    case S_CONNECT_LOST:
0951:                    case S_DOMAIN_UNRESOLVABLE:
0952:                        // these are all worth a retry
0953:                        // TODO: consider if any others (S_TIMEOUT in some cases?) 
0954:                        //       deserve retry
0955:                        return true;
0956:                    default:
0957:                        return false;
0958:                    }
0959:                }
0960:            }
0961:
0962:            protected boolean isDisregarded(CrawlURI curi) {
0963:                switch (curi.getFetchStatus()) {
0964:                case S_ROBOTS_PRECLUDED: // they don't want us to have it
0965:                case S_OUT_OF_SCOPE: // filtered out by scope
0966:                case S_BLOCKED_BY_CUSTOM_PROCESSOR:
0967:                case S_BLOCKED_BY_USER: // filtered out by user
0968:                case S_TOO_MANY_EMBED_HOPS: // too far from last true link
0969:                case S_TOO_MANY_LINK_HOPS: // too far from seeds
0970:                case S_DELETED_BY_USER: // user deleted
0971:                    return true;
0972:                default:
0973:                    return false;
0974:                }
0975:            }
0976:
0977:            /**
0978:             * Calculates how long a host queue needs to be snoozed following the
0979:             * crawling of a URI.
0980:             *
0981:             * @param curi The CrawlURI
0982:             * @return How long to snooze.
0983:             */
0984:            protected long calculateSnoozeTime(CrawlURI curi) {
0985:                long durationToWait = 0;
0986:                if (curi.containsKey(A_FETCH_BEGAN_TIME)
0987:                        && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
0988:
0989:                    try {
0990:
0991:                        long completeTime = curi
0992:                                .getLong(A_FETCH_COMPLETED_TIME);
0993:                        long durationTaken = (completeTime - curi
0994:                                .getLong(A_FETCH_BEGAN_TIME));
0995:
0996:                        durationToWait = (long) (((Float) getAttribute(
0997:                                ATTR_DELAY_FACTOR, curi)).floatValue() * durationTaken);
0998:
0999:                        long minDelay = ((Integer) getAttribute(ATTR_MIN_DELAY,
1000:                                curi)).longValue();
1001:
1002:                        if (minDelay > durationToWait) {
1003:                            // wait at least the minimum
1004:                            durationToWait = minDelay;
1005:                        }
1006:
1007:                        long maxDelay = ((Integer) getAttribute(ATTR_MAX_DELAY,
1008:                                curi)).longValue();
1009:                        if (durationToWait > maxDelay) {
1010:                            // wait no more than the maximum
1011:                            durationToWait = maxDelay;
1012:                        }
1013:                    } catch (AttributeNotFoundException e) {
1014:                        logger.severe("Unable to find attribute. "
1015:                                + curi.toString());
1016:                        //Wait for max interval.
1017:                        durationToWait = DEFAULT_MAX_DELAY.longValue();
1018:                    }
1019:
1020:                }
1021:                long ret = durationToWait > DEFAULT_MIN_DELAY.longValue() ? durationToWait
1022:                        : DEFAULT_MIN_DELAY.longValue();
1023:                logger.finest("Snooze time for " + curi.toString() + " = "
1024:                        + ret);
1025:                return ret;
1026:            }
1027:
1028:            /* (non-Javadoc)
1029:             * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
1030:             */
1031:            public synchronized long discoveredUriCount() {
1032:                return (this .alreadyIncluded != null) ? this .alreadyIncluded
1033:                        .count() : hostQueues.getSize();
1034:            }
1035:
1036:            /* (non-Javadoc)
1037:             * @see org.archive.crawler.framework.Frontier#queuedUriCount()
1038:             */
1039:            public synchronized long queuedUriCount() {
1040:                return hostQueues.getSize();
1041:            }
1042:
1043:            /* (non-Javadoc)
1044:             * @see org.archive.crawler.framework.Frontier#finishedUriCount()
1045:             */
1046:            public long finishedUriCount() {
1047:                return succeededFetchCount + failedFetchCount
1048:                        + disregardedUriCount;
1049:            }
1050:
1051:            /* (non-Javadoc)
1052:             * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
1053:             */
1054:            public long succeededFetchCount() {
1055:                return succeededFetchCount;
1056:            }
1057:
1058:            /* (non-Javadoc)
1059:             * @see org.archive.crawler.framework.Frontier#failedFetchCount()
1060:             */
1061:            public long failedFetchCount() {
1062:                return failedFetchCount;
1063:            }
1064:
1065:            /* (non-Javadoc)
1066:             * @see org.archive.crawler.framework.Frontier#disregardedUriCount()
1067:             */
1068:            public long disregardedUriCount() {
1069:                return disregardedUriCount++;
1070:            }
1071:
1072:            /* (non-Javadoc)
1073:             * @see org.archive.crawler.framework.Frontier#totalBytesWritten()
1074:             */
1075:            public long totalBytesWritten() {
1076:                return totalProcessedBytes;
1077:            }
1078:
1079:            /**
1080:             * Method is not supported by this Frontier implementation..
1081:             * @param pathToLog
1082:             * @throws IOException
1083:             */
1084:            public void importRecoverLog(String pathToLog) throws IOException {
1085:                throw new IOException("Unsupported by this frontier.");
1086:            }
1087:
1088:            public synchronized FrontierMarker getInitialMarker(String regexpr,
1089:                    boolean inCacheOnly) {
1090:                return null;
1091:            }
1092:
1093:            /* (non-Javadoc)
1094:             * @see org.archive.crawler.framework.Frontier#getURIsList(org.archive.crawler.framework.FrontierMarker, int, boolean)
1095:             */
1096:            public synchronized ArrayList getURIsList(FrontierMarker marker,
1097:                    int numberOfMatches, boolean verbose)
1098:                    throws InvalidFrontierMarkerException {
1099:                // TODO Auto-generated method stub
1100:                return null;
1101:            }
1102:
1103:            /* (non-Javadoc)
1104:             * @see org.archive.crawler.framework.Frontier#deleteURIs(java.lang.String)
1105:             */
1106:            public synchronized long deleteURIs(String match) {
1107:                // TODO Auto-generated method stub
1108:                return 0;
1109:            }
1110:
1111:            /* (non-Javadoc)
1112:             * @see org.archive.crawler.framework.Frontier#deleted(org.archive.crawler.datamodel.CrawlURI)
1113:             */
1114:            public synchronized void deleted(CrawlURI curi) {
1115:                // TODO Auto-generated method stub
1116:            }
1117:
1118:            public void considerIncluded(UURI u) {
1119:                // This will cause the URI to be crawled!!!
1120:                CrawlURI curi = new CrawlURI(u);
1121:                innerSchedule(curi);
1122:
1123:            }
1124:
1125:            public void kickUpdate() {
1126:                loadSeeds();
1127:            }
1128:
1129:            public void start() {
1130:                unpause();
1131:            }
1132:
1133:            synchronized public void pause() {
1134:                shouldPause = true;
1135:                notifyAll();
1136:            }
1137:
1138:            synchronized public void unpause() {
1139:                shouldPause = false;
1140:                notifyAll();
1141:            }
1142:
1143:            synchronized public void terminate() {
1144:                shouldTerminate = true;
1145:            }
1146:
1147:            /* (non-Javadoc)
1148:             * @see org.archive.crawler.framework.Frontier#getFrontierJournal()
1149:             */
1150:            public FrontierJournal getFrontierJournal() {
1151:                return null;
1152:            }
1153:
1154:            private static class ThreadLocalQueue extends
1155:                    ThreadLocal<Queue<CandidateURI>> implements  Serializable {
1156:
1157:                private static final long serialVersionUID = 8268977225156462059L;
1158:
1159:                protected Queue<CandidateURI> initialValue() {
1160:                    return new MemQueue<CandidateURI>();
1161:                }
1162:
1163:                /**
1164:                 * @return Queue of 'batched' items
1165:                 */
1166:                public Queue<CandidateURI> getQueue() {
1167:                    return get();
1168:                }
1169:            }
1170:
1171:            /**
1172:             * This method is not supported by this Frontier implementation
1173:             * @param pathToLog
1174:             * @param retainFailures
1175:             * @throws IOException
1176:             */
1177:            public void importRecoverLog(String pathToLog,
1178:                    boolean retainFailures) throws IOException {
1179:                throw new IOException("Unsupported");
1180:            }
1181:
1182:            //
1183:            // Reporter implementation
1184:            //
1185:
1186:            public String[] getReports() {
1187:                // none but default for now
1188:                return new String[] {};
1189:            }
1190:
1191:            /* (non-Javadoc)
1192:             * @see org.archive.util.Reporter#singleLineReport()
1193:             */
1194:            public String singleLineReport() {
1195:                return ArchiveUtils.singleLineReport(this );
1196:            }
1197:
1198:            /* (non-Javadoc)
1199:             * @see org.archive.util.Reporter#reportTo(java.io.Writer)
1200:             */
1201:            public void reportTo(PrintWriter writer) throws IOException {
1202:                reportTo(null, writer);
1203:            }
1204:
1205:            /* (non-Javadoc)
1206:             * @see org.archive.crawler.framework.Frontier#oneLineReport()
1207:             */
1208:            public synchronized void singleLineReportTo(PrintWriter w)
1209:                    throws IOException {
1210:                hostQueues.singleLineReportTo(w);
1211:            }
1212:
1213:            /* (non-Javadoc)
1214:             * @see org.archive.util.Reporter#singleLineLegend()
1215:             */
1216:            public String singleLineLegend() {
1217:                return hostQueues.singleLineLegend();
1218:            }
1219:
1220:            /* (non-Javadoc)
1221:             * @see org.archive.crawler.framework.Frontier#report()
1222:             */
1223:            public synchronized void reportTo(String name, PrintWriter writer) {
1224:                // ignore name; only one report for now
1225:                hostQueues.reportTo(name, writer);
1226:            }
1227:
1228:            /* (non-Javadoc)
1229:             * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1230:             */
1231:            public void crawlStarted(String message) {
1232:                // Not interested
1233:            }
1234:
1235:            /* (non-Javadoc)
1236:             * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1237:             */
1238:            public void crawlEnding(String sExitMessage) {
1239:                // Not interested
1240:            }
1241:
1242:            /* (non-Javadoc)
1243:             * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1244:             */
1245:            public void crawlEnded(String sExitMessage) {
1246:                // Cleanup!
1247:                if (this .alreadyIncluded != null) {
1248:                    this .alreadyIncluded.close();
1249:                    this .alreadyIncluded = null;
1250:                }
1251:                hostQueues.close();
1252:            }
1253:
1254:            /* (non-Javadoc)
1255:             * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1256:             */
1257:            public void crawlPausing(String statusMessage) {
1258:                // Not interested
1259:            }
1260:
1261:            /* (non-Javadoc)
1262:             * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1263:             */
1264:            public void crawlPaused(String statusMessage) {
1265:                // Not interested
1266:            }
1267:
1268:            /* (non-Javadoc)
1269:             * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1270:             */
1271:            public void crawlResuming(String statusMessage) {
1272:                // Not interested
1273:            }
1274:
1275:            /* (non-Javadoc)
1276:             * @see org.archive.crawler.event.CrawlStatusListener#crawlCheckpoint(java.io.File)
1277:             */
1278:            public void crawlCheckpoint(File checkpointDir) throws Exception {
1279:                // Not interested
1280:            }
1281:
1282:            /* (non-Javadoc)
1283:             * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CandidateURI)
1284:             */
1285:            public void receive(CandidateURI item) {
1286:                System.out.println("Received " + item);
1287:                innerSchedule(item);
1288:            }
1289:
1290:            /* (non-Javadoc)
1291:             * @see org.archive.crawler.framework.Frontier#getGroup(org.archive.crawler.datamodel.CrawlURI)
1292:             */
1293:            public FrontierGroup getGroup(CrawlURI curi) {
1294:                try {
1295:                    return getHQ(curi);
1296:                } catch (IOException ioe) {
1297:                    throw new RuntimeException(ioe);
1298:                }
1299:            }
1300:
1301:            public long averageDepth() {
1302:                return hostQueues.getAverageDepth();
1303:            }
1304:
1305:            public float congestionRatio() {
1306:                return hostQueues.getCongestionRatio();
1307:            }
1308:
1309:            public long deepestUri() {
1310:                return hostQueues.getDeepestQueueSize();
1311:            }
1312:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.