Source Code Cross Referenced for CrawlURI.java in  » Web-Crawler » heritrix » org » archive » crawler » datamodel » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.datamodel 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /* Copyright (C) 2003 Internet Archive.
0002:         *
0003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
0004:         *
0005:         * Heritrix is free software; you can redistribute it and/or modify
0006:         * it under the terms of the GNU Lesser Public License as published by
0007:         * the Free Software Foundation; either version 2.1 of the License, or
0008:         * any later version.
0009:         *
0010:         * Heritrix is distributed in the hope that it will be useful,
0011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
0012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0013:         * GNU Lesser Public License for more details.
0014:         *
0015:         * You should have received a copy of the GNU Lesser Public License
0016:         * along with Heritrix; if not, write to the Free Software
0017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0018:         *
0019:         * CrawlURI.java
0020:         * Created on Apr 16, 2003
0021:         *
0022:         * $Header$
0023:         */
0024:        package org.archive.crawler.datamodel;
0025:
0026:        import java.io.IOException;
0027:        import java.io.ObjectInputStream;
0028:        import java.io.ObjectOutputStream;
0029:        import java.util.ArrayList;
0030:        import java.util.Collection;
0031:        import java.util.HashSet;
0032:        import java.util.Iterator;
0033:        import java.util.List;
0034:        import java.util.Set;
0035:        import java.util.concurrent.CopyOnWriteArrayList;
0036:
0037:        import org.apache.commons.httpclient.HttpStatus;
0038:        import org.apache.commons.httpclient.URIException;
0039:        import org.archive.crawler.datamodel.credential.CredentialAvatar;
0040:        import org.archive.crawler.datamodel.credential.Rfc2617Credential;
0041:        import org.archive.crawler.extractor.Link;
0042:        import org.archive.crawler.framework.Processor;
0043:        import org.archive.crawler.framework.ProcessorChain;
0044:        import org.archive.crawler.util.Transform;
0045:        import org.archive.net.UURI;
0046:        import org.archive.net.UURIFactory;
0047:        import org.archive.util.Base32;
0048:        import org.archive.util.HttpRecorder;
0049:
0050:        import st.ata.util.AList;
0051:        import st.ata.util.HashtableAList;
0052:
0053:        /**
0054:         * Represents a candidate URI and the associated state it
0055:         * collects as it is crawled.
0056:         *
0057:         * <p>Core state is in instance variables but a flexible
0058:         * attribute list is also available. Use this 'bucket' to carry
0059:         * custom processing extracted data and state across CrawlURI
0060:         * processing.  See the {@link #putString(String, String)},
0061:         * {@link #getString(String)}, etc. 
0062:         *
0063:         * @author Gordon Mohr
0064:         */
0065:        public class CrawlURI extends CandidateURI implements  FetchStatusCodes {
0066:
0067:            private static final long serialVersionUID = 7874096757350100472L;
0068:
0069:            public static final int UNCALCULATED = -1;
0070:
0071:            // INHERITED FROM CANDIDATEURI
0072:            // uuri: core identity: the "usable URI" to be crawled
0073:            // isSeed
0074:            // inScopeVersion
0075:            // pathFromSeed
0076:            // via
0077:
0078:            // Processing progress
0079:            transient private Processor nextProcessor;
0080:            transient private ProcessorChain nextProcessorChain;
0081:            private int fetchStatus = 0; // default to unattempted
0082:            private int deferrals = 0; // count of postponements for prerequisites
0083:            private int fetchAttempts = 0; // the number of fetch attempts that have been made
0084:            transient private int threadNumber;
0085:
0086:            // dynamic context
0087:            /** @deprecated */
0088:            private int linkHopCount = UNCALCULATED; // from seeds
0089:            /** @deprecated */
0090:            private int embedHopCount = UNCALCULATED; // from a sure link; reset upon any link traversal
0091:
0092:            // User agent to masquerade as when crawling this URI. If null, globals should be used
0093:            private String userAgent = null;
0094:
0095:            // Once a link extractor has finished processing this curi this will be
0096:            // set as true
0097:            transient private boolean linkExtractorFinished = false;
0098:
0099:            /**
0100:             * Protection against outlink overflow.
0101:             * Change value by setting alternate maximum in heritrix.properties.
0102:             */
0103:            public static final int MAX_OUTLINKS = Integer.parseInt(System
0104:                    .getProperty(CrawlURI.class.getName() + ".maxOutLinks",
0105:                            "6000"));
0106:
0107:            transient private int discardedOutlinks = 0;
0108:
0109:            ////////////////////////////////////////////////////////////////////
0110:            private long contentSize = UNCALCULATED;
0111:            private long contentLength = UNCALCULATED;
0112:
0113:            /**
0114:             * Current http recorder.
0115:             *
0116:             * Gets set upon successful request.  Reset at start of processing chain.
0117:             */
0118:            private transient HttpRecorder httpRecorder = null;
0119:
0120:            /**
0121:             * Content type of a successfully fetched URI.
0122:             *
0123:             * May be null even on successfully fetched URI.
0124:             */
0125:            private String contentType = null;
0126:
0127:            /**
0128:             * True if this CrawlURI has been deemed a prerequisite by the
0129:             * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.
0130:             *
0131:             * This flag is used at least inside in the precondition enforcer so that
0132:             * subsequent prerequisite tests know to let this CrawlURI through because
0133:             * its a prerequisite needed by an earlier prerequisite tests (e.g. If
0134:             * this is a robots.txt, then the subsequent login credentials prereq
0135:             * test must not throw it out because its not a login curi).
0136:             */
0137:            private boolean prerequisite = false;
0138:
0139:            /**
0140:             * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.
0141:             */
0142:            private boolean post = false;
0143:
0144:            /** 
0145:             * Monotonically increasing number within a crawl;
0146:             * useful for tending towards breadth-first ordering.
0147:             * Will sometimes be truncated to 48 bits, so behavior
0148:             * over 281 trillion instantiated CrawlURIs may be 
0149:             * buggy
0150:             */
0151:            protected long ordinal;
0152:
0153:            /**
0154:             * Cache of this candidate uuri as a string.
0155:             *
0156:             * Profiling shows us spending about 1-2% of total elapsed time in
0157:             * toString.
0158:             */
0159:            private String cachedCrawlURIString = null;
0160:
0161:            /**
0162:             * Array to hold keys of alist members that persist across URI processings.
0163:             * Any key mentioned in this list will not be cleared out at the end
0164:             * of a pass down the processing chain.
0165:             */
0166:            private static final List<Object> alistPersistentMember = new CopyOnWriteArrayList<Object>(
0167:                    new String[] { A_CREDENTIAL_AVATARS_KEY });
0168:
0169:            /**
0170:             * A digest (hash, usually SHA1) of retrieved content-body. 
0171:             * 
0172:             */
0173:            private byte[] contentDigest = null;
0174:            private String contentDigestScheme = null;
0175:
0176:            /**
0177:             * Create a new instance of CrawlURI from a {@link UURI}.
0178:             *
0179:             * @param uuri the UURI to base this CrawlURI on.
0180:             */
0181:            public CrawlURI(UURI uuri) {
0182:                super (uuri);
0183:            }
0184:
0185:            /**
0186:             * Create a new instance of CrawlURI from a {@link CandidateURI}
0187:             *
0188:             * @param caUri the CandidateURI to base this CrawlURI on.
0189:             * @param o Monotonically increasing number within a crawl.
0190:             */
0191:            @SuppressWarnings("deprecation")
0192:            public CrawlURI(CandidateURI caUri, long o) {
0193:                super (caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),
0194:                        caUri.getViaContext());
0195:                ordinal = o;
0196:                setIsSeed(caUri.isSeed());
0197:                setSchedulingDirective(caUri.getSchedulingDirective());
0198:                setAList(caUri.getAList());
0199:            }
0200:
0201:            /**
0202:             * Takes a status code and converts it into a human readable string.
0203:             *
0204:             * @param code the status code
0205:             * @return a human readable string declaring what the status code is.
0206:             */
0207:            public static String fetchStatusCodesToString(int code) {
0208:                switch (code) {
0209:                // DNS
0210:                case S_DNS_SUCCESS:
0211:                    return "DNS-1-OK";
0212:                    // HTTP Informational 1xx
0213:                case 100:
0214:                    return "HTTP-100-Info-Continue";
0215:                case 101:
0216:                    return "HTTP-101-Info-Switching Protocols";
0217:                    // HTTP Successful 2xx
0218:                case 200:
0219:                    return "HTTP-200-Success-OK";
0220:                case 201:
0221:                    return "HTTP-201-Success-Created";
0222:                case 202:
0223:                    return "HTTP-202-Success-Accepted";
0224:                case 203:
0225:                    return "HTTP-203-Success-Non-Authoritative";
0226:                case 204:
0227:                    return "HTTP-204-Success-No Content ";
0228:                case 205:
0229:                    return "HTTP-205-Success-Reset Content";
0230:                case 206:
0231:                    return "HTTP-206-Success-Partial Content";
0232:                    // HTTP Redirection 3xx
0233:                case 300:
0234:                    return "HTTP-300-Redirect-Multiple Choices";
0235:                case 301:
0236:                    return "HTTP-301-Redirect-Moved Permanently";
0237:                case 302:
0238:                    return "HTTP-302-Redirect-Found";
0239:                case 303:
0240:                    return "HTTP-303-Redirect-See Other";
0241:                case 304:
0242:                    return "HTTP-304-Redirect-Not Modified";
0243:                case 305:
0244:                    return "HTTP-305-Redirect-Use Proxy";
0245:                case 307:
0246:                    return "HTTP-307-Redirect-Temporary Redirect";
0247:                    // HTTP Client Error 4xx
0248:                case 400:
0249:                    return "HTTP-400-ClientErr-Bad Request";
0250:                case 401:
0251:                    return "HTTP-401-ClientErr-Unauthorized";
0252:                case 402:
0253:                    return "HTTP-402-ClientErr-Payment Required";
0254:                case 403:
0255:                    return "HTTP-403-ClientErr-Forbidden";
0256:                case 404:
0257:                    return "HTTP-404-ClientErr-Not Found";
0258:                case 405:
0259:                    return "HTTP-405-ClientErr-Method Not Allowed";
0260:                case 407:
0261:                    return "HTTP-406-ClientErr-Not Acceptable";
0262:                case 408:
0263:                    return "HTTP-407-ClientErr-Proxy Authentication Required";
0264:                case 409:
0265:                    return "HTTP-408-ClientErr-Request Timeout";
0266:                case 410:
0267:                    return "HTTP-409-ClientErr-Conflict";
0268:                case 406:
0269:                    return "HTTP-410-ClientErr-Gone";
0270:                case 411:
0271:                    return "HTTP-411-ClientErr-Length Required";
0272:                case 412:
0273:                    return "HTTP-412-ClientErr-Precondition Failed";
0274:                case 413:
0275:                    return "HTTP-413-ClientErr-Request Entity Too Large";
0276:                case 414:
0277:                    return "HTTP-414-ClientErr-Request-URI Too Long";
0278:                case 415:
0279:                    return "HTTP-415-ClientErr-Unsupported Media Type";
0280:                case 416:
0281:                    return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
0282:                case 417:
0283:                    return "HTTP-417-ClientErr-Expectation Failed";
0284:                    // HTTP Server Error 5xx
0285:                case 500:
0286:                    return "HTTP-500-ServerErr-Internal Server Error";
0287:                case 501:
0288:                    return "HTTP-501-ServerErr-Not Implemented";
0289:                case 502:
0290:                    return "HTTP-502-ServerErr-Bad Gateway";
0291:                case 503:
0292:                    return "HTTP-503-ServerErr-Service Unavailable";
0293:                case 504:
0294:                    return "HTTP-504-ServerErr-Gateway Timeout";
0295:                case 505:
0296:                    return "HTTP-505-ServerErr-HTTP Version Not Supported";
0297:                    // Heritrix internal codes (all negative numbers
0298:                case S_BLOCKED_BY_USER:
0299:                    return "Heritrix(" + S_BLOCKED_BY_USER
0300:                            + ")-Blocked by user";
0301:                case S_BLOCKED_BY_CUSTOM_PROCESSOR:
0302:                    return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR
0303:                            + ")-Blocked by custom prefetch processor";
0304:                case S_DELETED_BY_USER:
0305:                    return "Heritrix(" + S_DELETED_BY_USER
0306:                            + ")-Deleted by user";
0307:                case S_CONNECT_FAILED:
0308:                    return "Heritrix(" + S_CONNECT_FAILED
0309:                            + ")-Connection failed";
0310:                case S_CONNECT_LOST:
0311:                    return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
0312:                case S_DEEMED_CHAFF:
0313:                    return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
0314:                case S_DEFERRED:
0315:                    return "Heritrix(" + S_DEFERRED + ")-Deferred";
0316:                case S_DOMAIN_UNRESOLVABLE:
0317:                    return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
0318:                            + ")-Domain unresolvable";
0319:                case S_OUT_OF_SCOPE:
0320:                    return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
0321:                case S_DOMAIN_PREREQUISITE_FAILURE:
0322:                    return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
0323:                            + ")-Domain prerequisite failure";
0324:                case S_ROBOTS_PREREQUISITE_FAILURE:
0325:                    return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
0326:                            + ")-Robots prerequisite failure";
0327:                case S_OTHER_PREREQUISITE_FAILURE:
0328:                    return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
0329:                            + ")-Other prerequisite failure";
0330:                case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
0331:                    return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
0332:                            + ")-Prerequisite unschedulable failure";
0333:                case S_ROBOTS_PRECLUDED:
0334:                    return "Heritrix(" + S_ROBOTS_PRECLUDED
0335:                            + ")-Robots precluded";
0336:                case S_RUNTIME_EXCEPTION:
0337:                    return "Heritrix(" + S_RUNTIME_EXCEPTION
0338:                            + ")-Runtime exception";
0339:                case S_SERIOUS_ERROR:
0340:                    return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
0341:                case S_TIMEOUT:
0342:                    return "Heritrix(" + S_TIMEOUT + ")-Timeout";
0343:                case S_TOO_MANY_EMBED_HOPS:
0344:                    return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
0345:                            + ")-Too many embed hops";
0346:                case S_TOO_MANY_LINK_HOPS:
0347:                    return "Heritrix(" + S_TOO_MANY_LINK_HOPS
0348:                            + ")-Too many link hops";
0349:                case S_TOO_MANY_RETRIES:
0350:                    return "Heritrix(" + S_TOO_MANY_RETRIES
0351:                            + ")-Too many retries";
0352:                case S_UNATTEMPTED:
0353:                    return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
0354:                case S_UNFETCHABLE_URI:
0355:                    return "Heritrix(" + S_UNFETCHABLE_URI
0356:                            + ")-Unfetchable URI";
0357:                case S_PROCESSING_THREAD_KILLED:
0358:                    return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-"
0359:                            + "Processing thread killed";
0360:                    // Unknown return code
0361:                default:
0362:                    return Integer.toString(code);
0363:                }
0364:            }
0365:
0366:            /**
0367:             * Return the overall/fetch status of this CrawlURI for its
0368:             * current trip through the processing loop.
0369:             *
0370:             * @return a value from FetchStatusCodes
0371:             */
0372:            public int getFetchStatus() {
0373:                return fetchStatus;
0374:            }
0375:
0376:            /**
0377:             * Set the overall/fetch status of this CrawlURI for
0378:             * its current trip through the processing loop.
0379:             *
0380:             * @param newstatus a value from FetchStatusCodes
0381:             */
0382:            public void setFetchStatus(int newstatus) {
0383:                fetchStatus = newstatus;
0384:            }
0385:
0386:            /**
0387:             * Get the number of attempts at getting the document referenced by this
0388:             * URI.
0389:             *
0390:             * @return the number of attempts at getting the document referenced by this
0391:             *         URI.
0392:             */
0393:            public int getFetchAttempts() {
0394:                return fetchAttempts;
0395:            }
0396:
0397:            /**
0398:             * Increment the number of attempts at getting the document referenced by
0399:             * this URI.
0400:             *
0401:             * @return the number of attempts at getting the document referenced by this
0402:             *         URI.
0403:             */
0404:            public int incrementFetchAttempts() {
0405:                // TODO: rename, this is actually processing-loop-attempts
0406:                return fetchAttempts++;
0407:            }
0408:
0409:            /**
0410:             * Reset fetchAttempts counter.
0411:             */
0412:            public void resetFetchAttempts() {
0413:                this .fetchAttempts = 0;
0414:            }
0415:
0416:            /**
0417:             * Reset deferrals counter.
0418:             */
0419:            public void resetDeferrals() {
0420:                this .deferrals = 0;
0421:            }
0422:
0423:            /**
0424:             * Get the next processor to process this URI.
0425:             *
0426:             * @return the processor that should process this URI next.
0427:             */
0428:            public Processor nextProcessor() {
0429:                return nextProcessor;
0430:            }
0431:
0432:            /**
0433:             * Get the processor chain that should be processing this URI after the
0434:             * current chain is finished with it.
0435:             *
0436:             * @return the next processor chain to process this URI.
0437:             */
0438:            public ProcessorChain nextProcessorChain() {
0439:                return nextProcessorChain;
0440:            }
0441:
0442:            /**
0443:             * Set the next processor to process this URI.
0444:             *
0445:             * @param processor the next processor to process this URI.
0446:             */
0447:            public void setNextProcessor(Processor processor) {
0448:                nextProcessor = processor;
0449:            }
0450:
0451:            /**
0452:             * Set the next processor chain to process this URI.
0453:             *
0454:             * @param nextProcessorChain the next processor chain to process this URI.
0455:             */
0456:            public void setNextProcessorChain(ProcessorChain nextProcessorChain) {
0457:                this .nextProcessorChain = nextProcessorChain;
0458:            }
0459:
0460:            /**
0461:             * Do all actions associated with setting a <code>CrawlURI</code> as
0462:             * requiring a prerequisite.
0463:             *
0464:             * @param lastProcessorChain Last processor chain reference.  This chain is
0465:             * where this <code>CrawlURI</code> goes next.
0466:             * @param preq Object to set a prerequisite.
0467:             * @throws URIException
0468:             */
0469:            public void markPrerequisite(String preq,
0470:                    ProcessorChain lastProcessorChain) throws URIException {
0471:                Link link = createLink(preq, Link.PREREQ_MISC, Link.PREREQ_HOP);
0472:                setPrerequisiteUri(link);
0473:                incrementDeferrals();
0474:                setFetchStatus(S_DEFERRED);
0475:                skipToProcessorChain(lastProcessorChain);
0476:            }
0477:
0478:            /**
0479:             * Set a prerequisite for this URI.
0480:             * <p>
0481:             * A prerequisite is a URI that must be crawled before this URI can be
0482:             * crawled.
0483:             *
0484:             * @param link Link to set as prereq.
0485:             */
0486:            public void setPrerequisiteUri(Object link) {
0487:                putObject(A_PREREQUISITE_URI, link);
0488:            }
0489:
0490:            /**
0491:             * Get the prerequisite for this URI.
0492:             * <p>
0493:             * A prerequisite is a URI that must be crawled before this URI can be
0494:             * crawled.
0495:             *
0496:             * @return the prerequisite for this URI or null if no prerequisite.
0497:             */
0498:            public Object getPrerequisiteUri() {
0499:                return getObject(A_PREREQUISITE_URI);
0500:            }
0501:
0502:            /**
0503:             * @return True if this CrawlURI has a prerequisite.
0504:             */
0505:            public boolean hasPrerequisiteUri() {
0506:                return containsKey(A_PREREQUISITE_URI);
0507:            }
0508:
0509:            /**
0510:             * Returns true if this CrawlURI is a prerequisite.
0511:             *
0512:             * @return true if this CrawlURI is a prerequisite.
0513:             */
0514:            public boolean isPrerequisite() {
0515:                return this .prerequisite;
0516:            }
0517:
0518:            /**
0519:             * Set if this CrawlURI is itself a prerequisite URI.
0520:             *
0521:             * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
0522:             */
0523:            public void setPrerequisite(boolean prerequisite) {
0524:                this .prerequisite = prerequisite;
0525:            }
0526:
0527:            /**
0528:             * @return This crawl URI as a string wrapped with 'CrawlURI(' +
0529:             * ')'.
0530:             */
0531:            public String getCrawlURIString() {
0532:                if (this .cachedCrawlURIString == null) {
0533:                    synchronized (this ) {
0534:                        if (this .cachedCrawlURIString == null) {
0535:                            this .cachedCrawlURIString = "CrawlURI("
0536:                                    + toString() + ")";
0537:                        }
0538:                    }
0539:                }
0540:                return this .cachedCrawlURIString;
0541:            }
0542:
0543:            /**
0544:             * Get the content type of this URI.
0545:             *
0546:             * @return Fetched URIs content type.  May be null.
0547:             */
0548:            public String getContentType() {
0549:                return this .contentType;
0550:            }
0551:
0552:            /**
0553:             * Set a fetched uri's content type.
0554:             *
0555:             * @param ct Contenttype.  May be null.
0556:             */
0557:            public void setContentType(String ct) {
0558:                this .contentType = ct;
0559:            }
0560:
0561:            /**
0562:             * Set the number of the ToeThread responsible for processing this uri.
0563:             *
0564:             * @param i the ToeThread number.
0565:             */
0566:            public void setThreadNumber(int i) {
0567:                threadNumber = i;
0568:            }
0569:
0570:            /**
0571:             * Get the number of the ToeThread responsible for processing this uri.
0572:             *
0573:             * @return the ToeThread number.
0574:             */
0575:            public int getThreadNumber() {
0576:                return threadNumber;
0577:            }
0578:
0579:            /**
0580:             * Increment the deferral count.
0581:             *
0582:             */
0583:            public void incrementDeferrals() {
0584:                deferrals++;
0585:            }
0586:
0587:            /**
0588:             * Get the deferral count.
0589:             *
0590:             * @return the deferral count.
0591:             */
0592:            public int getDeferrals() {
0593:                return deferrals;
0594:            }
0595:
0596:            /**
0597:             * Remove all attributes set on this uri.
0598:             * <p>
0599:             * This methods removes the attribute list.
0600:             */
0601:            public void stripToMinimal() {
0602:                clearAList();
0603:            }
0604:
0605:            /** 
0606:             * Get the size in bytes of this URI's recorded content, inclusive
0607:             * of things like protocol headers. It is the responsibility of the 
0608:             * classes which fetch the URI to set this value accordingly -- it is 
0609:             * not calculated/verified within CrawlURI. 
0610:             * 
0611:             * This value is consulted in reporting/logging/writing-decisions.
0612:             * 
0613:             * @see #setContentSize()
0614:             * @return contentSize
0615:             */
0616:            public long getContentSize() {
0617:                return contentSize;
0618:            }
0619:
0620:            /**
0621:             * Make note of a non-fatal error, local to a particular Processor,
0622:             * which should be logged somewhere, but allows processing to continue.
0623:             *
0624:             * This is how you add to the local-error log (the 'localized' in
0625:             * the below is making an error local rather than global, not
0626:             * making a swiss-french version of the error.).
0627:             * 
0628:             * @param processorName Name of processor the exception was thrown
0629:             * in.
0630:             * @param ex Throwable to log.
0631:             * @param message Extra message to log beyond exception message.
0632:             */
0633:            public void addLocalizedError(final String processorName,
0634:                    final Throwable ex, final String message) {
0635:                List<LocalizedError> localizedErrors;
0636:                if (containsKey(A_LOCALIZED_ERRORS)) {
0637:                    @SuppressWarnings("unchecked")
0638:                    List<LocalizedError> temp // to prevent warning on cast
0639:                    = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS);
0640:                    localizedErrors = temp;
0641:                } else {
0642:                    localizedErrors = new ArrayList<LocalizedError>();
0643:                    putObject(A_LOCALIZED_ERRORS, localizedErrors);
0644:                }
0645:
0646:                localizedErrors.add(new LocalizedError(processorName, ex,
0647:                        message));
0648:                addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@"
0649:                        + processorName);
0650:            }
0651:
0652:            // TODO: Move to utils.
0653:            protected String getClassSimpleName(final Class c) {
0654:                String classname = c.getName();
0655:                int index = classname.lastIndexOf('.');
0656:                return ((index > 0 && (index + 1) < classname.length()) ? classname
0657:                        .substring(index + 1)
0658:                        : classname);
0659:            }
0660:
0661:            /**
0662:             * Add an annotation: an abbrieviated indication of something special
0663:             * about this URI that need not be present in every crawl.log line,
0664:             * but should be noted for future reference. 
0665:             *
0666:             * @param annotation the annotation to add; should not contain 
0667:             * whitespace or a comma
0668:             */
0669:            public void addAnnotation(String annotation) {
0670:                String annotations;
0671:                if (containsKey(A_ANNOTATIONS)) {
0672:                    annotations = getString(A_ANNOTATIONS);
0673:                    annotations += "," + annotation;
0674:                } else {
0675:                    annotations = annotation;
0676:                }
0677:
0678:                putString(A_ANNOTATIONS, annotations);
0679:            }
0680:
0681:            /**
0682:             * TODO: Implement truncation using booleans rather than as this
0683:             * ugly String parse.
0684:             * @return True if fetch was truncated.
0685:             */
0686:            public boolean isTruncatedFetch() {
0687:                return annotationContains(TRUNC_SUFFIX);
0688:            }
0689:
0690:            public boolean isLengthTruncatedFetch() {
0691:                return annotationContains(LENGTH_TRUNC);
0692:            }
0693:
0694:            public boolean isTimeTruncatedFetch() {
0695:                return annotationContains(TIMER_TRUNC);
0696:            }
0697:
0698:            public boolean isHeaderTruncatedFetch() {
0699:                return annotationContains(HEADER_TRUNC);
0700:            }
0701:
0702:            protected boolean annotationContains(final String str2Find) {
0703:                boolean result = false;
0704:                if (!containsKey(A_ANNOTATIONS)) {
0705:                    return result;
0706:                }
0707:                String annotations = getString(A_ANNOTATIONS);
0708:                if (annotations != null && annotations.length() > 0) {
0709:                    result = annotations.indexOf(str2Find) >= 0;
0710:                }
0711:                return result;
0712:            }
0713:
0714:            /**
0715:             * Get the annotations set for this uri.
0716:             *
0717:             * @return the annotations set for this uri.
0718:             */
0719:            public String getAnnotations() {
0720:                return (containsKey(A_ANNOTATIONS)) ? getString(A_ANNOTATIONS)
0721:                        : null;
0722:            }
0723:
0724:            /**
0725:             * Get the embeded hop count.
0726:             *
0727:             * @return the embeded hop count.
0728:             * @deprecated 
0729:             */
0730:            public int getEmbedHopCount() {
0731:                return embedHopCount;
0732:            }
0733:
0734:            /**
0735:             * Get the link hop count.
0736:             *
0737:             * @return the link hop count.
0738:             * @deprecated 
0739:             */
0740:            public int getLinkHopCount() {
0741:                return linkHopCount;
0742:            }
0743:
0744:            /**
0745:             * Mark this uri as being a seed.
0746:             *
0747:             *
0748:             * @deprecated 
0749:             */
0750:            public void markAsSeed() {
0751:                linkHopCount = 0;
0752:                embedHopCount = 0;
0753:            }
0754:
0755:            /**
0756:             * Get the user agent to use for crawling this URI.
0757:             *
0758:             * If null the global setting should be used.
0759:             *
0760:             * @return user agent or null
0761:             */
0762:            public String getUserAgent() {
0763:                return userAgent;
0764:            }
0765:
0766:            /**
0767:             * Set the user agent to use when crawling this URI.
0768:             *
0769:             * If not set the global settings should be used.
0770:             *
0771:             * @param string user agent to use
0772:             */
0773:            public void setUserAgent(String string) {
0774:                userAgent = string;
0775:            }
0776:
0777:            /**
0778:             * Set which processor should be the next processor to process this uri
0779:             * instead of using the default next processor.
0780:             *
0781:             * @param processorChain the processor chain to skip to.
0782:             * @param processor the processor in the processor chain to skip to.
0783:             */
0784:            public void skipToProcessor(ProcessorChain processorChain,
0785:                    Processor processor) {
0786:                setNextProcessorChain(processorChain);
0787:                setNextProcessor(processor);
0788:            }
0789:
0790:            /**
0791:             * Set which processor chain should be processing this uri next.
0792:             *
0793:             * @param processorChain the processor chain to skip to.
0794:             */
0795:            public void skipToProcessorChain(ProcessorChain processorChain) {
0796:                setNextProcessorChain(processorChain);
0797:                setNextProcessor(null);
0798:            }
0799:
0800:            /**
0801:             * For completed HTTP transactions, the length of the content-body.
0802:             *
0803:             * @return For completed HTTP transactions, the length of the content-body.
0804:             */
0805:            public long getContentLength() {
0806:                if (this .contentLength < 0) {
0807:                    this .contentLength = (getHttpRecorder() != null) ? getHttpRecorder()
0808:                            .getResponseContentLength()
0809:                            : 0;
0810:                }
0811:                return this .contentLength;
0812:            }
0813:
0814:            /**
0815:             * Get size of data recorded (transferred)
0816:             *
0817:             * @return recorded data size
0818:             */
0819:            public long getRecordedSize() {
0820:                return (getHttpRecorder() != null) ? getHttpRecorder()
0821:                        .getRecordedInput().getSize()
0822:                // if unavailable fall back on content-size
0823:                        : getContentSize();
0824:            }
0825:
0826:            /**
0827:             * Sets the 'content size' for the URI, which is considered inclusive
0828:             * of all recorded material (such as protocol headers) or even material
0829:             * 'virtually' considered (as in material from a previous fetch 
0830:             * confirmed unchanged with a server). (In contrast, content-length 
0831:             * matches the HTTP definition, that of the enclosed content-body.)
0832:             * 
0833:             * Should be set by a fetcher or other processor as soon as the final 
0834:             * size of recorded content is known. Setting to an artificial/incorrect
0835:             * value may affect other reporting/processing. 
0836:             * 
0837:             * @param l Content size.
0838:             */
0839:            public void setContentSize(long l) {
0840:                contentSize = l;
0841:            }
0842:
0843:            /**
0844:             * If true then a link extractor has already claimed this CrawlURI and
0845:             * performed link extraction on the document content. This does not
0846:             * preclude other link extractors that may have an interest in this
0847:             * CrawlURI from also doing link extraction but default behavior should
0848:             * be to not run if link extraction has already been done.
0849:             * 
0850:             * <p>There is an onus on link extractors to set this flag if they have
0851:             * run.
0852:             * 
0853:             * <p>The only extractor of the default Heritrix set that does not
0854:             * respect this flag is
0855:             * {@link org.archive.crawler.extractor.ExtractorHTTP}.
0856:             * It runs against HTTP headers, not the document content.
0857:             * 
0858:             * @return True if a processor has performed link extraction on this
0859:             * CrawlURI
0860:             *
0861:             * @see #linkExtractorFinished()
0862:             */
0863:            public boolean hasBeenLinkExtracted() {
0864:                return linkExtractorFinished;
0865:            }
0866:
0867:            /**
0868:             * Note that link extraction has been performed on this CrawlURI. A processor
0869:             * doing link extraction should invoke this method once it has finished it's
0870:             * work. It should invoke it even if no links are extracted. It should only
0871:             * invoke this method if the link extraction was performed on the document
0872:             * body (not the HTTP headers etc.).
0873:             *
0874:             * @see #hasBeenLinkExtracted()
0875:             */
0876:            public void linkExtractorFinished() {
0877:                linkExtractorFinished = true;
0878:                if (discardedOutlinks > 0) {
0879:                    addAnnotation("dol:" + discardedOutlinks);
0880:                }
0881:            }
0882:
0883:            /**
0884:             * Notify CrawlURI it is about to be logged; opportunity
0885:             * for self-annotation
0886:             */
0887:            public void aboutToLog() {
0888:                if (fetchAttempts > 1) {
0889:                    addAnnotation(fetchAttempts + "t");
0890:                }
0891:            }
0892:
0893:            /**
0894:             * Get the http recorder associated with this uri.
0895:             *
0896:             * @return Returns the httpRecorder.  May be null but its set early in
0897:             * FetchHttp so there is an issue if its null.
0898:             */
0899:            public HttpRecorder getHttpRecorder() {
0900:                return httpRecorder;
0901:            }
0902:
0903:            /**
0904:             * Set the http recorder to be associated with this uri.
0905:             *
0906:             * @param httpRecorder The httpRecorder to set.
0907:             */
0908:            public void setHttpRecorder(HttpRecorder httpRecorder) {
0909:                this .httpRecorder = httpRecorder;
0910:            }
0911:
0912:            /**
0913:             * Return true if this is a http transaction.
0914:             *
0915:             * TODO: Compound this and {@link #isPost()} method so that there is one
0916:             * place to go to find out if get http, post http, ftp, dns.
0917:             *
0918:             * @return True if this is a http transaction.
0919:             */
0920:            public boolean isHttpTransaction() {
0921:                return containsKey(A_HTTP_TRANSACTION);
0922:            }
0923:
0924:            /**
0925:             * Clean up after a run through the processing chain.
0926:             *
0927:             * Called on the end of processing chain by Frontier#finish.  Null out any
0928:             * state gathered during processing.
0929:             */
0930:            public void processingCleanup() {
0931:                this .httpRecorder = null;
0932:                this .fetchStatus = S_UNATTEMPTED;
0933:                this .setPrerequisite(false);
0934:                this .contentSize = UNCALCULATED;
0935:                this .contentLength = UNCALCULATED;
0936:                // Clear 'links extracted' flag.
0937:                this .linkExtractorFinished = false;
0938:                // Clean the alist of all but registered permanent members.
0939:                setAList(getPersistentAList());
0940:            }
0941:
0942:            public AList getPersistentAList() {
0943:                AList newAList = new HashtableAList();
0944:                // copy declared persistent keys
0945:                if (alistPersistentMember != null
0946:                        && alistPersistentMember.size() > 0) {
0947:                    newAList.copyKeysFrom(alistPersistentMember.iterator(),
0948:                            getAList());
0949:                }
0950:                // also copy declared 'heritable' keys
0951:                List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
0952:                if (heritableKeys != null) {
0953:                    newAList.copyKeysFrom(heritableKeys.iterator(), getAList());
0954:                }
0955:                return newAList;
0956:            }
0957:
0958:            /**
0959:             * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.
0960:             *
0961:             * Its safe to pass a CrawlURI instance.  In this case we just return it
0962:             * as a result. Otherwise, we create new CrawlURI instance.
0963:             *
0964:             * @param caUri Candidate URI.
0965:             * @param ordinal
0966:             * @return A crawlURI made from the passed CandidateURI.
0967:             */
0968:            public static CrawlURI from(CandidateURI caUri, long ordinal) {
0969:                return (caUri instanceof  CrawlURI) ? (CrawlURI) caUri
0970:                        : new CrawlURI(caUri, ordinal);
0971:            }
0972:
0973:            /**
0974:             * @param avatars Credential avatars to save off.
0975:             */
0976:            private void setCredentialAvatars(Set avatars) {
0977:                putObject(A_CREDENTIAL_AVATARS_KEY, avatars);
0978:            }
0979:
0980:            /**
0981:             * @return Credential avatars.  Null if none set.
0982:             */
0983:            @SuppressWarnings("unchecked")
0984:            public Set<CredentialAvatar> getCredentialAvatars() {
0985:                return (Set) getObject(A_CREDENTIAL_AVATARS_KEY);
0986:            }
0987:
0988:            /**
0989:             * @return True if there are avatars attached to this instance.
0990:             */
0991:            public boolean hasCredentialAvatars() {
0992:                return getCredentialAvatars() != null
0993:                        && getCredentialAvatars().size() > 0;
0994:            }
0995:
0996:            /**
0997:             * Add an avatar.
0998:             *
0999:             * We do lazy instantiation.
1000:             *
1001:             * @param ca Credential avatar to add to set of avatars.
1002:             */
1003:            public void addCredentialAvatar(CredentialAvatar ca) {
1004:                Set<CredentialAvatar> avatars = getCredentialAvatars();
1005:                if (avatars == null) {
1006:                    avatars = new HashSet<CredentialAvatar>();
1007:                    setCredentialAvatars(avatars);
1008:                }
1009:                avatars.add(ca);
1010:            }
1011:
1012:            /**
1013:             * Remove all credential avatars from this crawl uri.
1014:             */
1015:            public void removeCredentialAvatars() {
1016:                if (hasCredentialAvatars()) {
1017:                    remove(A_CREDENTIAL_AVATARS_KEY);
1018:                }
1019:            }
1020:
1021:            /**
1022:             * Remove all credential avatars from this crawl uri.
1023:             * @param ca Avatar to remove.
1024:             * @return True if we removed passed parameter.  False if no operation
1025:             * performed.
1026:             */
1027:            public boolean removeCredentialAvatar(CredentialAvatar ca) {
1028:                boolean result = false;
1029:                Set avatars = getCredentialAvatars();
1030:                if (avatars != null && avatars.size() > 0) {
1031:                    result = avatars.remove(ca);
1032:                }
1033:                return result;
1034:            }
1035:
1036:            /**
1037:             * Ask this URI if it was a success or not.
1038:             *
1039:             * Only makes sense to call this method after execution of
1040:             * HttpMethod#execute. Regard any status larger then 0 as success
1041:             * except for below caveat regarding 401s.  Use {@link #is2XXSuccess()} if
1042:             * looking for a status code in the 200 range.
1043:             *
1044:             * <p>401s caveat: If any rfc2617 credential data present and we got a 401
1045:             * assume it got loaded in FetchHTTP on expectation that we're to go around
1046:             * the processing chain again. Report this condition as a failure so we
1047:             * get another crack at the processing chain only this time we'll be making
1048:             * use of the loaded credential data.
1049:             *
1050:             * @return True if ths URI has been successfully processed.
1051:             * @see #is2XXSuccess()
1052:             */
1053:            public boolean isSuccess() {
1054:                boolean result = false;
1055:                int statusCode = this .fetchStatus;
1056:                if (statusCode == HttpStatus.SC_UNAUTHORIZED
1057:                        && hasRfc2617CredentialAvatar()) {
1058:                    result = false;
1059:                } else {
1060:                    result = (statusCode > 0);
1061:                }
1062:                return result;
1063:            }
1064:
1065:            /**
1066:             * @return True if status code is in the 2xx range.
1067:             * @see #isSuccess()
1068:             */
1069:            public boolean is2XXSuccess() {
1070:                return this .fetchStatus >= 200 && this .fetchStatus < 300;
1071:            }
1072:
1073:            /**
1074:             * @return True if we have an rfc2617 payload.
1075:             */
1076:            public boolean hasRfc2617CredentialAvatar() {
1077:                boolean result = false;
1078:                Set avatars = getCredentialAvatars();
1079:                if (avatars != null && avatars.size() > 0) {
1080:                    for (Iterator i = avatars.iterator(); i.hasNext();) {
1081:                        if (((CredentialAvatar) i.next())
1082:                                .match(Rfc2617Credential.class)) {
1083:                            result = true;
1084:                            break;
1085:                        }
1086:                    }
1087:                }
1088:                return result;
1089:            }
1090:
1091:            /**
1092:             * Set whether this URI should be fetched by sending a HTTP POST request.
1093:             * Else a HTTP GET request will be used.
1094:             *
1095:             * @param b Set whether this curi is to be POST'd.  Else its to be GET'd.
1096:             */
1097:            public void setPost(boolean b) {
1098:                this .post = b;
1099:            }
1100:
1101:            /**
1102:             * Returns true if this URI should be fetched by sending a HTTP POST request.
1103:             *
1104:             *
1105:             * TODO: Compound this and {@link #isHttpTransaction()} method so that there
1106:             * is one place to go to find out if get http, post http, ftp, dns.
1107:             *
1108:             * @return Returns is this CrawlURI instance is to be posted.
1109:             */
1110:            public boolean isPost() {
1111:                return this .post;
1112:            }
1113:
1114:            /**
1115:             * Set the retained content-digest value (usu. SHA1). 
1116:             * 
1117:             * @param digestValue
1118:             * @deprecated Use {@link #setContentDigest(String scheme, byte[])}
1119:             */
1120:            public void setContentDigest(byte[] digestValue) {
1121:                setContentDigest("SHA1", digestValue);
1122:            }
1123:
1124:            public void setContentDigest(final String scheme,
1125:                    final byte[] digestValue) {
1126:                this .contentDigest = digestValue;
1127:                this .contentDigestScheme = scheme;
1128:            }
1129:
1130:            public String getContentDigestSchemeString() {
1131:                if (this .contentDigest == null) {
1132:                    return null;
1133:                }
1134:                return this .contentDigestScheme + ":"
1135:                        + getContentDigestString();
1136:            }
1137:
1138:            /**
1139:             * Return the retained content-digest value, if any.
1140:             * 
1141:             * @return Digest value.
1142:             */
1143:            public Object getContentDigest() {
1144:                return contentDigest;
1145:            }
1146:
1147:            public String getContentDigestString() {
1148:                if (this .contentDigest == null) {
1149:                    return null;
1150:                }
1151:                return Base32.encode(this .contentDigest);
1152:            }
1153:
1154:            transient Object holder;
1155:            transient Object holderKey;
1156:
1157:            /**
1158:             * Remember a 'holder' to which some enclosing/queueing
1159:             * facility has assigned this CrawlURI
1160:             * .
1161:             * @param obj
1162:             */
1163:            public void setHolder(Object obj) {
1164:                holder = obj;
1165:            }
1166:
1167:            /**
1168:             * Return the 'holder' for the convenience of 
1169:             * an external facility.
1170:             *
1171:             * @return holder
1172:             */
1173:            public Object getHolder() {
1174:                return holder;
1175:            }
1176:
1177:            /**
1178:             * Remember a 'holderKey' which some enclosing/queueing
1179:             * facility has assigned this CrawlURI
1180:             * .
1181:             * @param obj
1182:             */
1183:            public void setHolderKey(Object obj) {
1184:                holderKey = obj;
1185:            }
1186:
1187:            /**
1188:             * Return the 'holderKey' for convenience of 
1189:             * an external facility (Frontier).
1190:             * 
1191:             * @return holderKey 
1192:             */
1193:            public Object getHolderKey() {
1194:                return holderKey;
1195:            }
1196:
1197:            /**
1198:             * Get the ordinal (serial number) assigned at creation.
1199:             * 
1200:             * @return ordinal
1201:             */
1202:            public long getOrdinal() {
1203:                return ordinal;
1204:            }
1205:
1206:            /** spot for an integer cost to be placed by external facility (frontier).
1207:             *  cost is truncated to 8 bits at times, so should not exceed 255 */
1208:            int holderCost = UNCALCULATED;
1209:
1210:            /**
1211:             * Return the 'holderCost' for convenience of external facility (frontier)
1212:             * @return value of holderCost
1213:             */
1214:            public int getHolderCost() {
1215:                return holderCost;
1216:            }
1217:
1218:            /**
1219:             * Remember a 'holderCost' which some enclosing/queueing
1220:             * facility has assigned this CrawlURI
1221:             * @param cost value to remember
1222:             */
1223:            public void setHolderCost(int cost) {
1224:                holderCost = cost;
1225:            }
1226:
1227:            /** 
1228:             * All discovered outbound Links (navlinks, embeds, etc.) 
1229:             * Can either contain Link instances or CandidateURI instances, or both.
1230:             * The LinksScoper processor converts Link instances in this collection
1231:             * to CandidateURI instances. 
1232:             */
1233:            transient Collection<Object> outLinks = new HashSet<Object>();
1234:
1235:            /**
1236:             * Returns discovered links.  The returned collection might be empty if
1237:             * no links were discovered, or if something like LinksScoper promoted
1238:             * the links to CandidateURIs.
1239:             * 
1240:             * Elements can be removed from the returned collection, but not added.
1241:             * To add a discovered link, use one of the createAndAdd methods or
1242:             * {@link #getOutObjects()}.
1243:             * 
1244:             * @return Collection of all discovered outbound Links
1245:             */
1246:            public Collection<Link> getOutLinks() {
1247:                return Transform.subclasses(outLinks, Link.class);
1248:            }
1249:
1250:            /**
1251:             * Returns discovered candidate URIs.  The returned collection will be
1252:             * emtpy until something like LinksScoper promotes discovered Links
1253:             * into CandidateURIs.
1254:             * 
1255:             * Elements can be removed from the returned collection, but not added.
1256:             * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or
1257:             * {@link #getOutObjects}.
1258:             * 
1259:             * @return  Collection of candidate URIs
1260:             */
1261:            public Collection<CandidateURI> getOutCandidates() {
1262:                return Transform.subclasses(outLinks, CandidateURI.class);
1263:            }
1264:
1265:            /**
1266:             * Returns all of the outbound objects.  The returned Collection will
1267:             * contain Link instances, or CandidateURI instances, or both.  
1268:             * 
1269:             * @return  the collection of Links and/or CandidateURIs
1270:             */
1271:            public Collection<Object> getOutObjects() {
1272:                return outLinks;
1273:            }
1274:
1275:            /**
1276:             * Add a discovered Link, unless it would exceed the max number
1277:             * to accept. (If so, increment discarded link counter.) 
1278:             * 
1279:             * @param link the Link to add
1280:             */
1281:            public void addOutLink(Link link) {
1282:                if (outLinks.size() < MAX_OUTLINKS) {
1283:                    outLinks.add(link);
1284:                } else {
1285:                    // note & discard
1286:                    discardedOutlinks++;
1287:                }
1288:            }
1289:
1290:            public void clearOutlinks() {
1291:                this .outLinks.clear();
1292:            }
1293:
1294:            /**
1295:             * Replace current collection of links w/ passed list.
1296:             * Used by Scopers adjusting the list of links (removing those
1297:             * not in scope and promoting Links to CandidateURIs).
1298:             * 
1299:             * @param a collection of CandidateURIs replacing any previously
1300:             *   existing outLinks or outCandidates
1301:             */
1302:            public void replaceOutlinks(Collection<CandidateURI> links) {
1303:                clearOutlinks();
1304:                this .outLinks.addAll(links);
1305:            }
1306:
1307:            /**
1308:             * @return Count of outlinks.
1309:             */
1310:            public int outlinksSize() {
1311:                return this .outLinks.size();
1312:            }
1313:
1314:            /**
1315:             * Convenience method for creating a Link discovered at this URI
1316:             * with the given string and context
1317:             * 
1318:             * @param url
1319:             *            String to use to create Link
1320:             * @param context
1321:             *            CharSequence context to use
1322:             * @param hopType
1323:             * @return Link.
1324:             * @throws URIException
1325:             *             if Link UURI cannot be constructed
1326:             */
1327:            public Link createLink(String url, CharSequence context,
1328:                    char hopType) throws URIException {
1329:                return new Link(getUURI(), UURIFactory.getInstance(getUURI(),
1330:                        url), context, hopType);
1331:            }
1332:
1333:            /**
1334:             * Convenience method for creating a Link with the given string and
1335:             * context
1336:             * 
1337:             * @param url
1338:             *            String to use to create Link
1339:             * @param context
1340:             *            CharSequence context to use
1341:             * @param hopType
1342:             * @throws URIException
1343:             *             if Link UURI cannot be constructed
1344:             */
1345:            public void createAndAddLink(String url, CharSequence context,
1346:                    char hopType) throws URIException {
1347:                addOutLink(createLink(url, context, hopType));
1348:            }
1349:
1350:            /**
1351:             * Convenience method for creating a Link with the given string and
1352:             * context, relative to a previously set base HREF if available (or
1353:             * relative to the current CrawlURI if no other base has been set)
1354:             * 
1355:             * @param url String URL to add as destination of link
1356:             * @param context String context where link was discovered
1357:             * @param hopType char hop-type indicator
1358:             * @throws URIException
1359:             */
1360:            public void createAndAddLinkRelativeToBase(String url,
1361:                    CharSequence context, char hopType) throws URIException {
1362:                addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1363:                        getBaseURI(), url), context, hopType));
1364:            }
1365:
1366:            /**
1367:             * Convenience method for creating a Link with the given string and
1368:             * context, relative to this CrawlURI's via UURI if available. (If
1369:             * a via is not available, falls back to using 
1370:             * #createAndAddLinkRelativeToBase.)
1371:             * 
1372:             * @param url String URL to add as destination of link
1373:             * @param context String context where link was discovered
1374:             * @param hopType char hop-type indicator
1375:             * @throws URIException
1376:             */
1377:            public void createAndAddLinkRelativeToVia(String url,
1378:                    CharSequence context, char hopType) throws URIException {
1379:                if (getVia() != null) {
1380:                    addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1381:                            getVia(), url), context, hopType));
1382:                } else {
1383:                    // if no 'via', fall back to base/self
1384:                    createAndAddLinkRelativeToBase(url, context, hopType);
1385:                }
1386:            }
1387:
1388:            /**
1389:             * Set the (HTML) Base URI used for derelativizing internal URIs. 
1390:             * 
1391:             * @param baseHref String base href to use
1392:             * @throws URIException if supplied string cannot be interpreted as URI
1393:             */
1394:            public void setBaseURI(String baseHref) throws URIException {
1395:                putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));
1396:            }
1397:
1398:            /**
1399:             * Get the (HTML) Base URI used for derelativizing internal URIs. 
1400:             *
1401:             * @return UURI base URI previously set 
1402:             */
1403:            public UURI getBaseURI() {
1404:                if (!containsKey(A_HTML_BASE)) {
1405:                    return getUURI();
1406:                }
1407:                return (UURI) getObject(A_HTML_BASE);
1408:            }
1409:
1410:            /**
1411:             * Add the key of alist items you want to persist across
1412:             * processings.
1413:             * @param key Key to add.
1414:             */
1415:            public static void addAlistPersistentMember(Object key) {
1416:                alistPersistentMember.add(key);
1417:            }
1418:
1419:            /**
1420:             * @param key Key to remove.
1421:             * @return True if list contained the element.
1422:             */
1423:            public static boolean removeAlistPersistentMember(Object key) {
1424:                return alistPersistentMember.remove(key);
1425:            }
1426:
1427:            /**
1428:             * Custom serialization writing an empty 'outLinks' as null. Estimated
1429:             * to save ~20 bytes in serialized form. 
1430:             * 
1431:             * @param stream
1432:             * @throws IOException
1433:             */
1434:            private void writeObject(ObjectOutputStream stream)
1435:                    throws IOException {
1436:                stream.defaultWriteObject();
1437:                stream.writeObject((outLinks.isEmpty()) ? null : outLinks);
1438:            }
1439:
1440:            /**
1441:             * Custom deserialization recreating empty HashSet from null in 'outLinks'
1442:             * slot. 
1443:             * 
1444:             * @param stream
1445:             * @throws IOException
1446:             * @throws ClassNotFoundException
1447:             */
1448:            private void readObject(ObjectInputStream stream)
1449:                    throws IOException, ClassNotFoundException {
1450:                stream.defaultReadObject();
1451:                @SuppressWarnings("unchecked")
1452:                HashSet<Object> ol = (HashSet<Object>) stream.readObject();
1453:                outLinks = (ol == null) ? new HashSet<Object>() : ol;
1454:            }
1455:
1456:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.