Source Code Cross Referenced for RobotsExclusionPolicy.java in  » Web-Crawler » heritrix » org » archive » crawler » datamodel » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.datamodel 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * RobotsExclusionPolicy.java
020:         * Created on Apr 17, 2003
021:         *
022:         * $Header$
023:         */
024:        package org.archive.crawler.datamodel;
025:
026:        import java.io.BufferedReader;
027:        import java.io.IOException;
028:        import java.io.ObjectInputStream;
029:        import java.io.ObjectOutputStream;
030:        import java.io.Serializable;
031:        import java.util.ArrayList;
032:        import java.util.HashMap;
033:        import java.util.Iterator;
034:        import java.util.LinkedList;
035:        import java.util.List;
036:        import java.util.logging.Level;
037:        import java.util.logging.Logger;
038:
039:        import org.apache.commons.httpclient.URIException;
040:        import org.archive.crawler.settings.CrawlerSettings;
041:
042:        /**
043:         * RobotsExclusionPolicy represents the actual policy adopted with 
044:         * respect to a specific remote server, usually constructed from 
045:         * consulting the robots.txt, if any, the server provided. 
046:         * 
047:         * (The similarly named RobotsHonoringPolicy, on the other hand, 
048:         * describes the strategy used by the crawler to determine to what
049:         * extent it respects exclusion rules.)
050:         * 
051:         * The expiration of policies after a suitable amount of time has
052:         * elapsed since last fetch is handled outside this class, in 
053:         * CrawlServer itself. 
054:         * 
055:         * @author gojomo
056:         *
057:         */
058:        public class RobotsExclusionPolicy implements  Serializable {
059:
060:            private static final long serialVersionUID = 6323907991237383113L;
061:
062:            private static final Logger logger = Logger
063:                    .getLogger(RobotsExclusionPolicy.class.getName());
064:
065:            private final static int NORMAL_TYPE = 0;
066:            private final static int ALLOWALL_TYPE = 1;
067:            private final static int DENYALL_TYPE = 2;
068:            private transient int type = NORMAL_TYPE;
069:
070:            public static RobotsExclusionPolicy ALLOWALL = new RobotsExclusionPolicy(
071:                    ALLOWALL_TYPE);
072:            public static RobotsExclusionPolicy DENYALL = new RobotsExclusionPolicy(
073:                    DENYALL_TYPE);
074:
075:            private LinkedList<String> userAgents = null;
076:            private HashMap<String, List<String>> disallows = null;
077:            transient RobotsHonoringPolicy honoringPolicy = null;
078:
079:            private String lastUsedUserAgent = null;
080:            private List<String> userAgentsToTest = null;
081:
082:            /**
083:             * @param settings 
084:             * @param reader
085:             * @param honoringPolicy
086:             * @return Robot exclusion policy.
087:             * @throws IOException
088:             */
089:            public static RobotsExclusionPolicy policyFor(
090:                    CrawlerSettings settings, BufferedReader reader,
091:                    RobotsHonoringPolicy honoringPolicy) throws IOException {
092:                LinkedList<String> userAgents = new LinkedList<String>();
093:                HashMap<String, List<String>> disallows = new HashMap<String, List<String>>();
094:                Robotstxt.parse(reader, userAgents, disallows);
095:                return (disallows.isEmpty()) ? ALLOWALL
096:                        : new RobotsExclusionPolicy(settings, userAgents,
097:                                disallows, honoringPolicy);
098:            }
099:
100:            /**
101:             * @param settings 
102:             * @param u
103:             * @param d
104:             * @param honoringPolicy
105:             */
106:            public RobotsExclusionPolicy(CrawlerSettings settings,
107:                    LinkedList<String> u, HashMap<String, List<String>> d,
108:                    RobotsHonoringPolicy honoringPolicy) {
109:                userAgents = u;
110:                disallows = d;
111:                this .honoringPolicy = honoringPolicy;
112:
113:                if (honoringPolicy == null)
114:                    return;
115:
116:                // If honoring policy is most favored user agent, all rules should be checked
117:                if (honoringPolicy.isType(settings,
118:                        RobotsHonoringPolicy.MOST_FAVORED)) {
119:                    userAgentsToTest = userAgents;
120:
121:                    // IF honoring policy is most favored of set, then make a list with only the set as members
122:                } else if (honoringPolicy.isType(settings,
123:                        RobotsHonoringPolicy.MOST_FAVORED_SET)) {
124:                    userAgentsToTest = new ArrayList<String>();
125:                    Iterator userAgentSet = honoringPolicy.getUserAgents(
126:                            settings).iterator();
127:                    while (userAgentSet.hasNext()) {
128:                        String userAgent = (String) userAgentSet.next();
129:
130:                        Iterator iter = userAgents.iterator();
131:                        while (iter.hasNext()) {
132:                            String ua = (String) iter.next();
133:                            if (userAgent.indexOf(ua) > -1) {
134:                                userAgentsToTest.add(ua);
135:                                break;
136:                            }
137:                        }
138:                    }
139:                }
140:            }
141:
142:            public RobotsExclusionPolicy(int type) {
143:                this (null, null, null, null);
144:                this .type = type;
145:            }
146:
147:            public boolean disallows(CrawlURI curi, String userAgent) {
148:                if (this  == ALLOWALL)
149:                    return false;
150:                if (this  == DENYALL)
151:                    return true;
152:
153:                // In the common case with policy=Classic, the useragent is remembered from uri to uri on
154:                // the same server
155:                if ((honoringPolicy.isType(curi, RobotsHonoringPolicy.CLASSIC) || honoringPolicy
156:                        .isType(curi, RobotsHonoringPolicy.CUSTOM))
157:                        && (lastUsedUserAgent == null || !lastUsedUserAgent
158:                                .equals(userAgent))) {
159:
160:                    lastUsedUserAgent = userAgent;
161:                    userAgentsToTest = new ArrayList<String>();
162:                    Iterator iter = userAgents.iterator();
163:                    String lowerCaseUserAgent = userAgent.toLowerCase();
164:                    while (iter.hasNext()) {
165:                        String ua = (String) iter.next();
166:                        // ua in below is already lowercase. See Robotstxt.java line 60. 
167:                        if (lowerCaseUserAgent.indexOf(ua) > -1) {
168:                            userAgentsToTest.add(ua);
169:                            break; // consider no more sections
170:                        }
171:                    }
172:                }
173:
174:                boolean disallow = false;
175:                boolean examined = false;
176:                String ua = null;
177:
178:                // Go thru list of all user agents we might act as
179:                Iterator uas = userAgentsToTest.iterator();
180:                while (uas.hasNext() && examined == false) {
181:                    disallow = false;
182:                    ua = (String) uas.next();
183:                    Iterator dis = ((List) disallows.get(ua)).iterator();
184:
185:                    // Check if the current user agent is allowed to crawl
186:                    while (dis.hasNext() && examined == false
187:                            && disallow == false) {
188:                        String disallowedPath = (String) dis.next();
189:                        if (disallowedPath.length() == 0) {
190:                            // blanket allow
191:                            examined = true;
192:                            disallow = false;
193:                            break;
194:                        }
195:                        try {
196:                            String p = curi.getUURI().getPathQuery();
197:                            if (p != null && p.startsWith(disallowedPath)) {
198:                                // the user agent tested isn't allowed to get this uri
199:                                disallow = true;
200:                            }
201:                        } catch (URIException e) {
202:                            logger.log(Level.SEVERE,
203:                                    "Failed getPathQuery from " + curi, e);
204:                        }
205:                    }
206:                    if (disallow == false) {
207:                        // the user agent tested is allowed
208:                        examined = true;
209:                    }
210:                }
211:
212:                // Are we supposed to masquerade as the user agent to which restrictions
213:                // we follow?
214:                if (honoringPolicy.shouldMasquerade(curi) && ua != null
215:                        && !ua.equals("")) {
216:                    curi.setUserAgent(ua);
217:                }
218:                return disallow;
219:            }
220:
221:            // Methods for object serialization.
222:
223:            /** If object is DENYALL or ALLOWALL, only the object identity and type
224:             * is written in the serialization stream.
225:             *
226:             * @param stream the serialization stream.
227:             * @throws IOException 
228:             */
229:            private void writeObject(ObjectOutputStream stream)
230:                    throws IOException {
231:                stream.writeInt(type);
232:                if (type == NORMAL_TYPE) {
233:                    stream.defaultWriteObject();
234:                }
235:            }
236:
237:            /** If object is DENYALL or ALLOWALL, only the object identity and type
238:             * is read from the serialization stream.
239:             *
240:             * @param stream the serialization stream.
241:             * @throws IOException 
242:             * @throws ClassNotFoundException 
243:             */
244:            private void readObject(ObjectInputStream stream)
245:                    throws IOException, ClassNotFoundException {
246:                type = stream.readInt();
247:                if (type == NORMAL_TYPE) {
248:                    stream.defaultReadObject();
249:                }
250:            }
251:
252:            /** If object is DENYALL or ALLOWALL, the object is replaced by constants
253:             * so that check for object equality works.
254:             * @return Object.
255:             */
256:            private Object readResolve() {
257:                if (type == NORMAL_TYPE) {
258:                    return this;
259:                } else if (type == ALLOWALL_TYPE) {
260:                    return ALLOWALL;
261:                } else if (type == DENYALL_TYPE) {
262:                    return DENYALL;
263:                }
264:                return null;
265:            }
266:
267:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.