Source Code Cross Referenced for JoBoBase.java in  » Web-Crawler » JoBo » net » matuschek » jobo » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » JoBo » net.matuschek.jobo 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        package net.matuschek.jobo;
002:
003:        /************************************************
004:         Copyright (c) 2001/2002 by Daniel Matuschek
005:         *************************************************/
006:
007:        import java.io.File;
008:        import java.io.FileWriter;
009:        import java.io.Writer;
010:
011:        import net.matuschek.http.DownloadRuleSet;
012:        import net.matuschek.http.HttpDocToFile;
013:        import net.matuschek.http.HttpToolCallback;
014:        import net.matuschek.spider.RegExpURLCheck;
015:        import net.matuschek.spider.WebRobot;
016:        import net.matuschek.spider.WebRobotCallback;
017:        import net.matuschek.spider.docfilter.FilterChain;
018:        import net.matuschek.spider.docfilter.LinkLocalizer;
019:
020:        import org.apache.log4j.Category;
021:
022:        import org.exolab.castor.mapping.Mapping;
023:        import org.exolab.castor.xml.Marshaller;
024:        import org.exolab.castor.xml.Unmarshaller;
025:
026:        import org.xml.sax.InputSource;
027:
028:        /**
029:         * This is a simple class that contains all needed features for JoBo
030:         * (the web robot, the download rules, RegExpUrlCheck ...)
031:         * 
032:         * @author Daniel Matuschek
033:         * @version $Revision: 1.21 $
034:         */
035:        public class JoBoBase {
036:
037:            /** Log4J logging */
038:            private static Category log = Category.getInstance("");
039:
040:            /** The file used for XML->Java mapping */
041:            private static String mappingfile = "mapping.xml";
042:
043:            /** The jobo configuration in XML */
044:            private static String xmlconfig = "jobo.xml";
045:
046:            /** Start URL for the robot */
047:            //  private static String startUrl=null;
048:            private String storageDirectory = "/tmp";
049:            private WebRobot robot = null;
050:            private RegExpURLCheck urlcheck = null;
051:            private DownloadRuleSet downloadrules = null;
052:            private HttpDocToFile docstore = null;
053:
054:            /** Filter to localize included links */
055:            private LinkLocalizer linkLocalizer = null;
056:
057:            /** FilterChains with all filters */
058:            private FilterChain filters = null;
059:
060:            /**
061:             * @exception ClassNotFoundException if the Robot could not be instantiated
062:             * for some reason
063:             */
064:            public JoBoBase() throws ClassNotFoundException {
065:                log = Category.getInstance(this .getClass());
066:                docstore = new HttpDocToFile(storageDirectory);
067:                initializeFilters();
068:                robot = new WebRobot();
069:                robot.setFilters(filters);
070:            }
071:
072:            /**
073:             * Set the default filter chain
074:             */
075:            public void initializeFilters() {
076:                filters = new FilterChain();
077:                linkLocalizer = new LinkLocalizer();
078:                filters.add(linkLocalizer);
079:            }
080:
081:            /**
082:             * write the settings to an XML file
083:             */
084:            public void saveConfig(String filename) {
085:                File f1 = new File(mappingfile);
086:
087:                if (f1.exists()) {
088:                    Mapping mapping = new Mapping();
089:                    try {
090:                        mapping.loadMapping(mappingfile);
091:                        Writer writer = new FileWriter(filename);
092:                        Marshaller marshaller = new Marshaller(writer);
093:                        marshaller.setMapping(mapping);
094:                        marshaller.marshal(this );
095:                        writer.close();
096:
097:                        log.info("written to XML");
098:                    } catch (Exception e) {
099:                        log.error(e.getMessage());
100:                        e.printStackTrace();
101:                    }
102:                } else {
103:                    log.error("mapping and/or configfile not found");
104:                }
105:            }
106:
107:            public void registerHttpToolCallback(HttpToolCallback cb) {
108:                robot.setHttpToolCallback(cb);
109:            }
110:
111:            public void registerWebRobotCallback(WebRobotCallback cb) {
112:                robot.setWebRobotCallback(cb);
113:            }
114:
115:            /**
116:             * registers the regexpurlcheck and the download rules with the robot
117:             */
118:            public void configureRobot() {
119:                robot.setURLCheck(urlcheck);
120:                robot.setDownloadRuleSet(downloadrules);
121:                robot.setDocManager(docstore);
122:                robot.setFilters(filters);
123:            }
124:
125:            /**
126:             * Get the value of urlcheck.
127:             * @return Value of urlcheck.
128:             */
129:            public RegExpURLCheck getURLCheck() {
130:                return urlcheck;
131:            }
132:
133:            /**
134:             * Set the value of urlcheck.
135:             * @param v  Value to assign to urlcheck.
136:             */
137:            public void setURLCheck(RegExpURLCheck urlcheck) {
138:                this .urlcheck = urlcheck;
139:            }
140:
141:            /**
142:             * Get the value of robot.
143:             * @return Value of robot.
144:             */
145:            public WebRobot getRobot() {
146:                return robot;
147:            }
148:
149:            /**
150:             * Set the value of robot. The new Robot will use the
151:             * filter that are defined in JoBoBase, even if he had
152:             * its own FilterChain before.
153:             *
154:             * @param robot WebRobot object to use
155:             */
156:            public void setRobot(WebRobot robot) {
157:                this .robot = robot;
158:                robot.setFilters(filters);
159:            }
160:
161:            /**
162:             * Localize links ?
163:             *
164:             * @param localize if this is true, JoBo will trz to replace
165:             * absolute links by relative
166:             */
167:            public void setLocalizeLinks(boolean localize) {
168:                if (localize) {
169:                    linkLocalizer.enable();
170:                } else {
171:                    linkLocalizer.disable();
172:                }
173:            }
174:
175:            /**
176:             * is link localization enabled ?
177:             */
178:            public boolean getLocalizeLinks() {
179:                return linkLocalizer.isEnabled();
180:            }
181:
182:            /**
183:             * Get the value of downloadRules.
184:             * @return Value of downloadRules.
185:             */
186:            public DownloadRuleSet getDownloadRuleSet() {
187:                return downloadrules;
188:            }
189:
190:            /**
191:             * Set the value of downloadRules.
192:             * @param v  Value to assign to downloadRules.
193:             */
194:            public void setDownloadRuleSet(DownloadRuleSet downloadRuleSet) {
195:                this .downloadrules = downloadRuleSet;
196:            }
197:
198:            /**
199:             * Get the value of storageDirectory.
200:             * @return Value of storageDirectory.
201:             */
202:            public String getStorageDirectory() {
203:                return storageDirectory;
204:            }
205:
206:            /**
207:             * Set the value of storageDirectory.
208:             * @param v  Value to assign to storageDirectory.
209:             */
210:            public void setStorageDirectory(String storageDirectory) {
211:                this .storageDirectory = storageDirectory;
212:                docstore.setBaseDir(storageDirectory);
213:            }
214:
215:            /**
216:             * Enable/disable storing of dynamic documents (with an "?"
217:             * somewhere in the URL
218:             *
219:             * @param v true: enable storing of <b>all</b> documents, 
220:             *          false: store only documents with an URL without "?"
221:             */
222:            public void setStoreCGI(boolean storeCGI) {
223:                this .docstore.setStoreCGI(storeCGI);
224:            }
225:
226:            /**
227:             * Get the status of storeCGI
228:             *
229:             * @return the current status of storeCGI
230:             * @see #setStoreCGI for more information
231:             */
232:            public boolean getStoreCGI() {
233:                return this .docstore.getStoreCGI();
234:            }
235:
236:            /**
237:             * Unmarshall the object from an XML file (jobo.xml) in the current
238:             * directory
239:             *
240:             * @exception ClassNotFoundException if the Robot could not be instantiated
241:             * for some reason   
242:             */
243:            public static JoBoBase createFromXML()
244:                    throws ClassNotFoundException {
245:                return createFromXML(".");
246:            }
247:
248:            /** 
249:             * Unmarshall the object from an XML file
250:             *
251:             * @param configDirectory name of the directory where jobo.xml and
252:             * mapping.xml should be read from.
253:             * @exception ClassNotFoundException if the Robot could not be instantiated
254:             * for some reason   
255:             */
256:            public static JoBoBase createFromXML(String configDirectory)
257:                    throws ClassNotFoundException {
258:                JoBoBase baseobj = null;
259:
260:                xmlconfig = "jobo.xml";
261:
262:                File f1 = new File(configDirectory + File.separatorChar
263:                        + mappingfile);
264:                File f2 = new File(configDirectory + File.separatorChar
265:                        + xmlconfig);
266:
267:                if (f1.exists() && f2.exists()) {
268:                    Mapping mapping = new Mapping();
269:                    try {
270:                        mapping.loadMapping(f1.getPath());
271:                        Unmarshaller unmar = new Unmarshaller(mapping);
272:                        unmar.setDebug(true);
273:                        baseobj = (JoBoBase) unmar.unmarshal(new InputSource(f2
274:                                .getPath()));
275:
276:                        log.info("configured from XML");
277:
278:                    } catch (Exception e) {
279:                        log.error(e.getMessage());
280:                        e.printStackTrace();
281:                    }
282:                } else {
283:                    log.error("mapping and/or configfile not found");
284:                }
285:
286:                if (baseobj == null) {
287:                    baseobj = new JoBoBase();
288:                }
289:
290:                baseobj.configureRobot();
291:
292:                return baseobj;
293:            }
294:
295:        } // JoBoBase
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.