Source Code Cross Referenced for Kw3WriterProcessor.java in  » Web-Crawler » heritrix » org » archive » crawler » writer » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.writer 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* Created on 2006-okt-03
002:         *
003:         * Copyright (C) 2006 National Library of Sweden.
004:         *
005:         * This program is free software; you can redistribute it and/or
006:         * modify it under the terms of the GNU Lesser General Public License
007:         * as published by the Free Software Foundation; either version 2
008:         * of the License, or (at your option) any later version.
009:         *
010:         * This program is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser General Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser General Public License
016:         * along with this program; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
018:         */
019:
020:        package org.archive.crawler.writer;
021:
022:        import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
023:
024:        import java.io.ByteArrayOutputStream;
025:        import java.io.File;
026:        import java.io.FileOutputStream;
027:        import java.io.IOException;
028:        import java.io.OutputStream;
029:        import java.net.InetAddress;
030:        import java.security.MessageDigest;
031:        import java.security.NoSuchAlgorithmException;
032:        import java.util.logging.Level;
033:        import java.util.logging.Logger;
034:
035:        import javax.management.AttributeNotFoundException;
036:        import javax.management.MBeanException;
037:        import javax.management.ReflectionException;
038:
039:        import org.archive.crawler.datamodel.CoreAttributeConstants;
040:        import org.archive.crawler.datamodel.CrawlHost;
041:        import org.archive.crawler.datamodel.CrawlURI;
042:        import org.archive.crawler.framework.Processor;
043:        import org.archive.crawler.settings.SimpleType;
044:        import org.archive.crawler.settings.Type;
045:        import org.archive.io.ReplayInputStream;
046:        import org.archive.crawler.writer.Kw3Constants;
047:
048:        /**
049:         * Processor module that writes the results of successful fetches to
050:         * files on disk. These files are MIME-files of the type used by the
051:         * Swedish National Library's Kulturarw3 web harvesting [http://www.kb.se/kw3/].
052:         *  
053:         * Each URI gets written to its own file and has a path consisting of:
054:         * <ul>
055:         *  <li> A dir named with the first two chars of the website's md5. </li>
056:         *  <li> A dir named after the website. </li>
057:         *  <li> 'current' - a dir indicating that this is the directory being written
058:         *                   to by the ongoing crawl. </li>
059:         *  <li> A file on the format <md5 of url>.<fetchtime in seconds> </li>
060:         * </ul>
061:         * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'
062:         * 
063:         * The MIME-file itself consists of three parts:
064:         * <ul>
065:         *  <li> 1. ArchiveInfo - Metadata about the file and its content. </li>
066:         *  <li> 2. Header - The HTTP response header. </li>
067:         *  <li> 3. Content - The HTTP response content, plus content-type. </li>
068:         * </ul>
069:         * 
070:         * @author oskar
071:         */
072:        public class Kw3WriterProcessor extends Processor implements 
073:                CoreAttributeConstants, Kw3Constants {
074:
075:            private static final long serialVersionUID = 7171448068924684594L;
076:
077:            private static String COLON = ":";
078:            private static String WS = " ";
079:            private static String LF = "\n";
080:
081:            /**
082:             * Logger.
083:             */
084:            private static final Logger logger = Logger
085:                    .getLogger(Kw3WriterProcessor.class.getName());
086:
087:            /**
088:             * Key to use asking settings for arc path value.
089:             */
090:            public static final String ATTR_PATH = "path";
091:
092:            /**
093:             * Default path.
094:             */
095:            private static final String DEFAULT_PATH = "arcs";
096:
097:            /**
098:             * Key to use asking settings for max size value.
099:             */
100:            public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
101:
102:            /**
103:             * Default max file size.
104:             */
105:            public static final int DEFAULT_MAX_FILE_SIZE = 10000000;
106:
107:            /**
108:             * Key to use asking settings if chmod should be execuated .
109:             */
110:            public static final String ATTR_CHMOD = "chmod";
111:
112:            /**
113:             * Key to use asking settings for the new chmod value.
114:             */
115:            public static final String ATTR_CHMOD_VALUE = "chmod-value";
116:
117:            /**
118:             * Default value for permissions.
119:             */
120:            public static final String DEFAULT_CHMOD_VALUE = "777";
121:
122:            /**
123:             * Key for the maximum ARC bytes to write attribute.
124:             */
125:            public static final String ATTR_MAX_BYTES_WRITTEN = "total-bytes-to-write";
126:
127:            /**
128:             * Key for the collection attribute.
129:             */
130:            public static final String ATTR_COLLECTION = "collection";
131:
132:            /**
133:             * Default value for collection.
134:             */
135:            public static final String DEFAULT_COLLECTION_VALUE = "kw3";
136:
137:            /**
138:             * Key for the harvester attribute.
139:             */
140:            public static final String ATTR_HARVESTER = "harvester";
141:
142:            /**
143:             * Default value for harvester.
144:             */
145:            public static final String DEFAULT_HARVESTER_VALUE = "heritrix";
146:
147:            private static String BOUNDARY_START = "KulturArw3_";
148:
149:            /*
150:             * Private members for settings
151:             */
152:            private File arcsDir;
153:
154:            private boolean chmod;
155:
156:            private String chmodValue;
157:
158:            private int maxSize;
159:
160:            private String collection;
161:
162:            private String harvester;
163:
164:            /**
165:             * @param name Name of this processor.
166:             */
167:            public Kw3WriterProcessor(String name) {
168:                super (
169:                        name,
170:                        "Kw3Writer processor. "
171:                                + "A writer that writes files in the MIME format of The "
172:                                + "Swedish National Library.  See this class's javadoc for"
173:                                + "format exposition.");
174:                Type e;
175:                e = addElementToDefinition(new SimpleType(ATTR_PATH,
176:                        "Top-level directory for archive files.", DEFAULT_PATH));
177:                e.setOverrideable(false);
178:                e = addElementToDefinition(new SimpleType(ATTR_COLLECTION,
179:                        "Name of collection.", DEFAULT_COLLECTION_VALUE));
180:                e.setOverrideable(false);
181:                e = addElementToDefinition(new SimpleType(
182:                        ATTR_HARVESTER,
183:                        "Name of the harvester that is used for the web harvesting.",
184:                        DEFAULT_HARVESTER_VALUE));
185:                e.setOverrideable(false);
186:                e = addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES,
187:                        "Max size of each file", new Integer(
188:                                DEFAULT_MAX_FILE_SIZE)));
189:                e.setOverrideable(false);
190:                e = addElementToDefinition(new SimpleType(
191:                        ATTR_CHMOD,
192:                        "Should permissions be changed for the newly created dirs",
193:                        new Boolean(true)));
194:                e.setOverrideable(false);
195:                e = addElementToDefinition(new SimpleType(
196:                        ATTR_CHMOD_VALUE,
197:                        "What should the permissions be set to."
198:                                + " Given as three octal digits, as to the UNIX 'chmod' command."
199:                                + " Ex. 777 for all permissions to everyone.",
200:                        DEFAULT_CHMOD_VALUE));
201:                e.setOverrideable(false);
202:            }
203:
204:            protected void initialTasks() {
205:                try {
206:                    String arcsDirPath = (String) getAttribute(ATTR_PATH);
207:                    this .arcsDir = new File(arcsDirPath);
208:                    if (!this .arcsDir.isAbsolute())
209:                        this .arcsDir = new File(getController().getDisk(),
210:                                arcsDirPath);
211:
212:                    this .collection = (String) getAttribute(ATTR_COLLECTION);
213:                    this .harvester = (String) getAttribute(ATTR_HARVESTER);
214:                    this .chmod = (Boolean) getAttribute(ATTR_CHMOD);
215:                    this .chmodValue = (String) getAttribute(ATTR_CHMOD_VALUE);
216:                    this .maxSize = (Integer) getAttribute(ATTR_MAX_SIZE_BYTES);
217:                } catch (AttributeNotFoundException e) {
218:                    logger.log(Level.WARNING, "attribute error", e);
219:                } catch (MBeanException e) {
220:                    logger.log(Level.WARNING, "attribute error", e);
221:                } catch (ReflectionException e) {
222:                    logger.log(Level.WARNING, "attribute error", e);
223:                }
224:            }
225:
226:            protected void innerProcess(CrawlURI curi) {
227:                // Only successful fetches are written.
228:                if (!curi.isSuccess())
229:                    return;
230:                // Only http and https schemes are supported.
231:                String scheme = curi.getUURI().getScheme().toLowerCase();
232:                if (!"http".equalsIgnoreCase(scheme)
233:                        && !"https".equalsIgnoreCase(scheme))
234:                    return;
235:
236:                // Write the MIME-file
237:                try {
238:                    writeMimeFile(curi);
239:                } catch (IOException e) {
240:                    logger.log(Level.WARNING, "i/o error", e);
241:                }
242:            }
243:
244:            /*
245:             * The actual writing of the Kulturarw3 MIME-file.
246:             * 
247:             * The MIME-file consists of three parts:
248:             * 1. ArchiveInfo - Metadata about the file and its content.
249:             * 2. Header - The HTTP response header.
250:             * 3. Content - The HTTP response content, plus content-type.
251:             * 
252:             * For more on this format, see '?'.
253:             */
254:            protected void writeMimeFile(CrawlURI curi) throws IOException {
255:                ReplayInputStream ris = null;
256:                OutputStream out = null;
257:
258:                try {
259:                    String boundary = BOUNDARY_START
260:                            + stringToMD5(curi.toString());
261:                    ris = curi.getHttpRecorder().getRecordedInput()
262:                            .getReplayInputStream();
263:                    out = initOutputStream(curi);
264:
265:                    // Part 1: Archive info
266:                    writeArchiveInfoPart(boundary, curi, ris, out);
267:
268:                    // Part 2: Header info + HTTP header
269:                    writeHeaderPart(boundary, ris, out);
270:
271:                    // Part 3: Content info + HTTP content
272:                    writeContentPart(boundary, curi, ris, out);
273:
274:                    // And finally the terminator string
275:                    String terminator = "\n--" + boundary + "--\n";
276:                    out.write(terminator.getBytes());
277:                } finally {
278:                    if (ris != null)
279:                        ris.close();
280:                    if (out != null)
281:                        out.close();
282:                }
283:            }
284:
285:            /*
286:             * Get the OutputStream for the file to write to.
287:             * 
288:             * It has a path consisting of:
289:             * 1. A dir named with the first two chars of the website's md5.
290:             * 2. A dir named after the website.
291:             * 3. 'current' - a dir indicating that this is the directory being written
292:             *                to by the ongoing crawl. 
293:             * 4. A file on the format <md5 of url>.<fetchtime in seconds>
294:             * 
295:             * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'            
296:             */
297:            protected OutputStream initOutputStream(CrawlURI curi)
298:                    throws IOException {
299:                String uri = curi.toString();
300:                int port = curi.getUURI().getPort();
301:                String host = (port == 80 || port <= 0) ? curi.getUURI()
302:                        .getHost() : curi.getUURI().getHost() + ":" + port;
303:                long fetchTime = curi.getLong(A_FETCH_BEGAN_TIME) / 1000;
304:
305:                String md5 = stringToMD5(host);
306:                File dir = new File(this .arcsDir, md5.substring(0, 2) + "/"
307:                        + host + "/current");
308:                if (!dir.exists()) {
309:                    dir.mkdirs();
310:                    if (this .chmod)
311:                        chmods(dir, this .arcsDir);
312:                }
313:                md5 = stringToMD5(uri);
314:                File arcFile = new File(dir, md5 + "." + fetchTime);
315:                return new FastBufferedOutputStream(new FileOutputStream(
316:                        arcFile));
317:            }
318:
319:            protected void writeArchiveInfoPart(String boundary, CrawlURI curi,
320:                    ReplayInputStream ris, OutputStream out) throws IOException {
321:                // Get things we need to write in this part
322:                String uri = curi.toString();
323:                String ip = getHostAddress(curi);
324:                long headerLength = ris.getHeaderSize();
325:                long contentLength = ris.getContentSize();
326:                long archiveTime = System.currentTimeMillis() / 1000; // Fetchtime in seconds
327:                int statusCode = curi.getFetchStatus();
328:                String headerMd5 = null;
329:                Object contentMd5 = null;
330:
331:                // Get headerMd5
332:                ByteArrayOutputStream baos = new ByteArrayOutputStream();
333:                ris.readHeaderTo(baos);
334:                headerMd5 = stringToMD5(baos.toString());
335:
336:                // Get contentMd5
337:                contentMd5 = curi.getContentDigest();
338:                if (contentMd5 != null)
339:                    contentMd5 = getHexString((byte[]) contentMd5);
340:
341:                StringBuffer buffer = new StringBuffer();
342:                buffer.append("MIME-version: 1.1" + LF);
343:                buffer.append("Content-Type: multipart/mixed; boundary="
344:                        + boundary + LF);
345:                buffer.append("HTTP-Part: ArchiveInfo" + LF);
346:                buffer.append(COLLECTION_KEY + COLON + WS + this .collection
347:                        + LF);
348:                buffer.append(HARVESTER_KEY + COLON + WS + this .harvester + LF);
349:                buffer.append(URL_KEY + COLON + WS + uri + LF);
350:                buffer.append(IP_ADDRESS_KEY + COLON + WS + ip + LF);
351:                buffer.append(HEADER_LENGTH_KEY + COLON + WS + headerLength
352:                        + LF);
353:                buffer.append(HEADER_MD5_KEY + COLON + WS + headerMd5 + LF);
354:                buffer.append(CONTENT_LENGTH_KEY + COLON + WS + contentLength
355:                        + LF);
356:                buffer.append(CONTENT_MD5_KEY + COLON + WS + contentMd5 + LF);
357:                buffer.append(ARCHIVE_TIME_KEY + COLON + WS + archiveTime + LF);
358:                buffer.append(STATUS_CODE_KEY + COLON + WS + statusCode + LF
359:                        + LF);
360:                out.write(buffer.toString().getBytes());
361:            }
362:
363:            protected void writeHeaderPart(String boundary,
364:                    ReplayInputStream ris, OutputStream out) throws IOException {
365:                StringBuffer buffer = new StringBuffer();
366:                buffer.append("--" + boundary + LF);
367:                buffer.append("Content-Type: text/plain; charset=\"US-ascii\""
368:                        + LF);
369:                buffer.append("HTTP-Part: Header" + LF + LF);
370:                out.write(buffer.toString().getBytes());
371:                ris.readHeaderTo(out);
372:            }
373:
374:            protected void writeContentPart(String boundary, CrawlURI curi,
375:                    ReplayInputStream ris, OutputStream out) throws IOException {
376:                // Get things we need to write in this part
377:                String uri = curi.toString();
378:                String contentType = curi.getContentType();
379:                long contentLength = ris.getContentSize();
380:                // Only write content if there is some
381:                if (contentLength == 0)
382:                    return;
383:
384:                StringBuffer buffer = new StringBuffer();
385:                buffer.append("--" + boundary + LF);
386:                buffer.append("Content-Type: " + contentType + LF);
387:                buffer.append("HTTP-Part: Content" + LF + LF);
388:                out.write(buffer.toString().getBytes());
389:
390:                if (contentLength > this .maxSize) {
391:                    ris.readContentTo(out, this .maxSize);
392:                    logger.info(" Truncated url: " + uri + ", Size: "
393:                            + contentLength + ", Content-type: " + contentType);
394:                } else {
395:                    ris.readContentTo(out);
396:                }
397:            }
398:
399:            // --- Private helper functions --- //
400:            /*
401:             * Get a MD5 checksum based on a String. 
402:             */
403:            private String stringToMD5(String str) {
404:                try {
405:                    byte b[] = str.getBytes();
406:                    MessageDigest md = MessageDigest.getInstance("MD5");
407:                    md.update(b);
408:                    byte[] digest = md.digest();
409:                    return getHexString(digest);
410:                } catch (NoSuchAlgorithmException e) {
411:                    logger.log(Level.WARNING, "md5 error", e);
412:                }
413:                return null;
414:            }
415:
416:            /* 
417:             * Fast convert a byte array to a hex string with possible leading zero.
418:             */
419:            private String getHexString(byte[] b) {
420:                StringBuffer sb = new StringBuffer();
421:                for (int i = 0; i < b.length; i++) {
422:                    String tmp = Integer.toHexString(b[i] & 0xff);
423:                    if (tmp.length() < 2)
424:                        sb.append("0" + tmp);
425:                    else
426:                        sb.append(tmp);
427:                }
428:                return sb.toString();
429:            }
430:
431:            /* 
432:             * Chmods for all newly created directories.
433:             */
434:            private void chmods(File dir, File arcsDir) {
435:                String topdir = arcsDir.getAbsolutePath();
436:                chmod(dir, this .chmodValue);
437:                File parent = dir.getParentFile();
438:                while (!parent.getAbsolutePath().equalsIgnoreCase((topdir))) {
439:                    chmod(parent, this .chmodValue);
440:                    parent = parent.getParentFile();
441:                }
442:
443:            }
444:
445:            /* 
446:             * Chmod for a specific file or directory.
447:             */
448:            private void chmod(File file, String permissions) {
449:                Process proc = null;
450:                try {
451:                    proc = Runtime.getRuntime().exec(
452:                            "chmod " + permissions + " "
453:                                    + file.getAbsolutePath());
454:                    proc.waitFor();
455:                    proc.getInputStream().close();
456:                    proc.getOutputStream().close();
457:                    proc.getErrorStream().close();
458:                } catch (IOException e) {
459:                    logger.log(Level.WARNING, "chmod failed", e);
460:                } catch (InterruptedException e) {
461:                    logger.log(Level.WARNING, "chmod failed", e);
462:                }
463:            }
464:
465:            private String getHostAddress(CrawlURI curi) {
466:                CrawlHost h = getController().getServerCache().getHostFor(curi);
467:                if (h == null) {
468:                    throw new NullPointerException("Crawlhost is null for "
469:                            + curi + " " + curi.getVia());
470:                }
471:                InetAddress a = h.getIP();
472:                if (a == null) {
473:                    throw new NullPointerException(
474:                            "Address is null for "
475:                                    + curi
476:                                    + " "
477:                                    + curi.getVia()
478:                                    + ". Address "
479:                                    + ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ? "was never looked up."
480:                                            : (System.currentTimeMillis() - h
481:                                                    .getIpFetched())
482:                                                    + " ms ago."));
483:                }
484:                return h.getIP().getHostAddress();
485:            }
486:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.