Source Code Cross Referenced for ExperimentalWARCWriterTest.java in  » Web-Crawler » heritrix » org » archive » io » warc » v10 » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.io.warc.v10 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /*
002:         * ExperimentalWARCWriterTest
003:         *
004:         * $Id: ExperimentalWARCWriterTest.java 4554 2006-08-30 02:35:48Z stack-sf $
005:         *
006:         * Created on July 27th, 2006
007:         *
008:         * Copyright (C) 2006 Internet Archive.
009:         *
010:         * This file is part of the Heritrix web crawler (crawler.archive.org).
011:         *
012:         * Heritrix is free software; you can redistribute it and/or modify
013:         * it under the terms of the GNU Lesser Public License as published by
014:         * the Free Software Foundation; either version 2.1 of the License, or
015:         * any later version.
016:         *
017:         * Heritrix is distributed in the hope that it will be useful,
018:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
019:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
020:         * GNU Lesser Public License for more details.
021:         *
022:         * You should have received a copy of the GNU Lesser Public License
023:         * along with Heritrix; if not, write to the Free Software
024:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
025:         */
026:        package org.archive.io.warc.v10;
027:
028:        import java.io.ByteArrayInputStream;
029:        import java.io.ByteArrayOutputStream;
030:        import java.io.File;
031:        import java.io.FileNotFoundException;
032:        import java.io.IOException;
033:        import java.net.URI;
034:        import java.net.URISyntaxException;
035:        import java.util.Arrays;
036:        import java.util.Iterator;
037:        import java.util.List;
038:        import java.util.concurrent.atomic.AtomicInteger;
039:
040:        import org.archive.io.ArchiveRecord;
041:        import org.archive.io.ArchiveRecordHeader;
042:        import org.archive.io.UTF8Bytes;
043:        import org.archive.io.WriterPoolMember;
044:        import org.archive.io.warc.WARCConstants;
045:        import org.archive.uid.GeneratorFactory;
046:        import org.archive.util.ArchiveUtils;
047:        import org.archive.util.TmpDirTestCase;
048:        import org.archive.util.anvl.ANVLRecord;
049:
050:        /**
051:         * Test Writer and Reader.
052:         * @author stack
053:         * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
054:         */
055:        public class ExperimentalWARCWriterTest extends TmpDirTestCase
056:                implements  WARCConstants {
057:            private static final AtomicInteger SERIAL_NO = new AtomicInteger();
058:
059:            /**
060:             * Prefix to use for ARC files made by JUNIT.
061:             */
062:            private static final String PREFIX = "IAH";
063:
064:            private static final String SOME_URL = "http://www.archive.org/test/";
065:
066:            public void testCheckHeaderLineValue() throws Exception {
067:                ExperimentalWARCWriter writer = new ExperimentalWARCWriter();
068:                writer.checkHeaderLineParameters("one");
069:                IOException exception = null;
070:                try {
071:                    writer.checkHeaderLineParameters("with space");
072:                } catch (IOException e) {
073:                    exception = e;
074:                }
075:                assertNotNull(exception);
076:                exception = null;
077:                try {
078:                    writer
079:                            .checkHeaderLineParameters("with\0x0000controlcharacter");
080:                } catch (IOException e) {
081:                    exception = e;
082:                }
083:                assertNotNull(exception);
084:            }
085:
086:            public void testMimetypes() throws IOException {
087:                ExperimentalWARCWriter writer = new ExperimentalWARCWriter();
088:                writer.checkHeaderLineMimetypeParameter("text/xml");
089:                writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
090:                writer
091:                        .checkHeaderLineMimetypeParameter("text/plain; charset=SHIFT-JIS");
092:                System.out
093:                        .println(writer
094:                                .checkHeaderLineMimetypeParameter("multipart/mixed; \r\n        boundary=\"simple boundary\""));
095:            }
096:
097:            public void testWriteRecord() throws IOException {
098:                File[] files = { getTmpDir() };
099:
100:                // Write uncompressed.
101:                ExperimentalWARCWriter writer = new ExperimentalWARCWriter(
102:                        SERIAL_NO, Arrays.asList(files), this .getClass()
103:                                .getName(), "suffix", false, -1, null);
104:                writeFile(writer);
105:
106:                // Write compressed.
107:                writer = new ExperimentalWARCWriter(SERIAL_NO, Arrays
108:                        .asList(files), this .getClass().getName(), "suffix",
109:                        true, -1, null);
110:                writeFile(writer);
111:            }
112:
113:            private void writeFile(final ExperimentalWARCWriter writer)
114:                    throws IOException {
115:                try {
116:                    writeWarcinfoRecord(writer);
117:                    writeBasicRecords(writer);
118:                } finally {
119:                    writer.close();
120:                    writer.getFile().delete();
121:                }
122:            }
123:
124:            private void writeWarcinfoRecord(ExperimentalWARCWriter writer)
125:                    throws IOException {
126:                ANVLRecord meta = new ANVLRecord();
127:                meta.addLabelValue("size", "1G");
128:                meta.addLabelValue("operator", "igor");
129:                byte[] bytes = meta.getUTF8Bytes();
130:                writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,
131:                        new ByteArrayInputStream(bytes), bytes.length);
132:            }
133:
134:            protected void writeBasicRecords(final ExperimentalWARCWriter writer)
135:                    throws IOException {
136:                ANVLRecord headerFields = new ANVLRecord();
137:                headerFields.addLabelValue("x", "y");
138:                headerFields.addLabelValue("a", "b");
139:
140:                URI rid = null;
141:                try {
142:                    rid = GeneratorFactory.getFactory().getQualifiedRecordID(
143:                            TYPE, METADATA);
144:                } catch (URISyntaxException e) {
145:                    // Convert to IOE so can let it out.
146:                    throw new IOException(e.getMessage());
147:                }
148:                final String content = "Any old content.";
149:                for (int i = 0; i < 10; i++) {
150:                    String body = i + ". " + content;
151:                    byte[] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
152:                    writer.writeRecord(METADATA, "http://www.archive.org/",
153:                            ArchiveUtils.get14DigitDate(), "no/type", rid,
154:                            headerFields, new ByteArrayInputStream(bodyBytes),
155:                            (long) bodyBytes.length);
156:                }
157:            }
158:
159:            /**
160:             * @return Generic HTML Content.
161:             */
162:            protected static String getContent() {
163:                return getContent(null);
164:            }
165:
166:            /**
167:             * @return Generic HTML Content with mention of passed <code>indexStr</code>
168:             * in title and body.
169:             */
170:            protected static String getContent(String indexStr) {
171:                String page = (indexStr != null) ? "Page #" + indexStr
172:                        : "Some Page";
173:                return "HTTP/1.1 200 OK\r\n"
174:                        + "Content-Type: text/html\r\n\r\n"
175:                        + "<html><head><title>" + page + "</title></head>"
176:                        + "<body>" + page + "</body></html>";
177:            }
178:
179:            /**
180:             * Write random HTML Record.
181:             * @param w Where to write.
182:             * @param index An index to put into content.
183:             * @return Length of record written.
184:             * @throws IOException
185:             */
186:            protected int writeRandomHTTPRecord(ExperimentalWARCWriter w,
187:                    int index) throws IOException {
188:                ByteArrayOutputStream baos = new ByteArrayOutputStream();
189:                String indexStr = Integer.toString(index);
190:                byte[] record = (getContent(indexStr)).getBytes();
191:                int recordLength = record.length;
192:                baos.write(record);
193:                // Add named fields for ip, checksum, and relate the metadata
194:                // and request to the resource field.
195:                ANVLRecord r = new ANVLRecord(1);
196:                r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");
197:                w.writeResourceRecord("http://www.one.net/id=" + indexStr,
198:                        ArchiveUtils.get14DigitDate(),
199:                        "text/html; charset=UTF-8", r,
200:                        new ByteArrayInputStream(baos.toByteArray()),
201:                        recordLength);
202:                return recordLength;
203:            }
204:
205:            /**
206:             * Fill a WARC with HTML Records.
207:             * @param baseName WARC basename.
208:             * @param compress Whether to compress or not.
209:             * @param maxSize Maximum WARC size.
210:             * @param recordCount How many records.
211:             * @return The written file.
212:             * @throws IOException
213:             */
214:            private File writeRecords(String baseName, boolean compress,
215:                    int maxSize, int recordCount) throws IOException {
216:                cleanUpOldFiles(baseName);
217:                File[] files = { getTmpDir() };
218:                ExperimentalWARCWriter w = new ExperimentalWARCWriter(
219:                        SERIAL_NO, Arrays.asList(files), baseName + '-'
220:                                + PREFIX, "", compress, maxSize, null);
221:                assertNotNull(w);
222:                for (int i = 0; i < recordCount; i++) {
223:                    writeRandomHTTPRecord(w, i);
224:                }
225:                w.close();
226:                assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(), w
227:                        .getFile().exists());
228:                return w.getFile();
229:            }
230:
231:            /**
232:             * Run validation of passed file.
233:             * @param f File to validate.
234:             * @param recordCount Expected count of records.
235:             * @throws FileNotFoundException
236:             * @throws IOException
237:             */
238:            private void validate(File f, int recordCount)
239:                    throws FileNotFoundException, IOException {
240:                WARCReader reader = WARCReaderFactory.get(f);
241:                assertNotNull(reader);
242:                List headers = null;
243:                if (recordCount == -1) {
244:                    headers = reader.validate();
245:                } else {
246:                    headers = reader.validate(recordCount);
247:                }
248:                reader.close();
249:
250:                // Now, run through each of the records doing absolute get going from
251:                // the end to start.  Reopen the arc so no context between this test
252:                // and the previous.
253:                reader = WARCReaderFactory.get(f);
254:                for (int i = headers.size() - 1; i >= 0; i--) {
255:                    ArchiveRecordHeader h = (ArchiveRecordHeader) headers
256:                            .get(i);
257:                    ArchiveRecord r = reader.get(h.getOffset());
258:                    String mimeType = r.getHeader().getMimetype();
259:                    assertTrue("Record is bogus", mimeType != null
260:                            && mimeType.length() > 0);
261:                }
262:                reader.close();
263:
264:                assertTrue("Metadatas not equal", headers.size() == recordCount);
265:                for (Iterator i = headers.iterator(); i.hasNext();) {
266:                    ArchiveRecordHeader r = (ArchiveRecordHeader) i.next();
267:                    assertTrue("Record is empty", r.getLength() > 0);
268:                }
269:            }
270:
271:            public void testWriteRecords() throws IOException {
272:                final int recordCount = 2;
273:                File f = writeRecords("writeRecord", false,
274:                        DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
275:                validate(f, recordCount + 1); // Header record.
276:            }
277:
278:            public void testRandomAccess() throws IOException {
279:                final int recordCount = 3;
280:                File f = writeRecords("writeRecord", true,
281:                        DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
282:                WARCReader reader = WARCReaderFactory.get(f);
283:                // Get to second record.  Get its offset for later use.
284:                boolean readFirst = false;
285:                String url = null;
286:                long offset = -1;
287:                long totalRecords = 0;
288:                boolean readSecond = false;
289:                for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
290:                    WARCRecord ar = (WARCRecord) i.next();
291:                    if (!readFirst) {
292:                        readFirst = true;
293:                        continue;
294:                    }
295:                    if (!readSecond) {
296:                        url = ar.getHeader().getUrl();
297:                        offset = ar.getHeader().getOffset();
298:                        readSecond = true;
299:                    }
300:                }
301:
302:                reader = WARCReaderFactory.get(f, offset);
303:                ArchiveRecord ar = reader.get();
304:                assertEquals(ar.getHeader().getUrl(), url);
305:                ar.close();
306:
307:                // Get reader again.  See how iterator works with offset
308:                reader = WARCReaderFactory.get(f, offset);
309:                int count = 0;
310:                for (final Iterator i = reader.iterator(); i.hasNext(); i
311:                        .next()) {
312:                    count++;
313:                }
314:                reader.close();
315:                assertEquals(totalRecords - 1, count);
316:            }
317:
318:            public void testWriteRecordCompressed() throws IOException {
319:                final int recordCount = 2;
320:                File arcFile = writeRecords("writeRecordCompressed", true,
321:                        DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
322:                validate(arcFile, recordCount + 1 /*Header record*/);
323:            }
324:
325:            protected ExperimentalWARCWriter createWARCWriter(String NAME,
326:                    boolean compress) {
327:                File[] files = { getTmpDir() };
328:                return new ExperimentalWARCWriter(SERIAL_NO, Arrays
329:                        .asList(files), NAME, "", compress,
330:                        DEFAULT_MAX_WARC_FILE_SIZE, null);
331:            }
332:
333:            protected static ByteArrayOutputStream getBaos(String str)
334:                    throws IOException {
335:                ByteArrayOutputStream baos = new ByteArrayOutputStream();
336:                baos.write(str.getBytes());
337:                return baos;
338:            }
339:
340:            protected static void writeRecord(ExperimentalWARCWriter w,
341:                    String url, String mimetype, int len,
342:                    ByteArrayOutputStream baos) throws IOException {
343:                w.writeResourceRecord(url, ArchiveUtils.get14DigitDate(),
344:                        mimetype, null, new ByteArrayInputStream(baos
345:                                .toByteArray()), len);
346:            }
347:
348:            protected int iterateRecords(WARCReader r) throws IOException {
349:                int count = 0;
350:                for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
351:                    ArchiveRecord ar = i.next();
352:                    ar.close();
353:                    if (count != 0) {
354:                        assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
355:                                ar.getHeader().getUrl().equals(SOME_URL));
356:                    }
357:                    count++;
358:                }
359:                return count;
360:            }
361:
362:            protected ExperimentalWARCWriter createWithOneRecord(String name,
363:                    boolean compressed) throws IOException {
364:                ExperimentalWARCWriter writer = createWARCWriter(name,
365:                        compressed);
366:                String content = getContent();
367:                writeRecord(writer, SOME_URL, "text/html", content.length(),
368:                        getBaos(content));
369:                return writer;
370:            }
371:
372:            public void testSpaceInURL() {
373:                String eMessage = null;
374:                try {
375:                    holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
376:                } catch (IOException e) {
377:                    eMessage = e.getMessage();
378:                }
379:                assertTrue("Didn't get expected exception: " + eMessage,
380:                        eMessage.startsWith("Contains disallowed"));
381:            }
382:
383:            public void testTabInURL() {
384:                String eMessage = null;
385:                try {
386:                    holeyUrl("testTabInURL-" + PREFIX, false, "\t");
387:                } catch (IOException e) {
388:                    eMessage = e.getMessage();
389:                }
390:                assertTrue("Didn't get expected exception: " + eMessage,
391:                        eMessage.startsWith("Contains illegal"));
392:            }
393:
394:            protected void holeyUrl(String name, boolean compress,
395:                    String urlInsert) throws IOException {
396:                ExperimentalWARCWriter writer = createWithOneRecord(name,
397:                        compress);
398:                // Add some bytes on the end to mess up the record.
399:                String content = getContent();
400:                ByteArrayOutputStream baos = getBaos(content);
401:                writeRecord(writer, SOME_URL + urlInsert + "/index.html",
402:                        "text/html", content.length(), baos);
403:                writer.close();
404:            }
405:
406:            /**
407:             * Write an arc file for other tests to use.
408:             * @param arcdir Directory to write to.
409:             * @param compress True if file should be compressed.
410:             * @return ARC written.
411:             * @throws IOException 
412:             */
413:            public static File createWARCFile(File arcdir, boolean compress)
414:                    throws IOException {
415:                File[] files = { arcdir };
416:                ExperimentalWARCWriter writer = new ExperimentalWARCWriter(
417:                        SERIAL_NO, Arrays.asList(files), "test", "", compress,
418:                        DEFAULT_MAX_WARC_FILE_SIZE, null);
419:                String content = getContent();
420:                writeRecord(writer, SOME_URL, "text/html", content.length(),
421:                        getBaos(content));
422:                writer.close();
423:                return writer.getFile();
424:            }
425:
426:            //    public void testSpeed() throws IOException {
427:            //        ARCWriter writer = createArcWithOneRecord("speed", true);
428:            //        // Add a record with a length that is too long.
429:            //        String content = getContent();
430:            //        final int count = 100000;
431:            //        logger.info("Starting speed write of " + count + " records.");
432:            //        for (int i = 0; i < count; i++) {
433:            //            writeRecord(writer, SOME_URL, "text/html", content.length(),
434:            //                    getBaos(content));
435:            //        }
436:            //        writer.close();
437:            //        logger.info("Finished speed write test.");
438:            //    }
439:
440:            public void testArcRecordOffsetReads() throws Exception {
441:                // Get an ARC with one record.
442:                WriterPoolMember w = createWithOneRecord(
443:                        "testArcRecordInBufferStream", true);
444:                w.close();
445:                // Get reader on said ARC.
446:                WARCReader r = WARCReaderFactory.get(w.getFile());
447:                final Iterator<ArchiveRecord> i = r.iterator();
448:                // Skip first ARC meta record.
449:                ArchiveRecord ar = i.next();
450:                i.hasNext();
451:                // Now we're at first and only record in ARC.
452:                ar = (WARCRecord) i.next();
453:                // Now try getting some random set of bytes out of it 
454:                // at an odd offset (used to fail because we were
455:                // doing bad math to find where in buffer to read).
456:                final byte[] buffer = new byte[17];
457:                final int maxRead = 4;
458:                int totalRead = 0;
459:                while (totalRead < maxRead) {
460:                    totalRead = totalRead
461:                            + ar.read(buffer, 13 + totalRead, maxRead
462:                                    - totalRead);
463:                    assertTrue(totalRead > 0);
464:                }
465:            }
466:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.