Source Code Cross Referenced for DocumentCollectionTest.java in » Search-Engine » mg4j » test » it » unimi » dsi » mg4j » document » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » mg4j » test.it.unimi.dsi.mg4j.document
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        package test.it.unimi.dsi.mg4j.document;
002:
003:        /*		 
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2005-2007 Paolo Boldi 
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.fastutil.io.BinIO;
025:        import it.unimi.dsi.io.WordReader;
026:        import it.unimi.dsi.lang.MutableString;
027:        import it.unimi.dsi.logging.ProgressLogger;
028:        import it.unimi.dsi.mg4j.document.Document;
029:        import it.unimi.dsi.mg4j.document.DocumentCollection;
030:        import it.unimi.dsi.mg4j.document.DocumentIterator;
031:        import it.unimi.dsi.mg4j.document.DocumentSequence;
032:        import it.unimi.dsi.mg4j.document.FileSetDocumentCollection;
033:        import it.unimi.dsi.mg4j.document.HtmlDocumentFactory;
034:        import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;
035:        import it.unimi.dsi.mg4j.document.InputStreamDocumentSequence;
036:        import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory;
037:        import it.unimi.dsi.mg4j.document.ZipDocumentCollection;
038:        import it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder;
039:        import it.unimi.dsi.util.Properties;
040:
041:        import java.io.File;
042:        import java.io.FileInputStream;
043:        import java.io.FileOutputStream;
044:        import java.io.IOException;
045:        import java.io.InputStream;
046:        import java.io.OutputStreamWriter;
047:        import java.io.Reader;
048:        import java.io.Writer;
049:        import java.util.StringTokenizer;
050:
051:        import junit.framework.TestCase;
052:
053:        import org.apache.commons.configuration.ConfigurationException;
054:        import org.apache.commons.io.FileUtils;
055:
056:        import cern.colt.GenericSorting;
057:        import cern.colt.Swapper;
058:        import cern.colt.function.IntComparator;
059:
060:        public class DocumentCollectionTest extends TestCase {
061:
062:            /** We consider documents abstractly described by two fields each. */
063:            private final static String[][] document = new String[][] {
064:            //              0   1   2   3      0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
065:            new String[] { "xxx yyy zzz xxx",
066:                    "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
067:            /*	new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
068:            	new String[] { "aaa uuu aaa"    , "aaa uuu aaa xxx xxx xxx aaa xxx" },
069:            	// This tests that zipped collections handle properly initial spaces
070:            	new String[] { " aaa uuu aaa"    , " aaa uuu aaa xxx xxx xxx aaa xxx" },*/
071:            };
072:            private final static Properties DEFAULT_PROPERTIES = new Properties();
073:            static {
074:                DEFAULT_PROPERTIES.setProperty(
075:                        PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
076:                        "ASCII");
077:            }
078:
079:            /** The number of documents. */
080:            private final static int ndoc = document.length;
081:            /** The temporary directory where all tests are run. */
082:            private File tempDir;
083:            /** The set of files in the HTML directory. */
084:            private String[] htmlFileSet;
085:
086:            /** Given a two-field document, produce an HTML document with the first field as title and
087:             *  the second field as body.
088:             *  
089:             *  @param document the document.
090:             *  @return the HTML version of the document.
091:             */
092:            private String getHTMLDocument(String[] document) {
093:                MutableString res = new MutableString();
094:                res
095:                        .append("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n");
096:                res.append("<HTML>\n<HEAD>\n<TITLE>" + document[0]
097:                        + "</TITLE>\n");
098:                // Do NOT append the first part of the body
099:                res.append("<BODY>\n"
100:                        + document[1].substring(document[0].length()));
101:                res.append("\n</BODY>\n");
102:                res.append("</HTML>");
103:                return res.toString();
104:            }
105:
106:            /** Given a two-field document, produce a mbox document with the first field as subject and
107:             *  the second field as body.
108:             *  
109:             *  @param document the document.
110:             *  @return the HTML version of the document.
111:             */
112:            private String getMboxDocument(String[] document) {
113:                MutableString res = new MutableString();
114:                res.append("From MAILER-DAEMON Fri Apr 15 16:22:32 2005\n");
115:                res.append("Date: 15 Apr 2005 16:22:32 +0200\n");
116:                res
117:                        .append("From: Mail System Internal Data <MAILER-DAEMON@sliver.usr.dsi.unimi.it>\n");
118:                res.append("Subject: " + document[0] + "\n");
119:                res
120:                        .append("Message-ID: <1113574952@sliver.usr.dsi.unimi.it>\n");
121:                res.append("X-IMAP: 1102967122 0000138458\n");
122:                res.append("Return-Path: <matteo.xxx@unimi.it>\n");
123:                res
124:                        .append("Received: from localhost (localhost.localdomain [127.0.0.1])\n");
125:                res
126:                        .append("\tby sliver.usr.dsi.unimi.it (8.12.11/8.12.11) with ESMTP id iAUNtadn007305\n");
127:                res
128:                        .append("\tfor <vigna@localhost>; Wed, 1 Dec 2004 00:55:36 +0100\n");
129:                res
130:                        .append("Received: from law5.usr.dsi.unimi.it [159.149.146.241]\n");
131:                res.append("\tby localhost with IMAP (fetchmail-6.2.5)\n");
132:                res
133:                        .append("\tfor vigna@localhost (single-drop); Wed, 01 Dec 2004 00:55:36 +0100 (CET)\n");
134:                res.append("To: vigna@dsi.unimi.it\n");
135:                res
136:                        .append("Message-id: <Pine.WNT.4.33.0412010051240.-209505@p233-mmx>\n");
137:                res.append("Content-type: TEXT/PLAIN; charset=iso-8859-15\n");
138:                res.append("X-Warning: UNAuthenticated Sender\n");
139:                res.append("Content-Transfer-Encoding: 8bit\n");
140:                res.append("Content-Length: " + document[1].length() + "\n");
141:                res.append("\n");
142:                res.append(document[1] + "\n");
143:                return res.toString();
144:            }
145:
146:            /** Checks that the tokenizer and the word reader return exactly the same sequence of words. 
147:             * 
148:             * @param wordReader the word reader.
149:             * @param tok the tokenizer.
150:             * @throws IOException
151:             */
152:            private void checkSameWords(WordReader wordReader,
153:                    StringTokenizer tok) throws IOException {
154:                MutableString word = new MutableString();
155:                MutableString nonWord = new MutableString();
156:                boolean aWordInDocum, aWordInDocument;
157:                boolean firstTime = true;
158:                for (;;) {
159:                    aWordInDocum = wordReader.next(word, nonWord);
160:                    if (firstTime) {
161:                        firstTime = false;
162:                        if (word.equals(""))
163:                            continue;
164:                    }
165:                    assertFalse(aWordInDocum && word.equals(""));
166:                    aWordInDocument = tok.hasMoreElements();
167:                    assertEquals(aWordInDocum, aWordInDocument);
168:                    if (!aWordInDocum)
169:                        break;
170:                    assertEquals(tok.nextElement(), word.toString());
171:                }
172:            }
173:
174:            /** Checks that the documents in the collection have the same sequence of words as in
175:             *  document: the names of the fields to be checked are specified in the array.
176:             *  
177:             * @param coll the collection.
178:             * @param fieldName the field names.
179:             * @param document documents to be checked against.
180:             * @throws IOException
181:             */
182:            private void checkAllDocuments(final DocumentCollection coll,
183:                    final String[] fieldName, final String[][] document)
184:                    throws IOException {
185:                final int nfields = fieldName.length;
186:                final int[] fieldNumber = new int[nfields];
187:                final int[] arrayIndex = new int[nfields];
188:                // Look for field indices
189:                for (int i = 0; i < nfields; i++) {
190:                    arrayIndex[i] = i;
191:                    int j;
192:                    for (j = 0; j < coll.factory().numberOfFields(); j++)
193:                        if (coll.factory().fieldName(j).equals(fieldName[i])) {
194:                            fieldNumber[i] = j;
195:                            break;
196:                        }
197:                    assert j < coll.factory().numberOfFields();
198:                }
199:                // Sort fields to guarantee that they are correctly numbered
200:                GenericSorting.quickSort(0, nfields, new IntComparator() {
201:                    public int compare(int x, int y) {
202:                        return fieldNumber[x] - fieldNumber[y];
203:                    }
204:                }, new Swapper() {
205:                    public void swap(int x, int y) {
206:                        int t = fieldNumber[x];
207:                        fieldNumber[x] = fieldNumber[y];
208:                        fieldNumber[y] = t;
209:                        t = arrayIndex[x];
210:                        arrayIndex[x] = arrayIndex[y];
211:                        arrayIndex[y] = t;
212:                        String q = fieldName[x];
213:                        fieldName[x] = fieldName[y];
214:                        fieldName[y] = q;
215:                    }
216:                });
217:                // Start checking
218:                for (int doc = 0; doc < coll.size(); doc++) {
219:                    Document docum = coll.document(doc);
220:                    for (int i = 0; i < nfields; i++) {
221:                        int field = fieldNumber[i];
222:                        Reader content = (Reader) docum.content(field);
223:                        WordReader wordReader = docum.wordReader(field);
224:                        wordReader.setReader(content);
225:                        StringTokenizer tok = new StringTokenizer(
226:                                document[doc][arrayIndex[i]]);
227:                        System.err
228:                                .println("Checking document " + doc + " field "
229:                                        + fieldName[i] + " (" + field + ")");
230:                        checkSameWords(wordReader, tok);
231:                    }
232:                    docum.close();
233:                }
234:            }
235:
236:            /** Checks that the documents in the sequence have the same sequence of words as in
237:             *  <code>document</code>: the names of the fields to be checked are specified in the array.
238:             *  
239:             * @param seq the sequence.
240:             * @param fieldName the field names.
241:             * @param document documents to be checked against.
242:             * @throws IOException
243:             */
244:            private void checkAllDocumentsSeq(final DocumentSequence seq,
245:                    final String[] fieldName, final String[][] document)
246:                    throws IOException {
247:                final int nfields = fieldName.length;
248:                final int[] fieldNumber = new int[nfields];
249:                final int[] arrayIndex = new int[nfields];
250:                // Look for field indices
251:                for (int i = 0; i < nfields; i++) {
252:                    arrayIndex[i] = i;
253:                    int j;
254:                    for (j = 0; j < seq.factory().numberOfFields(); j++)
255:                        if (seq.factory().fieldName(j).equals(fieldName[i])) {
256:                            fieldNumber[i] = j;
257:                            break;
258:                        }
259:                    assert j < seq.factory().numberOfFields();
260:                }
261:                // Sort fields to guarantee that they are correctly numbered
262:                GenericSorting.quickSort(0, nfields, new IntComparator() {
263:                    public int compare(int x, int y) {
264:                        return fieldNumber[x] - fieldNumber[y];
265:                    }
266:                }, new Swapper() {
267:                    public void swap(int x, int y) {
268:                        int t = fieldNumber[x];
269:                        fieldNumber[x] = fieldNumber[y];
270:                        fieldNumber[y] = t;
271:                        t = arrayIndex[x];
272:                        arrayIndex[x] = arrayIndex[y];
273:                        arrayIndex[y] = t;
274:                        String q = fieldName[x];
275:                        fieldName[x] = fieldName[y];
276:                        fieldName[y] = q;
277:                    }
278:                });
279:                // Start checking
280:                DocumentIterator iterator = seq.iterator();
281:                Document docum;
282:                int doc = 0;
283:                while ((docum = iterator.nextDocument()) != null) {
284:                    for (int i = 0; i < nfields; i++) {
285:                        int field = fieldNumber[i];
286:                        Reader content = (Reader) docum.content(field);
287:                        WordReader wordReader = docum.wordReader(field);
288:                        wordReader.setReader(content);
289:                        StringTokenizer tok = new StringTokenizer(
290:                                document[doc][arrayIndex[i]]);
291:                        System.err.println("Checking sequentially document "
292:                                + doc + " field " + fieldName[i] + " (" + field
293:                                + ")");
294:                        checkSameWords(wordReader, tok);
295:                    }
296:                    docum.close();
297:                    doc++;
298:                }
299:                iterator.close();
300:            }
301:
302:            protected void setUp() throws IOException, ClassNotFoundException,
303:                    ConfigurationException {
304:                // Create a new directory under /tmp
305:                tempDir = File.createTempFile("mg4jtest", null);
306:                tempDir.delete();
307:                tempDir.mkdir();
308:                // Now create the hierarchy for HTML files
309:                File htmlDir = new File(tempDir, "html");
310:                htmlDir.mkdir();
311:                System.err.println("Temporary directory: " + tempDir);
312:                htmlFileSet = new String[ndoc];
313:                for (int i = 0; i < ndoc; i++) {
314:                    String docFile = new File(htmlDir, "doc" + i + ".html")
315:                            .toString();
316:                    htmlFileSet[i] = docFile;
317:                    Writer docWriter = new OutputStreamWriter(
318:                            new FileOutputStream(docFile), "ISO-8859-1");
319:                    docWriter.write(getHTMLDocument(document[i]));
320:                    docWriter.close();
321:                }
322:                // Now create the mbox file
323:                Writer mboxWriter = new OutputStreamWriter(
324:                        new FileOutputStream(new File(tempDir, "mbox")),
325:                        "ISO-8859-1");
326:                for (int i = 0; i < ndoc; i++)
327:                    mboxWriter.write(getMboxDocument(document[i]));
328:                mboxWriter.close();
329:                // Now create the zip collections
330:                FileSetDocumentCollection fileSetDocumentCollection = new FileSetDocumentCollection(
331:                        htmlFileSet,
332:                        new HtmlDocumentFactory(DEFAULT_PROPERTIES));
333:                ZipDocumentCollectionBuilder collBuilder = new ZipDocumentCollectionBuilder(
334:                        new File(tempDir, "zip").toString(),
335:                        fileSetDocumentCollection.factory(), true,
336:                        new ProgressLogger());
337:                ZipDocumentCollection zipDocumentCollection = collBuilder
338:                        .build(fileSetDocumentCollection);
339:                BinIO.storeObject(zipDocumentCollection, new File(tempDir,
340:                        "zip.collection").toString());
341:                zipDocumentCollection.close();
342:
343:                ZipDocumentCollectionBuilder apprCollBuilder = new ZipDocumentCollectionBuilder(
344:                        new File(tempDir, "azip").toString(),
345:                        fileSetDocumentCollection.factory(), false,
346:                        new ProgressLogger());
347:                zipDocumentCollection = apprCollBuilder
348:                        .build(fileSetDocumentCollection);
349:                BinIO.storeObject(zipDocumentCollection, new File(tempDir,
350:                        "azip.collection").toString());
351:                zipDocumentCollection.close();
352:                fileSetDocumentCollection.close();
353:            }
354:
355:            public void testFileSetDocumentCollection() throws IOException,
356:                    ConfigurationException {
357:                System.err.println("Checking fileset collection");
358:                FileSetDocumentCollection coll = new FileSetDocumentCollection(
359:                        htmlFileSet,
360:                        new HtmlDocumentFactory(DEFAULT_PROPERTIES));
361:                assertEquals(coll.size(), ndoc);
362:                checkAllDocuments(coll, new String[] { "title", "text" },
363:                        document);
364:                coll.close();
365:            }
366:
367:            public void testFileSetDocumentCollectionSeq() throws IOException,
368:                    ConfigurationException {
369:                System.err.println("Checking fileset collection sequentially");
370:                FileSetDocumentCollection coll = new FileSetDocumentCollection(
371:                        htmlFileSet,
372:                        new HtmlDocumentFactory(DEFAULT_PROPERTIES));
373:                checkAllDocumentsSeq(coll, new String[] { "title", "text" },
374:                        document);
375:                coll.close();
376:            }
377:
378:            /*	public void testMboxDocumentCollection() throws IOException, ConfigurationException, MessagingException {
379:             System.err.println( "Checking mbox collection" );
380:             JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES );
381:             checkAllDocuments( coll, new String[] { "subject", "body" }, document );
382:             coll.close();
383:             }
384:
385:             public void testMboxDocumentCollectionSeq() throws IOException, ConfigurationException, MessagingException {
386:             System.err.println( "Checking mbox collection sequentially" );
387:             JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES );
388:             checkAllDocumentsSeq( coll, new String[] { "subject", "body" }, document );
389:             coll.close();
390:             }
391:             */
392:            public void testZipDocumentCollection() throws IOException,
393:                    ClassNotFoundException {
394:                System.err.println("Checking zipped collection");
395:                ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
396:                        .loadObject(new File(tempDir, "zip.collection")
397:                                .toString());
398:                checkAllDocuments(coll, new String[] { "title", "text" },
399:                        document);
400:                coll.close();
401:            }
402:
403:            public void testZipDocumentCollectionSeq() throws IOException,
404:                    ClassNotFoundException {
405:                System.err.println("Checking zipped collection sequentially");
406:                ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
407:                        .loadObject(new File(tempDir, "zip.collection")
408:                                .toString());
409:                checkAllDocumentsSeq(coll, new String[] { "title", "text" },
410:                        document);
411:                coll.close();
412:            }
413:
414:            public void testZipDocumentCollectionAppr() throws IOException,
415:                    ClassNotFoundException {
416:                System.err.println("Checking approximated zipped collection");
417:                ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
418:                        .loadObject(new File(tempDir, "azip.collection")
419:                                .toString());
420:                checkAllDocuments(coll, new String[] { "title", "text" },
421:                        document);
422:                coll.close();
423:            }
424:
425:            public void testZipDocumentCollectionApprSeq() throws IOException,
426:                    ClassNotFoundException {
427:                System.err
428:                        .println("Checking approximated zipped collection sequentially");
429:                ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
430:                        .loadObject(new File(tempDir, "azip.collection")
431:                                .toString());
432:                checkAllDocumentsSeq(coll, new String[] { "title", "text" },
433:                        document);
434:                coll.close();
435:            }
436:
437:            public void testInputStreamSequence() throws IOException,
438:                    ConfigurationException {
439:                System.err.println("Checking input stream (text field only)");
440:                // Extract only field number 1, and write it out with separator '\u0000'
441:                MutableString res = new MutableString();
442:                String[][] justSecondField = new String[ndoc][1];
443:                for (int i = 0; i < ndoc; i++) {
444:                    res.append(document[i][1] + "\u0000");
445:                    justSecondField[i][0] = document[i][1];
446:                }
447:                String resString = res.toString();
448:                // Write the sequence on a file (in UTF-8)
449:                Writer resWriter = new OutputStreamWriter(new FileOutputStream(
450:                        new File(tempDir, "stream")), "UTF-8");
451:                resWriter.write(resString);
452:                resWriter.close();
453:                // Read it as a input stream document sequence
454:                InputStream is = new FileInputStream(
455:                        new File(tempDir, "stream"));
456:                DocumentSequence seq = new InputStreamDocumentSequence(is,
457:                        '\u0000', new IdentityDocumentFactory(
458:                                DEFAULT_PROPERTIES));
459:                checkAllDocumentsSeq(seq, new String[] { "text" },
460:                        justSecondField);
461:                seq.close();
462:            }
463:
464:            protected void tearDown() throws IOException {
465:                FileUtils.forceDeleteOnExit(tempDir);
466:            }
467:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.