001: package test.it.unimi.dsi.mg4j.document;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Paolo Boldi
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.io.BinIO;
025: import it.unimi.dsi.io.WordReader;
026: import it.unimi.dsi.lang.MutableString;
027: import it.unimi.dsi.logging.ProgressLogger;
028: import it.unimi.dsi.mg4j.document.Document;
029: import it.unimi.dsi.mg4j.document.DocumentCollection;
030: import it.unimi.dsi.mg4j.document.DocumentIterator;
031: import it.unimi.dsi.mg4j.document.DocumentSequence;
032: import it.unimi.dsi.mg4j.document.FileSetDocumentCollection;
033: import it.unimi.dsi.mg4j.document.HtmlDocumentFactory;
034: import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;
035: import it.unimi.dsi.mg4j.document.InputStreamDocumentSequence;
036: import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory;
037: import it.unimi.dsi.mg4j.document.ZipDocumentCollection;
038: import it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder;
039: import it.unimi.dsi.util.Properties;
040:
041: import java.io.File;
042: import java.io.FileInputStream;
043: import java.io.FileOutputStream;
044: import java.io.IOException;
045: import java.io.InputStream;
046: import java.io.OutputStreamWriter;
047: import java.io.Reader;
048: import java.io.Writer;
049: import java.util.StringTokenizer;
050:
051: import junit.framework.TestCase;
052:
053: import org.apache.commons.configuration.ConfigurationException;
054: import org.apache.commons.io.FileUtils;
055:
056: import cern.colt.GenericSorting;
057: import cern.colt.Swapper;
058: import cern.colt.function.IntComparator;
059:
060: public class DocumentCollectionTest extends TestCase {
061:
062: /** We consider documents abstractly described by two fields each. */
063: private final static String[][] document = new String[][] {
064: // 0 1 2 3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
065: new String[] { "xxx yyy zzz xxx",
066: "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
067: /* new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
068: new String[] { "aaa uuu aaa" , "aaa uuu aaa xxx xxx xxx aaa xxx" },
069: // This tests that zipped collections handle properly initial spaces
070: new String[] { " aaa uuu aaa" , " aaa uuu aaa xxx xxx xxx aaa xxx" },*/
071: };
072: private final static Properties DEFAULT_PROPERTIES = new Properties();
073: static {
074: DEFAULT_PROPERTIES.setProperty(
075: PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
076: "ASCII");
077: }
078:
079: /** The number of documents. */
080: private final static int ndoc = document.length;
081: /** The temporary directory where all tests are run. */
082: private File tempDir;
083: /** The set of files in the HTML directory. */
084: private String[] htmlFileSet;
085:
086: /** Given a two-field document, produce an HTML document with the first field as title and
087: * the second field as body.
088: *
089: * @param document the document.
090: * @return the HTML version of the document.
091: */
092: private String getHTMLDocument(String[] document) {
093: MutableString res = new MutableString();
094: res
095: .append("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n");
096: res.append("<HTML>\n<HEAD>\n<TITLE>" + document[0]
097: + "</TITLE>\n");
098: // Do NOT append the first part of the body
099: res.append("<BODY>\n"
100: + document[1].substring(document[0].length()));
101: res.append("\n</BODY>\n");
102: res.append("</HTML>");
103: return res.toString();
104: }
105:
106: /** Given a two-field document, produce a mbox document with the first field as subject and
107: * the second field as body.
108: *
109: * @param document the document.
110: * @return the HTML version of the document.
111: */
112: private String getMboxDocument(String[] document) {
113: MutableString res = new MutableString();
114: res.append("From MAILER-DAEMON Fri Apr 15 16:22:32 2005\n");
115: res.append("Date: 15 Apr 2005 16:22:32 +0200\n");
116: res
117: .append("From: Mail System Internal Data <MAILER-DAEMON@sliver.usr.dsi.unimi.it>\n");
118: res.append("Subject: " + document[0] + "\n");
119: res
120: .append("Message-ID: <1113574952@sliver.usr.dsi.unimi.it>\n");
121: res.append("X-IMAP: 1102967122 0000138458\n");
122: res.append("Return-Path: <matteo.xxx@unimi.it>\n");
123: res
124: .append("Received: from localhost (localhost.localdomain [127.0.0.1])\n");
125: res
126: .append("\tby sliver.usr.dsi.unimi.it (8.12.11/8.12.11) with ESMTP id iAUNtadn007305\n");
127: res
128: .append("\tfor <vigna@localhost>; Wed, 1 Dec 2004 00:55:36 +0100\n");
129: res
130: .append("Received: from law5.usr.dsi.unimi.it [159.149.146.241]\n");
131: res.append("\tby localhost with IMAP (fetchmail-6.2.5)\n");
132: res
133: .append("\tfor vigna@localhost (single-drop); Wed, 01 Dec 2004 00:55:36 +0100 (CET)\n");
134: res.append("To: vigna@dsi.unimi.it\n");
135: res
136: .append("Message-id: <Pine.WNT.4.33.0412010051240.-209505@p233-mmx>\n");
137: res.append("Content-type: TEXT/PLAIN; charset=iso-8859-15\n");
138: res.append("X-Warning: UNAuthenticated Sender\n");
139: res.append("Content-Transfer-Encoding: 8bit\n");
140: res.append("Content-Length: " + document[1].length() + "\n");
141: res.append("\n");
142: res.append(document[1] + "\n");
143: return res.toString();
144: }
145:
146: /** Checks that the tokenizer and the word reader return exactly the same sequence of words.
147: *
148: * @param wordReader the word reader.
149: * @param tok the tokenizer.
150: * @throws IOException
151: */
152: private void checkSameWords(WordReader wordReader,
153: StringTokenizer tok) throws IOException {
154: MutableString word = new MutableString();
155: MutableString nonWord = new MutableString();
156: boolean aWordInDocum, aWordInDocument;
157: boolean firstTime = true;
158: for (;;) {
159: aWordInDocum = wordReader.next(word, nonWord);
160: if (firstTime) {
161: firstTime = false;
162: if (word.equals(""))
163: continue;
164: }
165: assertFalse(aWordInDocum && word.equals(""));
166: aWordInDocument = tok.hasMoreElements();
167: assertEquals(aWordInDocum, aWordInDocument);
168: if (!aWordInDocum)
169: break;
170: assertEquals(tok.nextElement(), word.toString());
171: }
172: }
173:
174: /** Checks that the documents in the collection have the same sequence of words as in
175: * document: the names of the fields to be checked are specified in the array.
176: *
177: * @param coll the collection.
178: * @param fieldName the field names.
179: * @param document documents to be checked against.
180: * @throws IOException
181: */
182: private void checkAllDocuments(final DocumentCollection coll,
183: final String[] fieldName, final String[][] document)
184: throws IOException {
185: final int nfields = fieldName.length;
186: final int[] fieldNumber = new int[nfields];
187: final int[] arrayIndex = new int[nfields];
188: // Look for field indices
189: for (int i = 0; i < nfields; i++) {
190: arrayIndex[i] = i;
191: int j;
192: for (j = 0; j < coll.factory().numberOfFields(); j++)
193: if (coll.factory().fieldName(j).equals(fieldName[i])) {
194: fieldNumber[i] = j;
195: break;
196: }
197: assert j < coll.factory().numberOfFields();
198: }
199: // Sort fields to guarantee that they are correctly numbered
200: GenericSorting.quickSort(0, nfields, new IntComparator() {
201: public int compare(int x, int y) {
202: return fieldNumber[x] - fieldNumber[y];
203: }
204: }, new Swapper() {
205: public void swap(int x, int y) {
206: int t = fieldNumber[x];
207: fieldNumber[x] = fieldNumber[y];
208: fieldNumber[y] = t;
209: t = arrayIndex[x];
210: arrayIndex[x] = arrayIndex[y];
211: arrayIndex[y] = t;
212: String q = fieldName[x];
213: fieldName[x] = fieldName[y];
214: fieldName[y] = q;
215: }
216: });
217: // Start checking
218: for (int doc = 0; doc < coll.size(); doc++) {
219: Document docum = coll.document(doc);
220: for (int i = 0; i < nfields; i++) {
221: int field = fieldNumber[i];
222: Reader content = (Reader) docum.content(field);
223: WordReader wordReader = docum.wordReader(field);
224: wordReader.setReader(content);
225: StringTokenizer tok = new StringTokenizer(
226: document[doc][arrayIndex[i]]);
227: System.err
228: .println("Checking document " + doc + " field "
229: + fieldName[i] + " (" + field + ")");
230: checkSameWords(wordReader, tok);
231: }
232: docum.close();
233: }
234: }
235:
236: /** Checks that the documents in the sequence have the same sequence of words as in
237: * <code>document</code>: the names of the fields to be checked are specified in the array.
238: *
239: * @param seq the sequence.
240: * @param fieldName the field names.
241: * @param document documents to be checked against.
242: * @throws IOException
243: */
244: private void checkAllDocumentsSeq(final DocumentSequence seq,
245: final String[] fieldName, final String[][] document)
246: throws IOException {
247: final int nfields = fieldName.length;
248: final int[] fieldNumber = new int[nfields];
249: final int[] arrayIndex = new int[nfields];
250: // Look for field indices
251: for (int i = 0; i < nfields; i++) {
252: arrayIndex[i] = i;
253: int j;
254: for (j = 0; j < seq.factory().numberOfFields(); j++)
255: if (seq.factory().fieldName(j).equals(fieldName[i])) {
256: fieldNumber[i] = j;
257: break;
258: }
259: assert j < seq.factory().numberOfFields();
260: }
261: // Sort fields to guarantee that they are correctly numbered
262: GenericSorting.quickSort(0, nfields, new IntComparator() {
263: public int compare(int x, int y) {
264: return fieldNumber[x] - fieldNumber[y];
265: }
266: }, new Swapper() {
267: public void swap(int x, int y) {
268: int t = fieldNumber[x];
269: fieldNumber[x] = fieldNumber[y];
270: fieldNumber[y] = t;
271: t = arrayIndex[x];
272: arrayIndex[x] = arrayIndex[y];
273: arrayIndex[y] = t;
274: String q = fieldName[x];
275: fieldName[x] = fieldName[y];
276: fieldName[y] = q;
277: }
278: });
279: // Start checking
280: DocumentIterator iterator = seq.iterator();
281: Document docum;
282: int doc = 0;
283: while ((docum = iterator.nextDocument()) != null) {
284: for (int i = 0; i < nfields; i++) {
285: int field = fieldNumber[i];
286: Reader content = (Reader) docum.content(field);
287: WordReader wordReader = docum.wordReader(field);
288: wordReader.setReader(content);
289: StringTokenizer tok = new StringTokenizer(
290: document[doc][arrayIndex[i]]);
291: System.err.println("Checking sequentially document "
292: + doc + " field " + fieldName[i] + " (" + field
293: + ")");
294: checkSameWords(wordReader, tok);
295: }
296: docum.close();
297: doc++;
298: }
299: iterator.close();
300: }
301:
302: protected void setUp() throws IOException, ClassNotFoundException,
303: ConfigurationException {
304: // Create a new directory under /tmp
305: tempDir = File.createTempFile("mg4jtest", null);
306: tempDir.delete();
307: tempDir.mkdir();
308: // Now create the hierarchy for HTML files
309: File htmlDir = new File(tempDir, "html");
310: htmlDir.mkdir();
311: System.err.println("Temporary directory: " + tempDir);
312: htmlFileSet = new String[ndoc];
313: for (int i = 0; i < ndoc; i++) {
314: String docFile = new File(htmlDir, "doc" + i + ".html")
315: .toString();
316: htmlFileSet[i] = docFile;
317: Writer docWriter = new OutputStreamWriter(
318: new FileOutputStream(docFile), "ISO-8859-1");
319: docWriter.write(getHTMLDocument(document[i]));
320: docWriter.close();
321: }
322: // Now create the mbox file
323: Writer mboxWriter = new OutputStreamWriter(
324: new FileOutputStream(new File(tempDir, "mbox")),
325: "ISO-8859-1");
326: for (int i = 0; i < ndoc; i++)
327: mboxWriter.write(getMboxDocument(document[i]));
328: mboxWriter.close();
329: // Now create the zip collections
330: FileSetDocumentCollection fileSetDocumentCollection = new FileSetDocumentCollection(
331: htmlFileSet,
332: new HtmlDocumentFactory(DEFAULT_PROPERTIES));
333: ZipDocumentCollectionBuilder collBuilder = new ZipDocumentCollectionBuilder(
334: new File(tempDir, "zip").toString(),
335: fileSetDocumentCollection.factory(), true,
336: new ProgressLogger());
337: ZipDocumentCollection zipDocumentCollection = collBuilder
338: .build(fileSetDocumentCollection);
339: BinIO.storeObject(zipDocumentCollection, new File(tempDir,
340: "zip.collection").toString());
341: zipDocumentCollection.close();
342:
343: ZipDocumentCollectionBuilder apprCollBuilder = new ZipDocumentCollectionBuilder(
344: new File(tempDir, "azip").toString(),
345: fileSetDocumentCollection.factory(), false,
346: new ProgressLogger());
347: zipDocumentCollection = apprCollBuilder
348: .build(fileSetDocumentCollection);
349: BinIO.storeObject(zipDocumentCollection, new File(tempDir,
350: "azip.collection").toString());
351: zipDocumentCollection.close();
352: fileSetDocumentCollection.close();
353: }
354:
355: public void testFileSetDocumentCollection() throws IOException,
356: ConfigurationException {
357: System.err.println("Checking fileset collection");
358: FileSetDocumentCollection coll = new FileSetDocumentCollection(
359: htmlFileSet,
360: new HtmlDocumentFactory(DEFAULT_PROPERTIES));
361: assertEquals(coll.size(), ndoc);
362: checkAllDocuments(coll, new String[] { "title", "text" },
363: document);
364: coll.close();
365: }
366:
367: public void testFileSetDocumentCollectionSeq() throws IOException,
368: ConfigurationException {
369: System.err.println("Checking fileset collection sequentially");
370: FileSetDocumentCollection coll = new FileSetDocumentCollection(
371: htmlFileSet,
372: new HtmlDocumentFactory(DEFAULT_PROPERTIES));
373: checkAllDocumentsSeq(coll, new String[] { "title", "text" },
374: document);
375: coll.close();
376: }
377:
378: /* public void testMboxDocumentCollection() throws IOException, ConfigurationException, MessagingException {
379: System.err.println( "Checking mbox collection" );
380: JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES );
381: checkAllDocuments( coll, new String[] { "subject", "body" }, document );
382: coll.close();
383: }
384:
385: public void testMboxDocumentCollectionSeq() throws IOException, ConfigurationException, MessagingException {
386: System.err.println( "Checking mbox collection sequentially" );
387: JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES );
388: checkAllDocumentsSeq( coll, new String[] { "subject", "body" }, document );
389: coll.close();
390: }
391: */
392: public void testZipDocumentCollection() throws IOException,
393: ClassNotFoundException {
394: System.err.println("Checking zipped collection");
395: ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
396: .loadObject(new File(tempDir, "zip.collection")
397: .toString());
398: checkAllDocuments(coll, new String[] { "title", "text" },
399: document);
400: coll.close();
401: }
402:
403: public void testZipDocumentCollectionSeq() throws IOException,
404: ClassNotFoundException {
405: System.err.println("Checking zipped collection sequentially");
406: ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
407: .loadObject(new File(tempDir, "zip.collection")
408: .toString());
409: checkAllDocumentsSeq(coll, new String[] { "title", "text" },
410: document);
411: coll.close();
412: }
413:
414: public void testZipDocumentCollectionAppr() throws IOException,
415: ClassNotFoundException {
416: System.err.println("Checking approximated zipped collection");
417: ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
418: .loadObject(new File(tempDir, "azip.collection")
419: .toString());
420: checkAllDocuments(coll, new String[] { "title", "text" },
421: document);
422: coll.close();
423: }
424:
425: public void testZipDocumentCollectionApprSeq() throws IOException,
426: ClassNotFoundException {
427: System.err
428: .println("Checking approximated zipped collection sequentially");
429: ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
430: .loadObject(new File(tempDir, "azip.collection")
431: .toString());
432: checkAllDocumentsSeq(coll, new String[] { "title", "text" },
433: document);
434: coll.close();
435: }
436:
437: public void testInputStreamSequence() throws IOException,
438: ConfigurationException {
439: System.err.println("Checking input stream (text field only)");
440: // Extract only field number 1, and write it out with separator '\u0000'
441: MutableString res = new MutableString();
442: String[][] justSecondField = new String[ndoc][1];
443: for (int i = 0; i < ndoc; i++) {
444: res.append(document[i][1] + "\u0000");
445: justSecondField[i][0] = document[i][1];
446: }
447: String resString = res.toString();
448: // Write the sequence on a file (in UTF-8)
449: Writer resWriter = new OutputStreamWriter(new FileOutputStream(
450: new File(tempDir, "stream")), "UTF-8");
451: resWriter.write(resString);
452: resWriter.close();
453: // Read it as a input stream document sequence
454: InputStream is = new FileInputStream(
455: new File(tempDir, "stream"));
456: DocumentSequence seq = new InputStreamDocumentSequence(is,
457: '\u0000', new IdentityDocumentFactory(
458: DEFAULT_PROPERTIES));
459: checkAllDocumentsSeq(seq, new String[] { "text" },
460: justSecondField);
461: seq.close();
462: }
463:
464: protected void tearDown() throws IOException {
465: FileUtils.forceDeleteOnExit(tempDir);
466: }
467: }
|