001: package it.unimi.dsi.mg4j.tool;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.AbstractIntIterator;
025: import it.unimi.dsi.fastutil.ints.IntIterator;
026: import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
027: import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;
028: import it.unimi.dsi.mg4j.index.BitStreamIndex;
029: import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
030: import it.unimi.dsi.mg4j.index.CompressionFlags;
031: import it.unimi.dsi.mg4j.index.DiskBasedIndex;
032: import it.unimi.dsi.mg4j.index.Index;
033: import it.unimi.dsi.mg4j.index.IndexIterator;
034: import it.unimi.dsi.mg4j.index.IndexReader;
035: import it.unimi.dsi.mg4j.index.IndexWriter;
036: import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;
037: import it.unimi.dsi.mg4j.index.TermProcessor;
038: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
039: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
040: import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
041: import it.unimi.dsi.mg4j.index.payload.Payload;
042: import it.unimi.dsi.io.FastBufferedReader;
043: import it.unimi.dsi.io.InputBitStream;
044: import it.unimi.dsi.io.OutputBitStream;
045: import it.unimi.dsi.Util;
046: import it.unimi.dsi.lang.MutableString;
047: import it.unimi.dsi.lang.ObjectParser;
048: import it.unimi.dsi.logging.ProgressLogger;
049: import it.unimi.dsi.util.Properties;
050:
051: import java.io.BufferedWriter;
052: import java.io.Closeable;
053: import java.io.File;
054: import java.io.FileInputStream;
055: import java.io.FileNotFoundException;
056: import java.io.FileOutputStream;
057: import java.io.IOException;
058: import java.io.InputStreamReader;
059: import java.io.OutputStreamWriter;
060: import java.io.PrintStream;
061: import java.io.PrintWriter;
062: import java.lang.reflect.InvocationTargetException;
063: import java.net.URISyntaxException;
064: import java.util.Arrays;
065: import java.util.Map;
066:
067: import org.apache.commons.configuration.ConfigurationException;
068: import org.apache.commons.configuration.ConfigurationMap;
069: import org.apache.log4j.Logger;
070:
071: import com.martiansoftware.jsap.FlaggedOption;
072: import com.martiansoftware.jsap.JSAP;
073: import com.martiansoftware.jsap.JSAPException;
074: import com.martiansoftware.jsap.JSAPResult;
075: import com.martiansoftware.jsap.Parameter;
076: import com.martiansoftware.jsap.SimpleJSAP;
077: import com.martiansoftware.jsap.Switch;
078: import com.martiansoftware.jsap.UnflaggedOption;
079: import com.martiansoftware.jsap.stringparsers.FileStringParser;
080:
081: /** Combines several indices.
082: *
083: * <p>Indices may be combined in several different ways. This abstract class
084: * contains code that is common to classes such as {@link it.unimi.dsi.mg4j.tool.Merge}
085: * or {@link it.unimi.dsi.mg4j.tool.Concatenate}: essentially, command line parsing,
086: * inded opening, and term list fusion is taken care of. Then, the template method
087: * {@link #combine(int)} must write into {@link #indexWriter} the combined inverted
088: * list, returning the resulting frequency.
089: *
090: * <p>Note that by combining a single index into a new one you can recompress an index
091: * with different compression parameters (which includes the possibility of eliminating
092: * positions or counts).
093: *
094: * <p>The subclasses of this class must implement {@link #combine(int)} so that indices
095: * with different sets of features are combined keeping the largest set of features requested
096: * by the user. For instance, combining an index with positions and an index with counts, but
097: * no positions, should generate an index with counts but no positions.
098: *
099: * <p><strong>Warning</strong>: a combination requires opening <em>three</em> files per input index,
100: * plus a few more files for the output index. If the combination process is interrupted by
101: * an exception claiming that there are too many open files, check how to increase the
102: * number of files you can open (usually, for instance on UN*X, there is a global and a per-process limit,
103: * so be sure to set both).
104: *
105: * <h2>Read-once indices, readers, and distributed index combination</h2>
106: *
107: * <p>If the {@linkplain it.unimi.dsi.mg4j.index.Index indices} and
108: * {@linkplain it.unimi.dsi.mg4j.index.BitStreamIndexReader bitstream index readers} involved in the
109: * combination are <em>read-once</em> (i.e., opening an index and reading once its contents sequentially
110: * causes each file composing the index to be read exactly once)
111: * <em>then also {@link it.unimi.dsi.mg4j.tool.Combine} implementations should be read-once</em> ({@link it.unimi.dsi.mg4j.tool.Concatenate},
112: * {@link it.unimi.dsi.mg4j.tool.Merge} and {@link it.unimi.dsi.mg4j.tool.Paste} are).
113: *
114: * <p>This means, in particular, that index combination can be performed from <em>pipes</em>, which in
115: * turn can be filled, for instance, with data coming from the network. In other words, albeit this
116: * class is theoretically based on a number of indices existing on a local disk, those indices can be
117: * substituted with suitable pipes filled with remote data without affecting the combination process.
118: * For instance, the following <samp>bash</samp> code creates three sets of pipes:
119: * <pre style="margin: 1em 0">
120: * for i in 0 1 2; do
121: * for e in frequencies globcounts index offsets properties sizes terms; do
122: * mkfifo pipe$i.$e
123: * done
124: * done
125: * </pre>
126: *
127: * <p>Each pipe should be then filled with suitable data, for instance obtained from the net (assuming
128: * you have indices <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> on <samp>example.com</samp>):
129: * <pre style="margin: 1em 0">
130: * for i in 0 1 2; do
131: * for e in frequencies globcounts index offsets properties sizes terms; do
132: * (ssh -x example.com cat index$i.$e >pipe$i.$e &)
133: * done
134: * done
135: * </pre>
136: * <p>Now all pipes will be filled with data from the corresponding remote files, and
137: * combining the indices <samp>pipe0</samp>, <samp>pipe1</samp> and <samp>pipe2</samp>
138: * will give the same result as combining <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp>
139: * on the remote system.
140: *
141: * @author Sebastiano Vigna
142: * @since 1.0
143: */
144:
145: public abstract class Combine {
146: private static final Logger LOGGER = Util.getLogger(Combine.class);
147: private final static boolean DEBUG = false;
148:
149: /** The default buffer size. */
150: public static final int DEFAULT_BUFFER_SIZE = 1024 * 1024;
151:
152: /** The number of indices to be merged. */
153: final protected int numIndices;
154: /** The array of indices to be merged. */
155: final protected BitStreamIndex[] index;
156: /** An array of index readers parallel to {@link #index}. */
157: final protected IndexReader[] indexReader;
158: /** An array of index iterators parallel to {@link #index} (filled by concrete implementations). */
159: final protected IndexIterator[] indexIterator;
160: /** An array of input bit streams, returning the global counts for each index. */
161: private final InputBitStream[] globCounts;
162: /** Whether to output global counts. */
163: private boolean writeGlobCounts;
164: /** Whether to output sizes. */
165: private boolean writeSizes;
166: /** Compute only index metadata (sizes, terms and globcounts). */
167: private final boolean metadataOnly;
168: /** An array of mutable strings, containing the last term read for a given index. */
169: private MutableString[] term;
170: /** An array of fast buffered readers, used to read the terms of each index. */
171: private FastBufferedReader[] termReader;
172: /** The queue containing terms. */
173: protected ObjectHeapSemiIndirectPriorityQueue<MutableString> termQueue;
174: /** The overall number of documents. */
175: protected final int numberOfDocuments;
176: /** The overall number of occurrences. */
177: protected long numberOfOccurrences;
178: /** The maximum count in the merged index. */
179: protected int maxCount;
180: /** The array of input basenames. */
181: protected final String[] inputBasename;
182: /** The output basename. */
183: private final String outputBasename;
184: /** The size of I/O buffers. */
185: private final int bufferSize;
186: /** The logging interval. */
187: private final long logInterval;
188: /** The index writer for the merged index. */
189: protected IndexWriter indexWriter;
190: /** Whether {@link #indexWriter} has counts. */
191: protected final boolean hasCounts;
192: /** Whether {@link #indexWriter} has positions. */
193: protected final boolean hasPositions;
194: /** Whether {@link #indexWriter} has payloads. */
195: protected final boolean hasPayloads;
196: /** Additional properties for the merged index. */
197: private Properties additionalProperties;
198: /** An array partially filled with the indices (as offsets in {@link #index}) participating to the merge process for the current term. */
199: protected int[] usedIndex;
200: /** For each index, the frequency of the current term (given that it is present). */
201: final protected int[] frequency;
202: /** A cache for positions. */
203: protected int[] position;
204: /** The size of each document. */
205: protected int[] size;
206:
207: public Combine(final String outputBasename,
208: final String[] inputBasename, final boolean metadataOnly,
209: final int bufferSize,
210: final Map<Component, Coding> writerFlags,
211: boolean interleaved, final boolean skips,
212: final int quantum, final int height,
213: final int skipBufferSize, final long logInterval)
214: throws IOException, ConfigurationException,
215: URISyntaxException, ClassNotFoundException,
216: SecurityException, InstantiationException,
217: IllegalAccessException, InvocationTargetException,
218: NoSuchMethodException {
219:
220: this .logInterval = logInterval;
221:
222: LOGGER.debug("Combining indices "
223: + Arrays.toString(inputBasename) + " into "
224: + outputBasename);
225:
226: this .inputBasename = inputBasename;
227: this .outputBasename = outputBasename;
228: this .metadataOnly = metadataOnly;
229: this .bufferSize = bufferSize;
230:
231: numIndices = inputBasename.length;
232: index = new BitStreamIndex[numIndices];
233: indexReader = new IndexReader[numIndices];
234: indexIterator = new IndexIterator[numIndices];
235: globCounts = new InputBitStream[numIndices];
236: term = new MutableString[numIndices];
237: termReader = new FastBufferedReader[numIndices];
238: termQueue = new ObjectHeapSemiIndirectPriorityQueue<MutableString>(
239: term, numIndices);
240:
241: // This will remain set if *all* indices to be merged agree
242: boolean haveCounts = true, havePositions = true;
243: /* This will be set if *all* indices to be merged agree. Moreover, if some
244: * indices disagree we will emit a warning. */
245: TermProcessor termProcessor = null;
246: /* This will be set if *all* indices to be merged agree. Moreover, if some
247: * indices disagree we will emit a warning. */
248: Payload payload = null;
249: String field = null;
250: writeGlobCounts = writeSizes = true;
251: boolean someGlobCounts = false, someSizes = false;
252:
253: for (int i = 0; i < numIndices; i++) {
254: index[i] = getIndex(inputBasename[i]);
255: if (i == 0) {
256: termProcessor = index[0].termProcessor.copy();
257: payload = index[0].payload == null ? null
258: : index[0].payload.copy();
259: } else {
260: if (!termProcessor.equals(index[i].termProcessor))
261: throw new IllegalStateException(
262: "The term processor of the first index ("
263: + termProcessor
264: + ") is different from the term processor of index "
265: + i + " (" + index[i].termProcessor
266: + ")");
267: if ((payload == null) != (index[i].payload == null)
268: || payload != null
269: && !payload.compatibleWith(index[i].payload))
270: throw new IllegalStateException(
271: "The payload specification of index "
272: + index[0]
273: + " is not compatible with that of index "
274: + index[i]);
275: }
276:
277: if (index[i].field != null) {
278: if (field == null) {
279: if (i != 0)
280: LOGGER
281: .warn("Not all indices specify the field property");
282: field = index[i].field;
283: } else if (!field.equals(index[i].field))
284: LOGGER.warn("Index fields disagree: \"" + field
285: + "\", \"" + index[i].field + "\"");
286: }
287:
288: haveCounts &= index[i].hasCounts;
289: havePositions &= index[i].hasPositions;
290: maxCount = Math.max(maxCount, index[i].maxCount);
291: if (!metadataOnly)
292: indexReader[i] = index[i].getReader(bufferSize);
293: if (index[i].properties
294: .getLong(Index.PropertyKeys.OCCURRENCES) == -1)
295: numberOfOccurrences = -1;
296: if (numberOfOccurrences != -1)
297: numberOfOccurrences += index[i].properties
298: .getLong(Index.PropertyKeys.OCCURRENCES);
299: final File globCountsFile = new File(inputBasename[i]
300: + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
301: writeGlobCounts &= globCountsFile.exists();
302: someGlobCounts |= globCountsFile.exists();
303: if (writeGlobCounts)
304: globCounts[i] = new InputBitStream(globCountsFile);
305:
306: final File sizesFile = new File(inputBasename[i]
307: + DiskBasedIndex.SIZES_EXTENSION);
308: writeSizes &= sizesFile.exists();
309: someSizes |= sizesFile.exists();
310:
311: term[i] = new MutableString();
312: termReader[i] = new FastBufferedReader(
313: new InputStreamReader(new FileInputStream(
314: inputBasename[i]
315: + DiskBasedIndex.TERMS_EXTENSION),
316: "UTF-8"));
317: if (termReader[i].readLine(term[i]) != null)
318: termQueue.enqueue(i); // If the term list is nonempty, we enqueue it
319: }
320:
321: if (writeGlobCounts != someGlobCounts)
322: LOGGER
323: .warn("Some (but not all) global-counts file missing");
324: if (writeSizes != someSizes)
325: LOGGER.warn("Some (but not all) sizes file missing");
326:
327: additionalProperties = new Properties();
328: additionalProperties.setProperty(
329: Index.PropertyKeys.TERMPROCESSOR, ObjectParser
330: .toSpec(termProcessor));
331: if (payload != null) {
332: additionalProperties.setProperty(
333: Index.PropertyKeys.PAYLOADCLASS, payload.getClass()
334: .getName());
335: //writerFlags.put( Component.PAYLOADS, null );
336: }
337: additionalProperties.setProperty(Index.PropertyKeys.BATCHES,
338: inputBasename.length);
339: if (field != null)
340: additionalProperties.setProperty(Index.PropertyKeys.FIELD,
341: field);
342:
343: usedIndex = new int[numIndices];
344: frequency = new int[numIndices];
345: position = new int[maxCount];
346: numberOfDocuments = combineNumberOfDocuments();
347:
348: if ((hasCounts = writerFlags.containsKey(Component.COUNTS))
349: && !haveCounts)
350: throw new IllegalArgumentException(
351: "Some of the indices to be combined do not have counts.");
352: if ((hasPositions = writerFlags
353: .containsKey(Component.POSITIONS))
354: && !havePositions)
355: throw new IllegalArgumentException(
356: "Some of the indices to be combined do not have positions.");
357: if ((hasPayloads = writerFlags.containsKey(Component.PAYLOADS))
358: && payload == null)
359: throw new IllegalArgumentException(
360: "Indices to be combined do not have payloads.");
361:
362: interleaved |= !hasPositions || hasPayloads;
363:
364: if (!metadataOnly) {
365: if (!interleaved)
366: indexWriter = new BitStreamHPIndexWriter(
367: outputBasename, numberOfDocuments, true,
368: skipBufferSize, writerFlags, quantum, height);
369: else if (!skips)
370: indexWriter = new BitStreamIndexWriter(outputBasename,
371: numberOfDocuments, true, writerFlags);
372: else
373: indexWriter = new SkipBitStreamIndexWriter(
374: outputBasename, numberOfDocuments, true,
375: skipBufferSize, writerFlags, quantum, height);
376: //else indexWriter = new SqrtSkipIndexWriter( outputBasename, numberOfDocuments, true, writerFlags );
377: }
378: }
379:
380: /** Return a index with given basename, loaded with options suitable to perform the combination.
381: *
382: * <p>This basic implementation calls {@link it.unimi.dsi.mg4j.index.Index#getInstance(CharSequence, boolean, boolean)}
383: * with all Boolean parameters set to false. Subclasses can override this
384: * method to load more data.
385: *
386: * @param basename an index basename.
387: * @return an index loaded with the correct options for the combining strategy.
388: */
389: protected BitStreamIndex getIndex(final CharSequence basename)
390: throws ConfigurationException, IOException,
391: URISyntaxException, ClassNotFoundException,
392: SecurityException, InstantiationException,
393: IllegalAccessException, InvocationTargetException,
394: NoSuchMethodException {
395: return (BitStreamIndex) Index.getInstance(basename, false,
396: false, false);
397: }
398:
399: /** Combines the number of documents.
400: *
401: * @return the number of documents of the combined index.
402: */
403: protected abstract int combineNumberOfDocuments();
404:
405: /** A partial {@link IntIterator} implementation based on γ-coded integers.
406: *
407: * <p>Instances of this class adapt an {@link InputBitStream} to an {@link IntIterator}
408: * by reading γ-coded integers. The implementation is partial because {@link #hasNext()}
409: * always returns true—the user must know in advance how many times {@link #nextInt()}
410: * may be safely called.
411: *
412: * @see #sizes(int)
413: */
414: protected static final class GammaCodedIntIterator extends
415: AbstractIntIterator implements Closeable {
416: final private InputBitStream inputBitStream;
417:
418: public GammaCodedIntIterator(final InputBitStream inputBitStream) {
419: this .inputBitStream = inputBitStream;
420: }
421:
422: /** Returns true.
423: * @return true
424: */
425: public boolean hasNext() {
426: return true;
427: }
428:
429: /** Returns the next γ-coded integer in the underlying {@link InputBitStream}.
430: * @return the result of {@link InputBitStream#readGamma()}.
431: */
432: public int nextInt() {
433: try {
434: return inputBitStream.readGamma();
435: } catch (IOException e) {
436: throw new RuntimeException(e);
437: }
438: }
439:
440: /** Delegates to the underlying {@link InputBitStream}. */
441: public void close() throws IOException {
442: inputBitStream.close();
443: }
444: }
445:
446: /** Returns an iterator on sizes.
447: *
448: * <p>The purpose of this method is to provide {@link #combineSizes()} implementations with
449: * a way to access the size list from a disk file or from {@link BitStreamIndex#sizes} transparently.
450: * This mechanism is essential to ensure that size files are read exactly once.
451: *
452: * <p>The caller should check whether the returned object implements {@link Closeable},
453: * and, in this case, invoke {@link Closeable#close()} after usage.
454: *
455: * @param numIndex the number of an index.
456: * @return an iterator on the sizes of the index.
457: */
458:
459: protected IntIterator sizes(int numIndex)
460: throws FileNotFoundException {
461: if (index[numIndex].sizes != null)
462: return index[numIndex].sizes.listIterator();
463: LOGGER.debug("Reading sizes from " + inputBasename[numIndex]
464: + DiskBasedIndex.SIZES_EXTENSION);
465: return new GammaCodedIntIterator(new InputBitStream(
466: inputBasename[numIndex]
467: + DiskBasedIndex.SIZES_EXTENSION));
468: }
469:
470: /** Combines size lists.
471: *
472: * @return the maximum size of a document in the combined index.
473: * @throws IOException
474: */
475: protected abstract int combineSizes() throws IOException;
476:
477: /** Combines several indices.
478: *
479: * <p>When this method is called, exactly <code>numUsedIndices</code> entries
480: * of {@link #usedIndex} contain, in increasing order, the indices containing
481: * inverted lists for the current term. Implementations of this method must
482: * combine the inverted list, save the total global count for the current
483: * term and return the resulting frequency.
484: *
485: * @param numUsedIndices the number of valid entries in {@link #usedIndex}.
486: * @return the frequency of the combined lists.
487: */
488:
489: protected abstract int combine(int numUsedIndices)
490: throws IOException;
491:
492: public void run() throws ConfigurationException, IOException {
493: final Logger logger = Util.getLogger(this .getClass());
494: final ProgressLogger pl = new ProgressLogger(logger,
495: logInterval);
496:
497: final int maxDocSize;
498: long totalSize = 0;
499:
500: if (writeSizes) {
501: size = new int[numberOfDocuments];
502: logger.info("Combining sizes...");
503:
504: maxDocSize = combineSizes();
505:
506: final OutputBitStream outputSizes = new OutputBitStream(
507: outputBasename + DiskBasedIndex.SIZES_EXTENSION,
508: bufferSize);
509: for (int i = 0; i < numberOfDocuments; i++) {
510: totalSize += size[i];
511: outputSizes.writeGamma(size[i]);
512: }
513: outputSizes.close();
514:
515: logger.info("Sizes combined.");
516: } else
517: maxDocSize = -1;
518:
519: // To write the global count of each term
520: final OutputBitStream outputGlobCounts = writeGlobCounts ? new OutputBitStream(
521: outputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION)
522: : null;
523: // To write the frequency of each term
524: final OutputBitStream frequencies = metadataOnly ? null
525: : new OutputBitStream(outputBasename
526: + DiskBasedIndex.FREQUENCIES_EXTENSION);
527: // To write the new term list
528: final PrintWriter termFile = new PrintWriter(
529: new BufferedWriter(new OutputStreamWriter(
530: new FileOutputStream(outputBasename
531: + DiskBasedIndex.TERMS_EXTENSION),
532: "UTF-8"), bufferSize));
533:
534: // The current term
535: MutableString currTerm;
536:
537: // Total number of pointers and occurrences
538: long numPointers = 0;
539:
540: pl.expectedUpdates = writeGlobCounts ? numberOfOccurrences : -1;
541: pl.itemsName = "occurrences";
542: pl.logInterval = logInterval;
543: pl.start("Combining lists...");
544:
545: int totalFrequency, numTerms = 0, numUsedIndices, k;
546: long totalGlobCount = 0;
547:
548: // TODO: use the front of the queue?
549: while (!termQueue.isEmpty()) {
550: numUsedIndices = 0;
551: // We read a new word from the queue, copy it and write it to the term file
552: currTerm = term[k = usedIndex[numUsedIndices++] = termQueue
553: .first()].copy();
554:
555: if (DEBUG)
556: System.err.println("Merging term " + currTerm);
557:
558: currTerm.println(termFile);
559: if (termReader[k].readLine(term[k]) == null)
560: termQueue.dequeue();
561: else
562: termQueue.changed();
563:
564: // Then, we extract all equal words from the queue, accumulating the set of indices in inIndex and currIndex
565: while (!termQueue.isEmpty()
566: && term[termQueue.first()].equals(currTerm)) {
567: k = usedIndex[numUsedIndices++] = termQueue.first();
568: if (termReader[k].readLine(term[k]) == null)
569: termQueue.dequeue();
570: else
571: termQueue.changed();
572: }
573:
574: if (numUsedIndices > 1)
575: Arrays.sort(usedIndex, 0, numUsedIndices);
576:
577: // Load index iterators
578: for (int i = numUsedIndices; i-- != 0;)
579: indexIterator[usedIndex[i]] = indexReader[usedIndex[i]]
580: .nextIterator();
581:
582: numTerms++;
583:
584: if (writeGlobCounts) {
585: // Compute and write the total global count. This works for all kind of indices.
586: totalGlobCount = 0;
587: for (int i = 0; i < numUsedIndices; i++)
588: totalGlobCount += globCounts[usedIndex[i]]
589: .readGamma();
590: outputGlobCounts.writeLongGamma(totalGlobCount);
591: }
592:
593: if (!metadataOnly) {
594: totalFrequency = combine(numUsedIndices);
595: frequencies.writeGamma(totalFrequency);
596: numPointers += totalFrequency;
597: }
598:
599: /* A trick to get a correct prediction. */
600: if (writeGlobCounts)
601: pl.count += totalGlobCount - 1;
602: pl.update();
603: }
604: pl.done();
605:
606: if (writeGlobCounts)
607: outputGlobCounts.close();
608: termFile.close();
609:
610: if (!metadataOnly) {
611: frequencies.close();
612: for (int i = numIndices; i-- != 0;) {
613: indexReader[i].close();
614: if (writeGlobCounts)
615: globCounts[i].close();
616: termReader[i].close();
617: }
618: final long indexSize = indexWriter.writtenBits();
619: indexWriter.close();
620: final Properties properties = indexWriter.properties();
621: additionalProperties.setProperty(Index.PropertyKeys.SIZE,
622: indexSize);
623: additionalProperties.setProperty(
624: Index.PropertyKeys.MAXDOCSIZE, maxDocSize);
625: additionalProperties
626: .setProperty(Index.PropertyKeys.OCCURRENCES,
627: numberOfOccurrences);
628: properties.addAll(additionalProperties);
629: logger.debug("Post-merge properties: "
630: + new ConfigurationMap(properties));
631: properties.save(outputBasename
632: + DiskBasedIndex.PROPERTIES_EXTENSION);
633: }
634:
635: final PrintStream stats = new PrintStream(new FileOutputStream(
636: outputBasename + DiskBasedIndex.STATS_EXTENSION));
637: if (writeSizes)
638: stats.println("Average document size: "
639: + Util.format((double) totalSize
640: / numberOfDocuments));
641: if (!metadataOnly)
642: indexWriter.printStats(stats);
643: stats.close();
644: }
645:
646: public static void main(final String[] arg) throws JSAPException,
647: ConfigurationException, IOException, URISyntaxException,
648: ClassNotFoundException, SecurityException,
649: InstantiationException, IllegalAccessException,
650: InvocationTargetException, NoSuchMethodException {
651: main(arg, null);
652: }
653:
654: public static void main(final String[] arg,
655: final Class<? extends Combine> combineClass)
656: throws JSAPException, ConfigurationException, IOException,
657: URISyntaxException, ClassNotFoundException,
658: SecurityException, InstantiationException,
659: IllegalAccessException, InvocationTargetException,
660: NoSuchMethodException {
661:
662: SimpleJSAP jsap = new SimpleJSAP(
663: Combine.class.getName(),
664: "Combines several indices. By default, documents are concatenated, but you can also merge or paste them by choosing the suitable options, or invoking the corresponding subclass instead of "
665: + Combine.class.getName()
666: + ". Note that by combining a single input index you can recompress an index with new parameters.",
667: new Parameter[] {
668: new FlaggedOption(
669: "bufferSize",
670: JSAP.INTSIZE_PARSER,
671: Util
672: .formatBinarySize(DEFAULT_BUFFER_SIZE),
673: JSAP.NOT_REQUIRED, 'b', "buffer-size",
674: "The size of an I/O buffer."),
675: new FlaggedOption("comp", JSAP.STRING_PARSER,
676: JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
677: 'c', "comp",
678: "A compression flag for the index (may be specified several times).")
679: .setAllowMultipleDeclarations(true),
680: new Switch(
681: "skips",
682: JSAP.NO_SHORTFLAG,
683: "skips",
684: "Requires skips (which however are present by default, unless you required an interleaved index)."),
685: new Switch("interleaved", JSAP.NO_SHORTFLAG,
686: "interleaved",
687: "Forces an interleaved index."),
688: new FlaggedOption("quantum",
689: JSAP.INTSIZE_PARSER, "64",
690: JSAP.NOT_REQUIRED, 'Q', "quantum",
691: "The skip quantum."),
692: new FlaggedOption("height",
693: JSAP.INTSIZE_PARSER, "8",
694: JSAP.NOT_REQUIRED, 'H', "height",
695: "The skip height."),
696: new Switch("metadataOnly", 'o',
697: "metadata-only",
698: "Combines only metadata (sizes, terms and globcounts)."),
699: new Switch("merge", 'm', "merge",
700: "Merges indices (duplicates cause an error)."),
701: new Switch("duplicates", 'd', "duplicates",
702: "Pastes indices, concatenating the document positions for duplicates."),
703: new Switch(
704: "properties",
705: 'p',
706: "properties",
707: "The only specified inputBasename will be used to load a property file written by the scanning process."),
708: new FlaggedOption("tempFileDir",
709: FileStringParser.getParser(),
710: JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
711: JSAP.NO_SHORTFLAG, "temp-file-dir",
712: "The directory for the temporary file used during pasting."),
713: new FlaggedOption(
714: "tempFileBufferSize",
715: JSAP.INTSIZE_PARSER,
716: Util
717: .formatBinarySize(Paste.DEFAULT_MEMORY_BUFFER_SIZE),
718: JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG,
719: "temp-file-buffer-size",
720: "The size of the buffer for the temporary file during pasting."),
721: new FlaggedOption(
722: "skipBufferSize",
723: JSAP.INTSIZE_PARSER,
724: Util
725: .formatBinarySize(SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE),
726: JSAP.NOT_REQUIRED,
727: JSAP.NO_SHORTFLAG,
728: "skip-buffer-size",
729: "The size of the internal temporary buffer used while creating an index with skips."),
730: new FlaggedOption(
731: "logInterval",
732: JSAP.LONG_PARSER,
733: Long
734: .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
735: JSAP.NOT_REQUIRED, 'l', "log-interval",
736: "The minimum time interval between activity logs in milliseconds."),
737: new UnflaggedOption("outputBasename",
738: JSAP.STRING_PARSER, JSAP.REQUIRED,
739: "The basename of the resulting index."),
740: new UnflaggedOption("inputBasename",
741: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
742: JSAP.REQUIRED, JSAP.GREEDY,
743: "The basenames of the indices to be merged.") });
744:
745: JSAPResult jsapResult = jsap.parse(arg);
746: if (jsap.messagePrinted())
747: return;
748:
749: final boolean skips = jsapResult.getBoolean("skips");
750: final boolean interleaved = jsapResult
751: .getBoolean("interleaved");
752: if (interleaved
753: && !skips
754: && (jsapResult.userSpecified("quantum") || jsapResult
755: .userSpecified("height"))) {
756: System.err
757: .println("You specified quantum or height, but did not turn on skips.");
758: return;
759: }
760:
761: if (combineClass != null
762: && jsapResult.userSpecified("duplicates")
763: || jsapResult.userSpecified("merge"))
764: throw new IllegalArgumentException("When invoking "
765: + Combine.class.getName() + " from "
766: + combineClass.getName()
767: + " you cannot choose the combination process");
768:
769: final String[] inputBasename;
770: if (jsapResult.getBoolean("properties")) {
771: if (jsapResult.getStringArray("inputBasename").length > 1)
772: throw new IllegalArgumentException(
773: "When using --properties, you must specify exactly one inputBasename");
774: inputBasename = new Properties(jsapResult
775: .getStringArray("inputBasename")[0]
776: + Scan.CLUSTER_PROPERTIES_EXTENSION)
777: .getStringArray(IndexCluster.PropertyKeys.LOCALINDEX);
778: } else
779: inputBasename = jsapResult.getStringArray("inputBasename");
780: // TODO: resolve problem of passing default flag values without knowing type of index
781: (combineClass == Paste.class
782: || jsapResult.getBoolean("duplicates") ? (Combine) new Paste(
783: jsapResult.getString("outputBasename"), inputBasename,
784: jsapResult.getBoolean("metadataOnly"), jsapResult
785: .getInt("bufferSize"), jsapResult
786: .getFile("tempFileDir"), jsapResult
787: .getInt("tempFileBufferSize"),
788: CompressionFlags.valueOf(jsapResult
789: .getStringArray("comp"),
790: CompressionFlags.DEFAULT_STANDARD_INDEX),
791: interleaved, skips, jsapResult.getInt("quantum"),
792: jsapResult.getInt("height"), jsapResult
793: .getInt("skipBufferSize"), jsapResult
794: .getLong("logInterval"))
795: : combineClass == Merge.class
796: || jsapResult.getBoolean("merge") ? (Combine) new Merge(
797: jsapResult.getString("outputBasename"),
798: inputBasename,
799: jsapResult.getBoolean("metadataOnly"),
800: jsapResult.getInt("bufferSize"),
801: CompressionFlags
802: .valueOf(
803: jsapResult
804: .getStringArray("comp"),
805: CompressionFlags.DEFAULT_STANDARD_INDEX),
806: interleaved, skips, jsapResult
807: .getInt("quantum"), jsapResult
808: .getInt("height"), jsapResult
809: .getInt("skipBufferSize"), jsapResult
810: .getLong("logInterval"))
811: : (Combine) new Concatenate(
812: jsapResult.getString("outputBasename"),
813: inputBasename,
814: jsapResult.getBoolean("metadataOnly"),
815: jsapResult.getInt("bufferSize"),
816: CompressionFlags
817: .valueOf(
818: jsapResult
819: .getStringArray("comp"),
820: CompressionFlags.DEFAULT_STANDARD_INDEX),
821: interleaved, skips, jsapResult
822: .getInt("quantum"), jsapResult
823: .getInt("height"), jsapResult
824: .getInt("skipBufferSize"),
825: jsapResult.getLong("logInterval"))
826:
827: ).run();
828: }
829: }
|