001: package it.unimi.dsi.mg4j.tool;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntList;
025: import it.unimi.dsi.fastutil.io.BinIO;
026: import it.unimi.dsi.mg4j.index.BitStreamIndex;
027: import it.unimi.dsi.mg4j.index.CachingOutputBitStream;
028: import it.unimi.dsi.mg4j.index.CompressionFlags;
029: import it.unimi.dsi.mg4j.index.DiskBasedIndex;
030: import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;
031: import it.unimi.dsi.mg4j.index.Index;
032: import it.unimi.dsi.mg4j.index.IndexIterator;
033: import it.unimi.dsi.mg4j.index.IndexReader;
034: import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
035: import it.unimi.dsi.mg4j.index.IndexWriter;
036: import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;
037: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
038: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
039: import it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy;
040: import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;
041: import it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster;
042: import it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster;
043: import it.unimi.dsi.mg4j.index.cluster.DocumentalPartitioningStrategy;
044: import it.unimi.dsi.mg4j.index.cluster.DocumentalStrategies;
045: import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
046: import it.unimi.dsi.mg4j.index.payload.Payload;
047: import it.unimi.dsi.io.FastBufferedReader;
048: import it.unimi.dsi.io.InputBitStream;
049: import it.unimi.dsi.io.OutputBitStream;
050: import it.unimi.dsi.Util;
051: import it.unimi.dsi.lang.MutableString;
052: import it.unimi.dsi.logging.ProgressLogger;
053: import it.unimi.dsi.sux4j.util.ShiftAddXorSignedStringMap;
054: import it.unimi.dsi.util.BloomFilter;
055: import it.unimi.dsi.util.ImmutableExternalPrefixMap;
056: import it.unimi.dsi.util.PrefixMap;
057: import it.unimi.dsi.util.Properties;
058: import it.unimi.dsi.util.StringMap;
059:
060: import java.io.BufferedWriter;
061: import java.io.File;
062: import java.io.FileInputStream;
063: import java.io.FileOutputStream;
064: import java.io.IOException;
065: import java.io.InputStreamReader;
066: import java.io.OutputStreamWriter;
067: import java.io.PrintWriter;
068: import java.net.URISyntaxException;
069: import java.util.Map;
070:
071: import org.apache.commons.configuration.ConfigurationException;
072: import org.apache.commons.configuration.ConfigurationMap;
073: import org.apache.log4j.Logger;
074:
075: import com.martiansoftware.jsap.FlaggedOption;
076: import com.martiansoftware.jsap.JSAP;
077: import com.martiansoftware.jsap.JSAPResult;
078: import com.martiansoftware.jsap.Parameter;
079: import com.martiansoftware.jsap.SimpleJSAP;
080: import com.martiansoftware.jsap.Switch;
081: import com.martiansoftware.jsap.UnflaggedOption;
082:
083: /** Partitions an index documentally.
084: *
085: * <p>A global index is partitioned documentally by providing a {@link DocumentalPartitioningStrategy}
086: * that specifies a destination local index for each document, and a local document pointer. The global index
087: * is scanned, and the postings are partitioned among the local indices using the provided strategy. For instance,
088: * a {@link ContiguousDocumentalStrategy} divides an index into blocks of contiguous documents.
089: *
090: * <p>Since each local index contains a (proper) subset of the original set of documents, it contains in general a (proper)
091: * subset of the terms in the global index. Thus, the local term numbers and the global term numbers will not in general coincide.
092: * As a result, when a set of local indices is accessed transparently as a single index
093: * using a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalCluster},
094: * a call to {@link it.unimi.dsi.mg4j.index.Index#documents(int)} will throw an {@link java.lang.UnsupportedOperationException},
095: * because there is no way to map the global term numbers to local term numbers.
096: *
097: * <p>On the other hand, a call to {@link it.unimi.dsi.mg4j.index.Index#documents(CharSequence)} will be passed each local index to
098: * build a global iterator. To speed up this phase for not-so-frequent terms, when partitioning an index you can require
099: * the construction of {@linkplain BloomFilter Bloom filters} that will be used to try to avoid
100: * inquiring indices that do not contain a term. The precision of the filters is settable.
101: *
102: * <p>The property file will use a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster} unless you provide
103: * a {@link ContiguousDocumentalStrategy}, in which case a
104: * {@link it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster} will be used instead. Note that there might
105: * be other cases in which the latter is adapt, in which case you can edit manually the property file.
106: *
107: * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps}
108: * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g.,
109: * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}).
110: *
111: * <h2>Write-once output and distributed index partitioning</h2>
112: *
113: * Plase see {@link it.unimi.dsi.mg4j.tool.PartitionLexically}—the same comments apply.
114: *
115: * @author Alessandro Arrabito
116: * @author Sebastiano Vigna
117: *
118: * @since 1.0.1
119: */
120:
121: public class PartitionDocumentally {
122: private final static Logger LOGGER = Util
123: .getLogger(PartitionDocumentally.class);
124:
125: /** The default buffer size for all involved indices. */
126: public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;
127:
128: /** The number of local indices. */
129: private final int numIndices;
130: /** The output basenames. */
131: private final String outputBasename;
132: /** The array of local output basenames. */
133: private final String[] localBasename;
134: /** The input basename. */
135: private final String inputBasename;
136: /** The properties of the input index. */
137: private final Properties inputProperties;
138: /** The size of I/O buffers. */
139: private final int bufferSize;
140: /** The filename of the strategy used to partition the index. */
141: private final String strategyFilename;
142: /** The strategy used to perform the partitioning. */
143: private final DocumentalPartitioningStrategy strategy;
144: /** The additional local properties of each local index. */
145: private final Properties[] strategyProperties;
146: /** The logging interval. */
147: private final long logInterval;
148: /** The global index to be partitioned. */
149: private final BitStreamIndex globalIndex;
150: /** A reader on {@link #globalIndex}. */
151: private final IndexReader indexReader;
152: /** A reader for the terms of the global index. */
153: private final FastBufferedReader terms;
154: /** An index writer for each local index. */
155: private final IndexWriter[] indexWriter;
156: /** Whether each {@link #indexWriter} has counts. */
157: private final boolean haveCounts;
158: /** Whether each {@link #indexWriter} has positions. */
159: private final boolean havePositions;
160: /** Whether each {@link #indexWriter} has payloads. */
161: private final boolean havePayloads;
162: /** A bit output stream for global counts of each local index. */
163: private final OutputBitStream[] localGlobCounts;
164: /** A bit output stream for the frequencies of each local index. */
165: private final OutputBitStream[] localFrequencies;
166: /** A print writer for the terms of each local index. */
167: private final PrintWriter[] localTerms;
168: /** The maximum size of a document in each local index. */
169: private final int[] maxDocSize;
170: /** The maximum number of positions in each local index. */
171: private final int[] maxDocPos;
172: /** The number of terms in each local index. */
173: private final int[] numTerms;
174: /** The number of postings in each local index. */
175: private final long[] numPostings;
176: /** The number of occurrences in each local index. */
177: private final long[] numOccurrences;
178: /** The global count for each local index. */
179: private final long[] globCount;
180: /** The required precision for Bloom filters (0 means no filter). */
181: private final int bloomFilterPrecision;
182:
183: public PartitionDocumentally(final String inputBasename,
184: final String outputBasename,
185: final DocumentalPartitioningStrategy strategy,
186: final String strategyFilename,
187: final int bloomFilterPrecision, final int bufferSize,
188: final Map<Component, Coding> writerFlags,
189: boolean interleaved, final boolean skips,
190: final int quantum, final int height,
191: final int skipBufferSize, final long logInterval)
192: throws ConfigurationException, IOException,
193: ClassNotFoundException, SecurityException,
194: InstantiationException, IllegalAccessException {
195:
196: this .inputBasename = inputBasename;
197: this .outputBasename = outputBasename;
198: this .strategy = strategy;
199: this .strategyFilename = strategyFilename;
200: this .strategyProperties = strategy.properties();
201: this .bufferSize = bufferSize;
202: this .logInterval = logInterval;
203: this .bloomFilterPrecision = bloomFilterPrecision;
204:
205: numIndices = strategy.numberOfLocalIndices();
206:
207: final Coding positionCoding = writerFlags
208: .get(Component.POSITIONS);
209:
210: inputProperties = new Properties(inputBasename
211: + DiskBasedIndex.PROPERTIES_EXTENSION);
212: globalIndex = DiskBasedIndex.getInstance(inputBasename,
213: inputProperties, false, positionCoding == Coding.GOLOMB
214: || positionCoding == Coding.INTERPOLATIVE,
215: false, null);
216: indexReader = globalIndex.getReader();
217:
218: localBasename = new String[numIndices];
219: for (int i = 0; i < numIndices; i++)
220: localBasename[i] = outputBasename + "-" + i;
221:
222: localGlobCounts = new OutputBitStream[numIndices];
223: localFrequencies = new OutputBitStream[numIndices];
224: localTerms = new PrintWriter[numIndices];
225: maxDocSize = new int[numIndices];
226: maxDocPos = new int[numIndices];
227: numTerms = new int[numIndices];
228: globCount = new long[numIndices];
229: numOccurrences = new long[numIndices];
230: numPostings = new long[numIndices];
231: indexWriter = new IndexWriter[numIndices];
232:
233: if ((havePayloads = writerFlags.containsKey(Component.PAYLOADS))
234: && !globalIndex.hasPayloads)
235: throw new IllegalArgumentException(
236: "You requested payloads, but the global index does not contain them.");
237: if ((haveCounts = writerFlags.containsKey(Component.COUNTS))
238: && !globalIndex.hasCounts)
239: throw new IllegalArgumentException(
240: "You requested counts, but the global index does not contain them.");
241: if ((havePositions = writerFlags
242: .containsKey(Component.POSITIONS))
243: && !globalIndex.hasPositions)
244: throw new IllegalArgumentException(
245: "You requested positions, but the global index does not contain them.");
246:
247: interleaved |= !havePositions || havePayloads;
248:
249: for (int i = 0; i < numIndices; i++) {
250: String name = localBasename[i];
251: if (!interleaved)
252: indexWriter[i] = new BitStreamHPIndexWriter(
253: localBasename[i],
254: strategy.numberOfDocuments(i), true,
255: skipBufferSize, writerFlags, quantum, height);
256: else if (!skips)
257: indexWriter[i] = new BitStreamIndexWriter(
258: localBasename[i],
259: strategy.numberOfDocuments(i), true,
260: writerFlags);
261: else
262: indexWriter[i] = new SkipBitStreamIndexWriter(
263: localBasename[i],
264: strategy.numberOfDocuments(i), true,
265: skipBufferSize, writerFlags, quantum, height);
266:
267: if (haveCounts)
268: localGlobCounts[i] = new OutputBitStream(name
269: + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
270: localFrequencies[i] = new OutputBitStream(name
271: + DiskBasedIndex.FREQUENCIES_EXTENSION);
272: localTerms[i] = new PrintWriter(new BufferedWriter(
273: new OutputStreamWriter(new FileOutputStream(
274: localBasename[i]
275: + DiskBasedIndex.TERMS_EXTENSION),
276: "UTF-8")));
277: }
278:
279: terms = new FastBufferedReader(new InputStreamReader(
280: new FileInputStream(inputBasename
281: + DiskBasedIndex.TERMS_EXTENSION), "UTF-8"));
282: }
283:
284: private void partitionSizes() throws IOException {
285: final File sizesFile = new File(inputBasename
286: + DiskBasedIndex.SIZES_EXTENSION);
287: if (sizesFile.exists()) {
288: LOGGER.info("Partitioning sizes...");
289: final InputBitStream sizes = new InputBitStream(sizesFile);
290: final OutputBitStream localSizes[] = new OutputBitStream[numIndices];
291: for (int i = 0; i < numIndices; i++)
292: localSizes[i] = new OutputBitStream(localBasename[i]
293: + DiskBasedIndex.SIZES_EXTENSION);
294:
295: // ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents.
296: int size, localIndex;
297: if (globalIndex.numberOfDocuments == strategy
298: .numberOfDocuments(0)) {
299: for (int i = 0; i < globalIndex.numberOfDocuments; i++) {
300: localSizes[localIndex = strategy.localIndex(i)]
301: .writeGamma(size = sizes.readGamma());
302: if (maxDocSize[localIndex] < size)
303: maxDocSize[localIndex] = size;
304: for (int l = numIndices; l-- != 0;)
305: if (l != localIndex)
306: localSizes[l].writeGamma(0);
307: }
308: } else {
309: for (int i = 0; i < globalIndex.numberOfDocuments; i++) {
310: localSizes[localIndex = strategy.localIndex(i)]
311: .writeGamma(size = sizes.readGamma());
312: if (maxDocSize[localIndex] < size)
313: maxDocSize[localIndex] = size;
314: }
315: }
316:
317: sizes.close();
318: for (int i = 0; i < numIndices; i++)
319: localSizes[i].close();
320: }
321: }
322:
323: public void run() throws Exception {
324: final ProgressLogger pl = new ProgressLogger(LOGGER,
325: logInterval);
326: final IntList sizeList = globalIndex.sizes;
327: partitionSizes();
328:
329: final int[] position = new int[globalIndex.maxCount];
330: final int[] localFrequency = new int[numIndices];
331: final int[] usedIndex = new int[numIndices];
332: final InputBitStream[] direct = new InputBitStream[numIndices];
333: final InputBitStream[] indirect = new InputBitStream[numIndices];
334: final BloomFilter[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[numIndices]
335: : null;
336: final File[] tempFile = new File[numIndices];
337: final CachingOutputBitStream[] temp = new CachingOutputBitStream[numIndices];
338: IndexIterator indexIterator;
339:
340: for (int i = 0; i < numIndices; i++) {
341: tempFile[i] = new File(localBasename[i] + ".temp");
342: temp[i] = new CachingOutputBitStream(tempFile[i],
343: bufferSize);
344: direct[i] = new InputBitStream(temp[i].buffer());
345: indirect[i] = new InputBitStream(tempFile[i]);
346: if (bloomFilterPrecision != 0)
347: bloomFilter[i] = new BloomFilter(
348: globalIndex.numberOfTerms, bloomFilterPrecision);
349: }
350: int usedIndices;
351: MutableString currentTerm = new MutableString();
352: Payload payload = null;
353: int frequency, globalPointer, localIndex, localPointer, count = -1;
354:
355: pl.expectedUpdates = globalIndex.numberOfPostings;
356: pl.itemsName = "postings";
357: pl.logInterval = logInterval;
358: pl.start("Partitioning index...");
359:
360: for (int t = 0; t < globalIndex.numberOfTerms; t++) {
361: terms.readLine(currentTerm);
362: indexIterator = indexReader.nextIterator();
363: usedIndices = 0;
364: frequency = indexIterator.frequency();
365:
366: for (int j = 0; j < frequency; j++) {
367: globalPointer = indexIterator.nextDocument();
368: localIndex = strategy.localIndex(globalPointer);
369:
370: if (localFrequency[localIndex] == 0) {
371: // First time we see a document for this index.
372: currentTerm.println(localTerms[localIndex]);
373: numTerms[localIndex]++;
374: usedIndex[usedIndices++] = localIndex;
375: if (bloomFilterPrecision != 0)
376: bloomFilter[localIndex].add(currentTerm);
377: }
378:
379: /* Store temporarily posting data; note that we save the global pointer as we
380: * will have to access the size list. */
381:
382: localFrequency[localIndex]++;
383: numPostings[localIndex]++;
384: temp[localIndex].writeGamma(globalPointer);
385:
386: if (globalIndex.hasPayloads)
387: payload = indexIterator.payload();
388: if (havePayloads)
389: payload.write(temp[localIndex]);
390:
391: if (haveCounts) {
392: count = indexIterator.count();
393: temp[localIndex].writeGamma(count);
394: globCount[localIndex] += count;
395: if (maxDocPos[localIndex] < count)
396: maxDocPos[localIndex] = count;
397: if (havePositions) {
398: final int[] pos = indexIterator.positionArray();
399: // TODO: compress this stuff
400: for (int p = 0; p < count; p++)
401: temp[localIndex].writeGamma(pos[p]);
402: }
403: }
404: }
405:
406: // We now run through the indices used by this term and copy from the temporary buffer.
407:
408: OutputBitStream obs;
409:
410: for (int k = 0; k < usedIndices; k++) {
411: final int i = usedIndex[k];
412:
413: localFrequencies[i].writeGamma(localFrequency[i]);
414: if (haveCounts)
415: numOccurrences[i] += globCount[i];
416: if (localGlobCounts[i] != null)
417: localGlobCounts[i].writeLongGamma(globCount[i]);
418: globCount[i] = 0;
419:
420: InputBitStream ibs;
421: indexWriter[i].newInvertedList();
422:
423: temp[i].align();
424: if (temp[i].buffer() != null)
425: ibs = direct[i];
426: else {
427: // We cannot read directly from the internal buffer.
428: ibs = indirect[i];
429: ibs.flush();
430: temp[i].flush();
431: }
432:
433: ibs.position(0);
434:
435: indexWriter[i].writeFrequency(localFrequency[i]);
436: for (int j = 0; j < localFrequency[i]; j++) {
437: obs = indexWriter[i].newDocumentRecord();
438: globalPointer = ibs.readGamma();
439: localPointer = strategy.localPointer(globalPointer);
440: indexWriter[i].writeDocumentPointer(obs,
441: localPointer);
442: if (havePayloads) {
443: payload.read(ibs);
444: indexWriter[i].writePayload(obs, payload);
445: }
446: if (haveCounts)
447: indexWriter[i].writePositionCount(obs,
448: count = ibs.readGamma());
449: if (havePositions) {
450: for (int p = 0; p < count; p++)
451: position[p] = ibs.readGamma();
452: indexWriter[i].writeDocumentPositions(obs,
453: position, 0, count,
454: sizeList != null ? sizeList
455: .getInt(globalPointer) : -1);
456: }
457:
458: }
459: temp[i].position(0);
460: temp[i].writtenBits(0);
461: localFrequency[i] = 0;
462: }
463:
464: usedIndices = 0;
465: pl.count += frequency - 1;
466: pl.update();
467: }
468:
469: pl.done();
470:
471: Properties globalProperties = new Properties();
472: globalProperties.setProperty(Index.PropertyKeys.FIELD,
473: inputProperties.getProperty(Index.PropertyKeys.FIELD));
474: globalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
475: inputProperties
476: .getProperty(Index.PropertyKeys.TERMPROCESSOR));
477:
478: for (int i = 0; i < numIndices; i++) {
479: localFrequencies[i].close();
480: if (localGlobCounts[i] != null)
481: localGlobCounts[i].close();
482: localTerms[i].close();
483: indexWriter[i].close();
484: if (bloomFilterPrecision != 0)
485: BinIO.storeObject(bloomFilter[i], localBasename[i]
486: + DocumentalCluster.BLOOM_EXTENSION);
487: temp[i].close();
488: tempFile[i].delete();
489:
490: Properties localProperties = indexWriter[i].properties();
491: localProperties.addAll(globalProperties);
492: localProperties.setProperty(Index.PropertyKeys.MAXCOUNT,
493: String.valueOf(maxDocPos[i]));
494: localProperties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
495: maxDocSize[i]);
496: localProperties.setProperty(Index.PropertyKeys.FIELD,
497: globalProperties
498: .getProperty(Index.PropertyKeys.FIELD));
499: localProperties.setProperty(Index.PropertyKeys.OCCURRENCES,
500: haveCounts ? numOccurrences[i] : -1);
501: localProperties.setProperty(Index.PropertyKeys.POSTINGS,
502: numPostings[i]);
503: localProperties.setProperty(Index.PropertyKeys.TERMS,
504: numTerms[i]);
505: if (havePayloads)
506: localProperties.setProperty(
507: Index.PropertyKeys.PAYLOADCLASS, payload
508: .getClass().getName());
509: if (strategyProperties[i] != null)
510: localProperties.addAll(strategyProperties[i]);
511: localProperties.save(localBasename[i]
512: + DiskBasedIndex.PROPERTIES_EXTENSION);
513: }
514:
515: if (strategyFilename != null)
516: globalProperties.setProperty(
517: IndexCluster.PropertyKeys.STRATEGY,
518: strategyFilename);
519: for (int i = 0; i < numIndices; i++)
520: globalProperties.addProperty(
521: IndexCluster.PropertyKeys.LOCALINDEX,
522: localBasename[i]);
523: globalProperties.setProperty(
524: DocumentalCluster.PropertyKeys.BLOOM,
525: bloomFilterPrecision != 0);
526: // If we partition an index with a single term, by definition we have a flat cluster
527: globalProperties.setProperty(
528: DocumentalCluster.PropertyKeys.FLAT, inputProperties
529: .getInt(Index.PropertyKeys.TERMS) <= 1);
530: globalProperties.setProperty(Index.PropertyKeys.MAXCOUNT,
531: inputProperties
532: .getProperty(Index.PropertyKeys.MAXCOUNT));
533: globalProperties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
534: inputProperties
535: .getProperty(Index.PropertyKeys.MAXDOCSIZE));
536: globalProperties.setProperty(Index.PropertyKeys.POSTINGS,
537: inputProperties
538: .getProperty(Index.PropertyKeys.POSTINGS));
539: globalProperties.setProperty(Index.PropertyKeys.OCCURRENCES,
540: inputProperties
541: .getProperty(Index.PropertyKeys.OCCURRENCES));
542: globalProperties.setProperty(Index.PropertyKeys.DOCUMENTS,
543: inputProperties
544: .getProperty(Index.PropertyKeys.DOCUMENTS));
545: globalProperties.setProperty(Index.PropertyKeys.TERMS,
546: inputProperties.getProperty(Index.PropertyKeys.TERMS));
547: if (havePayloads)
548: globalProperties.setProperty(
549: Index.PropertyKeys.PAYLOADCLASS, payload.getClass()
550: .getName());
551:
552: /* For the general case, we must rely on a merged cluster. However, if we detect a contiguous
553: * strategy we can optimise a bit. */
554:
555: globalProperties
556: .setProperty(
557: Index.PropertyKeys.INDEXCLASS,
558: strategy instanceof ContiguousDocumentalStrategy ? DocumentalConcatenatedCluster.class
559: .getName()
560: : DocumentalMergedCluster.class
561: .getName());
562:
563: globalProperties.save(outputBasename
564: + DiskBasedIndex.PROPERTIES_EXTENSION);
565: LOGGER.debug("Properties for clustered index " + outputBasename
566: + ": " + new ConfigurationMap(globalProperties));
567:
568: }
569:
570: public static void main(final String arg[])
571: throws ConfigurationException, IOException,
572: URISyntaxException, ClassNotFoundException, Exception {
573:
574: SimpleJSAP jsap = new SimpleJSAP(
575: PartitionDocumentally.class.getName(),
576: "Partitions an index documentally.",
577: new Parameter[] {
578: new FlaggedOption(
579: "bufferSize",
580: JSAP.INTSIZE_PARSER,
581: Util
582: .formatBinarySize(DEFAULT_BUFFER_SIZE),
583: JSAP.NOT_REQUIRED, 'b', "buffer-size",
584: "The size of an I/O buffer."),
585: new FlaggedOption(
586: "logInterval",
587: JSAP.LONG_PARSER,
588: Long
589: .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
590: JSAP.NOT_REQUIRED, 'l', "log-interval",
591: "The minimum time interval between activity logs in milliseconds."),
592: new FlaggedOption("strategy",
593: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
594: JSAP.NOT_REQUIRED, 's', "strategy",
595: "A serialised documental partitioning strategy."),
596: new FlaggedOption("uniformStrategy",
597: JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
598: JSAP.NOT_REQUIRED, 'u', "uniform",
599: "Requires a uniform partitioning in the given number of parts."),
600: new FlaggedOption("bloom", JSAP.INTEGER_PARSER,
601: "0", JSAP.NOT_REQUIRED, 'B', "bloom",
602: "Generates Bloom filters with given precision."),
603: new FlaggedOption("comp", JSAP.STRING_PARSER,
604: JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
605: 'c', "comp",
606: "A compression flag for the index (may be specified several times).")
607: .setAllowMultipleDeclarations(true),
608: new Switch(
609: "skips",
610: JSAP.NO_SHORTFLAG,
611: "skips",
612: "Requires skips (which however are present by default, unless you required an interleaved index)."),
613: new Switch("interleaved", JSAP.NO_SHORTFLAG,
614: "interleaved",
615: "Forces an interleaved index."),
616: new FlaggedOption("quantum",
617: JSAP.INTSIZE_PARSER, "64",
618: JSAP.NOT_REQUIRED, 'Q', "quantum",
619: "The skip quantum."),
620: new FlaggedOption("height",
621: JSAP.INTSIZE_PARSER, "8",
622: JSAP.NOT_REQUIRED, 'H', "height",
623: "The skip height."),
624: new FlaggedOption(
625: "skipBufferSize",
626: JSAP.INTSIZE_PARSER,
627: Util
628: .formatBinarySize(SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE),
629: JSAP.NOT_REQUIRED,
630: JSAP.NO_SHORTFLAG,
631: "skip-buffer-size",
632: "The size of the internal temporary buffer used while creating an index with skips."),
633: new UnflaggedOption("inputBasename",
634: JSAP.STRING_PARSER, JSAP.REQUIRED,
635: "The basename of the global index."),
636: new UnflaggedOption("outputBasename",
637: JSAP.STRING_PARSER, JSAP.REQUIRED,
638: "The basename of the local indices.") });
639:
640: JSAPResult jsapResult = jsap.parse(arg);
641: if (jsap.messagePrinted())
642: return;
643: String inputBasename = jsapResult.getString("inputBasename");
644: String outputBasename = jsapResult.getString("outputBasename");
645: String strategyFilename = jsapResult.getString("strategy");
646: DocumentalPartitioningStrategy strategy = null;
647:
648: if (jsapResult.userSpecified("uniformStrategy")) {
649: strategy = DocumentalStrategies.uniform(jsapResult
650: .getInt("uniformStrategy"), Index
651: .getInstance(inputBasename).numberOfDocuments);
652: BinIO.storeObject(strategy,
653: strategyFilename = outputBasename
654: + IndexCluster.STRATEGY_DEFAULT_EXTENSION);
655: } else if (strategyFilename != null)
656: strategy = (DocumentalPartitioningStrategy) BinIO
657: .loadObject(strategyFilename);
658: else
659: throw new IllegalArgumentException(
660: "You must specify a partitioning strategy");
661:
662: final boolean skips = jsapResult.getBoolean("skips");
663: final boolean interleaved = jsapResult
664: .getBoolean("interleaved");
665: if (interleaved
666: && !skips
667: && (jsapResult.userSpecified("quantum") || jsapResult
668: .userSpecified("height"))) {
669: System.err
670: .println("You specified quantum or height, but did not turn on skips.");
671: return;
672: }
673:
674: new PartitionDocumentally(inputBasename, outputBasename,
675: strategy, strategyFilename, jsapResult.getInt("bloom"),
676: jsapResult.getInt("bufferSize"),
677: CompressionFlags.valueOf(jsapResult
678: .getStringArray("comp"),
679: CompressionFlags.DEFAULT_STANDARD_INDEX),
680: interleaved, skips, jsapResult.getInt("quantum"),
681: jsapResult.getInt("height"), jsapResult
682: .getInt("skipBufferSize"), jsapResult
683: .getLong("logInterval")).run();
684: }
685: }
|