0001: package it.unimi.dsi.mg4j.tool;
0002:
0003: /*
0004: * MG4J: Managing Gigabytes for Java
0005: *
0006: * Copyright (C) 2005-2007 Sebastiano Vigna
0007: *
0008: * This library is free software; you can redistribute it and/or modify it
0009: * under the terms of the GNU Lesser General Public License as published by the Free
0010: * Software Foundation; either version 2.1 of the License, or (at your option)
0011: * any later version.
0012: *
0013: * This library is distributed in the hope that it will be useful, but
0014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
0015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
0016: * for more details.
0017: *
0018: * You should have received a copy of the GNU Lesser General Public License
0019: * along with this program; if not, write to the Free Software
0020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
0021: *
0022: */
0023:
0024: import it.unimi.dsi.fastutil.Hash;
0025: import it.unimi.dsi.fastutil.ints.IntArrayList;
0026: import it.unimi.dsi.fastutil.ints.IntArrays;
0027: import it.unimi.dsi.fastutil.io.BinIO;
0028: import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
0029: import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
0030: import it.unimi.dsi.fastutil.objects.ObjectList;
0031: import it.unimi.dsi.mg4j.document.Document;
0032: import it.unimi.dsi.mg4j.document.DocumentCollection;
0033: import it.unimi.dsi.mg4j.document.DocumentFactory;
0034: import it.unimi.dsi.mg4j.document.DocumentIterator;
0035: import it.unimi.dsi.mg4j.document.DocumentSequence;
0036: import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;
0037: import it.unimi.dsi.mg4j.document.InputStreamDocumentSequence;
0038: import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory;
0039: import it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder;
0040: import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
0041: import it.unimi.dsi.mg4j.index.CompressionFlags;
0042: import it.unimi.dsi.mg4j.index.DiskBasedIndex;
0043: import it.unimi.dsi.mg4j.index.DowncaseTermProcessor;
0044: import it.unimi.dsi.mg4j.index.FileIndex;
0045: import it.unimi.dsi.mg4j.index.Index;
0046: import it.unimi.dsi.mg4j.index.IndexWriter;
0047: import it.unimi.dsi.mg4j.index.NullTermProcessor;
0048: import it.unimi.dsi.mg4j.index.TermProcessor;
0049: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
0050: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
0051: import it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy;
0052: import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;
0053: import it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster;
0054: import it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster;
0055: import it.unimi.dsi.mg4j.index.cluster.IdentityDocumentalStrategy;
0056: import it.unimi.dsi.mg4j.index.cluster.IndexCluster;
0057: import it.unimi.dsi.mg4j.index.payload.DatePayload;
0058: import it.unimi.dsi.mg4j.index.payload.IntegerPayload;
0059: import it.unimi.dsi.mg4j.index.payload.Payload;
0060: import it.unimi.dsi.mg4j.io.ByteArrayPostingList;
0061: import it.unimi.dsi.io.FastBufferedReader;
0062: import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;
0063: import it.unimi.dsi.io.InputBitStream;
0064: import it.unimi.dsi.io.OutputBitStream;
0065: import it.unimi.dsi.io.WordReader;
0066: import it.unimi.dsi.Util;
0067: import it.unimi.dsi.mg4j.util.MG4JClassParser;
0068: import it.unimi.dsi.lang.MutableString;
0069: import it.unimi.dsi.lang.ObjectParser;
0070: import it.unimi.dsi.logging.ProgressLogger;
0071: import it.unimi.dsi.util.Properties;
0072: import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;
0073:
0074: import java.io.File;
0075: import java.io.FileNotFoundException;
0076: import java.io.FileOutputStream;
0077: import java.io.FileWriter;
0078: import java.io.IOException;
0079: import java.io.OutputStreamWriter;
0080: import java.io.PrintWriter;
0081: import java.io.Reader;
0082: import java.io.Serializable;
0083: import java.lang.reflect.InvocationTargetException;
0084: import java.util.Arrays;
0085: import java.util.EnumMap;
0086: import java.util.Map;
0087:
0088: import org.apache.commons.configuration.ConfigurationException;
0089: import org.apache.log4j.Logger;
0090:
0091: import cern.colt.GenericSorting;
0092: import cern.colt.Sorting;
0093: import cern.colt.Swapper;
0094: import cern.colt.function.IntComparator;
0095: import cern.colt.function.LongComparator;
0096:
0097: import com.martiansoftware.jsap.FlaggedOption;
0098: import com.martiansoftware.jsap.JSAP;
0099: import com.martiansoftware.jsap.JSAPException;
0100: import com.martiansoftware.jsap.JSAPResult;
0101: import com.martiansoftware.jsap.Parameter;
0102: import com.martiansoftware.jsap.ParseException;
0103: import com.martiansoftware.jsap.SimpleJSAP;
0104: import com.martiansoftware.jsap.Switch;
0105: import com.martiansoftware.jsap.UnflaggedOption;
0106: import com.martiansoftware.jsap.stringparsers.LongSizeStringParser;
0107:
0108: /**
0109: * Scans a document sequence, dividing it in batches of occurrences and writing for each batch a
0110: * corresponding subindex.
0111: *
0112: * <P>This class (more precisely, its
0113: * {@link #run(String, DocumentSequence, TermProcessor, String, int, int, int[], VirtualDocumentResolver[], int[], String, long, String) run()}
0114: * method) reads a document sequence and produces several <em>batches</em>, that is, subindices
0115: * corresponding to subsets of term/document pairs of the collection. A set of batches is generated
0116: * for each indexed field of the collection. A main method invokes the above method setting its
0117: * parameters using suitable options.
0118: *
0119: * <p>Unless a serialised {@link it.unimi.dsi.mg4j.document.DocumentSequence} is specified using
0120: * the suitable option, an implicit {@link it.unimi.dsi.mg4j.document.InputStreamDocumentSequence}
0121: * is created using separator byte (default is 10, i.e., newline). In the latter case, the factory
0122: * and its properties can be set with command-line options.
0123: *
0124: * <P>The only mandatory argument is a <em>basename</em>, which will be used to stem the names
0125: * of all files generated. The first batch of a field named <var>field</var> will use the basename
0126: * <samp><var>basename-field</var>@0</samp>, the second batch <samp><var>basename-field</var>@1</samp>
0127: * and so on. It is also possible to specify a separate directory for batch files (e.g., for easier
0128: * {@linkplain #cleanup(String, int, File) cleanup} when they are no longer necessary).
0129: *
0130: * <P>Since documents are read sequentially, every document has a <em>natural
0131: * index</em> starting
0132: * from 0. If no remapping (i.e., renumbering) is specified, the <em>document
0133: * index</em> of each document
0134: * corresponds to its natural index. If, however, a remapping is specified, under the form of a
0135: * list of integers, the document index of a document is the integer found in the corresponding
0136: * position of the list. More precisely, a remapping for <var>N</var> documents is a list of
0137: * <var>N</var> distinct integers, and a document with natural index <var>i</var> has document
0138: * index given by the <var>i</var>-th element of the list. This is useful when indexing statically
0139: * ranked documents (e.g., if you are indexing a part of the web and would like the index to return
0140: * documents with higher static rank first). If the remapping file is provided, it must be a
0141: * sequence of integers, written using the {@link java.io.DataOutputStream#writeInt(int)} method; if
0142: * <var>N</var> is the number of documents, the file is to contain exactly <var>N</var> distinct
0143: * integers. The integers need not be between 0 and <var>N</var>-1, to allow the remapping of
0144: * subindices (but a warning will be logged in this case, just to be sure you know what you're doing).
0145: *
0146: * <P>Also every term has an associated number starting from 0, assigned in lexicographic order.
0147: *
0148: * <h2>Index types and indexing types</h2>
0149: *
0150: * <p>A <em>standard index</em> contains a list of terms, and for each term a posting list. Each
0151: * posting contains mandatorily a document pointer, and then, optionally, the count and the
0152: * positions of the term (whether the last two elements appear can be specified using suitable
0153: * {@linkplain CompressionFlags compression flags}).
0154: *
0155: * <p>The indexing type of a standard index can be {@link IndexingType#STANDARD},
0156: * {@link IndexingType#REMAPPED} or {@link IndexingType#VIRTUAL}. In the first case, we index the
0157: * words occurring in documents as usual. In the second case, before writing the index all documents
0158: * are renumbered following a provided map. In the third case (used only with
0159: * {@link it.unimi.dsi.mg4j.document.DocumentFactory.FieldType#VIRTUAL} fields) indexing is performed on a virtual document
0160: * obtained by collating a number of {@linkplain VirtualDocumentFragment fragments}.
0161: * Fragments are associated to documents by some key,
0162: * and a {@link VirtualDocumentResolver} turns a key into a document natural number, so that the
0163: * collation process can take place (a settable gap is inserted between fragments).
0164: *
0165: * <p>Besides storing document pointers, document counts, and position, MG4J makes it possible to
0166: * store an arbitrary <em>payload</em> with each posting. This feature is presently used only to
0167: * create <em>payload-based indices</em>—indices without counts and positions that contain
0168: * a single, dummy word <samp>#</samp>. They are actually used to store arbitrary data associated
0169: * to each document, such as dates and integers: using a special syntax, is then possible to specify
0170: * <em>range queries</em> on the values of such fields.
0171: *
0172: * <p>The main difference between standard and payload-based indices is that the first type is
0173: * handled by instances of this class, whereas the second type is handled by instances of
0174: * {@link Scan.PayloadAccumulator}. The
0175: * {@link #run(String, DocumentSequence, TermProcessor, String, int, int, int[], VirtualDocumentResolver[], int[], String, long, String) run()}
0176: * method creates a set of suitable instances, one for each indexed field, and feeds them in
0177: * parallel with data from the appropriate field of the same document.
0178: *
0179: * <h2>Batch subdivision and content</h2>
0180: *
0181: * <p>The scanning process uses a user-settable number of documents per batch, and will try to
0182: * build batches containing exactly that number of documents (for all indexed fields). There are of
0183: * course space constraints that could make building exact batches impossible, as the entire data of
0184: * a batch must into core memory. If memory is too low, a batch will be generated with fewer
0185: * documents than expected.
0186: *
0187: * <p>In some extreme cases, it could be impossible to produce cleanly a set of batches for all
0188: * fields: in that case, <em>emergency dumps</em> will create <em>fragmented batches</em>—instead
0189: * of a single batch containing <var>k</var> documents a certain field will generate two separate
0190: * batches. As a consequence, different fields will have a number of batches, but a simple
0191: * inspection of the property files (see below) will reveal the details of the emergency dumps (and
0192: * {@link Combine} can be used to rebuild the desired exact batches, if necessary).
0193: *
0194: * <p>The larger the number of documents in a batch is, the quicker index construction will be.
0195: * Usually, some experiments and a look at the logs is all that suffices to find out good parameters
0196: * for the Java virtual machine maximum memory setting and for the number of documents per batch.
0197: *
0198: * <P>These are the files currently generated for each batch (<samp><var>basename</var></samp>
0199: * denotes the basename of the batch, not of the index):
0200: *
0201: * <dl>
0202: *
0203: * <dt><samp><var>basename</var>.terms</samp>
0204: *
0205: * <dd>For each indexed term, the corresponding literal string in UTF-8 encoding. More precisely,
0206: * the <var>i</var>-th line of the file (starting from 0) contains the literal string corresponding
0207: * to term index <var>i</var>.
0208: *
0209: * <dt><samp><var>basename</var>.terms.unsorted</samp>
0210: *
0211: * <dd>The list of indexed terms in the same order in which they were met in the document
0212: * collection. This list is not produced unless you ask for it explicitly with a suitable option.
0213: *
0214: * <dt><samp><var>basename</var>.frequencies</samp>
0215: *
0216: * <dd>For each term, the number of documents in which the term appears in γ coding. More
0217: * precisely, <var>i</var>-th integer of the file (starting from 0) is the number of documents in
0218: * which the term of index <var>i</var> appears.
0219: *
0220: * <dt><samp><var>basename</var>.sizes</samp> (not generated for payload-based indices)
0221: *
0222: * <dd>For each indexed document, the corresponding size (=number of words) in γ coding. More
0223: * precisely, <var>i</var>-th integer of the file (starting from 0) is the size in words of the
0224: * document of index <var>i</var>.
0225: *
0226: * <dt><samp><var>basename</var>.index</samp>
0227: *
0228: * <dd>The inverted index.
0229: *
0230: * <dt><samp><var>basename</var>.offsets</samp> (not generated for payload-based indices)
0231: *
0232: * <dd>For each term, the bit offset in <samp><var>basename</var>.index</samp> at which the
0233: * inverted lists start. More precisely, the first integer is the offset for term 0 in γ
0234: * coding, and then the <var>i</var>-th integer is the difference between the <var>i</var>-th and
0235: * the <var>i</var>−1-th offset in γ coding. If <var>T</var> terms were indexed, this
0236: * file will contain <var>T</var>+1 integers, the last being the difference (in bits) between the
0237: * length of the entire inverted index and the offset of the last inverted list.
0238: *
0239: * <dt><samp><var>basename</var>.globcounts</samp> (not generated for payload-based indices)
0240: *
0241: * <dd>For each term, the number of its occurrences throughout the whole document collection, in
0242: * γ coding. More precisely, the <var>i</var>-th integer of the file (starting from 0) is the
0243: * number of occurrences of the term of index <var>i</var>.
0244: *
0245: *
0246: * <dt><samp><var>basename</var>.properties</samp>
0247: *
0248: * <dd>A Java {@linkplain Properties property file} containing information about the index.
0249: * Currently, the following keys (taken from {@link it.unimi.dsi.mg4j.index.Index.PropertyKeys})
0250: * generated:
0251: *
0252: * <dl> <dt>indexclass <dd>the class used to generate the batch (presently, {@link BitStreamIndexWriter});
0253: * <dt>documents <dd>number documents in the collection; <dt>terms <dd>number of indexed terms;
0254: * <dt>occurrences <dd>number of words throughout the whole collection; <dt>postings <dd>number
0255: * of postings (pairs term/document) throughout the whole collection; <dt>maxdocsize <dd>maximum
0256: * size of a document in words; <dt>termprocessor <dd>the term processor (if any) used during the
0257: * index construction; <dt>coding <dd>one or more items, each defining a key/pair value for the
0258: * <em>flag map</em> of the index; each pair is of the form <samp><var>component</var>:<var>coding</var></samp>
0259: * (see {@link it.unimi.dsi.mg4j.index.CompressionFlags}); <dt>field <dd>the name of the field
0260: * that generated this batch (optional) <dt>maxcount <dd>the maximum count in the collection, that
0261: * is, the maximum count of a term maximised on all terms and documents; <dt>size <dd>the index
0262: * size in bits; </dl>
0263: *
0264: * <dt><samp><var>basename</var>.cluster.properties</samp>
0265: *
0266: * <dd>A Java {@linkplain Properties property file} containing information about the set of batches
0267: * seen as a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalCluster}. The keys are same as in the
0268: * previous case, but additionally a number of <samp>localindex</samp> entries specify the basename
0269: * of the batches, and a <samp>splitstrategy</samp>. After creating manually suitable term maps for
0270: * each batch, you will be able to access the set of batches as a single index (but note that
0271: * standard batches have <em>no skip structure</em>, and should not be used
0272: * in production; if you intend to do so, you have to write a customised scanning procedure).
0273: *
0274: * </dl>
0275: *
0276: * @author Sebastiano Vigna
0277: * @since 1.0
0278: */
0279:
0280: public class Scan {
0281: private final static Logger LOGGER = Util.getLogger(Scan.class);
0282:
0283: private final static boolean ASSERTS = false;
0284:
0285: public static enum IndexingType {
0286: /** A standard index—documents will be provided in increasing order. */
0287: STANDARD,
0288: /** A remapped index—documents will be provided in separate calls, but in any order. */
0289: REMAPPED,
0290: /**
0291: * A virtual index—documents will be provided in any order, and a document may appear
0292: * several times.
0293: */
0294: VIRTUAL
0295: }
0296:
0297: /** An interface that describes a virtual document fragment.
0298: *
0299: * When indexing in {@link IndexingType#VIRTUAL} mode, documents are composed
0300: * by fragments (typically, some text surrounding an anchor) that are referred
0301: * to a document by some spefication (in the previous case, the content <samp>href</samp> attribute
0302: * of the anchor). This interface is used to describe such fragments (see, e.g.,
0303: * {@link AnchorExtractor}).
0304: *
0305: * @see VirtualDocumentResolver
0306: */
0307:
0308: public static interface VirtualDocumentFragment extends
0309: Serializable {
0310: /** The specification of the document to which this fragment belong.
0311: *
0312: * @return the specification of the document to which this fragment belong.
0313: * @see VirtualDocumentResolver
0314: */
0315: public MutableString documentSpecifier();
0316:
0317: /** The textual content of this fragment.
0318: *
0319: * @return the textual content of this fragment.
0320: */
0321: public MutableString text();
0322: }
0323:
0324: /** The extension of the property file for the cluster associated to a scan. */
0325: private static final String CLUSTER_STRATEGY_EXTENSION = ".cluster.strategy";
0326:
0327: /** The extension of the strategy for the cluster associated to a scan. */
0328: public static final String CLUSTER_PROPERTIES_EXTENSION = ".cluster.properties";
0329:
0330: /** The frequency with which we report the current number of terms. */
0331: private static final int TERM_REPORT_STEP = 1000000;
0332:
0333: /** The initial size of the term map. */
0334: private static final int INITIAL_TERM_MAP_SIZE = 1000;
0335:
0336: /** A term processor to be applied during the indexing phase. */
0337: private final TermProcessor termProcessor;
0338:
0339: /**
0340: * The current basename of the overall index (usually some basename postfixed with the field
0341: * name).
0342: */
0343: private final String basename;
0344:
0345: /** The field name, if available. */
0346: private final String field;
0347:
0348: /** The size of a buffer. */
0349: private final int bufferSize;
0350:
0351: /** The directory where batches files will be created. */
0352: private final File batchDir;
0353:
0354: /** The flag map for batches. */
0355: final Map<Component, Coding> flags;
0356:
0357: /** A map containing the terms seen so far. */
0358: private Object2ReferenceOpenHashMap<MutableString, ByteArrayPostingList> termMap;
0359:
0360: /**
0361: * The output bit stream for size information. For {@link IndexingType#STANDARD} indexing, the
0362: * list of γ-coded document sizes. For {@link IndexingType#REMAPPED} indexing, a list of
0363: * γ-coded document numbers and document sizes.
0364: */
0365: private OutputBitStream sizes;
0366:
0367: /** The total number of occurrences. */
0368: private long totOccurrences;
0369:
0370: /** The total number of postings (pairs term/document). */
0371: private long totPostings;
0372:
0373: /** The total number of documents. */
0374: private int totDocuments;
0375:
0376: /** Maximum occurrence count. */
0377: private int maxCount;
0378:
0379: /** Maximum size in words of documents seen so far. */
0380: private int globMaxDocSize;
0381:
0382: /** The number of documents indexed so far in the current batch. */
0383: private int documentCount;
0384:
0385: /** The number of terms seen so far in the current batch. */
0386: private int numTerms;
0387:
0388: /** Maximum size in words of documents seen so far in the current batch. */
0389: int maxDocSize;
0390:
0391: /** The current batch. */
0392: private int batch;
0393:
0394: /** The number of occurrences in the current batch. */
0395: private int numOccurrences;
0396:
0397: /** If true, this class experienced an {@link OutOfMemoryError} during some buffer reallocation. */
0398: public boolean outOfMemoryError;
0399:
0400: /** The type of indexing for this scan. */
0401: private final IndexingType indexingType;
0402:
0403: /** Whether {@link #indexingType} is {@link IndexingType#STANDARD}. */
0404: private final boolean indexingIsStandard;
0405:
0406: /** Whether {@link #indexingType} is {@link IndexingType#REMAPPED}. */
0407: private final boolean indexingIsRemapped;
0408:
0409: /** Whether {@link #indexingType} is {@link IndexingType#VIRTUAL}. */
0410: private final boolean indexingIsVirtual;
0411:
0412: /** The number of occurrences generated by the current document. */
0413: private int occsInCurrDoc;
0414:
0415: /** The current maximum position for each document, if the field indexed is virtual. */
0416: protected int[] currMaxPos;
0417:
0418: /**
0419: * The maximum document pointer ever seen (could be different from the last document indexed if
0420: * {@link #indexingType} is not {@link IndexingType#STANDARD}).
0421: */
0422: private int maxDocInBatch;
0423:
0424: /** The width of the artificial gap introduced between virtual-document fragments. */
0425: protected int virtualDocumentGap;
0426:
0427: /** A builder that will be used to zip the document sequence while we pass through it. */
0428: private final ZipDocumentCollectionBuilder builder;
0429:
0430: /**
0431: * The cutpoints of the batches (for building later a
0432: * {@link it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy}).
0433: */
0434: protected final IntArrayList cutPoints;
0435:
0436: /**
0437: * Creates a new scanner instance.
0438: *
0439: * @param basename the basename (usually a global filename followed by the field name, separated
0440: * by a dash).
0441: * @param field the field to be indexed.
0442: * @param termProcessor the term processor for this index.
0443: * @param documentsAreInOrder if true, documents will be served in increasing order.
0444: * @param bufferSize the buffer size used in all I/O.
0445: * @param builder a builder used to create a compressed document collection on the fly.
0446: * @param batchDir a directory for batch files; batch names will be relativised to this
0447: * directory if it is not <code>null</code>.
0448: * @throws FileNotFoundException
0449: */
0450: public Scan(final String basename, final String field,
0451: final TermProcessor termProcessor,
0452: final boolean documentsAreInOrder, final int bufferSize,
0453: final ZipDocumentCollectionBuilder builder,
0454: final File batchDir) throws FileNotFoundException {
0455: this (basename, field, termProcessor,
0456: documentsAreInOrder ? IndexingType.STANDARD
0457: : IndexingType.VIRTUAL, 0, 0, bufferSize,
0458: builder, batchDir);
0459: }
0460:
0461: /**
0462: * Creates a new scanner instance.
0463: *
0464: * @throws FileNotFoundException
0465: *
0466: */
0467: public Scan(final String basename, final String field,
0468: final TermProcessor termProcessor,
0469: final IndexingType indexingType, final int bufferSize,
0470: final ZipDocumentCollectionBuilder builder,
0471: final File batchDir) throws FileNotFoundException {
0472: this (basename, field, termProcessor, indexingType, 0, 0,
0473: bufferSize, builder, batchDir);
0474: }
0475:
0476: /**
0477: * Creates a new scanner instance.
0478: *
0479: * @param basename the basename (usually a global filename followed by the field name, separated
0480: * by a dash).
0481: * @param field the field to be indexed.
0482: * @param termProcessor the term processor for this index.
0483: * @param indexingType the type of indexing procedure.
0484: * @param numVirtualDocs the number of virtual documents that will be used, in case of a virtual
0485: * index; otherwise, immaterial.
0486: * @param virtualDocumentGap the artificial gap introduced between virtual documents fragments, in case
0487: * of a virtual index; otherwise, immaterial.
0488: * @param bufferSize the buffer size used in all I/O.
0489: * @param builder a builder used to create a compressed document collection on the fly.
0490: * @param batchDir a directory for batch files; batch names will be relativised to this
0491: * directory if it is not <code>null</code>.
0492: */
0493: public Scan(final String basename, final String field,
0494: final TermProcessor termProcessor,
0495: final IndexingType indexingType, final int numVirtualDocs,
0496: final int virtualDocumentGap, final int bufferSize,
0497: final ZipDocumentCollectionBuilder builder,
0498: final File batchDir) throws FileNotFoundException {
0499: this .basename = basename;
0500: this .field = field;
0501: this .indexingType = indexingType;
0502: this .termProcessor = termProcessor;
0503: this .bufferSize = bufferSize;
0504: this .builder = builder;
0505: this .batchDir = batchDir;
0506: this .virtualDocumentGap = virtualDocumentGap;
0507: this .cutPoints = new IntArrayList();
0508: this .cutPoints.add(0);
0509:
0510: termMap = new Object2ReferenceOpenHashMap<MutableString, ByteArrayPostingList>(
0511: INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR);
0512:
0513: flags = new EnumMap<Component, Coding>(
0514: CompressionFlags.DEFAULT_STANDARD_INDEX);
0515:
0516: maxDocInBatch = -1;
0517:
0518: indexingIsStandard = indexingType == IndexingType.STANDARD;
0519: indexingIsRemapped = indexingType == IndexingType.REMAPPED;
0520: indexingIsVirtual = indexingType == IndexingType.VIRTUAL;
0521: if (indexingIsVirtual && virtualDocumentGap == 0)
0522: throw new IllegalArgumentException(
0523: "Illegal virtual document gap: "
0524: + virtualDocumentGap);
0525:
0526: if (indexingIsVirtual)
0527: currMaxPos = new int[numVirtualDocs];
0528: openSizeBitStream();
0529: }
0530:
0531: /** Cleans all intermediate files generated by a run of this class.
0532: *
0533: * @param basename the basename of the run.
0534: * @param batches the number of generated batches.
0535: * @param batchDir if not <code>null</code>, a temporary directory where the batches are located.
0536: */
0537: public static void cleanup(final String basename,
0538: final int batches, final File batchDir) throws IOException {
0539: final String basepath = (batchDir != null ? new File(basename)
0540: : new File(basename)).getCanonicalPath();
0541: new File(basepath.toString() + CLUSTER_STRATEGY_EXTENSION)
0542: .delete();
0543: new File(basepath.toString() + CLUSTER_PROPERTIES_EXTENSION)
0544: .delete();
0545: for (int i = 0; i < batches; i++) {
0546: final String batchBasename = batchBasename(i, basename,
0547: batchDir);
0548: new File(batchBasename
0549: + DiskBasedIndex.FREQUENCIES_EXTENSION).delete();
0550: new File(batchBasename
0551: + DiskBasedIndex.GLOBCOUNTS_EXTENSION).delete();
0552: new File(batchBasename + DiskBasedIndex.INDEX_EXTENSION)
0553: .delete();
0554: new File(batchBasename + DiskBasedIndex.OFFSETS_EXTENSION)
0555: .delete();
0556: new File(batchBasename + DiskBasedIndex.SIZES_EXTENSION)
0557: .delete();
0558: new File(batchBasename + DiskBasedIndex.STATS_EXTENSION)
0559: .delete();
0560: new File(batchBasename
0561: + DiskBasedIndex.PROPERTIES_EXTENSION).delete();
0562: new File(batchBasename + DiskBasedIndex.TERMS_EXTENSION)
0563: .delete();
0564: new File(batchBasename
0565: + DiskBasedIndex.UNSORTED_TERMS_EXTENSION).delete();
0566: }
0567: }
0568:
0569: /**
0570: * Returns the name of a batch.
0571: *
0572: * <p>You can override this method if you prefer a different batch naming scheme.
0573: *
0574: * @param batch the batch number.
0575: * @param basename the index basename.
0576: * @param batchDir if not <code>null</code>, a temporary directory for batches.
0577: * @return simply <code>basename@batch</code>, if <code>batchDir</code> is
0578: * <code>null</code>; otherwise, we relativise the name to <code>batchDir</code>.
0579: */
0580: protected static String batchBasename(int batch, String basename,
0581: final File batchDir) {
0582: return batchDir != null ? new File(batchDir, basename + "@"
0583: + batch).toString() : basename + "@" + batch;
0584: }
0585:
0586: /**
0587: * Dumps the current batch on disk as an index.
0588: *
0589: * @return the number of occurrences contained in the batch.
0590: */
0591: protected long dumpBatch() throws IOException,
0592: ConfigurationException {
0593:
0594: outOfMemoryError = false;
0595: final String batchBasename = batchBasename(batch, basename,
0596: batchDir);
0597:
0598: LOGGER.debug("Generating index " + batchBasename
0599: + "; documents: " + documentCount + "; terms:"
0600: + numTerms + "; occurrences: " + numOccurrences);
0601:
0602: // We write down all term in appearance order in termArray.
0603: MutableString[] termArray = termMap.keySet().toArray(
0604: new MutableString[numTerms]);
0605:
0606: if (ASSERTS)
0607: assert numTerms == termMap.size();
0608: if (!indexingIsVirtual)
0609: sizes.close();
0610:
0611: // We sort the terms appering in the batch and write them on disk.
0612: Sorting.quickSort(termArray);
0613: final PrintWriter pw = new PrintWriter(new OutputStreamWriter(
0614: new FastBufferedOutputStream(
0615: new FileOutputStream(batchBasename
0616: + DiskBasedIndex.TERMS_EXTENSION),
0617: bufferSize), "UTF-8"));
0618: for (MutableString t : termArray)
0619: t.println(pw);
0620: pw.close();
0621:
0622: try {
0623: final OutputBitStream frequencies = new OutputBitStream(
0624: batchBasename
0625: + DiskBasedIndex.FREQUENCIES_EXTENSION);
0626: final OutputBitStream globCounts = new OutputBitStream(
0627: batchBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
0628:
0629: if (indexingIsStandard) {
0630: final OutputBitStream index = new OutputBitStream(
0631: batchBasename + DiskBasedIndex.INDEX_EXTENSION);
0632: final OutputBitStream offsets = new OutputBitStream(
0633: batchBasename
0634: + DiskBasedIndex.OFFSETS_EXTENSION);
0635:
0636: ByteArrayPostingList baobs;
0637: int maxCount = 0, frequency;
0638: long bitLength, postings = 0, prevOffset = 0;
0639:
0640: offsets.writeGamma(0);
0641:
0642: for (int i = 0; i < numTerms; i++) {
0643: baobs = termMap.get(termArray[i]);
0644: frequency = baobs.frequency;
0645:
0646: baobs.flush();
0647: if (maxCount < baobs.maxCount)
0648: maxCount = baobs.maxCount;
0649: bitLength = baobs.writtenBits();
0650: baobs.align();
0651:
0652: postings += frequency;
0653:
0654: index.writeGamma(frequency - 1);
0655:
0656: // We need special treatment for terms appearing in all documents
0657: if (frequency == documentCount)
0658: baobs.stripPointers(index, bitLength);
0659: else
0660: index.write(baobs.buffer, bitLength);
0661:
0662: frequencies.writeGamma(frequency);
0663: globCounts.writeLongGamma(baobs.globCount);
0664: offsets.writeLongGamma(index.writtenBits()
0665: - prevOffset);
0666: prevOffset = index.writtenBits();
0667: }
0668:
0669: totPostings += postings;
0670:
0671: final Properties properties = new Properties();
0672: properties.setProperty(Index.PropertyKeys.DOCUMENTS,
0673: documentCount);
0674: properties.setProperty(Index.PropertyKeys.TERMS,
0675: numTerms);
0676: properties.setProperty(Index.PropertyKeys.POSTINGS,
0677: postings);
0678: properties.setProperty(Index.PropertyKeys.MAXCOUNT,
0679: maxCount);
0680: properties.setProperty(Index.PropertyKeys.INDEXCLASS,
0681: FileIndex.class.getName());
0682: properties.addProperty(Index.PropertyKeys.CODING,
0683: "FREQUENCIES:GAMMA");
0684: properties.addProperty(Index.PropertyKeys.CODING,
0685: "POINTERS:DELTA");
0686: properties.addProperty(Index.PropertyKeys.CODING,
0687: "COUNTS:GAMMA");
0688: properties.addProperty(Index.PropertyKeys.CODING,
0689: "POSITIONS:DELTA");
0690: properties.setProperty(
0691: Index.PropertyKeys.TERMPROCESSOR, ObjectParser
0692: .toSpec(termProcessor));
0693: properties.setProperty(Index.PropertyKeys.OCCURRENCES,
0694: numOccurrences);
0695: properties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
0696: maxDocSize);
0697: properties.setProperty(Index.PropertyKeys.SIZE, index
0698: .writtenBits());
0699: if (field != null)
0700: properties.setProperty(Index.PropertyKeys.FIELD,
0701: field);
0702: properties.save(batchBasename
0703: + DiskBasedIndex.PROPERTIES_EXTENSION);
0704: index.close();
0705: offsets.close();
0706:
0707: } else {
0708: final IndexWriter indexWriter = new BitStreamIndexWriter(
0709: batchBasename, maxDocInBatch + 1, true, flags);
0710:
0711: ByteArrayPostingList baobs;
0712: OutputBitStream obs;
0713: int maxCount = 0, maxFrequency = 0, frequency, count;
0714: // Compute max frequency and allocate position array.
0715: for (ByteArrayPostingList b : termMap.values()) {
0716: b.flush();
0717: b.align();
0718: if (maxFrequency < b.frequency)
0719: maxFrequency = b.frequency;
0720: if (maxCount < b.maxCount)
0721: maxCount = b.maxCount;
0722: }
0723:
0724: final long[] bitPos = new long[maxFrequency];
0725: final int[] pointer = new int[maxFrequency];
0726: int[] pos = new int[maxCount];
0727:
0728: for (int i = 0; i < numTerms; i++) {
0729: baobs = termMap.get(termArray[i]);
0730: final InputBitStream ibs = new InputBitStream(
0731: baobs.buffer);
0732: frequency = baobs.frequency; // This could be much more than the actual frequency in virtual indices
0733:
0734: // Calculate posting bit positions and corresponding pointers
0735: for (int j = 0; j < frequency; j++) {
0736: bitPos[j] = ibs.readBits(); // Cache bit poisition
0737: pointer[j] = ibs.readDelta(); // Cache pointer
0738: for (int p = ibs.readGamma() + 1; p-- != 0;)
0739: ibs.readDelta(); // Skip document positions
0740: }
0741:
0742: // Sort stably pointers and positions by increasing pointer
0743: GenericSorting.quickSort(0, frequency,
0744: new IntComparator() {
0745: public int compare(final int i0,
0746: final int i1) {
0747: final int t = pointer[i0]
0748: - pointer[i1];
0749: if (t != 0)
0750: return t;
0751: final long u = bitPos[i0]
0752: - bitPos[i1]; // We need a stable sort
0753: return u < 0 ? -1 : u > 0 ? 1 : 0;
0754: }
0755: }, new Swapper() {
0756: public void swap(final int i0,
0757: final int i1) {
0758: final long t = bitPos[i0];
0759: bitPos[i0] = bitPos[i1];
0760: bitPos[i1] = t;
0761: final int p = pointer[i0];
0762: pointer[i0] = pointer[i1];
0763: pointer[i1] = p;
0764: }
0765: });
0766:
0767: int actualFrequency = frequency;
0768: // Compute actual frequency for virtual indices
0769: if (indexingIsVirtual) {
0770: actualFrequency = 1;
0771: for (int j = 1; j < frequency; j++)
0772: if (pointer[j] != pointer[j - 1])
0773: actualFrequency++;
0774: if (ASSERTS) {
0775: for (int j = 1; j < frequency; j++) {
0776: assert pointer[j] >= pointer[j - 1];
0777: assert pointer[j] != pointer[j - 1]
0778: || bitPos[j] > bitPos[j - 1];
0779: }
0780: }
0781: }
0782:
0783: indexWriter.newInvertedList();
0784: indexWriter.writeFrequency(actualFrequency);
0785:
0786: int currPointer;
0787: for (int j = 0; j < frequency; j++) {
0788: ibs.position(bitPos[j]);
0789: obs = indexWriter.newDocumentRecord();
0790: indexWriter.writeDocumentPointer(obs,
0791: currPointer = ibs.readDelta());
0792: if (ASSERTS)
0793: assert currPointer == pointer[j];
0794: count = ibs.readGamma() + 1;
0795: pos[0] = ibs.readDelta();
0796: for (int p = 1; p < count; p++)
0797: pos[p] = pos[p - 1] + 1 + ibs.readDelta();
0798:
0799: if (indexingIsVirtual) {
0800: while (j < frequency - 1) {
0801: ibs.position(bitPos[j + 1]);
0802: if (currPointer != ibs.readDelta())
0803: break;
0804: j++;
0805: final int moreCount = ibs.readGamma() + 1;
0806: pos = IntArrays.grow(pos, count
0807: + moreCount, count);
0808: pos[count] = ibs.readDelta();
0809: if (ASSERTS)
0810: assert pos[count] > pos[count - 1];
0811: for (int p = 1; p < moreCount; p++)
0812: pos[count + p] = pos[count + p - 1]
0813: + 1 + ibs.readDelta();
0814: count += moreCount;
0815: }
0816: if (maxCount < count)
0817: maxCount = count;
0818: }
0819:
0820: indexWriter.writePositionCount(obs, count);
0821: indexWriter.writeDocumentPositions(obs, pos, 0,
0822: count, -1);
0823: }
0824:
0825: frequencies.writeGamma(actualFrequency);
0826: globCounts.writeLongGamma(baobs.globCount);
0827: }
0828:
0829: indexWriter.close();
0830: final Properties properties = indexWriter.properties();
0831: totPostings += properties.getLong("postings");
0832: properties.setProperty(
0833: Index.PropertyKeys.TERMPROCESSOR, ObjectParser
0834: .toSpec(termProcessor));
0835: properties.setProperty(Index.PropertyKeys.OCCURRENCES,
0836: numOccurrences);
0837: properties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
0838: maxDocSize);
0839: properties.setProperty(Index.PropertyKeys.SIZE,
0840: indexWriter.writtenBits());
0841: if (field != null)
0842: properties.setProperty(Index.PropertyKeys.FIELD,
0843: field);
0844: properties.save(batchBasename
0845: + DiskBasedIndex.PROPERTIES_EXTENSION);
0846:
0847: if (indexingIsRemapped) {
0848: // We must permute sizes
0849: final int[] document = new int[documentCount], size = new int[documentCount];
0850: final InputBitStream sizes = new InputBitStream(
0851: batchBasename
0852: + DiskBasedIndex.SIZES_EXTENSION);
0853: for (int i = 0; i < documentCount; i++) {
0854: document[i] = sizes.readGamma();
0855: size[i] = sizes.readGamma();
0856: }
0857:
0858: GenericSorting.quickSort(0, documentCount,
0859: new IntComparator() {
0860: public int compare(int x, int y) {
0861: return document[x] - document[y];
0862: }
0863: }, new Swapper() {
0864: public void swap(int x, int y) {
0865: int t = document[x];
0866: document[x] = document[y];
0867: document[y] = t;
0868: t = size[x];
0869: size[x] = size[y];
0870: size[y] = t;
0871: }
0872: });
0873:
0874: final OutputBitStream permutedSizes = new OutputBitStream(
0875: batchBasename(batch, basename, batchDir)
0876: + DiskBasedIndex.SIZES_EXTENSION);
0877: for (int i = 0, d = 0; i < documentCount; i++) {
0878: while (d++ < document[i])
0879: permutedSizes.writeGamma(0);
0880: permutedSizes.writeGamma(size[i]);
0881: }
0882: permutedSizes.close();
0883: }
0884: }
0885:
0886: if (indexingIsVirtual) {
0887: final OutputBitStream sizes = new OutputBitStream(
0888: batchBasename(batch, basename, batchDir)
0889: + DiskBasedIndex.SIZES_EXTENSION);
0890: for (int i = 0; i < currMaxPos.length; i++)
0891: sizes.writeGamma(currMaxPos[i]);
0892: sizes.close();
0893: IntArrays.fill(currMaxPos, 0);
0894: }
0895:
0896: globCounts.close();
0897: frequencies.close();
0898: termMap.clear();
0899:
0900: numTerms = 0;
0901: totOccurrences += numOccurrences;
0902: totDocuments += documentCount;
0903: final long result = numOccurrences;
0904: numOccurrences = 0;
0905: globMaxDocSize = Math.max(maxDocSize, globMaxDocSize);
0906: maxDocSize = documentCount = 0;
0907: maxDocInBatch = -1;
0908: if (indexingIsStandard)
0909: cutPoints.add(cutPoints.getInt(cutPoints.size() - 1)
0910: + documentCount);
0911: batch++;
0912:
0913: System.gc(); // This is exactly the right time to do collection and compaction.
0914: return result;
0915: } catch (IOException e) {
0916: LOGGER.fatal("I/O Error on batch " + batch);
0917: throw e;
0918: }
0919: }
0920:
0921: protected void openSizeBitStream() throws FileNotFoundException {
0922: if (!indexingIsVirtual)
0923: sizes = new OutputBitStream(batchBasename(batch, basename,
0924: batchDir)
0925: + DiskBasedIndex.SIZES_EXTENSION);
0926: }
0927:
0928: /**
0929: * Runs in parallel a number of instances.
0930: */
0931: public static void run(final String basename,
0932: final DocumentSequence documentSequence,
0933: final TermProcessor termProcessor,
0934: final String zipCollectionBasename, final int bufferSize,
0935: final int documentsPerBatch, final int[] indexedField,
0936: final String renumberingFile, final long logInterval,
0937: final String tempDirName) throws ConfigurationException,
0938: IOException {
0939: run(basename, documentSequence, termProcessor,
0940: zipCollectionBasename, bufferSize, documentsPerBatch,
0941: indexedField, null, null, renumberingFile, logInterval,
0942: tempDirName);
0943: }
0944:
0945: /**
0946: * Runs in parallel a number of instances.
0947: *
0948: * <p>This commodity method takes care of instantiating one instance per indexed field, and to
0949: * pass the right information to each instance. All options are common to all fields, except for
0950: * the number of occurrences in a batch, which can be tuned for each field separately.
0951: *
0952: * @param basename the index basename.
0953: * @param documentSequence a document sequence.
0954: * @param termProcessor the term processor for this index.
0955: * @param zipCollectionBasename if not <code>null</code>, the basename of a new GZIP'd
0956: * collection built using <code>documentSequence</code>.
0957: * @param bufferSize the buffer size used in all I/O.
0958: * @param documentsPerBatch the number of documents that we should try to put in each segment.
0959: * @param indexedField the fields that should be indexed, in increasing order.
0960: * @param virtualDocumentResolver the array of virtual document resolvers to be used, parallel
0961: * to <code>indexedField</code>: it can safely contain anything (even <code>null</code>)
0962: * in correspondence to non-virtual fields, and can safely be <code>null</code> if no fields
0963: * are virtual.
0964: * @param virtualGap the array of virtual field gaps to be used, parallel to
0965: * <code>indexedField</code>: it can safely contain anything in correspondence to non-virtual
0966: * fields, and can safely be <code>null</code> if no fields are virtual.
0967: * @param mapFile the name of a file containing a map to be applied to document indices.
0968: * @param logInterval the minimum time interval between activity logs in milliseconds.
0969: * @param tempDirName a directory for temporary files.
0970: * @throws IOException
0971: * @throws ConfigurationException
0972: */
0973: @SuppressWarnings("unchecked")
0974: public static void run(final String basename,
0975: final DocumentSequence documentSequence,
0976: final TermProcessor termProcessor,
0977: final String zipCollectionBasename, final int bufferSize,
0978: final int documentsPerBatch, final int[] indexedField,
0979: final VirtualDocumentResolver[] virtualDocumentResolver,
0980: final int[] virtualGap, final String mapFile,
0981: final long logInterval, final String tempDirName)
0982: throws ConfigurationException, IOException {
0983:
0984: int numDocuments = 0;
0985: final int numberOfIndexedFields = indexedField.length;
0986: if (numberOfIndexedFields == 0)
0987: throw new IllegalArgumentException(
0988: "You must specify at least one field");
0989: final DocumentFactory factory = documentSequence.factory();
0990: final File tempDir = tempDirName == null ? null : new File(
0991: tempDirName);
0992: for (int i = 0; i < indexedField.length; i++)
0993: if (factory.fieldType(indexedField[i]) == DocumentFactory.FieldType.VIRTUAL
0994: && (virtualDocumentResolver == null || virtualDocumentResolver[i] == null))
0995: throw new IllegalArgumentException(
0996: "No resolver was associated with virtual field "
0997: + factory.fieldName(indexedField[i]));
0998:
0999: final int[] map = mapFile != null ? BinIO.loadInts(mapFile)
1000: : null;
1001:
1002: final Scan[] scan = new Scan[numberOfIndexedFields]; // To scan textual content
1003: final PayloadAccumulator[] accumulator = new PayloadAccumulator[numberOfIndexedFields]; // To accumulate
1004: // document data
1005:
1006: final ProgressLogger pl = new ProgressLogger(LOGGER,
1007: logInterval, "documents");
1008: if (documentSequence instanceof DocumentCollection)
1009: pl.expectedUpdates = ((DocumentCollection) documentSequence)
1010: .size();
1011: final boolean zipping = zipCollectionBasename != null;
1012: final ZipDocumentCollectionBuilder builder = zipping ? new ZipDocumentCollectionBuilder(
1013: zipCollectionBasename + ".zip", documentSequence
1014: .factory(), true, pl)
1015: : null;
1016:
1017: for (int i = 0; i < numberOfIndexedFields; i++) {
1018: switch (factory.fieldType(indexedField[i])) {
1019: case TEXT:
1020: scan[i] = new Scan(basename + '-'
1021: + factory.fieldName(indexedField[i]), factory
1022: .fieldName(indexedField[i]), termProcessor,
1023: map != null ? IndexingType.REMAPPED
1024: : IndexingType.STANDARD, 0, 0,
1025: bufferSize, builder, tempDir);
1026: break;
1027: case VIRTUAL:
1028: scan[i] = new Scan(basename + '-'
1029: + factory.fieldName(indexedField[i]), factory
1030: .fieldName(indexedField[i]), termProcessor,
1031: IndexingType.VIRTUAL,
1032: virtualDocumentResolver[i].numberOfDocuments(),
1033: virtualGap[i], bufferSize, builder, tempDir);
1034: break;
1035:
1036: case DATE:
1037: accumulator[i] = new PayloadAccumulator(basename + '-'
1038: + factory.fieldName(indexedField[i]),
1039: new DatePayload(), factory
1040: .fieldName(indexedField[i]),
1041: map != null ? IndexingType.REMAPPED
1042: : IndexingType.STANDARD,
1043: documentsPerBatch, tempDir);
1044: break;
1045: case INT:
1046: accumulator[i] = new PayloadAccumulator(basename + '-'
1047: + factory.fieldName(indexedField[i]),
1048: new IntegerPayload(), factory
1049: .fieldName(indexedField[i]),
1050: map != null ? IndexingType.REMAPPED
1051: : IndexingType.STANDARD,
1052: documentsPerBatch, tempDir);
1053: break;
1054: default:
1055:
1056: }
1057: }
1058:
1059: pl.displayFreeMemory = true;
1060: pl.start("Indexing documents...");
1061:
1062: DocumentIterator iterator = documentSequence.iterator();
1063: Reader reader;
1064: WordReader wordReader;
1065: ObjectList<VirtualDocumentFragment> fragments;
1066: Document document;
1067:
1068: int documentPointer = 0, documentsInBatch = 0;
1069: long batchStartTime = System.currentTimeMillis();
1070: boolean outOfMemoryError = false, stopCompaction = false;
1071:
1072: while ((document = iterator.nextDocument()) != null) {
1073:
1074: if (zipping)
1075: builder.startDocument(document.title(), document.uri());
1076: for (int i = 0; i < numberOfIndexedFields; i++) {
1077: switch (factory.fieldType(indexedField[i])) {
1078: case TEXT:
1079: reader = (Reader) document.content(indexedField[i]);
1080: wordReader = document.wordReader(indexedField[i]);
1081: wordReader.setReader(reader);
1082: if (zipping)
1083: builder.startTextField();
1084: scan[i].processDocument(
1085: map != null ? map[documentPointer]
1086: : documentPointer, wordReader);
1087: if (zipping)
1088: builder.endTextField();
1089: break;
1090: case VIRTUAL:
1091: fragments = (ObjectList<VirtualDocumentFragment>) document
1092: .content(indexedField[i]);
1093: wordReader = document.wordReader(indexedField[i]);
1094: virtualDocumentResolver[i].context(document);
1095: for (VirtualDocumentFragment fragment : fragments) {
1096: int virtualDocumentPointer = virtualDocumentResolver[i]
1097: .resolve(fragment.documentSpecifier());
1098: if (virtualDocumentPointer < 0)
1099: continue;
1100: if (map != null)
1101: virtualDocumentPointer = map[virtualDocumentPointer];
1102: wordReader.setReader(new FastBufferedReader(
1103: fragment.text()));
1104: scan[i].processDocument(virtualDocumentPointer,
1105: wordReader);
1106: }
1107: if (zipping)
1108: builder.virtualField(fragments);
1109: break;
1110: default:
1111: Object o = document.content(indexedField[i]);
1112: accumulator[i].processData(
1113: map != null ? map[documentPointer]
1114: : documentPointer, o);
1115: if (zipping)
1116: builder.nonTextField(o);
1117: break;
1118: }
1119:
1120: if (scan[i] != null && scan[i].outOfMemoryError)
1121: stopCompaction = outOfMemoryError = true;
1122: }
1123: if (zipping)
1124: builder.endDocument();
1125: documentPointer++;
1126: documentsInBatch++;
1127: document.close();
1128: pl.update();
1129:
1130: // We try compaction until (after compaction) we have less than 20% memory available
1131: long percAvailableMemory = Util.percAvailableMemory();
1132: if (percAvailableMemory < 10 && !stopCompaction) {
1133: LOGGER.info("Trying compaction... ("
1134: + percAvailableMemory + "% available)");
1135: Util.compactMemory();
1136: percAvailableMemory = Util.percAvailableMemory();
1137: if (percAvailableMemory < 20)
1138: stopCompaction = true;
1139: LOGGER.info("Compaction completed ("
1140: + percAvailableMemory + "% available"
1141: + (stopCompaction ? ")" : ", will try again)"));
1142: }
1143:
1144: if (outOfMemoryError
1145: || documentsInBatch == documentsPerBatch
1146: || percAvailableMemory < 10) {
1147: if (outOfMemoryError)
1148: LOGGER
1149: .warn("OutOfMemoryError during buffer reallocation: writing a batch of "
1150: + documentsInBatch + " documents");
1151: else if (percAvailableMemory < 10)
1152: LOGGER
1153: .warn("Available memory below 10%: writing a batch of "
1154: + documentsInBatch + " documents");
1155:
1156: long occurrences = 0;
1157: for (int i = 0; i < numberOfIndexedFields; i++) {
1158: switch (factory.fieldType(indexedField[i])) {
1159: case TEXT:
1160: case VIRTUAL:
1161: occurrences += scan[i].dumpBatch();
1162: scan[i].openSizeBitStream();
1163: break;
1164: default:
1165: accumulator[i].writeData();
1166: }
1167: }
1168:
1169: LOGGER
1170: .info("Last set of batches indexed at "
1171: + Util
1172: .format((1000. * occurrences)
1173: / (System
1174: .currentTimeMillis() - batchStartTime))
1175: + " occurrences/s");
1176: batchStartTime = System.currentTimeMillis();
1177: documentsInBatch = 0;
1178: stopCompaction = outOfMemoryError = false;
1179: }
1180: }
1181:
1182: iterator.close();
1183: if (builder != null)
1184: BinIO.storeObject(builder.close(), zipCollectionBasename
1185: + DocumentCollection.DEFAULT_EXTENSION);
1186:
1187: for (int i = 0; i < numberOfIndexedFields; i++) {
1188: switch (factory.fieldType(indexedField[i])) {
1189: case TEXT:
1190: case VIRTUAL:
1191: scan[i].close();
1192: break;
1193: default:
1194: accumulator[i].close();
1195: break;
1196: }
1197:
1198: }
1199:
1200: pl.done();
1201:
1202: if (numDocuments > 0 && documentPointer != numDocuments)
1203: LOGGER
1204: .error("The document sequence contains "
1205: + documentPointer
1206: + " documents, but the ZerothPass property file claims that there are "
1207: + numDocuments + " documents");
1208: if (map != null && documentPointer != map.length)
1209: LOGGER.warn("The document sequence contains "
1210: + documentPointer
1211: + " documents, but the map contains " + map.length
1212: + " integers");
1213: }
1214:
1215: final MutableString word = new MutableString();
1216:
1217: final MutableString nonWord = new MutableString();
1218:
1219: /** The default delimiter separating two documents read from standard input (a newline). */
1220: public static final int DEFAULT_DELIMITER = 10;
1221:
1222: /** The default batch size. */
1223: public static final int DEFAULT_BATCH_SIZE = 100000;
1224:
1225: /** The default buffer size. */
1226: public static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
1227:
1228: /** The default virtual field gap. */
1229: public static final int DEFAULT_VIRTUAL_DOCUMENT_GAP = 64;
1230:
1231: /**
1232: * Processes a document.
1233: *
1234: * @param documentPointer the integer pointer associated to the document.
1235: * @param wordReader the word reader associated to the document.
1236: */
1237: public void processDocument(final int documentPointer,
1238: final WordReader wordReader) throws IOException {
1239: int pos = indexingIsVirtual ? currMaxPos[documentPointer] : 0;
1240: final int actualPointer = indexingIsStandard ? documentCount
1241: : documentPointer;
1242: ByteArrayPostingList termBaobs;
1243:
1244: word.length(0);
1245: nonWord.length(0);
1246:
1247: while (wordReader.next(word, nonWord)) {
1248: if (builder != null)
1249: builder.add(word, nonWord);
1250: if (word.length() == 0)
1251: continue;
1252: if (!termProcessor.processTerm(word)) {
1253: pos++; // We do consider the positions of terms canceled out by the term processor.
1254: continue;
1255: }
1256:
1257: // We check whether we have already seen this term. If not, we add it to the term map.
1258: if ((termBaobs = termMap.get(word)) == null) {
1259: termMap.put(word.copy(),
1260: termBaobs = new ByteArrayPostingList(
1261: new byte[32], indexingIsStandard));
1262: numTerms++;
1263: if (numTerms % TERM_REPORT_STEP == 0)
1264: LOGGER.info("[" + Util.format(numTerms)
1265: + " term(s), "
1266: + Util.format(totOccurrences + 1)
1267: + " occ(s)]");
1268: }
1269:
1270: // We now record the occurrence. If a renumbering map has
1271: // been specified, we have to renumber the document index through it.
1272: termBaobs.setDocumentPointer(actualPointer);
1273: termBaobs.addPosition(pos);
1274: // Record whether this posting list has an out-of-memory-error problem.
1275: if (termBaobs.outOfMemoryError)
1276: outOfMemoryError = true;
1277: occsInCurrDoc++;
1278: numOccurrences++;
1279: pos++;
1280: }
1281:
1282: if (pos > maxDocSize)
1283: maxDocSize = pos;
1284:
1285: if (indexingIsStandard)
1286: sizes.writeGamma(pos);
1287: else if (indexingIsRemapped) {
1288: sizes.writeGamma(actualPointer);
1289: sizes.writeGamma(pos);
1290: }
1291:
1292: if (indexingIsVirtual)
1293: currMaxPos[documentPointer] += occsInCurrDoc
1294: + virtualDocumentGap;
1295:
1296: pos = occsInCurrDoc = 0;
1297: documentCount++;
1298: if (actualPointer > maxDocInBatch)
1299: maxDocInBatch = actualPointer;
1300: }
1301:
1302: private static void makeEmpty(final String filename)
1303: throws IOException {
1304: final File file = new File(filename);
1305: if (file.exists() && !file.delete())
1306: throw new IOException("Cannot delete file " + file);
1307: file.createNewFile();
1308: }
1309:
1310: /**
1311: * Closes this pass, releasing all resources.
1312: */
1313: public void close() throws ConfigurationException, IOException {
1314: if (numOccurrences > 0)
1315: dumpBatch();
1316:
1317: if (numOccurrences == 0 && batch == 0) {
1318: // Special case: no term has been indexed. We generate an empty batch.
1319: final String batchBasename = batchBasename(0, basename,
1320: batchDir);
1321: LOGGER.debug("Generating empty index " + batchBasename);
1322: makeEmpty(batchBasename + DiskBasedIndex.TERMS_EXTENSION);
1323: makeEmpty(batchBasename
1324: + DiskBasedIndex.FREQUENCIES_EXTENSION);
1325: makeEmpty(batchBasename
1326: + DiskBasedIndex.GLOBCOUNTS_EXTENSION);
1327: makeEmpty(batchBasename + DiskBasedIndex.SIZES_EXTENSION);
1328:
1329: final IndexWriter indexWriter = new BitStreamIndexWriter(
1330: batchBasename, totDocuments, true, flags);
1331: indexWriter.close();
1332: final Properties properties = indexWriter.properties();
1333: properties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
1334: ObjectParser.toSpec(termProcessor));
1335: properties.setProperty(Index.PropertyKeys.OCCURRENCES, 0);
1336: properties.setProperty(Index.PropertyKeys.MAXCOUNT, 0);
1337: properties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
1338: maxDocSize);
1339: properties.setProperty(Index.PropertyKeys.SIZE, 0);
1340: if (field != null)
1341: properties.setProperty(Index.PropertyKeys.FIELD, field);
1342: properties.save(batchBasename
1343: + DiskBasedIndex.PROPERTIES_EXTENSION);
1344: batch = 1;
1345: }
1346:
1347: termMap = null;
1348:
1349: final Properties properties = new Properties();
1350: if (field != null)
1351: properties.setProperty(Index.PropertyKeys.FIELD, field);
1352: properties.setProperty(Index.PropertyKeys.BATCHES, batch);
1353: properties.setProperty(Index.PropertyKeys.DOCUMENTS,
1354: totDocuments);
1355: properties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
1356: globMaxDocSize);
1357: properties.setProperty(Index.PropertyKeys.MAXCOUNT, maxCount);
1358: properties.setProperty(Index.PropertyKeys.OCCURRENCES,
1359: totOccurrences);
1360: properties
1361: .setProperty(Index.PropertyKeys.POSTINGS, totPostings);
1362: properties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
1363: termProcessor.getClass().getName());
1364:
1365: if (!indexingIsVirtual) {
1366: // This set of batches can be seen as a documental cluster index.
1367: final Properties clusterProperties = new Properties();
1368: clusterProperties.addAll(properties);
1369: clusterProperties.setProperty(Index.PropertyKeys.TERMS, -1);
1370: clusterProperties.setProperty(
1371: DocumentalCluster.PropertyKeys.BLOOM, false);
1372: clusterProperties.setProperty(
1373: IndexCluster.PropertyKeys.FLAT, false);
1374:
1375: if (indexingIsStandard) {
1376: clusterProperties.setProperty(
1377: Index.PropertyKeys.INDEXCLASS,
1378: DocumentalConcatenatedCluster.class.getName());
1379: BinIO.storeObject(new ContiguousDocumentalStrategy(
1380: cutPoints.toIntArray()), basename
1381: + CLUSTER_STRATEGY_EXTENSION);
1382: } else { // Remapped
1383: clusterProperties.setProperty(
1384: Index.PropertyKeys.INDEXCLASS,
1385: DocumentalMergedCluster.class.getName());
1386: BinIO.storeObject(new IdentityDocumentalStrategy(batch,
1387: totDocuments), basename
1388: + CLUSTER_STRATEGY_EXTENSION);
1389: }
1390: clusterProperties.setProperty(
1391: IndexCluster.PropertyKeys.STRATEGY, basename
1392: + CLUSTER_STRATEGY_EXTENSION);
1393: for (int i = 0; i < batch; i++)
1394: clusterProperties.addProperty(
1395: IndexCluster.PropertyKeys.LOCALINDEX,
1396: batchBasename(i, basename, batchDir));
1397: clusterProperties.save(basename
1398: + CLUSTER_PROPERTIES_EXTENSION);
1399:
1400: }
1401:
1402: properties.save(basename + DiskBasedIndex.PROPERTIES_EXTENSION);
1403: }
1404:
1405: public String toString() {
1406: return this .getClass().getSimpleName() + "(" + basename + ":"
1407: + field + ")";
1408: }
1409:
1410: /**
1411: * An accumulator for payloads.
1412: *
1413: * <P>This class is essentially a stripped-down version of {@link Scan} that just accumulate
1414: * payloads in a bitstream and releases them in batches. The main difference is that neither
1415: * sizes nor globcounts are saved (as they would not make much sense).
1416: */
1417:
1418: protected static class PayloadAccumulator {
1419: /**
1420: * The current basename of the overall index (usually some basename postfixed with the field
1421: * name).
1422: */
1423: private final String basename;
1424:
1425: /** The field name, if available. */
1426: private final String field;
1427:
1428: /** The total number of postings (pairs term/document). */
1429: private long totPostings;
1430:
1431: /** The directory where batches files will be created. */
1432: private final File batchDir;
1433:
1434: /** The flag map for batches. */
1435: final Map<Component, Coding> flags;
1436:
1437: /** The total number of documents. */
1438: private int totDocuments;
1439:
1440: /** The number of documents indexed so far in the current batch. */
1441: private int documentCount;
1442:
1443: /** The current batch. */
1444: private int batch;
1445:
1446: /** The type of indexing for this scan. */
1447: private final IndexingType indexingType;
1448:
1449: /** The pointers into the stream, if {@link #indexingType} is {@link IndexingType#REMAPPED}. */
1450: private long position[];
1451:
1452: /** The output stream underlying this accumulator. */
1453: private FastByteArrayOutputStream accumulatorStream;
1454:
1455: /** The accumulating output bit stream, wrapping {@link #accumulatorStream}. */
1456: private OutputBitStream accumulator;
1457:
1458: /**
1459: * The cutpoints of the batches (for building later a
1460: * {@link it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy}).
1461: */
1462: protected final IntArrayList cutPoints;
1463:
1464: /** The payload accumulated by this accumulator. */
1465: private final Payload payload;
1466:
1467: /** The maximum document ever seen in the current batch. */
1468: private int maxDocInBatch;
1469:
1470: /**
1471: * Creates a new accumulator.
1472: *
1473: * @param basename the basename (usually a global filename followed by the field name,
1474: * separated by a dash).
1475: * @param payload the payload stored by this accumulator.
1476: * @param field the name of the accumulated field.
1477: * @param indexingType the type of indexing procedure.
1478: * @param documentsPerBatch the number of documents in each batch.
1479: * @param batchDir a directory for batch files; batch names will be relativised to this
1480: * directory if it is not <code>null</code>.
1481: */
1482: public PayloadAccumulator(final String basename,
1483: final Payload payload, final String field,
1484: final IndexingType indexingType,
1485: final int documentsPerBatch, final File batchDir) {
1486: this .basename = basename;
1487: this .payload = payload;
1488: this .field = field;
1489: this .indexingType = indexingType;
1490: if (indexingType != IndexingType.STANDARD
1491: && indexingType != IndexingType.REMAPPED)
1492: throw new UnsupportedOperationException(
1493: "Non-standard payload-based indices support only standard and remapped indexing");
1494: if (indexingType == IndexingType.REMAPPED)
1495: position = new long[documentsPerBatch];
1496: this .batchDir = batchDir;
1497: this .cutPoints = new IntArrayList();
1498: this .cutPoints.add(0);
1499:
1500: flags = new EnumMap<Component, Coding>(
1501: CompressionFlags.DEFAULT_PAYLOAD_INDEX);
1502: accumulatorStream = new FastByteArrayOutputStream();
1503: accumulator = new OutputBitStream(accumulatorStream);
1504: }
1505:
1506: /** Writes in compressed form the data currently accumulated. */
1507: protected void writeData() throws IOException,
1508: ConfigurationException {
1509:
1510: final String batchBasename = batchBasename(batch, basename,
1511: batchDir);
1512:
1513: LOGGER.debug("Generating index " + batchBasename
1514: + "; documents: " + documentCount);
1515:
1516: try {
1517: accumulator.flush();
1518: final InputBitStream ibs = new InputBitStream(
1519: accumulatorStream.array);
1520: final IndexWriter indexWriter = new BitStreamIndexWriter(
1521: batchBasename,
1522: indexingType == IndexingType.STANDARD ? documentCount
1523: : maxDocInBatch + 1, false, flags);
1524: indexWriter.newInvertedList();
1525: indexWriter.writeFrequency(documentCount);
1526: OutputBitStream obs;
1527:
1528: if (indexingType == IndexingType.STANDARD) {
1529: for (int i = 0; i < documentCount; i++) {
1530: obs = indexWriter.newDocumentRecord();
1531: indexWriter.writeDocumentPointer(obs, i);
1532: payload.read(ibs);
1533: indexWriter.writePayload(obs, payload);
1534: }
1535: } else {
1536: // We sort position by pointed document pointer.
1537: Sorting.quickSort(position, 0, documentCount,
1538: new LongComparator() {
1539: public int compare(
1540: final long position0,
1541: final long position1) {
1542: try {
1543: ibs.position(position0);
1544: final int d0 = ibs.readDelta();
1545: ibs.position(position1);
1546: return d0 - ibs.readDelta();
1547: } catch (IOException e) {
1548: throw new RuntimeException(e);
1549: }
1550: }
1551: });
1552: for (int i = 0; i < documentCount; i++) {
1553: obs = indexWriter.newDocumentRecord();
1554: ibs.position(position[i]);
1555: indexWriter.writeDocumentPointer(obs, ibs
1556: .readDelta());
1557: payload.read(ibs);
1558: indexWriter.writePayload(obs, payload);
1559: }
1560:
1561: maxDocInBatch = 0;
1562: }
1563:
1564: indexWriter.close();
1565:
1566: final Properties properties = indexWriter.properties();
1567: totPostings += properties.getLong("postings");
1568: properties.setProperty(Index.PropertyKeys.OCCURRENCES,
1569: -1);
1570: properties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
1571: -1);
1572: properties.setProperty(Index.PropertyKeys.SIZE,
1573: indexWriter.writtenBits());
1574: properties.setProperty(
1575: Index.PropertyKeys.TERMPROCESSOR,
1576: NullTermProcessor.class.getName());
1577: properties.setProperty(Index.PropertyKeys.PAYLOADCLASS,
1578: payload.getClass().getName());
1579: if (field != null)
1580: properties.setProperty(Index.PropertyKeys.FIELD,
1581: field);
1582: properties.save(batchBasename
1583: + DiskBasedIndex.PROPERTIES_EXTENSION);
1584:
1585: // We *must* generate a fake term file, or index combination won't work.
1586: final PrintWriter termWriter = new PrintWriter(
1587: new FileWriter(batchBasename
1588: + DiskBasedIndex.TERMS_EXTENSION));
1589: termWriter.println("#");
1590: termWriter.close();
1591:
1592: cutPoints.add(cutPoints.getInt(cutPoints.size() - 1)
1593: + documentCount);
1594: accumulatorStream.reset();
1595: accumulator.writtenBits(0);
1596: documentCount = 0;
1597: maxDocInBatch = 0;
1598: batch++;
1599: } catch (IOException e) {
1600: LOGGER.fatal("I/O Error on batch " + batch);
1601: throw e;
1602: }
1603: }
1604:
1605: /**
1606: * Processes the payload of a given document.
1607: *
1608: * @param documentPointer the document pointer.
1609: * @param content the payload.
1610: */
1611: public void processData(final int documentPointer,
1612: final Object content) throws IOException {
1613: // We write document pointers only for non-standard indices.
1614: if (indexingType != IndexingType.STANDARD) {
1615: position[documentCount] = accumulator.writtenBits();
1616: accumulator.writeDelta(documentPointer);
1617: }
1618: // TODO: devise an out-of-memory-error check mechanism similar to that of ByteArrayPostingList.
1619: payload.set(content);
1620: payload.write(accumulator);
1621:
1622: if (documentPointer > maxDocInBatch)
1623: maxDocInBatch = documentPointer;
1624: documentCount++;
1625: totDocuments++;
1626: }
1627:
1628: /**
1629: * Closes this accumulator, releasing all resources.
1630: */
1631: public void close() throws ConfigurationException, IOException {
1632: if (documentCount > 0)
1633: writeData();
1634:
1635: if (totDocuments == 0) {
1636: // Special case: no document has been indexed. We generate an empty batch.
1637: final String batchBasename = batchBasename(0, basename,
1638: batchDir);
1639: LOGGER.debug("Generating empty index " + batchBasename);
1640:
1641: final IndexWriter indexWriter = new BitStreamIndexWriter(
1642: batchBasename, 0, false, flags);
1643: indexWriter.close();
1644: final Properties properties = indexWriter.properties();
1645: properties.setProperty(Index.PropertyKeys.SIZE, 0);
1646: properties.setProperty(Index.PropertyKeys.OCCURRENCES,
1647: -1);
1648: properties.setProperty(Index.PropertyKeys.MAXCOUNT, -1);
1649: properties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
1650: -1);
1651: properties.setProperty(
1652: Index.PropertyKeys.TERMPROCESSOR,
1653: NullTermProcessor.class.getName());
1654: properties.setProperty(Index.PropertyKeys.PAYLOADCLASS,
1655: payload.getClass().getName());
1656: if (field != null)
1657: properties.setProperty(Index.PropertyKeys.FIELD,
1658: field);
1659: properties.save(batchBasename
1660: + DiskBasedIndex.PROPERTIES_EXTENSION);
1661: new FileOutputStream(batchBasename
1662: + DiskBasedIndex.TERMS_EXTENSION).close();
1663: batch = 1;
1664: }
1665:
1666: accumulator = null;
1667: accumulatorStream = null;
1668: position = null;
1669:
1670: final Properties properties = new Properties();
1671: if (field != null)
1672: properties.setProperty(Index.PropertyKeys.FIELD, field);
1673: properties.setProperty(Index.PropertyKeys.BATCHES, batch);
1674: properties.setProperty(Index.PropertyKeys.DOCUMENTS,
1675: totDocuments);
1676: properties.setProperty(Index.PropertyKeys.POSTINGS,
1677: totPostings);
1678: properties.setProperty(Index.PropertyKeys.OCCURRENCES, -1);
1679: properties.setProperty(Index.PropertyKeys.MAXCOUNT, -1);
1680: properties.setProperty(Index.PropertyKeys.MAXDOCSIZE, -1);
1681: properties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
1682: NullTermProcessor.class.getName());
1683: properties.setProperty(Index.PropertyKeys.PAYLOADCLASS,
1684: payload.getClass().getName());
1685:
1686: // This set of batches can be seen as a documental cluster index.
1687: final Properties clusterProperties = new Properties();
1688: clusterProperties.addAll(properties);
1689: clusterProperties.setProperty(Index.PropertyKeys.TERMS, 1);
1690: clusterProperties.setProperty(
1691: IndexCluster.PropertyKeys.BLOOM, false);
1692: clusterProperties.setProperty(
1693: IndexCluster.PropertyKeys.FLAT, true);
1694:
1695: if (indexingType == IndexingType.STANDARD) {
1696: clusterProperties.setProperty(
1697: Index.PropertyKeys.INDEXCLASS,
1698: DocumentalConcatenatedCluster.class.getName());
1699: BinIO.storeObject(new ContiguousDocumentalStrategy(
1700: cutPoints.toIntArray()), basename
1701: + CLUSTER_STRATEGY_EXTENSION);
1702: } else {
1703: clusterProperties.setProperty(
1704: Index.PropertyKeys.INDEXCLASS,
1705: DocumentalMergedCluster.class.getName());
1706: BinIO.storeObject(new IdentityDocumentalStrategy(batch,
1707: totDocuments), basename
1708: + CLUSTER_STRATEGY_EXTENSION);
1709: }
1710: clusterProperties.setProperty(
1711: IndexCluster.PropertyKeys.STRATEGY, basename
1712: + CLUSTER_STRATEGY_EXTENSION);
1713: for (int i = 0; i < batch; i++)
1714: clusterProperties.addProperty(
1715: IndexCluster.PropertyKeys.LOCALINDEX,
1716: batchBasename(i, basename, batchDir));
1717: clusterProperties.save(basename
1718: + CLUSTER_PROPERTIES_EXTENSION);
1719:
1720: properties.save(basename
1721: + DiskBasedIndex.PROPERTIES_EXTENSION);
1722: }
1723:
1724: }
1725:
1726: public static int[] parseQualifiedSizes(
1727: final String[] qualifiedSizes, final String defaultSize,
1728: final int[] indexedField, final DocumentFactory factory)
1729: throws ParseException {
1730: final int[] size = new int[indexedField.length];
1731: String defaultSpec = defaultSize;
1732: IntArrayList indexedFields = IntArrayList.wrap(indexedField);
1733: for (int i = 0; i < qualifiedSizes.length; i++)
1734: if (qualifiedSizes[i].indexOf(':') == -1)
1735: defaultSpec = qualifiedSizes[i];
1736: for (int i = 0; i < size.length; i++)
1737: size[i] = (int) LongSizeStringParser.parseSize(defaultSpec);
1738: for (int i = 0; i < qualifiedSizes.length; i++) {
1739: final int split = qualifiedSizes[i].indexOf(':');
1740: if (split >= 0) {
1741: final String fieldName = qualifiedSizes[i].substring(0,
1742: split);
1743: final int field = factory.fieldIndex(fieldName);
1744: if (field < 0)
1745: throw new IllegalArgumentException("Field "
1746: + fieldName + " is not part of factory "
1747: + factory.getClass().getName());
1748: if (!indexedFields.contains(field))
1749: throw new IllegalArgumentException("Field "
1750: + factory.fieldName(field)
1751: + " is not being indexed");
1752: size[indexedFields.indexOf(field)] = (int) LongSizeStringParser
1753: .parseSize(qualifiedSizes[i]
1754: .substring(split + 1));
1755: }
1756: }
1757: return size;
1758: }
1759:
1760: public static VirtualDocumentResolver[] parseVirtualDocumentResolver(
1761: final String[] virtualDocumentSpec,
1762: final int[] indexedField, final DocumentFactory factory) {
1763: final VirtualDocumentResolver[] virtualDocumentResolver = new VirtualDocumentResolver[indexedField.length];
1764: VirtualDocumentResolver defaultResolver = null;
1765: IntArrayList indexedFields = IntArrayList.wrap(indexedField);
1766: for (int i = 0; i < virtualDocumentSpec.length; i++)
1767: if (virtualDocumentSpec[i].indexOf(':') == -1)
1768: try {
1769: defaultResolver = (VirtualDocumentResolver) BinIO
1770: .loadObject(virtualDocumentSpec[i]);
1771: } catch (IOException e) {
1772: throw new RuntimeException(
1773: "An I/O error occurred while loading "
1774: + virtualDocumentSpec[i], e);
1775: } catch (ClassNotFoundException e) {
1776: throw new RuntimeException("Cannot load "
1777: + virtualDocumentSpec[i], e);
1778: }
1779: for (int i = 0; i < virtualDocumentResolver.length; i++)
1780: virtualDocumentResolver[i] = defaultResolver;
1781: for (int i = 0; i < virtualDocumentSpec.length; i++) {
1782: final int split = virtualDocumentSpec[i].indexOf(':');
1783: if (split >= 0) {
1784: final String fieldName = virtualDocumentSpec[i]
1785: .substring(0, split);
1786: final int field = factory.fieldIndex(fieldName);
1787: if (field < 0)
1788: throw new IllegalArgumentException("Field "
1789: + fieldName + " is not part of factory "
1790: + factory.getClass().getName());
1791: if (!indexedFields.contains(field))
1792: throw new IllegalArgumentException("Field "
1793: + factory.fieldName(field)
1794: + " is not being indexed");
1795: if (factory.fieldType(field) != DocumentFactory.FieldType.VIRTUAL)
1796: throw new IllegalArgumentException("Field "
1797: + factory.fieldName(field)
1798: + " is not virtual");
1799: try {
1800: virtualDocumentResolver[indexedFields
1801: .indexOf(field)] = (VirtualDocumentResolver) BinIO
1802: .loadObject(virtualDocumentSpec[i]
1803: .substring(split + 1));
1804: } catch (IOException e) {
1805: throw new RuntimeException(
1806: "An I/O error occurred while loading "
1807: + virtualDocumentSpec[i]
1808: .substring(split + 1), e);
1809: } catch (ClassNotFoundException e) {
1810: throw new RuntimeException("Cannot load "
1811: + virtualDocumentSpec[i]
1812: .substring(split + 1), e);
1813: }
1814: }
1815: }
1816: return virtualDocumentResolver;
1817: }
1818:
1819: public static int[] parseVirtualDocumentGap(
1820: final String[] virtualDocumentGapSpec,
1821: final int[] indexedField, final DocumentFactory factory) {
1822: final int[] virtualDocumentGap = new int[indexedField.length];
1823: int defaultGap = DEFAULT_VIRTUAL_DOCUMENT_GAP;
1824: IntArrayList indexedFields = IntArrayList.wrap(indexedField);
1825: for (int i = 0; i < virtualDocumentGapSpec.length; i++)
1826: if (virtualDocumentGapSpec[i].indexOf(':') == -1)
1827: try {
1828: defaultGap = Integer
1829: .parseInt(virtualDocumentGapSpec[i]);
1830: if (defaultGap < 0)
1831: throw new NumberFormatException(
1832: "Gap can't be negative");
1833: } catch (NumberFormatException e) {
1834: throw new RuntimeException(
1835: "Cannot parse gap correctly "
1836: + virtualDocumentGapSpec[i], e);
1837: }
1838: for (int i = 0; i < virtualDocumentGap.length; i++)
1839: virtualDocumentGap[i] = defaultGap;
1840: for (int i = 0; i < virtualDocumentGapSpec.length; i++) {
1841: final int split = virtualDocumentGapSpec[i].indexOf(':');
1842: if (split >= 0) {
1843: final String fieldName = virtualDocumentGapSpec[i]
1844: .substring(0, split);
1845: final int field = factory.fieldIndex(fieldName);
1846: if (field < 0)
1847: throw new IllegalArgumentException("Field "
1848: + fieldName + " is not part of factory "
1849: + factory.getClass().getName());
1850: if (!indexedFields.contains(field))
1851: throw new IllegalArgumentException("Field "
1852: + factory.fieldName(field)
1853: + " is not being indexed");
1854: if (factory.fieldType(field) != DocumentFactory.FieldType.VIRTUAL)
1855: throw new IllegalArgumentException("Field "
1856: + factory.fieldName(field)
1857: + " is not virtual");
1858: try {
1859: virtualDocumentGap[indexedFields.indexOf(field)] = Integer
1860: .parseInt(virtualDocumentGapSpec[i]
1861: .substring(split + 1));
1862: if (virtualDocumentGap[indexedFields.indexOf(field)] < 0)
1863: throw new NumberFormatException(
1864: "Gap can't be negative");
1865: } catch (NumberFormatException e) {
1866: throw new RuntimeException(
1867: "Cannot parse gap correctly "
1868: + virtualDocumentGapSpec[i], e);
1869: }
1870: }
1871: }
1872: return virtualDocumentGap;
1873: }
1874:
1875: public static int[] parseFieldNames(
1876: final String[] indexedFieldName,
1877: final DocumentFactory factory, final boolean allSupported) {
1878: final IntArrayList indexedFields = new IntArrayList();
1879:
1880: if (indexedFieldName.length == 0) {
1881: for (int i = 0; i < factory.numberOfFields(); i++) {
1882: DocumentFactory.FieldType type = factory.fieldType(i);
1883: if (allSupported)
1884: indexedFields.add(i);
1885: else if (type != DocumentFactory.FieldType.VIRTUAL)
1886: indexedFields.add(i);
1887: else
1888: LOGGER
1889: .warn("Virtual field "
1890: + factory.fieldName(i)
1891: + " is not being indexed; use -a or explicitly add field among the indexed ones");
1892: }
1893: } else {
1894: for (int i = 0; i < indexedFieldName.length; i++) {
1895: final int field = factory
1896: .fieldIndex(indexedFieldName[i]);
1897: if (field < 0)
1898: throw new IllegalArgumentException("Field "
1899: + indexedFieldName[i]
1900: + " is not part of factory "
1901: + factory.getClass().getName());
1902: indexedFields.add(field);
1903: }
1904: }
1905:
1906: int[] indexedField = indexedFields.toIntArray();
1907: Arrays.sort(indexedField);
1908: return indexedField;
1909: }
1910:
1911: /**
1912: * Returns the document sequence to be indexed.
1913: *
1914: * @param sequenceName the name of a serialised document sequence, or <code>null</code> for
1915: * standard input.
1916: * @param factoryClass the class of the {@link DocumentFactory} that should be passed to the
1917: * document sequence.
1918: * @param property an array of property strings to be used in the factory initialisation.
1919: * @param delimiter a delimiter in case we want to use standard input.
1920: * @param logger a logger.
1921: * @return the document sequence to be indexed.
1922: */
1923: public static DocumentSequence getSequence(
1924: final String sequenceName, final Class<?> factoryClass,
1925: final String[] property, final int delimiter, Logger logger)
1926: throws IllegalAccessException, InvocationTargetException,
1927: NoSuchMethodException, IOException, ClassNotFoundException,
1928: InstantiationException {
1929: if (sequenceName != null) {
1930: return (DocumentSequence) BinIO.loadObject(sequenceName);
1931: } else {
1932: logger
1933: .debug("Documents will be separated by the Unicode character "
1934: + delimiter);
1935: DocumentFactory factory = PropertyBasedDocumentFactory
1936: .getInstance(factoryClass, property);
1937: return new InputStreamDocumentSequence(System.in,
1938: delimiter, factory);
1939: }
1940: }
1941:
1942: public static void main(final String[] arg) throws JSAPException,
1943: InvocationTargetException, NoSuchMethodException,
1944: ConfigurationException, ClassNotFoundException,
1945: IOException, IllegalAccessException, InstantiationException {
1946:
1947: SimpleJSAP jsap = new SimpleJSAP(
1948: Scan.class.getName(),
1949: "Builds a set of batches from a sequence of documents.",
1950: new Parameter[] {
1951: new FlaggedOption("sequence",
1952: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
1953: JSAP.NOT_REQUIRED, 'S', "sequence",
1954: "A serialised document sequence that will be used instead of stdin."),
1955: new FlaggedOption(
1956: "factory",
1957: MG4JClassParser.getParser(),
1958: IdentityDocumentFactory.class.getName(),
1959: JSAP.NOT_REQUIRED, 'f', "factory",
1960: "A document factory with a standard constructor."),
1961: new FlaggedOption("property",
1962: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
1963: JSAP.NOT_REQUIRED, 'p', "property",
1964: "A 'key=value' specification, or the name of a property file")
1965: .setAllowMultipleDeclarations(true),
1966: new FlaggedOption("termProcessor",
1967: JSAP.STRING_PARSER,
1968: NullTermProcessor.class.getName(),
1969: JSAP.NOT_REQUIRED, 't',
1970: "term-processor",
1971: "Sets the term processor to the given class."),
1972: new Switch("downcase", JSAP.NO_SHORTFLAG,
1973: "downcase",
1974: "A shortcut for setting the term processor to the downcasing processor."),
1975: new FlaggedOption(
1976: "indexedField",
1977: JSAP.STRING_PARSER,
1978: JSAP.NO_DEFAULT,
1979: JSAP.NOT_REQUIRED,
1980: 'I',
1981: "indexed-field",
1982: "The field(s) of the document factory that will be indexed. (default: all non-virtual fields)")
1983: .setAllowMultipleDeclarations(true),
1984: new Switch(
1985: "allFields",
1986: 'a',
1987: "all-fields",
1988: "Index also all virtual fields; has no effect if indexedField has been used at least once."),
1989: new FlaggedOption("zipCollection",
1990: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
1991: JSAP.NOT_REQUIRED, 'z', "zip",
1992: "Creates a support ZipDocumentCollection with given basename."),
1993: new FlaggedOption(
1994: "batchSize",
1995: JSAP.INTSIZE_PARSER,
1996: Integer
1997: .toString(Scan.DEFAULT_BATCH_SIZE),
1998: JSAP.NOT_REQUIRED, 's', "batch-size",
1999: "The size of a batch, in documents. (default: "
2000: + DEFAULT_BATCH_SIZE + ")"),
2001: new FlaggedOption(
2002: "virtualDocumentResolver",
2003: JSAP.STRING_PARSER,
2004: JSAP.NO_DEFAULT,
2005: JSAP.NOT_REQUIRED,
2006: 'v',
2007: "virtual-document-resolver",
2008: "The virtual document resolver. It can be specified several times in the form [<field>:]<filename>. If the field is omitted, it sets the document resolver for all virtual fields.")
2009: .setAllowMultipleDeclarations(true),
2010: new FlaggedOption(
2011: "virtualDocumentGap",
2012: JSAP.STRING_PARSER,
2013: JSAP.NO_DEFAULT,
2014: JSAP.NOT_REQUIRED,
2015: 'g',
2016: "virtual-document-gap",
2017: "The virtual document gap. It can be specified several times in the form [<field>:]<gap>. If the field is omitted, it sets the document gap for all virtual fields; the default gap is "
2018: + DEFAULT_VIRTUAL_DOCUMENT_GAP)
2019: .setAllowMultipleDeclarations(true),
2020: new FlaggedOption(
2021: "bufferSize",
2022: JSAP.INTSIZE_PARSER,
2023: Util
2024: .formatBinarySize(DEFAULT_BUFFER_SIZE),
2025: JSAP.NOT_REQUIRED, 'b', "buffer-size",
2026: "The size of an I/O buffer."),
2027: new FlaggedOption("delimiter",
2028: JSAP.INTEGER_PARSER, Integer
2029: .toString(DEFAULT_DELIMITER),
2030: JSAP.NOT_REQUIRED, 'd', "delimiter",
2031: "The document delimiter."),
2032: new FlaggedOption("renumber",
2033: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
2034: JSAP.NOT_REQUIRED, 'r', "renumber",
2035: "The filename of a document renumbering."),
2036: new Switch("keepUnsorted", 'u',
2037: "keep-unsorted",
2038: "Keep the unsorted term file."),
2039: new FlaggedOption(
2040: "logInterval",
2041: JSAP.LONG_PARSER,
2042: Long
2043: .toString(ProgressLogger.DEFAULT_LOG_INTERVAL),
2044: JSAP.NOT_REQUIRED, 'l', "log-interval",
2045: "The minimum time interval between activity logs in milliseconds."),
2046: new FlaggedOption("tempDir",
2047: JSAP.STRING_PARSER, JSAP.NO_DEFAULT,
2048: JSAP.NOT_REQUIRED, 'T', "temp-dir",
2049: "A directory for all temporary files (e.g., batches)."),
2050: new UnflaggedOption("basename",
2051: JSAP.STRING_PARSER, JSAP.REQUIRED,
2052: "The basename of the resulting index.") });
2053:
2054: JSAPResult jsapResult = jsap.parse(arg);
2055: if (jsap.messagePrinted())
2056: return;
2057:
2058: DocumentSequence documentSequence = getSequence(jsapResult
2059: .getString("sequence"), jsapResult.getClass("factory"),
2060: jsapResult.getStringArray("property"), jsapResult
2061: .getInt("delimiter"), LOGGER);
2062:
2063: final DocumentFactory factory = documentSequence.factory();
2064: final int[] indexedField = parseFieldNames(jsapResult
2065: .getStringArray("indexedField"), factory, jsapResult
2066: .getBoolean("allFields"));
2067: final int batchSize = jsapResult.getInt("batchSize");
2068: final VirtualDocumentResolver[] virtualDocumentResolver = parseVirtualDocumentResolver(
2069: jsapResult.getStringArray("virtualDocumentResolver"),
2070: indexedField, factory);
2071: final int[] virtualDocumentGap = parseVirtualDocumentGap(
2072: jsapResult.getStringArray("virtualDocumentGap"),
2073: indexedField, factory);
2074:
2075: run(
2076: jsapResult.getString("basename"),
2077: documentSequence,
2078: jsapResult.getBoolean("downcase") ? DowncaseTermProcessor
2079: .getInstance()
2080: : ObjectParser.fromSpec(jsapResult
2081: .getString("termProcessor"),
2082: TermProcessor.class,
2083: MG4JClassParser.PACKAGE,
2084: new String[] { "getInstance" }),
2085: jsapResult.getString("zipCollection"), jsapResult
2086: .getInt("bufferSize"), batchSize, indexedField,
2087: virtualDocumentResolver, virtualDocumentGap, jsapResult
2088: .getString("renumber"), jsapResult
2089: .getLong("logInterval"), jsapResult
2090: .getString("tempDir"));
2091: }
2092: }
|