001: package it.unimi.dsi.mg4j.index;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.bits.Fast;
025: import it.unimi.dsi.fastutil.ints.IntArrays;
026: import it.unimi.dsi.io.OutputBitStream;
027: import it.unimi.dsi.lang.MutableString;
028: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
029: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
030: import it.unimi.dsi.mg4j.index.payload.Payload;
031: import it.unimi.dsi.mg4j.io.InterpolativeCoding;
032: import it.unimi.dsi.util.Properties;
033: import jal.ints.Sorting;
034:
035: import java.io.FileOutputStream;
036: import java.io.IOException;
037: import java.util.Map;
038:
039: /** Writes a bitstream-based interleaved index.
040: *
041: * <H2>Offsets bit stream</H2>
042: *
043: * <P>An inverted index may have an associated {@link OutputBitStream} of
044: * offsets: this file contains <code>T+1</code> integers, where <code>T</code>
045: * is the number of inverted lists (i.e., the number of terms), and the
046: * <code>i</code>-th entry is a suitable coding of the position in bits where
047: * the <code>i</code>-th inverted list starts (the last entry is actually the
048: * length, in bytes, of the inverted index file itself). The coding used for
049: * the offset stream is a γ code of the difference between the current
050: * position and the last position.
051: *
052: * @author Paolo Boldi
053: * @author Sebastiano Vigna
054: * @since 0.6
055: */
056:
057: public class BitStreamIndexWriter extends AbstractBitStreamIndexWriter {
058: private static final boolean ASSERTS = false;
059:
060: /** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/
061: protected static final int BEFORE_INVERTED_LIST = 0;
062:
063: /** This value of {@link #state} means that we are positioned at the start of an inverted list,
064: * and we should call {@link #writeFrequency(int)}.*/
065: protected static final int BEFORE_FREQUENCY = 1;
066:
067: /** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */
068: protected static final int BEFORE_DOCUMENT_RECORD = 2;
069:
070: /** This value of {@link #state} means that we just started a new document record, and we
071: * should call {@link #writeDocumentPointer(OutputBitStream, int)}. */
072: protected static final int BEFORE_POINTER = 3;
073:
074: /** This value of {@link #state} can be assumed only in indices that contain payloads; it
075: * means that we are positioned just before the payload for the current document record. */
076: protected static final int BEFORE_PAYLOAD = 4;
077:
078: /** This value of {@link #state} can be assumed only in indices that contain counts; it
079: * means that we are positioned just before the count for the current document record. */
080: protected static final int BEFORE_COUNT = 5;
081:
082: /** This value of {@link #state} can be assumed only in indices that contain document positions;
083: * it means that we are positioned just before the position list of the current document record. */
084: protected static final int BEFORE_POSITIONS = 6;
085:
086: /** This is the first unused state. Subclasses may start from this value to define new states. */
087: protected static final int FIRST_UNUSED_STATE = 7;
088:
089: /** The underlying {@link OutputBitStream}. */
090: protected OutputBitStream obs;
091: /** The offset {@link OutputBitStream}. */
092: private OutputBitStream offset;
093: /** The current state of the writer. */
094: protected int state;
095: /** The number of document records that the current inverted list will contain. */
096: protected int frequency;
097: /** The number of document records already written for the current inverted list. */
098: protected int writtenDocuments;
099: /** The current document pointer. */
100: protected int currentDocument;
101: /** The last document pointer in the current list. */
102: protected int lastDocument;
103: /** The position (in bytes) where the last inverted list started. */
104: private long lastInvertedListPos;
105: /** The parameter <code>b</code> for Golomb coding of pointers. */
106: protected int b;
107: /** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */
108: protected int log2b;
109: /** The maximum number of positions in a document record so far. */
110: public int maxCount;
111: /** A temporary array for skewed Golomb coding. */
112: private int[] sortedDeltas = IntArrays.EMPTY_ARRAY;
113:
114: /** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
115: * If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>).
116: * When {@link #close()} will be called, the property file will also be produced (stemmed with <samp>.properties</samp>),
117: * or enriched if it already exists.
118: *
119: * @param basename the basename.
120: * @param numberOfDocuments the number of documents in the collection to be indexed.
121: * @param writeOffsets if <code>true</code>, the offset file will also be produced.
122: * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
123: */
124: public BitStreamIndexWriter(final CharSequence basename,
125: final int numberOfDocuments, final boolean writeOffsets,
126: final Map<Component, Coding> flags) throws IOException {
127: this (new OutputBitStream(new FileOutputStream(basename
128: + DiskBasedIndex.INDEX_EXTENSION)),
129: writeOffsets ? new OutputBitStream(
130: new FileOutputStream(basename
131: + DiskBasedIndex.OFFSETS_EXTENSION))
132: : null, numberOfDocuments, flags);
133: }
134:
135: /** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}.
136: *
137: * @param obs the underlying output bit stream.
138: * @param offset the offset bit stream, or <code>null</code> if offsets should not be written.
139: * @param numberOfDocuments the number of documents in the collection to be indexed.
140: * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
141: */
142: public BitStreamIndexWriter(final OutputBitStream obs,
143: final OutputBitStream offset, final int numberOfDocuments,
144: final Map<Component, Coding> flags) {
145: super (numberOfDocuments, flags);
146: this .obs = obs;
147: this .offset = offset;
148: this .frequency = -1;
149: this .currentTerm = -1;
150: this .maxCount = 0;
151:
152: if (!hasCounts && hasPositions)
153: throw new IllegalArgumentException(
154: "Index would have positions but no counts (this can't happen)");
155: }
156:
157: /** Creates a new index writer, with the specified underlying {@link OutputBitStream},
158: * without an associated offset bit stream.
159: *
160: * @param obs the underlying output bit stream.
161: * @param numberOfDocuments the number of documents in the collection to be indexed.
162: * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
163: */
164: public BitStreamIndexWriter(final OutputBitStream obs,
165: final int numberOfDocuments,
166: final Map<Component, Coding> flags) {
167: this (obs, null, numberOfDocuments, flags);
168: }
169:
170: public long newInvertedList() throws IOException {
171: if (frequency >= 0 && frequency != writtenDocuments)
172: throw new IllegalStateException(
173: "The number of document records ("
174: + this .writtenDocuments
175: + ") does not match the frequency ("
176: + this .frequency + ")");
177: if (state != BEFORE_INVERTED_LIST
178: && state != BEFORE_DOCUMENT_RECORD)
179: throw new IllegalStateException(
180: "Trying to start new inverted list in state "
181: + state);
182:
183: // The position (in bits) where the new inverted list starts
184: long pos = obs.writtenBits();
185: // Reset variables
186: writtenDocuments = 0;
187: currentTerm++;
188: currentDocument = -1;
189:
190: // If needed, write the offset
191: if (offset != null)
192: offset.writeLongGamma(pos - lastInvertedListPos);
193: lastInvertedListPos = pos;
194: state = BEFORE_FREQUENCY;
195: return pos;
196: }
197:
198: public int writeFrequency(final int frequency) throws IOException {
199: if (state != BEFORE_FREQUENCY)
200: throw new IllegalStateException(
201: "Trying to write frequency in state " + state);
202:
203: int bitCount;
204: // Write the frequency
205: switch (frequencyCoding) {
206: case SHIFTED_GAMMA:
207: bitCount = obs.writeShiftedGamma(frequency - 1); // frequency cannot be 0
208: break;
209: case GAMMA:
210: bitCount = obs.writeGamma(frequency - 1); // frequency cannot be 0
211: break;
212: case DELTA:
213: bitCount = obs.writeDelta(frequency - 1); // frequency cannot be 0
214: break;
215: default:
216: throw new IllegalStateException(
217: "The required frequency coding (" + frequencyCoding
218: + ") is not supported.");
219: }
220:
221: this .frequency = frequency;
222:
223: // We compute the modulus used for pointer Golomb coding
224: if (pointerCoding == Coding.GOLOMB) {
225: b = BitStreamIndex.golombModulus(frequency,
226: numberOfDocuments);
227: log2b = Fast.mostSignificantBit(b);
228: }
229:
230: state = BEFORE_DOCUMENT_RECORD;
231: bitsForFrequencies += bitCount;
232: return bitCount;
233: }
234:
235: public OutputBitStream newDocumentRecord() throws IOException {
236: if (frequency == writtenDocuments)
237: throw new IllegalStateException(
238: "Document record overflow (written "
239: + this .frequency + " already)");
240: if (state != BEFORE_DOCUMENT_RECORD)
241: throw new IllegalStateException(
242: "Trying to start new document record in state "
243: + state);
244:
245: writtenDocuments++;
246: numberOfPostings++;
247: lastDocument = currentDocument;
248: state = BEFORE_POINTER;
249: return obs;
250: }
251:
252: public int writeDocumentPointer(final OutputBitStream out,
253: final int pointer) throws IOException {
254: if (state != BEFORE_POINTER)
255: throw new IllegalStateException(
256: "Trying to write pointer in state " + state);
257:
258: currentDocument = pointer;
259: int bitCount = 0;
260:
261: if (frequency != numberOfDocuments) { // We do not write pointers for everywhere occurring documents.
262: switch (pointerCoding) {
263: case SHIFTED_GAMMA:
264: bitCount = out.writeShiftedGamma(pointer - lastDocument
265: - 1);
266: break;
267: case GAMMA:
268: bitCount = out.writeGamma(pointer - lastDocument - 1);
269: break;
270: case DELTA:
271: bitCount = out.writeDelta(pointer - lastDocument - 1);
272: break;
273: case GOLOMB:
274: bitCount = out.writeGolomb(pointer - lastDocument - 1,
275: b, log2b);
276: break;
277: default:
278: throw new IllegalStateException(
279: "The required pointer coding (" + pointerCoding
280: + ") is not supported.");
281: }
282: } else if (pointer - lastDocument != 1)
283: throw new IllegalStateException(
284: "Term "
285: + currentTerm
286: + " has frequency equal to the number of documents, but pointers are not consecutive integers");
287:
288: state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT
289: : BEFORE_DOCUMENT_RECORD;
290: bitsForPointers += bitCount;
291: return bitCount;
292: }
293:
294: public int writePayload(final OutputBitStream out,
295: final Payload payload) throws IOException {
296: if (frequency < 0)
297: throw new IllegalStateException(
298: "Trying to write payload without calling newInvertedList");
299: if (state != BEFORE_PAYLOAD)
300: throw new IllegalStateException(
301: "Trying to write payload in state " + state);
302: final int count = payload.write(out);
303: bitsForPayloads += count;
304: state = hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
305: return count;
306: }
307:
308: public void close() throws IOException {
309: if (state != BEFORE_DOCUMENT_RECORD
310: && state != BEFORE_INVERTED_LIST)
311: throw new IllegalStateException(
312: "Trying to close index in state " + state);
313: if (frequency >= 0 && frequency != writtenDocuments)
314: throw new IllegalStateException(
315: "The number of document records ("
316: + this .writtenDocuments
317: + ") does not match the frequency ("
318: + this .frequency + ")");
319:
320: if (writtenBits() != obs.writtenBits())
321: throw new IllegalStateException(
322: "Written bits count mismatch: we say "
323: + writtenBits() + ", the stream says "
324: + obs.writtenBits());
325:
326: if (offset != null) {
327: offset.writeLongGamma(obs.writtenBits()
328: - lastInvertedListPos);
329: offset.close();
330: }
331:
332: obs.close();
333: }
334:
335: public int writePositionCount(final OutputBitStream out,
336: final int count) throws IOException {
337: if (frequency < 0)
338: throw new IllegalStateException(
339: "Trying to write count without calling newInvertedList");
340: if (state != BEFORE_COUNT)
341: throw new IllegalStateException(
342: "Trying to write count in state " + state);
343: final int bitCount;
344:
345: numberOfOccurrences += count;
346: switch (countCoding) {
347: case SHIFTED_GAMMA:
348: bitCount = out.writeShiftedGamma(count - 1);
349: break;
350: case GAMMA:
351: bitCount = out.writeGamma(count - 1);
352: break;
353: case UNARY:
354: bitCount = out.writeUnary(count - 1);
355: break;
356: case DELTA:
357: bitCount = out.writeDelta(count - 1);
358: break;
359: default:
360: throw new IllegalStateException(
361: "The required count coding (" + countCoding
362: + ") is not supported.");
363: }
364:
365: state = hasPositions ? BEFORE_POSITIONS
366: : BEFORE_DOCUMENT_RECORD;
367: bitsForCounts += bitCount;
368: return bitCount;
369: }
370:
371: public int writeDocumentPositions(final OutputBitStream out,
372: final int[] occ, final int offset, final int len,
373: final int docSize) throws IOException {
374: if (frequency < 0)
375: throw new IllegalStateException(
376: "Trying to write occurrences without calling newInvertedList");
377: if (state != BEFORE_POSITIONS)
378: throw new IllegalStateException(
379: "Trying to write positions in state " + state);
380:
381: if (ASSERTS && docSize > 0)
382: for (int i = 0; i < len; i++)
383: assert occ[offset + i] < docSize : "Position "
384: + occ[offset + i] + " for document "
385: + currentDocument + " is too large; size is "
386: + docSize;
387:
388: int i;
389: int prev = -1;
390: int bitCount = 0;
391: final int end = offset + len;
392:
393: switch (positionCoding) {
394: case GAMMA:
395: for (i = offset; i < end; i++) {
396: bitCount += out.writeGamma(occ[i] - prev - 1);
397: prev = occ[i];
398: }
399: break;
400: case DELTA:
401: for (i = offset; i < end; i++) {
402: bitCount += out.writeDelta(occ[i] - prev - 1);
403: prev = occ[i];
404: }
405: break;
406: case SHIFTED_GAMMA:
407: for (i = offset; i < end; i++) {
408: bitCount += out.writeShiftedGamma(occ[i] - prev - 1);
409: prev = occ[i];
410: }
411: break;
412: case GOLOMB:
413: if (len < 3) {
414: for (i = 0; i < len; i++)
415: bitCount += out.writeMinimalBinary(occ[i], docSize);
416: break;
417: }
418:
419: // We compute b and log2b for positions
420: final int positionB = BitStreamIndex.golombModulus(len,
421: docSize);
422: final int positionLog2b = Fast
423: .mostSignificantBit(positionB);
424:
425: for (i = offset; i < end; i++) {
426: bitCount += out.writeGolomb(occ[i] - prev - 1,
427: positionB, positionLog2b);
428: prev = occ[i];
429: }
430: break;
431: case SKEWED_GOLOMB:
432: if (len < 3) {
433: for (i = 0; i < len; i++)
434: bitCount += out.writeMinimalBinary(occ[i], docSize);
435: break;
436: }
437:
438: if (sortedDeltas.length < len)
439: sortedDeltas = new int[len];
440: System.arraycopy(occ, offset, sortedDeltas, 0, len);
441: i = len - 1;
442: while (i-- != 0)
443: sortedDeltas[i + 1] -= sortedDeltas[i] + 1;
444: Sorting.nth_element(sortedDeltas, 0, len / 2, len);
445: final int sb = sortedDeltas[len / 2] + 1; // The median of deltas
446:
447: bitCount = out.writeMinimalBinary(sb - 1, docSize);
448:
449: for (i = offset; i < end; i++) {
450: bitCount += out
451: .writeSkewedGolomb(occ[i] - prev - 1, sb);
452: prev = occ[i];
453: }
454: break;
455: case INTERPOLATIVE:
456: bitCount = InterpolativeCoding.write(out, occ, 0, len, 0,
457: docSize - 1);
458: break;
459: default:
460: throw new IllegalStateException(
461: "The required position coding (" + positionCoding
462: + ") is not supported.");
463: }
464:
465: state = BEFORE_DOCUMENT_RECORD;
466: bitsForPositions += bitCount;
467: if (len > maxCount)
468: maxCount = len;
469: return bitCount;
470: }
471:
472: public long writtenBits() {
473: return bitsForFrequencies + bitsForPointers + bitsForPayloads
474: + bitsForCounts + bitsForPositions;
475: }
476:
477: public Properties properties() {
478: Properties result = new Properties();
479: result.setProperty(Index.PropertyKeys.DOCUMENTS,
480: numberOfDocuments);
481: result.setProperty(Index.PropertyKeys.TERMS, currentTerm + 1);
482: result.setProperty(Index.PropertyKeys.POSTINGS,
483: numberOfPostings);
484: result.setProperty(Index.PropertyKeys.MAXCOUNT, maxCount);
485: result.setProperty(Index.PropertyKeys.INDEXCLASS,
486: FileIndex.class.getName());
487: // We save all flags, except for the PAYLOAD component, which is just used internally.
488: for (Map.Entry<Component, Coding> e : flags.entrySet())
489: if (e.getKey() != Component.PAYLOADS)
490: result.addProperty(Index.PropertyKeys.CODING,
491: new MutableString().append(e.getKey()).append(
492: ':').append(e.getValue()));
493: return result;
494: }
495: }
|