001: package it.unimi.dsi.mg4j.index;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.bits.Fast;
025: import it.unimi.dsi.fastutil.ints.Int2IntRBTreeMap;
026: import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
027: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
028: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
029: import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;
030: import it.unimi.dsi.io.NullOutputStream;
031: import it.unimi.dsi.io.OutputBitStream;
032: import it.unimi.dsi.Util;
033: import it.unimi.dsi.util.Properties;
034:
035: import java.io.File;
036: import java.io.FileInputStream;
037: import java.io.FileOutputStream;
038: import java.io.IOException;
039: import java.io.PrintStream;
040: import java.io.PrintWriter;
041: import java.util.Map;
042:
043: /** Provides facilities to write skip inverted indices,
044: * that is, inverted indices with an additional skip structure. A skip inverted index
045: * allows one to skip ahead when reading inverted lists. More specifically,
046: * when reading the inverted list relative to a certain term, one may want to
047: * decide to skip all document records that concern documents with pointer
048: * less than a given integer. In a normal inverted index this is impossible:
049: * one would have to read all document records sequentially.
050: *
051: * <p>The skipping structure used by this class is new: details can be found
052: * <a href="http://vigna.dsi.unimi.it/papers.php#BoVCPESLQIIL">here</a>.
053: *
054: * @author Paolo Boldi
055: * @author Sebastiano Vigna
056: * @since 0.6
057: */
058: public class SkipBitStreamIndexWriter extends BitStreamIndexWriter {
059: /** The size of the buffer for the temporary file used to build an inverted list. Inverted lists
060: * shorter than this number of bytes will be directly rebuilt from the buffer, and never flushed to disk. */
061: public final static int DEFAULT_TEMP_BUFFER_SIZE = 64 * 1024 * 1024;
062:
063: private static final boolean ASSERTS = false;
064: private static final boolean DEBUG = false;
065: private final static boolean STATS = false;
066:
067: /** Maximum number of trials when optimising the entry bit length. */
068: private final static int MAX_TRY = 32;
069:
070: /** The parameter <code>h</code> (the maximum height of a skip tower). */
071: private final int h;
072:
073: /** The parameter <code>q</code> (2<var><sup>h</sup>q</var> documents record are kept in the cache); necessarily a power of two. */
074: private final int q;
075:
076: /** We have <var>w</var>=2<sup><var>h</var></sup><var>q</var>. */
077: private final int w;
078:
079: /** The number of document records written in the cache containing the current block. */
080: private int cache;
081:
082: /** The <var>k</var>-th entry of this array contains the document pointer of the <var>k</var>-th
083: * skip document record within the current block. For sake of simplicity, <code>pointer[cache]</code>
084: * contains the first document pointer within the next block. */
085: private final int[] skipPointer;
086:
087: /** The {@link OutputBitStream}s where cached document pointers are written. */
088: private final OutputBitStream[] cachePointer;
089:
090: /** The {@link FastByteArrayOutputStream}s underlying <code>cachePointer</code> . */
091: private final FastByteArrayOutputStream[] cachePointerByte;
092:
093: /** The {@link OutputBitStream}s where cached skip towers are written. Indices are skip
094: * indices. */
095: private final OutputBitStream[] cacheSkip;
096:
097: /** An array whose entries (as many as those of {@link #cacheSkip}) are all {@link #bitCount}. */
098: private final OutputBitStream[] cacheSkipBitCount;
099:
100: /** The {@link FastByteArrayOutputStream}s underlying <code>cacheSkip</code> . Indices are skip
101: * indices. */
102: private final FastByteArrayOutputStream[] cacheSkipByte;
103:
104: /** The {@link OutputBitStream} where cached document data are written. */
105: private final CachingOutputBitStream cacheDataOut;
106:
107: /** The {@link FastBufferedInputStream} from which cached document data are read. */
108: private final FastBufferedInputStream cacheDataIn;
109:
110: /** The length of the data segment for each quantum. */
111: private final int[] cacheDataLength;
112:
113: /** An {@link OutputBitStream} wrapping a {@link NullOutputStream} for code-length preview. */
114: private final OutputBitStream bitCount;
115:
116: /** The sum of all tower data computed so far. */
117: public final TowerData towerData;
118:
119: /** The number of bits written for quantum lengths. */
120: public long bitsForQuantumBitLengths;
121:
122: /** The number of bits written for entry lenghts. */
123: public long bitsForEntryBitLengths;
124:
125: /** The number of written blocks. */
126: public long numberOfBlocks;
127:
128: /** An estimate on the number of bits occupied per tower entry in the last written cache, or -1 if no cache has been
129: * written for the current inverted list. */
130: public int prevEntryBitLength;
131:
132: /** An estimate on the number of bits occupied per quantum in the last written cache, or -1 if no cache has been
133: * written for the current inverted list. */
134: public int prevQuantumBitLength;
135:
136: /** The Golomb modulus for a top pointer skip, for each level. */
137: private final int[] towerTopB;
138:
139: /** The most significant bit of the Golomb modulus for a top pointer skip, for each level. */
140: private final int[] towerTopLog2B;
141:
142: /** The Golomb modulus for a lower pointer skip, for each level. */
143: private final int[] towerLowerB;
144:
145: /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
146: private final int[] towerLowerLog2B;
147:
148: /** The prediction for a pointer skip, for each level. */
149: private final int[] pointerPrediction;
150:
151: private java.io.PrintWriter pointerSkipStats;
152: private java.io.PrintWriter pointerTopSkipStats;
153: private java.io.PrintWriter bitSkipStats;
154: private java.io.PrintWriter bitTopSkipStats;
155: private String pointerSkipLine, bitSkipLine;
156:
157: /** Creates a new skip index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
158: * If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>).
159: *
160: * <p>The size of the internal temporary buffer will be {@link #DEFAULT_TEMP_BUFFER_SIZE}.
161: *
162: * @param basename the basename.
163: * @param numberOfDocuments the number of documents in the collection to be indexed.
164: * @param writeOffsets if <code>true</code>, the offset file will also be produced.
165: * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
166: * @param q the cache contains at most <var>2<sup>h</sup></var> document records.
167: * @param h the maximum height of a skip tower.
168: */
169: public SkipBitStreamIndexWriter(final CharSequence basename,
170: final int numberOfDocuments, final boolean writeOffsets,
171: final Map<Component, Coding> flags, final int q, final int h)
172: throws IOException {
173: this (new OutputBitStream(new FileOutputStream(basename
174: + DiskBasedIndex.INDEX_EXTENSION)),
175: writeOffsets ? new OutputBitStream(
176: new FileOutputStream(basename
177: + DiskBasedIndex.OFFSETS_EXTENSION))
178: : null, numberOfDocuments,
179: DEFAULT_TEMP_BUFFER_SIZE, flags, q, h);
180: }
181:
182: /** Creates a new skip index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
183: * If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>).
184: *
185: * @param basename the basename.
186: * @param numberOfDocuments the number of documents in the collection to be indexed.
187: * @param writeOffsets if <code>true</code>, the offset file will also be produced.
188: * @param tempBufferSize the size in bytes of the internal temporary buffer (inverted lists shorter than this size will never be flushed to disk).
189: * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
190: * @param q the cache contains at most <var>2<sup>h</sup></var> document records.
191: * @param h the maximum height of a skip tower.
192: */
193: public SkipBitStreamIndexWriter(final CharSequence basename,
194: final int numberOfDocuments, final boolean writeOffsets,
195: int tempBufferSize, final Map<Component, Coding> flags,
196: final int q, final int h) throws IOException {
197: this (new OutputBitStream(new FileOutputStream(basename
198: + DiskBasedIndex.INDEX_EXTENSION)),
199: writeOffsets ? new OutputBitStream(
200: new FileOutputStream(basename
201: + DiskBasedIndex.OFFSETS_EXTENSION))
202: : null, numberOfDocuments, tempBufferSize,
203: flags, q, h);
204: }
205:
206: /**
207: * Creates a new skip index writer.
208: *
209: * @param obs the underlying output bit stream.
210: * @param offset the offset bit stream.
211: * @param N the number of documents in the collection to be indexed.
212: * @param tempBufferSize the size in bytes of the internal temporary buffer (inverted lists shorter than this size will never be flushed to disk).
213: * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
214: * @param q the cache contains at most <var>2<sup>h</sup></var> document records.
215: * @param h the maximum height of a skip tower.
216: * @throws IOException
217: */
218: public SkipBitStreamIndexWriter(final OutputBitStream obs,
219: final OutputBitStream offset, final int N,
220: int tempBufferSize, final Map<Component, Coding> flags,
221: final int q, final int h) throws IOException {
222: super (obs, offset, N, flags);
223: if (h < 0)
224: throw new IllegalArgumentException("Illegal height " + h);
225: if (q <= 0 || (q & -q) != q)
226: throw new IllegalArgumentException("Illegal quantum " + q);
227: this .h = h;
228: this .q = q;
229:
230: int two2h = 1 << h;
231: w = two2h * q;
232:
233: if (DEBUG) {
234: System.err.println("Cache will contain at most " + w
235: + " records (q=" + q + ",h=" + h + ")");
236: System.err.print("Skip records will be ");
237: for (int i = 0; i < two2h; i++)
238: System.err.print((i * q) + " ");
239: System.err.println();
240: }
241:
242: towerData = new TowerData();
243: tempFile = File.createTempFile("MG4J", ".data");
244: cacheDataIn = new FastBufferedInputStream(new FileInputStream(
245: tempFile));
246: cacheDataOut = new CachingOutputBitStream(tempFile,
247: tempBufferSize);
248: cacheDataLength = new int[two2h];
249: cachePointer = new OutputBitStream[two2h];
250: cachePointerByte = new FastByteArrayOutputStream[two2h];
251:
252: for (int i = 0; i < two2h; i++)
253: cachePointer[i] = new OutputBitStream(
254: cachePointerByte[i] = new FastByteArrayOutputStream(),
255: 0);
256:
257: cacheSkip = new OutputBitStream[two2h];
258: cacheSkipBitCount = new OutputBitStream[two2h];
259: cacheSkipByte = new FastByteArrayOutputStream[two2h];
260:
261: for (int i = 0; i < two2h; i++) {
262: cacheSkip[i] = new OutputBitStream(
263: cacheSkipByte[i] = new FastByteArrayOutputStream(),
264: 0);
265: cacheSkipBitCount[i] = new OutputBitStream(NullOutputStream
266: .getInstance(), 0);
267: }
268:
269: skipPointer = new int[two2h + 1];
270: distance = new long[two2h + 1];
271:
272: bitCount = new OutputBitStream(NullOutputStream.getInstance(),
273: 0);
274:
275: towerTopB = new int[h + 1];
276: towerTopLog2B = new int[h + 1];
277: towerLowerB = new int[h + 1];
278: towerLowerLog2B = new int[h + 1];
279: pointerPrediction = new int[h + 1];
280:
281: if (STATS) {
282: try {
283: pointerSkipStats = new PrintWriter(
284: new java.io.FileWriter("pointerSkip.stats"));
285: pointerTopSkipStats = new PrintWriter(
286: new java.io.FileWriter("pointerTopSkip.stats"));
287: bitSkipStats = new PrintWriter(new java.io.FileWriter(
288: "bitSkip.stats"));
289: bitTopSkipStats = new PrintWriter(
290: new java.io.FileWriter("bitTopSkip.stats"));
291:
292: String comment = "# " + System.getProperty("freq")
293: + "\u00b1" + Integer.getInteger("error") + "%";
294: pointerSkipStats.println(comment);
295: pointerTopSkipStats.println(comment);
296: bitSkipStats.println(comment);
297: bitTopSkipStats.println(comment);
298: } catch (IOException e) {
299: System.err.println("Can't open stat files: " + e);
300: }
301: }
302: }
303:
304: public long newInvertedList() throws IOException {
305: if (cache != 0)
306: writeOutCache(-1);
307: return super .newInvertedList();
308: }
309:
310: public int writeFrequency(final int frequency) throws IOException {
311: int res = super .writeFrequency(frequency);
312: prevQuantumBitLength = prevEntryBitLength = -1;
313:
314: if (DEBUG)
315: System.err.println("----------- " + currentTerm + " ("
316: + frequency + ")");
317:
318: final long pointerQuantumSigma = BitStreamIndex.quantumSigma(
319: frequency, numberOfDocuments, q);
320: for (int i = Math
321: .min(h, Fast.mostSignificantBit(frequency / q)); i >= 0; i--) {
322: towerTopB[i] = BitStreamIndex.gaussianGolombModulus(
323: pointerQuantumSigma, i + 1);
324: towerTopLog2B[i] = Fast.mostSignificantBit(towerTopB[i]);
325: towerLowerB[i] = BitStreamIndex.gaussianGolombModulus(
326: pointerQuantumSigma, i);
327: towerLowerLog2B[i] = Fast
328: .mostSignificantBit(towerLowerB[i]);
329: pointerPrediction[i] = (int) ((q * (1L << i)
330: * numberOfDocuments + frequency / 2) / frequency);
331: }
332:
333: return res;
334: }
335:
336: public OutputBitStream newDocumentRecord() throws IOException {
337: super .newDocumentRecord();
338: return cacheDataOut;
339: }
340:
341: public int writeDocumentPointer(final OutputBitStream out,
342: final int pointer) throws IOException {
343: // If the previous block is over, write it out!
344:
345: if (cache == w)
346: writeOutCache(pointer);
347:
348: // Record data pointer if we are on a skip; otherwise, write it to the cache.
349: if (cache % q == 0) {
350: if (cache / q > 0)
351: cacheDataLength[cache / q - 1] = (int) cacheDataOut
352: .writtenBits();
353: cacheDataOut.align();
354: cacheDataOut.writtenBits(0);
355: skipPointer[cache / q] = pointer;
356: return super .writeDocumentPointer(
357: cachePointer[cache++ / q], pointer);
358: } else {
359: cache++;
360: return super .writeDocumentPointer(cacheDataOut, pointer);
361: }
362: }
363:
364: private int writeOutPointer(final OutputBitStream out,
365: final int pointer) throws IOException {
366: if (frequency == numberOfDocuments)
367: return 0; // We do not write pointers for everywhere occurring terms.
368:
369: switch (pointerCoding) {
370: case GAMMA:
371: return out.writeGamma(pointer - lastDocument - 1);
372: case DELTA:
373: return out.writeDelta(pointer - lastDocument - 1);
374: case GOLOMB:
375: return out
376: .writeGolomb(pointer - lastDocument - 1, b, log2b);
377: default:
378: throw new IllegalStateException(
379: "The required pointer coding (" + pointerCoding
380: + ") is not supported.");
381: }
382: }
383:
384: public void close() throws IOException {
385:
386: if (cache != 0)
387: writeOutCache(-1);
388: super .close();
389:
390: cacheDataIn.close();
391: cacheDataOut.close();
392: tempFile.delete();
393:
394: if (STATS) {
395: pointerSkipStats.close();
396: pointerTopSkipStats.close();
397: bitSkipStats.close();
398: bitTopSkipStats.close();
399: }
400: }
401:
402: /** A structure maintaining statistical data about tower construction. */
403:
404: public static class TowerData {
405: /** The number of bits written for bit skips at the top of a tower. */
406: public long bitsForTopBitSkips;
407:
408: /** The number of bits written for skip pointers at the top of a tower. */
409: public long bitsForTopSkipPointers;
410:
411: /** The number of bits written for bit skips in the lower part of a tower. */
412: public long bitsForLowerBitSkips;
413:
414: /** The number of bits written for skip pointers in the lower part of a tower. */
415: public long bitsForLowerSkipPointers;
416:
417: /** The number of bits written for tower lengths. */
418: public long bitsForTowerLengths;
419:
420: /** The number of written skip towers. */
421: public long numberOfSkipTowers;
422:
423: /** The number of written top skip entries. */
424: public long numberOfTopEntries;
425:
426: /** The number of written lower skip entries. */
427: public long numberOfLowerEntries;
428:
429: /** Clear all fields of this tower data. */
430:
431: void clear() {
432: bitsForTopBitSkips = 0;
433: bitsForTopSkipPointers = 0;
434: bitsForLowerBitSkips = 0;
435: bitsForLowerSkipPointers = 0;
436: bitsForTowerLengths = 0;
437: numberOfSkipTowers = 0;
438: numberOfTopEntries = 0;
439: numberOfLowerEntries = 0;
440: }
441:
442: /** Returns the overall number of bits used for skip pointers.
443: * @return the overall number of bits used for skip pointers.
444: */
445: public long bitsForSkipPointers() {
446: return bitsForTopSkipPointers + bitsForLowerSkipPointers;
447: }
448:
449: /** Returns the overall number of bits used for bit skips.
450: * @return the overall number of bits used for bit skips.
451: */
452: public long bitsForBitSkips() {
453: return bitsForTopBitSkips + bitsForLowerBitSkips;
454: }
455:
456: /** Returns the overall number of bits used for tower entries (bits for tower lengths are not included).
457: * @return the overall number of bits used for tower entries.
458: */
459: public long bitsForEntries() {
460: return bitsForSkipPointers() + bitsForBitSkips();
461: }
462:
463: /** Returns the overall number of bits used for towers.
464: * @return the overall number of bits used for towers.
465: */
466: public long bitsForTowers() {
467: return bitsForTowerLengths + bitsForEntries();
468: }
469:
470: /** Returns the overall number of entries.
471: * @return the overall number of entries.
472: */
473: public long numberOfEntries() {
474: return numberOfTopEntries + numberOfLowerEntries;
475: }
476: }
477:
478: /** The <var>k</var>-th entry of this array contains the number of bits from the start of
479: * the <var>k</var>-th skip tower up to the end of the current block (more precisely,
480: * to the point that should be reached via skipping, which is just after the document pointer).
481: * Indices are skip indices. It is used just by {@link #tryTower(int, int, long, OutputBitStream[], TowerData, boolean)},
482: * but it is declared here for efficiency.
483: */
484: final private long[] distance;
485:
486: /** The temporary file dumping the index data contained in a block. */
487: final private File tempFile;
488:
489: /** Whether we should write stats. */
490: private boolean writeStats;
491:
492: /** Computes the towers.
493: *
494: * @param quantumBitLength the length in bits of a quantum.
495: * @param entryBitLength the estimated length in bits of a tower entry.
496: * @param toTheEnd the number of bits that must be skipped to reach the next tower (usually,
497: * the length of the first pointer of the next block or 0 if this is to be the last block).
498: * @param skip an array of output bit stream where the data related to each tower will be written.
499: * @param towerData will be filled with statistical date about the towers.
500: * @param doinIt if true, we are actually writing a tower, not just trying.
501: */
502: private void tryTower(final int quantumBitLength,
503: final int entryBitLength, long toTheEnd,
504: final OutputBitStream[] skip, final TowerData towerData,
505: final boolean doinIt) throws IOException {
506: int i, k, s;
507: long d;
508: int basePointer;
509: // truncated is true only for those towers (in defective blocks) whose height is strictly smaller than the height they should have
510: boolean truncated = false;
511:
512: if (DEBUG && doinIt)
513: System.err.println("Writing out tower for term "
514: + currentTerm + "; quantumBitLength="
515: + quantumBitLength + " entryBitLength="
516: + entryBitLength);
517:
518: for (k = (cache - 1) / q; k >= 0; k--) {
519: // Where are we? At the end of the k-th quantum. So toTheEnd must be increased by
520: // the length of the data contained in the same quantum, moving us...
521: toTheEnd += cacheDataLength[k];
522:
523: // ...just after the k-th skip tower.
524: // We compute the maximum valid index of the skip tower (*MUST* be kept in sync with the subsequent loop).
525: s = (k == 0) ? h : Fast.leastSignificantBit(k);
526:
527: // This test handles defective blocks. In particular, for defective quanta s=-1,
528: // yielding no skipping data at all for such quanta. truncated is true if the
529: // current tower is truncated w.r.t. the infinite skip list.
530: if (cache < w) {
531: final int upperBound = Fast
532: .mostSignificantBit((cache / q) - k);
533: if (s > upperBound) {
534: s = upperBound;
535: truncated = true;
536: } else
537: truncated = false;
538: } else
539: truncated = k == 0;
540:
541: skip[k].writtenBits(0);
542:
543: if (s >= 0) {
544: if (DEBUG && doinIt)
545: System.err.print("% (" + k + ") [" + skipPointer[k]
546: + "] ");
547:
548: basePointer = skipPointer[k];
549:
550: /* If the current tower is truncated, we must actually write the top of the tower.
551: * The top must be forecast in a Bernoullian way: we write it as a difference from the average pointer skip,
552: * which is q 2^s / relativeFrequency. */
553: if (truncated) {
554: towerData.numberOfTopEntries++;
555: towerData.bitsForTopSkipPointers += skip[k]
556: .writeGolomb(Fast.int2nat(skipPointer[k
557: + (1 << s)]
558: - basePointer
559: - pointerPrediction[s]),
560: towerTopB[s], towerTopLog2B[s]);
561: towerData.bitsForTopBitSkips += skip[k]
562: .writeLongDelta(Fast
563: .int2nat((int) ((toTheEnd - distance[k
564: + (1 << s)]) - (quantumBitLength
565: * (1 << s) + entryBitLength
566: * ((1 << s + 1) - s - 2)))));
567: }
568:
569: if (DEBUG && doinIt)
570: System.err.print((truncated ? "" : "(")
571: + (skipPointer[k + (1 << s)] - basePointer)
572: + ":" + (toTheEnd - distance[k + (1 << s)])
573: + (truncated ? " " : ") "));
574:
575: if (STATS && doinIt && writeStats) {
576: // These *MUST* be kept in sync with the writes above.
577: if (truncated) {
578: pointerTopSkipStats
579: .println(s
580: + "\t"
581: + (skipPointer[k + (1 << s)]
582: - basePointer - pointerPrediction[s]));
583: bitTopSkipStats
584: .println(s
585: + "\t"
586: + (int) ((toTheEnd - distance[k
587: + (1 << s)]) - (quantumBitLength
588: * (1 << s) + entryBitLength
589: * ((1 << s + 1) - s - 2))));
590: }
591: pointerSkipLine = "";
592: bitSkipLine = "";
593: }
594:
595: // Produce a (single) tower of height s
596: for (i = s - 1; i >= 0; i--) {
597: towerData.bitsForLowerSkipPointers += skip[k]
598: .writeGolomb(
599: Fast
600: .int2nat((skipPointer[k
601: + (1 << i)] - basePointer)
602: - ((skipPointer[k
603: + (1 << i + 1)] - basePointer) / 2)),
604: towerLowerB[i], towerLowerLog2B[i]);
605:
606: if (STATS && doinIt && writeStats)
607: pointerSkipLine = ((skipPointer[k + (1 << i)] - basePointer) - ((skipPointer[k
608: + (1 << i + 1)] - basePointer) / 2))
609: + "\t" + pointerSkipLine;
610:
611: towerData.bitsForLowerBitSkips += skip[k]
612: .writeLongDelta(Fast
613: .int2nat((int) (((toTheEnd
614: - distance[k
615: + (1 << (i + 1))] - entryBitLength
616: * (i + 1)) / 2) - (toTheEnd - distance[k
617: + (1 << i)]))));
618:
619: if (STATS && doinIt && writeStats)
620: bitSkipLine = (int) (((toTheEnd
621: - distance[k + (1 << (i + 1))] - entryBitLength
622: * (i + 1)) / 2) - (toTheEnd - distance[k
623: + (1 << i)]))
624: + "\t" + bitSkipLine;
625:
626: if (DEBUG && doinIt)
627: System.err
628: .print((skipPointer[k + (1 << i)] - basePointer)
629: + ":"
630: + (toTheEnd - distance[k
631: + (1 << i)]) + " ");
632: }
633:
634: if (s > 0) { // No length for single-entry towers.
635: d = bitCount.writeDelta(Fast.int2nat((int) skip[k]
636: .writtenBits()
637: - (s + 1) * entryBitLength));
638: towerData.bitsForTowerLengths += d;
639: toTheEnd += d;
640: }
641:
642: if (STATS && doinIt && writeStats) {
643: if (pointerSkipLine.length() > 0)
644: pointerSkipStats.println(pointerSkipLine);
645: if (bitSkipLine.length() > 0)
646: bitSkipStats.println(bitSkipLine);
647: }
648:
649: toTheEnd += skip[k].writtenBits();
650:
651: if (DEBUG && doinIt)
652: System.err.print(" (" + (int) skip[k].writtenBits()
653: + " bits)");
654:
655: towerData.numberOfLowerEntries += s;
656: towerData.numberOfSkipTowers++;
657:
658: if (DEBUG && doinIt)
659: System.err.println();
660: }
661:
662: distance[k] = toTheEnd;
663:
664: // Where are we? Just before the beginning of the k-th skip tower
665: toTheEnd += cachePointer[k].writtenBits();
666:
667: // Where are we? Just before the beginning of the k-th document record
668: }
669: }
670:
671: /** Write out the cache content.
672: *
673: * @param nextPointer the first pointer of the next block, or -1 if this is the last block.
674: */
675: private void writeOutCache(final int nextPointer)
676: throws IOException {
677: if (DEBUG)
678: System.err.println("Entered writeOutCache() with cache="
679: + cache + " (H is " + (1 << h) + ", B is " + w
680: + ")");
681:
682: cacheDataLength[(cache + q - 1) / q - 1] = (int) cacheDataOut
683: .writtenBits();
684:
685: /* Number of bits to go after the first pointer of the first record of the next block (or, if there
686: is no other block in the current list, to go to the end of the list). */
687: long toTheEnd;
688:
689: // Record the new document pointer for the highest tower
690: int nextAfter = ((cache + q) - 1) / q; // This is ceil( cache / q )
691:
692: if (nextPointer >= 0) {
693: skipPointer[nextAfter] = nextPointer;
694: toTheEnd = writeOutPointer(bitCount, nextPointer);
695: } else {
696: skipPointer[nextAfter] = currentDocument + 1; // Fake: just for the last block
697: toTheEnd = 0;
698: }
699:
700: distance[nextAfter] = 0;
701:
702: int k, s;
703: long d;
704:
705: // Compute quantum length in bits (without towers)
706: int quantumBitLength = 0, entryBitLength = 0;
707:
708: for (d = k = 0; k <= ((cache - 1) / q); k++)
709: d += (cachePointer[k].writtenBits() + cacheDataLength[k]);
710: quantumBitLength = (int) (((d * q) + (cache - 1)) / cache);
711:
712: final TowerData td = new TowerData();
713: final Int2IntRBTreeMap candidates = new Int2IntRBTreeMap();
714:
715: /* As a first try, we compute the tower costs using 0 as average entry bit length. */
716: tryTower(quantumBitLength, 0, toTheEnd, cacheSkipBitCount, td,
717: false);
718:
719: if (td.numberOfSkipTowers > 0) { // There actually is at least a tower.
720: /* Now we repeat this operation, trying to obtain the best value for the
721: * average entry bit length.
722: */
723:
724: while (candidates.size() < MAX_TRY
725: && !candidates
726: .containsValue(entryBitLength = (int) (td
727: .bitsForTowers() / td
728: .numberOfEntries()))) {
729: td.clear();
730: tryTower(quantumBitLength, entryBitLength, toTheEnd,
731: cacheSkipBitCount, td, false);
732: candidates.put((int) (td.bitsForTowers() / td
733: .numberOfEntries()), entryBitLength);
734: }
735:
736: if (ASSERTS)
737: assert candidates.size() < MAX_TRY;
738:
739: entryBitLength = candidates.get(candidates.firstIntKey());
740:
741: if (STATS && System.getProperty("freq") != null) {
742: final double freq = Double.parseDouble(System
743: .getProperty("freq"));
744: final double error = Integer.getInteger("error")
745: .intValue() / 100.0;
746: final double relativeFrequency = (double) frequency
747: / numberOfDocuments;
748: if ((writeStats = (relativeFrequency >= freq
749: * (1 - error) && relativeFrequency <= freq
750: * (1 + error)))) {
751: pointerSkipStats.println("# " + currentTerm);
752: pointerTopSkipStats.println("# " + currentTerm);
753: bitSkipStats.println("# " + currentTerm + " "
754: + quantumBitLength + " " + entryBitLength);
755: bitTopSkipStats.println("# " + currentTerm + " "
756: + quantumBitLength + " " + entryBitLength);
757: }
758: }
759: if (DEBUG)
760: System.err.println("Going to write tower at position "
761: + obs.writtenBits());
762: tryTower(quantumBitLength, entryBitLength, toTheEnd,
763: cacheSkip, towerData, true);
764: }
765:
766: // Ready to write out cache
767: int maxCacheDataLength = 0;
768: for (k = 0; k <= ((cache - 1) / q); k++)
769: if (cacheDataLength[k] > maxCacheDataLength)
770: maxCacheDataLength = cacheDataLength[k];
771:
772: /* We have two ways of writing out cached data. If all the data is still in the output bit
773: * stream buffer, we just read it directly. Otherwise, we have to pour it into a temporary buffer. */
774:
775: final byte[] buffer;
776: final boolean direct;
777: int pos = 0;
778:
779: cacheDataOut.align();
780:
781: if (cacheDataOut.buffer() != null) {
782: buffer = cacheDataOut.buffer();
783: direct = true;
784: } else {
785: cacheDataOut.flush();
786: buffer = new byte[(maxCacheDataLength + 7) / 8];
787: direct = false;
788: cacheDataIn.flush();
789: cacheDataIn.position(0);
790: }
791:
792: for (k = 0; k <= ((cache - 1) / q); k++) {
793:
794: /* See comments above. */
795: s = (k == 0) ? h : Fast.leastSignificantBit(k);
796:
797: if (cache < w)
798: s = Math.min(s, Fast
799: .mostSignificantBit((cache / q) - k));
800:
801: d = cachePointer[k].writtenBits();
802: cachePointer[k].flush();
803: obs.write(cachePointerByte[k].array, d);
804:
805: d = cacheSkip[k].writtenBits();
806: cacheSkip[k].flush();
807:
808: if (s >= 0) {
809: if (k == 0) {
810: if (prevQuantumBitLength < 0) {
811: bitsForQuantumBitLengths += obs
812: .writeLongDelta(quantumBitLength);
813: bitsForEntryBitLengths += obs
814: .writeLongDelta(entryBitLength);
815: } else {
816: bitsForQuantumBitLengths += obs
817: .writeLongDelta(Fast
818: .int2nat(quantumBitLength
819: - prevQuantumBitLength));
820: bitsForEntryBitLengths += obs
821: .writeLongDelta(Fast
822: .int2nat(entryBitLength
823: - prevEntryBitLength));
824: }
825:
826: prevQuantumBitLength = quantumBitLength;
827: prevEntryBitLength = entryBitLength;
828:
829: numberOfBlocks++;
830: }
831:
832: if (s > 0)
833: obs.writeDelta(Fast.int2nat((int) d
834: - entryBitLength * (s + 1))); // No length for single-entry towers.
835: } else if (ASSERTS)
836: assert d == 0;
837:
838: obs.write(cacheSkipByte[k].array, d);
839:
840: if (direct) {
841: obs.write(buffer, pos * 8, cacheDataLength[k]);
842: pos += (cacheDataLength[k] + 7) / 8;
843: } else {
844: cacheDataIn.read(buffer, 0,
845: (cacheDataLength[k] + 7) / 8);
846: obs.write(buffer, cacheDataLength[k]);
847: }
848: }
849:
850: // Clean used caches
851: for (k = 0; k <= ((cache - 1) / q); k++) {
852: cachePointerByte[k].reset();
853: cachePointer[k].writtenBits(0);
854:
855: cacheSkipByte[k].reset();
856: cacheSkip[k].writtenBits(0);
857:
858: cacheDataOut.position(0);
859: cacheDataOut.writtenBits(0);
860: }
861:
862: cache = 0;
863:
864: if (ASSERTS)
865: assert obs.writtenBits() == writtenBits();
866: }
867:
868: public long writtenBits() {
869: return super .writtenBits() + towerData.bitsForTopSkipPointers
870: + towerData.bitsForTopBitSkips
871: + towerData.bitsForLowerSkipPointers
872: + towerData.bitsForLowerBitSkips
873: + towerData.bitsForTowerLengths
874: + bitsForQuantumBitLengths + bitsForEntryBitLengths;
875: }
876:
877: public Properties properties() {
878: Properties result = super .properties();
879: result.setProperty(Index.PropertyKeys.INDEXCLASS,
880: FileIndex.class.getName());
881: result.setProperty(BitStreamIndex.PropertyKeys.SKIPQUANTUM, q);
882: result.setProperty(BitStreamIndex.PropertyKeys.SKIPHEIGHT, h);
883: return result;
884: }
885:
886: public void printStats(final PrintStream stats) {
887: super .printStats(stats);
888: stats.println("Skip towers: "
889: + Util.format(towerData.numberOfSkipTowers)
890: + " ("
891: + Util.format(towerData.bitsForTowers())
892: + " bits ["
893: + Util.format(towerData.bitsForTowers() * 100.0
894: / writtenBits())
895: + "%], "
896: + Util.format(towerData.bitsForTowers()
897: / (double) towerData.numberOfSkipTowers)
898: + " bits/tower)");
899: stats.println("Skip entries: "
900: + Util.format(towerData.numberOfEntries())
901: + " ("
902: + Util.format(towerData.bitsForEntries()
903: / (double) towerData.numberOfEntries())
904: + " bits/entry)");
905: // Note that lengths are written approximately every other tower.
906: stats.println("Skip tower lengths: "
907: + Util.format(towerData.bitsForTowerLengths)
908: + " bits ("
909: + Util.format(2.0 * towerData.bitsForTowerLengths
910: / towerData.numberOfSkipTowers)
911: + " bits/tower)");
912: stats.println("Quantum bit lengths: "
913: + Util.format(bitsForQuantumBitLengths)
914: + " bits ("
915: + Util.format(bitsForQuantumBitLengths
916: / (double) numberOfBlocks) + " bits/block)");
917: stats.println("Entry bit lengths: "
918: + Util.format(bitsForEntryBitLengths)
919: + " bits ("
920: + Util.format(bitsForEntryBitLengths
921: / (double) numberOfBlocks) + " bits/block)");
922:
923: stats.println("Top bit skips: "
924: + Util.format(towerData.bitsForTopBitSkips)
925: + " bits ("
926: + Util.format(towerData.bitsForTopBitSkips
927: / (double) towerData.numberOfTopEntries)
928: + " bits/skip)");
929: stats.println("Top pointer skips: "
930: + Util.format(towerData.bitsForTopSkipPointers)
931: + " bits ("
932: + Util.format(towerData.bitsForTopSkipPointers
933: / (double) towerData.numberOfTopEntries)
934: + " bits/skip)");
935: stats.println("Lower bit skips: "
936: + Util.format(towerData.bitsForLowerBitSkips)
937: + " bits ("
938: + Util.format(towerData.bitsForLowerBitSkips
939: / (double) towerData.numberOfLowerEntries)
940: + " bits/skip)");
941: stats.println("Lower pointer skips: "
942: + Util.format(towerData.bitsForLowerSkipPointers)
943: + " bits ("
944: + Util.format(towerData.bitsForLowerSkipPointers
945: / (double) towerData.numberOfLowerEntries)
946: + " bits/skip)");
947: stats.println("Bit skips: "
948: + Util.format(towerData.bitsForBitSkips())
949: + " bits ("
950: + Util.format(towerData.bitsForBitSkips()
951: / (double) towerData.numberOfEntries())
952: + " bits/skip)");
953: stats.println("Pointer skips: "
954: + Util.format(towerData.bitsForSkipPointers())
955: + " bits ("
956: + Util.format(towerData.bitsForSkipPointers()
957: / (double) towerData.numberOfEntries())
958: + " bits/skip)");
959: }
960:
961: }
|