0001: package it.unimi.dsi.mg4j.index;
0002:
0003: /*
0004: * MG4J: Managing Gigabytes for Java
0005: *
0006: * Copyright (C) 2007 Paolo Boldi and Sebastiano Vigna
0007: *
0008: * This library is free software; you can redistribute it and/or modify it
0009: * under the terms of the GNU Lesser General Public License as published by the Free
0010: * Software Foundation; either version 2.1 of the License, or (at your option)
0011: * any later version.
0012: *
0013: * This library is distributed in the hope that it will be useful, but
0014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
0015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
0016: * for more details.
0017: *
0018: * You should have received a copy of the GNU Lesser General Public License
0019: * along with this program; if not, write to the Free Software
0020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
0021: *
0022: */
0023:
0024: import it.unimi.dsi.fastutil.ints.IntIterator;
0025: import it.unimi.dsi.fastutil.ints.IntIterators;
0026: import it.unimi.dsi.fastutil.ints.IntSet;
0027: import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
0028: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
0029: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
0030: import it.unimi.dsi.fastutil.objects.ReferenceSet;
0031: import it.unimi.dsi.mg4j.index.AbstractIndexIterator;
0032: import it.unimi.dsi.mg4j.index.AbstractIndexReader;
0033: import it.unimi.dsi.mg4j.index.BitStreamHPIndex;
0034: import it.unimi.dsi.mg4j.index.Index;
0035: import it.unimi.dsi.mg4j.index.IndexIterator;
0036: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
0037: import it.unimi.dsi.mg4j.index.payload.Payload;
0038: import it.unimi.dsi.io.InputBitStream;
0039: import it.unimi.dsi.util.Interval;
0040: import it.unimi.dsi.mg4j.search.IntervalIterator;
0041: import it.unimi.dsi.mg4j.search.IntervalIterators;
0042: import it.unimi.dsi.bits.Fast;
0043: import it.unimi.dsi.Util;
0044:
0045: import java.io.IOException;
0046: import java.util.NoSuchElementException;
0047:
0048: import org.apache.log4j.Logger;
0049:
0050: /** A bitstream-based {@linkplain IndexReader index reader} for {@linkplain BitStreamHPIndex high-performance indices}. */
0051:
0052: public class BitStreamHPIndexReader extends AbstractIndexReader {
0053: @SuppressWarnings("unused")
0054: private static final Logger LOGGER = Util
0055: .getLogger(BitStreamHPIndexReader.class);
0056:
0057: private final static boolean ASSERTS = false;
0058: private final static boolean DEBUG = false;
0059: private final static boolean COOKIES = false;
0060:
0061: /** The reference index. */
0062: protected final BitStreamHPIndex index;
0063: /** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */
0064: protected final BitStreamHPIndexReaderIndexIterator indexIterator;
0065:
0066: /**
0067: * Creates a new skip index reader, with the specified underlying {@link Index} and input bit
0068: * stream.
0069: *
0070: * @param index the index.
0071: * @param ibs the underlying bit stream.
0072: */
0073: public BitStreamHPIndexReader(final BitStreamHPIndex index,
0074: final InputBitStream ibs, final InputBitStream positions) {
0075: this .index = index;
0076: this .indexIterator = new BitStreamHPIndexReaderIndexIterator(
0077: this , ibs, positions);
0078: }
0079:
0080: protected static final class BitStreamHPIndexReaderIndexIterator
0081: extends AbstractIndexIterator implements IndexIterator {
0082: /** The enclosing instance. */
0083: private final BitStreamHPIndexReader parent;
0084:
0085: /** The reference index. */
0086: protected final BitStreamHPIndex index;
0087:
0088: /** The underlying input bit stream. */
0089: protected final InputBitStream ibs;
0090:
0091: /** The underlying positions input bit stream. */
0092: private final InputBitStream positions;
0093:
0094: /** The enclosed interval iterator. */
0095: private final IndexIntervalIterator intervalIterator;
0096:
0097: /** A singleton set containing the enclosed interval iterator. */
0098: private final Reference2ReferenceMap<Index, IntervalIterator> singletonIntervalIterator;
0099:
0100: /** The key index. */
0101: private final Index keyIndex;
0102:
0103: /** The cached copy of {@link #index index.pointerCoding}. */
0104: protected final Coding pointerCoding;
0105:
0106: /** The cached copy of {@link #index index.countCoding}. */
0107: protected final Coding countCoding;
0108:
0109: /** The cached copy of {@link #index index.positionCoding}. */
0110: protected final Coding positionCoding;
0111:
0112: /** The parameter <code>b</code> for Golomb coding of pointers. */
0113: protected int b;
0114:
0115: /**
0116: * The parameter <code>log2b</code> for Golomb coding of pointers; it is the most
0117: * significant bit of {@link #b}.
0118: */
0119: protected int log2b;
0120:
0121: /** The current term. */
0122: @SuppressWarnings("hiding")
0123: protected int term = -1;
0124:
0125: /** The current frequency. */
0126: protected int frequency;
0127:
0128: /**
0129: * Whether the current terms has pointers at all (this happens when the {@link #frequency}
0130: * is smaller than the number of documents).
0131: */
0132: protected boolean hasPointers;
0133:
0134: /** The current count (if this index contains counts). */
0135: protected int count;
0136:
0137: /**
0138: * The last document pointer we read from current list, -1 if we just read the frequency,
0139: * {@link Integer#MAX_VALUE} if we are beyond the end of list.
0140: */
0141: protected int currentDocument;
0142:
0143: /** The number of the document record we are going to read inside the current inverted list. */
0144: protected int numberOfDocumentRecord;
0145:
0146: /** This variable tracks the current state of the reader. */
0147: protected int state;
0148:
0149: /** The parameter <code>h</code> (the maximum height of a skip tower). */
0150: public final int height;
0151:
0152: /** The quantum. */
0153: public final int quantum;
0154:
0155: /** The bit mask giving the remainder of the division by {@link #quantum}. */
0156: public final int quantumModuloMask;
0157:
0158: /** The shift giving result of the division by {@link #quantum}. */
0159: public final int quantumDivisionShift;
0160:
0161: /**
0162: * The maximum height of a skip tower in the current block. May be less than {@link #height}
0163: * if the block is defective, and will be -1 on defective quanta (no tower at all).
0164: */
0165: private int maxh;
0166:
0167: /** The maximum valid index of the current skip tower, if any. */
0168: private int s;
0169:
0170: /**
0171: * The minimum valid index of the current skip tower, or {@link Integer#MAX_VALUE}. If
0172: * {@link #maxh} is negative, the value is undefined.
0173: */
0174: private int lowest;
0175:
0176: /** We have <var>w</var> = <var>Hq</var>. */
0177: private int w;
0178:
0179: /** The bit mask giving the remainder of the division by {@link #w}. */
0180: private final int wModuloMask;
0181:
0182: /** The shift giving result of the division by {@link #w}. */
0183: private final int wDivisionShift;
0184:
0185: /** The Golomb modulus for a top pointer skip, for each level. */
0186: private int[] towerTopB;
0187:
0188: /** The most significant bit of the Golomb modulus for a top point[]er skip, for each level. */
0189: private int[] towerTopLog2B;
0190:
0191: /** The Golomb modulus for a lower pointer skip, for each level. */
0192: private int[] towerLowerB;
0193:
0194: /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
0195: private int[] towerLowerLog2B;
0196:
0197: /** The prediction for a pointer skip, for each level. */
0198: private int[] pointerPrediction;
0199:
0200: /** An array to decode bit skips. */
0201: private long[] bitSkip;
0202:
0203: /** An array to decode positions bit skips. */
0204: private long[] positionsBitSkip;
0205:
0206: /** An array to decode the pointer skips. */
0207: private int[] pointerSkip;
0208:
0209: /** The number of bits read just after reading the last skip tower. */
0210: private long readBitsAtLastSkipTower;
0211:
0212: /** The document pointer corresponding to the last skip tower. */
0213: private int pointerAtLastSkipTower;
0214:
0215: /** The number of positions that should be skipped manually
0216: * once the last read tower has been reached by skipping {@link #positionsToReadToReachCurrentPosition}.
0217: */
0218: private int positionsToReadToReachCurrentPosition;
0219:
0220: /** The offset in the positions stream from the start of the current quantum
0221: * to the start of the last tower.
0222: */
0223: private long positionsBitsOffset;
0224:
0225: /** The last increment added to {@link #positionsBitSkip}. */
0226: private long lastPositionsIncrement;
0227:
0228: /** The current quantum bit length, as provided by the index. */
0229: private int quantumBitLength;
0230:
0231: /** The current positions quantum bit length, as provided by the index. */
0232: private int positionsQuantumBitLength;
0233:
0234: /** The current entry bit length, as provided by the index. */
0235: private int entryBitLength;
0236:
0237: /** This value of {@link #state} means that we are positioned just before a tower. */
0238: private static final int BEFORE_TOWER = 0;
0239:
0240: /**
0241: * This value of {@link #state} can be assumed only in indices that contain counts; it means
0242: * that we are positioned just before the count for the current document record.
0243: */
0244: private static final int BEFORE_COUNT = 1;
0245:
0246: /**
0247: * This value of {@link #state} means that we are at the start of a new document record,
0248: * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} ==
0249: * {@link #frequency}), in which case we are at the end of the inverted list, and
0250: * {@link #endOfList()} is true.
0251: */
0252: private static final int BEFORE_POINTER = 2;
0253:
0254: /** Whether the positions for the current document pointer have not been fetched yet. */
0255: private boolean positionsUnread;
0256:
0257: /** The cached position array. */
0258: protected int[] positionCache = new int[16];
0259:
0260: /** The offset of the positions of the current {@link #term}. */
0261: protected long lastPositionsOffset;
0262:
0263: public BitStreamHPIndexReaderIndexIterator(
0264: final BitStreamHPIndexReader parent,
0265: final InputBitStream ibs, final InputBitStream positions) {
0266: this .parent = parent;
0267: this .ibs = ibs;
0268: this .positions = positions;
0269: index = parent.index;
0270: keyIndex = index.keyIndex;
0271: pointerCoding = index.pointerCoding;
0272: countCoding = index.countCoding;
0273: positionCoding = index.positionCoding;
0274: intervalIterator = index.hasPositions ? new IndexIntervalIterator()
0275: : null;
0276: singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps
0277: .singleton(keyIndex,
0278: (IntervalIterator) intervalIterator)
0279: : null;
0280: quantum = index.quantum;
0281: quantumModuloMask = quantum - 1;
0282: quantumDivisionShift = Fast.mostSignificantBit(quantum);
0283: height = index.height;
0284: w = (1 << height) * quantum;
0285: wModuloMask = w - 1;
0286: wDivisionShift = Fast.mostSignificantBit(w);
0287: bitSkip = new long[height + 1];
0288: positionsBitSkip = new long[height + 1];
0289: pointerSkip = new int[height + 1];
0290: towerTopB = new int[height + 1];
0291: towerTopLog2B = new int[height + 1];
0292: towerLowerB = new int[height + 1];
0293: towerLowerLog2B = new int[height + 1];
0294: pointerPrediction = new int[height + 1];
0295: }
0296:
0297: /** Debug variable, usually unused, that keeps track of the end of the positions stream for the current term (but not for term 0!). */
0298: private long positionsLength;
0299:
0300: /**
0301: * Positions the index on the inverted list of a given term.
0302: *
0303: * <p>This method can be called at any time. Note that it is <em>always</em> possible to
0304: * call this method with argument 0, even if offsets have not been loaded.
0305: *
0306: * @param term a term.
0307: */
0308: protected void position(final int term) throws IOException {
0309: if (term == 0) {
0310: if (ASSERTS) {
0311: // Get end of positions
0312: ibs.position(index.offsets.getLong(1));
0313: positionsLength = ibs.readLongDelta();
0314: }
0315:
0316: ibs.position(0);
0317: ibs.readBits(0);
0318: lastPositionsOffset = ibs.readLongDelta(); // This is 0 for sure
0319: positions.position(0);
0320: } else {
0321: if (index.offsets == null)
0322: throw new IllegalStateException(
0323: "You cannot position an index without offsets");
0324:
0325: if (ASSERTS) {
0326: // Get end of positions
0327: if (term < index.numberOfTerms - 1) {
0328: ibs.position(index.offsets.getLong(term + 1));
0329: positionsLength = ibs.readLongDelta();
0330: } else
0331: positionsLength = Long.MAX_VALUE; // Presently, no way to check
0332: }
0333:
0334: ibs.position(index.offsets.getLong(term));
0335: ibs.readBits(index.offsets.getLong(term));
0336: // Let us position the positions bitstream
0337: lastPositionsOffset = ibs.readLongDelta();
0338: positions.position(lastPositionsOffset);
0339: if (ASSERTS && positionsLength != Long.MAX_VALUE)
0340: positionsLength -= lastPositionsOffset;
0341: if (DEBUG && ASSERTS)
0342: System.err.println(this + ": positions for term "
0343: + term + " start at bit "
0344: + lastPositionsOffset + " ("
0345: + positionsLength + " bits)");
0346: }
0347: positions.readBits(0);
0348: this .term = term;
0349: readFrequency();
0350: }
0351:
0352: public int termNumber() {
0353: return term;
0354: }
0355:
0356: protected IndexIterator advance() throws IOException {
0357: if (term == index.numberOfTerms - 1)
0358: return null;
0359: if (term != -1) {
0360: skipTo(Integer.MAX_VALUE);
0361: // This guarantees we have no garbage before the frequency
0362: nextDocument();
0363: positionsUnread = false;
0364: }
0365: // We must skip the offset into the positions bitstream
0366: final long nextPositionsOffset = ibs.readLongDelta();
0367: positions.skip(nextPositionsOffset - lastPositionsOffset
0368: - positions.readBits());
0369: lastPositionsOffset = nextPositionsOffset;
0370: positions.readBits(0);
0371: term++;
0372: if (ASSERTS)
0373: positionsLength = -1; // Invalidate
0374: readFrequency();
0375: return this ;
0376: }
0377:
0378: private void readFrequency() throws IOException {
0379: // Read the frequency
0380:
0381: switch (index.frequencyCoding) {
0382: case GAMMA:
0383:
0384: frequency = ibs.readGamma() + 1;
0385:
0386: break;
0387: case SHIFTED_GAMMA:
0388:
0389: frequency = ibs.readShiftedGamma() + 1;
0390:
0391: break;
0392: case DELTA:
0393:
0394: frequency = ibs.readDelta() + 1;
0395:
0396: break;
0397: default:
0398: throw new IllegalStateException(
0399: "The required frequency coding ("
0400: + index.frequencyCoding
0401: + ") is not supported.");
0402: }
0403:
0404: if (DEBUG)
0405: System.err.println(this + ": Frequency for term "
0406: + term + " is " + frequency);
0407:
0408: hasPointers = frequency < index.numberOfDocuments;
0409:
0410: // We compute the modulus used for pointer Golomb coding
0411: if (pointerCoding == Coding.GOLOMB) {
0412:
0413: if (hasPointers) {
0414: b = BitStreamIndex.golombModulus(frequency,
0415: index.numberOfDocuments);
0416: log2b = Fast.mostSignificantBit(b);
0417: }
0418:
0419: }
0420:
0421: quantumBitLength = positionsQuantumBitLength = entryBitLength = -1;
0422: lowest = Integer.MAX_VALUE;
0423: if (ASSERTS)
0424: for (int i = height; i > Math
0425: .min(
0426: height,
0427: Fast
0428: .mostSignificantBit(frequency >> quantumDivisionShift)); i--)
0429: towerTopB[i] = towerLowerB[i] = pointerPrediction[i] = -1;
0430: final long pointerQuantumSigma = BitStreamIndex
0431: .quantumSigma(frequency, index.numberOfDocuments,
0432: quantum);
0433: for (int i = Math
0434: .min(
0435: height,
0436: Fast
0437: .mostSignificantBit(frequency >> quantumDivisionShift)); i >= 0; i--) {
0438: towerTopB[i] = BitStreamIndex.gaussianGolombModulus(
0439: pointerQuantumSigma, i + 1);
0440: towerTopLog2B[i] = Fast
0441: .mostSignificantBit(towerTopB[i]);
0442: towerLowerB[i] = BitStreamIndex.gaussianGolombModulus(
0443: pointerQuantumSigma, i);
0444: towerLowerLog2B[i] = Fast
0445: .mostSignificantBit(towerLowerB[i]);
0446: pointerPrediction[i] = (int) ((quantum * (1L << i)
0447: * index.numberOfDocuments + frequency / 2) / frequency);
0448: }
0449: count = -1;
0450: currentDocument = -1;
0451: numberOfDocumentRecord = -1;
0452: positionsBitsOffset = 0;
0453: positionsBitSkip[0] = 0; // To avoid spurious tower updates on the first tower
0454: positionsToReadToReachCurrentPosition = 0;
0455: lastPositionsIncrement = 0;
0456: state = BEFORE_POINTER;
0457: }
0458:
0459: public Index index() {
0460: return keyIndex;
0461: }
0462:
0463: public int frequency() {
0464: return frequency;
0465: }
0466:
0467: private void ensureCurrentDocument() {
0468: if (currentDocument < 0)
0469: throw new IllegalStateException(
0470: "nextDocument() has never been called for (term="
0471: + term + ")");
0472: if (currentDocument == Integer.MAX_VALUE)
0473: throw new IllegalStateException(
0474: "This reader is positioned beyond the end of list of (term="
0475: + term + ")");
0476: }
0477:
0478: /**
0479: * Returns whether there are no more document records in the current inverted list.
0480: *
0481: * <p>This method returns true if the last document pointer of the current inverted list
0482: * has been read. It makes no distinction as to where (inside the last document record) this
0483: * reader is currently positioned. In particular, this method will return true independently
0484: * of whether count and positions have been read or not (we note by passing that this is the
0485: * only sensible behaviour, as you can build indices with or without counts/positions).
0486: *
0487: * <p>This method will return true also when this reader is positioned <em>beyond</em>
0488: * the last document pointer. In this case, {@link #currentDocumentPointer()} will return
0489: * {@link Integer#MAX_VALUE}.
0490: *
0491: * @return true whether there are no more document records in the current inverted list.
0492: */
0493: private boolean endOfList() {
0494: if (ASSERTS)
0495: assert numberOfDocumentRecord <= frequency;
0496: return numberOfDocumentRecord >= frequency - 1;
0497: }
0498:
0499: public int document() {
0500: if (ASSERTS)
0501: ensureCurrentDocument();
0502: return currentDocument;
0503: }
0504:
0505: public Payload payload() throws IOException {
0506: throw new UnsupportedOperationException("This index ("
0507: + index + ") does not contain payloads");
0508: }
0509:
0510: public int count() throws IOException {
0511: if (DEBUG)
0512: System.err.println(this + ".count()");
0513: if (count != -1)
0514: return count;
0515: if (ASSERTS)
0516: ensureCurrentDocument();
0517: if (state == BEFORE_TOWER)
0518: readTower();
0519: if (ASSERTS && state != BEFORE_COUNT)
0520: throw new IllegalStateException();
0521: state = BEFORE_POINTER;
0522:
0523: switch (countCoding) {
0524: case UNARY:
0525:
0526: count = ibs.readUnary() + 1;
0527:
0528: break;
0529: case SHIFTED_GAMMA:
0530:
0531: count = ibs.readShiftedGamma() + 1;
0532:
0533: break;
0534: case GAMMA:
0535:
0536: count = ibs.readGamma() + 1;
0537:
0538: break;
0539: case DELTA:
0540:
0541: count = ibs.readDelta() + 1;
0542:
0543: break;
0544: default:
0545: throw new IllegalStateException(
0546: "The required count coding (" + countCoding
0547: + ") is not supported.");
0548: }
0549:
0550: return count;
0551: }
0552:
0553: protected void updatePositionCache() throws IOException {
0554: if (DEBUG)
0555: System.err.println(this + ".updatePositionCache()");
0556: positionsUnread = false;
0557: count(); // This will force reading the tower and updating positionsBitsOffset, if necessary
0558: if (positionsBitsOffset > positions.readBits()) {
0559: if (DEBUG)
0560: System.err.println(this
0561: + ": positionsBitsOffset="
0562: + positionsBitsOffset
0563: + ", positions.readBits()="
0564: + positions.readBits()
0565: + ", skipping by "
0566: + (positionsBitsOffset - positions
0567: .readBits()));
0568: positions.skip(positionsBitsOffset
0569: - positions.readBits());
0570: }
0571:
0572: if (ASSERTS)
0573: assert positionsToReadToReachCurrentPosition >= 0 : positionsToReadToReachCurrentPosition
0574: + " < 0";
0575:
0576: if (positionsToReadToReachCurrentPosition > 0) {
0577: if (DEBUG)
0578: System.err.println(this + ":Skipping sequentially "
0579: + positionsToReadToReachCurrentPosition
0580: + " positions...");
0581: // We skip, inside the current quantum, the positions we haven't read
0582:
0583: switch (positionCoding) {
0584: case SHIFTED_GAMMA:
0585:
0586: if (COOKIES) {
0587: positionsToReadToReachCurrentPosition--;
0588: if (positions.readShiftedGamma() != Integer.MAX_VALUE)
0589: throw new AssertionError();
0590: }
0591: positions
0592: .skipShiftedGammas(positionsToReadToReachCurrentPosition);
0593:
0594: break;
0595: case GAMMA:
0596:
0597: if (COOKIES) {
0598: positionsToReadToReachCurrentPosition--;
0599: if (positions.readGamma() != Integer.MAX_VALUE)
0600: throw new AssertionError();
0601: }
0602: positions
0603: .skipGammas(positionsToReadToReachCurrentPosition);
0604:
0605: break;
0606: case DELTA:
0607:
0608: if (COOKIES) {
0609: positionsToReadToReachCurrentPosition--;
0610: if (positions.readDelta() != Integer.MAX_VALUE)
0611: throw new AssertionError();
0612: }
0613: positions
0614: .skipDeltas(positionsToReadToReachCurrentPosition);
0615:
0616: break;
0617: default:
0618: throw new IllegalStateException(
0619: "The required position coding ("
0620: + positionCoding
0621: + ") is not supported.");
0622: }
0623:
0624: }
0625:
0626: // We must fix it so that nextDocument() will restore it to 0
0627: positionsToReadToReachCurrentPosition = -count;
0628: if (COOKIES)
0629: positionsToReadToReachCurrentPosition--;
0630: if (count > positionCache.length)
0631: positionCache = new int[Math.max(
0632: positionCache.length * 2, count)];
0633: final int[] occ = positionCache;
0634:
0635: switch (positionCoding) {
0636: case SHIFTED_GAMMA:
0637:
0638: if (COOKIES
0639: && positions.readShiftedGamma() != Integer.MAX_VALUE)
0640: throw new AssertionError();
0641: positions.readShiftedGammas(occ, count);
0642: for (int i = 1; i < count; i++)
0643: occ[i] += occ[i - 1] + 1;
0644:
0645: return;
0646: case GAMMA:
0647:
0648: if (COOKIES
0649: && positions.readGamma() != Integer.MAX_VALUE)
0650: throw new AssertionError();
0651: positions.readGammas(occ, count);
0652: for (int i = 1; i < count; i++)
0653: occ[i] += occ[i - 1] + 1;
0654:
0655: return;
0656: case DELTA:
0657:
0658: if (COOKIES
0659: && positions.readDelta() != Integer.MAX_VALUE)
0660: throw new AssertionError();
0661: positions.readDeltas(occ, count);
0662: for (int i = 1; i < count; i++)
0663: occ[i] += occ[i - 1] + 1;
0664:
0665: return;
0666: default:
0667: throw new IllegalStateException(
0668: "The required position coding ("
0669: + index.positionCoding
0670: + ") is not supported.");
0671: }
0672:
0673: }
0674:
0675: public IntIterator positions() throws IOException {
0676: if (ASSERTS)
0677: ensureCurrentDocument();
0678: if (positionsUnread)
0679: updatePositionCache();
0680: return IntIterators.wrap(positionCache, 0, count);
0681: }
0682:
0683: public int[] positionArray() throws IOException {
0684: if (ASSERTS)
0685: ensureCurrentDocument();
0686: if (positionsUnread)
0687: updatePositionCache();
0688: return positionCache;
0689: }
0690:
0691: // TODO: check who's using this (positionArray() is actually faster now)
0692: public int positions(final int[] position) throws IOException {
0693: if (ASSERTS)
0694: ensureCurrentDocument();
0695: if (positionsUnread)
0696: updatePositionCache(); // And also that positions have
0697: // been read
0698: if (position.length < count)
0699: return -count;
0700: for (int i = count; i-- != 0;)
0701: position[i] = this .positionCache[i];
0702: return count;
0703: }
0704:
0705: public int nextDocument() throws IOException {
0706: if (DEBUG)
0707: System.err.println("{" + this + "} nextDocument()");
0708: if (state != BEFORE_POINTER) {
0709: if (state == BEFORE_TOWER)
0710: readTower();
0711: if (state == BEFORE_COUNT) {
0712:
0713: switch (countCoding) {
0714: case UNARY:
0715:
0716: count = ibs.readUnary() + 1;
0717:
0718: break;
0719: case SHIFTED_GAMMA:
0720:
0721: count = ibs.readShiftedGamma() + 1;
0722:
0723: break;
0724: case GAMMA:
0725:
0726: count = ibs.readGamma() + 1;
0727:
0728: break;
0729: case DELTA:
0730:
0731: count = ibs.readDelta() + 1;
0732:
0733: break;
0734: default:
0735: throw new IllegalStateException(
0736: "The required count coding ("
0737: + countCoding
0738: + ") is not supported.");
0739: }
0740:
0741: state = BEFORE_POINTER;
0742: }
0743: }
0744: if (endOfList())
0745: return -1;
0746: if (hasPointers) {// We do not write pointers for everywhere occurring terms.
0747:
0748: switch (pointerCoding) {
0749: case SHIFTED_GAMMA:
0750:
0751: currentDocument += ibs.readShiftedGamma() + 1;
0752:
0753: break;
0754: case GAMMA:
0755:
0756: currentDocument += ibs.readGamma() + 1;
0757:
0758: break;
0759: case DELTA:
0760:
0761: currentDocument += ibs.readDelta() + 1;
0762:
0763: break;
0764: case GOLOMB:
0765:
0766: currentDocument += ibs.readGolomb(b, log2b) + 1;
0767:
0768: break;
0769: default:
0770: throw new IllegalStateException(
0771: "The required pointer coding ("
0772: + pointerCoding
0773: + ") is not supported.");
0774: }
0775:
0776: } else
0777: currentDocument++;
0778: numberOfDocumentRecord++;
0779:
0780: if (ASSERTS && numberOfDocumentRecord > quantum)
0781: assert positionsBitsOffset > 0;
0782:
0783: if ((numberOfDocumentRecord & quantumModuloMask) == 0) {
0784: state = BEFORE_TOWER;
0785: positionsToReadToReachCurrentPosition = 0;
0786: } else {
0787: state = BEFORE_COUNT;
0788: if (ASSERTS)
0789: assert count > 0 : count + " <= " + 0;
0790: positionsToReadToReachCurrentPosition += count;
0791: if (COOKIES)
0792: positionsToReadToReachCurrentPosition++;
0793: }
0794: count = -1;
0795: positionsUnread = true;
0796: return currentDocument;
0797: }
0798:
0799: /**
0800: * Reads the entire skip tower for the current position. This method assumes that the tower
0801: * has been passed over sequentially, and correpondingly sets {@link #lastPositionsIncrement}
0802: * to the number of positions bits of the last quantum.
0803: */
0804: private void readTower() throws IOException {
0805: lastPositionsIncrement = maxh >= 0 ? positionsBitSkip[0]
0806: : 0;
0807: if (DEBUG)
0808: System.err.println(this
0809: + ": Setting lastPositionsIncrement to "
0810: + lastPositionsIncrement + " in readTower()");
0811: readTower(-1);
0812: if (DEBUG)
0813: System.err.println(this
0814: + ": Incrementing positionsBitsOffset by "
0815: + lastPositionsIncrement + " in readTower()");
0816: // TODO: this should be moved into readTower(int)
0817: positionsBitsOffset += lastPositionsIncrement;
0818: }
0819:
0820: /**
0821: * Reads the skip tower for the current position, possibly skipping part of the tower.
0822: *
0823: * <P>Note that this method will update {@link #state} only if it reads the entire tower,
0824: * otherwise the state remains {@link #BEFORE_TOWER}.
0825: *
0826: * @param pointer the tower will be read up to the first entry smaller than or equal to this
0827: * pointer; use -1 to guarantee that the entire tower will be read.
0828: */
0829: private void readTower(final int pointer) throws IOException {
0830: int i, j, k, cacheOffset, cache, towerLength = 0;
0831: long bitsAtTowerStart = 0;
0832: boolean truncated = false;
0833: if (ASSERTS)
0834: assert numberOfDocumentRecord % quantum == 0;
0835: if (ASSERTS && state != BEFORE_TOWER)
0836: throw new IllegalStateException(
0837: "readTower() called in state " + state);
0838: cacheOffset = (numberOfDocumentRecord & wModuloMask);
0839: k = cacheOffset >> quantumDivisionShift;
0840: if (ASSERTS && k == 0) { // Invalidate current tower data
0841: it.unimi.dsi.fastutil.ints.IntArrays.fill(pointerSkip,
0842: Integer.MAX_VALUE);
0843: it.unimi.dsi.fastutil.longs.LongArrays.fill(bitSkip,
0844: Integer.MAX_VALUE);
0845: it.unimi.dsi.fastutil.longs.LongArrays.fill(
0846: positionsBitSkip, Integer.MAX_VALUE);
0847: }
0848: // Compute the height of the current skip tower.
0849: s = (k == 0) ? height : Fast.leastSignificantBit(k);
0850: cache = frequency - w
0851: * (numberOfDocumentRecord >> wDivisionShift);
0852: if (cache < w) {
0853: maxh = Fast
0854: .mostSignificantBit((cache >> quantumDivisionShift)
0855: - k);
0856: if (maxh < s) {
0857: s = maxh;
0858: truncated = true;
0859: } else
0860: truncated = false;
0861: } else {
0862: cache = w;
0863: maxh = height;
0864: truncated = k == 0;
0865: }
0866: // assert w == cache || k == 0 || lastMaxh == Fast.mostSignificantBit( k ^ (
0867: // cache/quantum ) ) : lastMaxh +","+ (Fast.mostSignificantBit( k ^ ( cache/quantum )
0868: // ));
0869: i = s;
0870: if (s >= 0) {
0871: if (k == 0) {
0872: if (quantumBitLength < 0) {
0873: quantumBitLength = ibs.readDelta();
0874: positionsQuantumBitLength = ibs.readDelta();
0875: entryBitLength = ibs.readDelta();
0876: } else {
0877: quantumBitLength += Fast.nat2int(ibs
0878: .readDelta());
0879: positionsQuantumBitLength += Fast.nat2int(ibs
0880: .readDelta());
0881: entryBitLength += Fast.nat2int(ibs.readDelta());
0882: }
0883: if (DEBUG)
0884: System.err
0885: .println("{"
0886: + this
0887: + "} quantum bit length="
0888: + quantumBitLength
0889: + " positions quantum bit length="
0890: + positionsQuantumBitLength
0891: + " entry bit length="
0892: + entryBitLength);
0893: }
0894: if (DEBUG)
0895: System.err.println("{" + this
0896: + "} Reading tower; pointer=" + pointer
0897: + " maxh=" + maxh + " s=" + s);
0898: if (s > 0) {
0899: towerLength = entryBitLength * (s + 1)
0900: + Fast.nat2int(ibs.readDelta());
0901: if (DEBUG)
0902: System.err.println("{" + this
0903: + "} Tower length=" + towerLength);
0904: }
0905: // We store the number of bits read at the start of the tower (just after the
0906: // length).
0907: bitsAtTowerStart = ibs.readBits();
0908: if (truncated) {
0909: if (DEBUG)
0910: System.err.println("{" + this
0911: + "} Truncated--reading tops");
0912: // We read the tower top.
0913: pointerSkip[s] = Fast.nat2int(ibs.readGolomb(
0914: towerTopB[s], towerTopLog2B[s]))
0915: + pointerPrediction[s];
0916: bitSkip[s] = quantumBitLength * (1 << s)
0917: + entryBitLength * ((1 << s + 1) - s - 2)
0918: + Fast.nat2int(ibs.readLongDelta());
0919: positionsBitSkip[s] = positionsQuantumBitLength
0920: * (1 << s)
0921: + Fast.nat2int(ibs.readLongDelta());
0922: } else {
0923: // We copy the tower top from the lowest inherited entry suitably updated.
0924: pointerSkip[s] = pointerSkip[s + 1]
0925: - (currentDocument - pointerAtLastSkipTower);
0926: bitSkip[s] = bitSkip[s + 1]
0927: - (bitsAtTowerStart - readBitsAtLastSkipTower)
0928: - towerLength;
0929: // TODO: note that we could use the same method for pointers and bit skips!
0930: positionsBitSkip[s] = positionsBitSkip[s + 1]
0931: - positionsBitSkip[s];
0932: if (ASSERTS)
0933: assert positionsBitSkip[s] > 0 : positionsBitSkip[s]
0934: + " <= " + 0;
0935: }
0936: // We read the remaining part of the tower, at least until we point after pointer.
0937: if (currentDocument + pointerSkip[i] > pointer) {
0938: for (i = s - 1; i >= 0; i--) {
0939: pointerSkip[i] = Fast.nat2int(ibs.readGolomb(
0940: towerLowerB[i], towerLowerLog2B[i]))
0941: + pointerSkip[i + 1] / 2;
0942: bitSkip[i] = (bitSkip[i + 1] - entryBitLength
0943: * (i + 1))
0944: / 2 - Fast.nat2int(ibs.readLongDelta());
0945: positionsBitSkip[i] = positionsBitSkip[i + 1]
0946: / 2 - Fast.nat2int(ibs.readLongDelta());
0947: if (DEBUG
0948: && currentDocument + pointerSkip[i] <= pointer)
0949: System.err
0950: .println("{"
0951: + this
0952: + "} stopping reading at i="
0953: + i
0954: + " as currentDocument ("
0955: + currentDocument
0956: + ") plus pointer skip ("
0957: + pointerSkip[i]
0958: + ") is smaller than or equal target ("
0959: + pointer + ")");
0960: if (currentDocument + pointerSkip[i] <= pointer)
0961: break;
0962: }
0963: }
0964: }
0965: /*
0966: * If we did not read the entire tower, we need to fix the skips we read (as they are
0967: * offsets from the *end* of the tower) and moreover we must make unusable the rest of
0968: * the tower (for asserts).
0969: */
0970: if (i > 0) {
0971: final long fix = ibs.readBits() - bitsAtTowerStart;
0972: for (j = s; j >= i; j--)
0973: bitSkip[j] += towerLength - fix;
0974: if (ASSERTS)
0975: for (; j >= 0; j--)
0976: pointerSkip[j] = Integer.MAX_VALUE;
0977: } else
0978: state = BEFORE_COUNT;
0979: // We update the inherited tower.
0980: final long deltaBits = ibs.readBits()
0981: - readBitsAtLastSkipTower;
0982: final int deltaPointers = currentDocument
0983: - pointerAtLastSkipTower;
0984:
0985: if (ASSERTS)
0986: assert lastPositionsIncrement >= 0 : lastPositionsIncrement
0987: + " < " + 0;
0988:
0989: for (j = Fast.mostSignificantBit(k
0990: ^ (cache >> quantumDivisionShift)); j >= s + 1; j--) {
0991: bitSkip[j] -= deltaBits;
0992: positionsBitSkip[j] -= lastPositionsIncrement;
0993: pointerSkip[j] -= deltaPointers;
0994: }
0995:
0996: readBitsAtLastSkipTower = ibs.readBits();
0997: pointerAtLastSkipTower = currentDocument;
0998: lowest = i < 0 ? 0 : i;
0999:
1000: if (DEBUG) {
1001: System.err.println("{" + this + "} "
1002: + "Computed skip tower (lowest: " + lowest
1003: + ") for document record number "
1004: + numberOfDocumentRecord + " (pointer "
1005: + currentDocument + ") from " + Math.max(i, 0)
1006: + ": ");
1007: System.err.print("% ");
1008: for (j = 0; j <= s; j++)
1009: System.err.print(pointerSkip[j] + ":" + bitSkip[j]
1010: + ":" + positionsBitSkip[j] + " ");
1011: System.err.print(" [");
1012: for (; j <= height; j++)
1013: System.err.print(pointerSkip[j] + ":" + bitSkip[j]
1014: + ":" + positionsBitSkip[j] + " ");
1015: System.err.print("]");
1016: System.err.println();
1017: }
1018:
1019: if (ASSERTS) {
1020: for (j = Fast.mostSignificantBit(k
1021: ^ (cache >> quantumDivisionShift)); j >= s + 1; j--) {
1022: assert positionsBitSkip[j] >= 0 : "positionsBitSkip["
1023: + j
1024: + "] = "
1025: + positionsBitSkip[j]
1026: + " < "
1027: + 0;
1028: assert positionsBitSkip[j] >= positionsBitSkip[j - 1] : "positionsBitSkip["
1029: + j
1030: + "] = "
1031: + positionsBitSkip[j]
1032: + " < "
1033: + positionsBitSkip[j - 1]
1034: + " = positionsBitSkip[" + (j - 1) + "]";
1035: }
1036: }
1037:
1038: }
1039:
1040: public int skipTo(final int p) throws IOException {
1041: if (DEBUG)
1042: System.err.println(this + ".skipTo(" + p
1043: + ") [currentDocument=" + currentDocument
1044: + ", numberOfDocumentRecord="
1045: + numberOfDocumentRecord
1046: + ", positionsBitsOffset="
1047: + positionsBitsOffset + "]");
1048: // If we are just at the start of a list, let us read the first pointer.
1049: if (numberOfDocumentRecord == -1)
1050: nextDocument(); // TODO: shouldn't we just read the
1051: // tower?
1052: if (currentDocument >= p) {
1053: if (DEBUG)
1054: System.err.println(this
1055: + ": No skip necessary, returning "
1056: + currentDocument);
1057: return currentDocument;
1058: }
1059: if (state == BEFORE_TOWER) {
1060: lastPositionsIncrement = maxh >= 0 ? positionsBitSkip[0]
1061: : 0;
1062: readTower(p);
1063: if (DEBUG)
1064: System.err.println(this
1065: + ": Incrementing positionsBitsOffset by "
1066: + lastPositionsIncrement
1067: + " in skipTo() initial phase");
1068: positionsBitsOffset += lastPositionsIncrement;
1069: }
1070:
1071: final int[] pointerSkip = this .pointerSkip;
1072: for (;;) {
1073: if (ASSERTS)
1074: assert maxh < 0 || lowest > 0
1075: || pointerSkip[0] != Integer.MAX_VALUE;
1076: // If on a defective quantum (no tower) or p is inside the current quantum (no
1077: // need to scan the tower) we bail out.
1078: if (maxh < 0 || lowest == 0
1079: && pointerAtLastSkipTower + pointerSkip[0] > p)
1080: break;
1081: if (DEBUG)
1082: System.err.println(this
1083: + ": In for loop, currentDocument="
1084: + currentDocument + ", maxh=" + maxh
1085: + ", numberOfDocumentRecord="
1086: + numberOfDocumentRecord + ", p=" + p);
1087: final int cacheOffset = (numberOfDocumentRecord & wModuloMask);
1088: final int k = cacheOffset >> quantumDivisionShift;
1089: final int top = Fast
1090: .mostSignificantBit(k
1091: ^ (Math.min(w, frequency
1092: - numberOfDocumentRecord
1093: + cacheOffset) >> quantumDivisionShift));
1094: int i;
1095: for (i = lowest; i <= top; i++) {
1096: if (ASSERTS && (k & 1 << i) != 0)
1097: assert pointerSkip[i] == pointerSkip[i + 1];
1098: if (ASSERTS)
1099: assert pointerSkip[i] != Integer.MAX_VALUE : "Invalid pointer skip "
1100: + i
1101: + " (lowest="
1102: + lowest
1103: + ", top="
1104: + top + ")";
1105: if (pointerAtLastSkipTower + pointerSkip[i] > p)
1106: break;
1107: }
1108: if (--i < 0)
1109: break;
1110: if (ASSERTS)
1111: assert i >= lowest : i + " < " + lowest;
1112: if (DEBUG)
1113: System.err.println(this
1114: + ": Safely after for with i=" + i
1115: + ", P[i]=" + pointerSkip[i] + ", A[i]="
1116: + bitSkip[i] + ", B[i]="
1117: + positionsBitSkip[i]);
1118: if (DEBUG)
1119: System.err
1120: .println(this
1121: + ": ["
1122: + ibs.readBits()
1123: + "] Skipping "
1124: + (bitSkip[i] - (ibs.readBits() - readBitsAtLastSkipTower))
1125: + " bits ("
1126: + (((k & -(1 << i)) + (1 << i))
1127: * quantum - cacheOffset)
1128: + " records) to get to document pointer "
1129: + (currentDocument + pointerSkip[i]));
1130: ibs.skip(bitSkip[i]
1131: - (ibs.readBits() - readBitsAtLastSkipTower));
1132: /* We accumulate the number of bits that must be skipped in the positions stream to reach the position
1133: * corresponding to the current tower. */
1134: if (DEBUG)
1135: System.err.println(this
1136: + ": Incrementing positionsBitsOffset by "
1137: + positionsBitSkip[i] + " (height " + i
1138: + ")");
1139: lastPositionsIncrement = positionsBitSkip[i];
1140: positionsBitsOffset += lastPositionsIncrement;
1141: if (ASSERTS && positionsLength != -1)
1142: assert positionsBitsOffset <= positionsLength : positionsBitsOffset
1143: + " > " + positionsLength;
1144: state = BEFORE_TOWER;
1145: currentDocument = pointerSkip[i]
1146: + pointerAtLastSkipTower;
1147: numberOfDocumentRecord += ((k & -(1 << i)) + (1 << i))
1148: * quantum - cacheOffset;
1149: // If we skipped beyond the end of the list, we invalidate the current document.
1150: if (numberOfDocumentRecord == frequency) {
1151: currentDocument = Integer.MAX_VALUE;
1152: positionsUnread = false;
1153: state = BEFORE_POINTER; // We are actually before a frequency, but we must
1154: // avoid that calls to nextDocument() read anything
1155: } else {
1156: positionsUnread = true;
1157: readTower(p); // Note that if we are exactly on the destination pointer, we will read the entire tower.
1158: }
1159:
1160: count = -1; // We must invalidate count as readDocumentPointer() would do.
1161: positionsToReadToReachCurrentPosition = 0;
1162: if (endOfList()) {
1163: if (DEBUG)
1164: System.err.println(this
1165: + ".toSkip(): end-of-list, returning "
1166: + currentDocument + ", state=" + state);
1167: // Note that if p == Integer.MAX_VALUE, we are certainly beyond end-of-list
1168: return p == Integer.MAX_VALUE ? Integer.MAX_VALUE
1169: : currentDocument;
1170: }
1171: }
1172: if (DEBUG)
1173: System.err.println(this
1174: + ": Completing sequentially, currentDocument="
1175: + currentDocument + ", numberOfDocumentRecord="
1176: + numberOfDocumentRecord + ", p=" + p);
1177: while (currentDocument < p) {
1178: if (DEBUG)
1179: System.err
1180: .println(this
1181: + ": Skipping sequentially (second), currentDocument="
1182: + currentDocument
1183: + ", numberOfDocumentRecord="
1184: + numberOfDocumentRecord + ", p="
1185: + p);
1186: if (nextDocument() == -1) {
1187: if (DEBUG)
1188: System.err.println(this
1189: + ": end-of-list, returning MAX_VALUE");
1190: return Integer.MAX_VALUE;
1191: }
1192: }
1193: if (DEBUG)
1194: System.err.println(this + ".toSkip(): Returning "
1195: + currentDocument);
1196: return currentDocument;
1197: }
1198:
1199: public void dispose() throws IOException {
1200: parent.close();
1201: }
1202:
1203: public boolean hasNext() {
1204: return !endOfList();
1205: }
1206:
1207: public int nextInt() {
1208: if (!hasNext())
1209: throw new NoSuchElementException();
1210: try {
1211: return nextDocument();
1212: } catch (IOException e) {
1213: throw new RuntimeException(e);
1214: }
1215: }
1216:
1217: public String toString() {
1218: return index + " [" + term + "]";
1219: }
1220:
1221: /**
1222: * An interval iterator returning the positions of the current document as singleton
1223: * intervals.
1224: */
1225: private final class IndexIntervalIterator extends
1226: AbstractObjectIterator<Interval> implements
1227: IntervalIterator {
1228: int pos = -1;
1229:
1230: public void reset() throws IOException {
1231: pos = -1;
1232: if (positionsUnread)
1233: updatePositionCache(); // This guarantees the position cache is ok
1234: }
1235:
1236: public void intervalTerms(final IntSet terms) {
1237: terms
1238: .add(BitStreamHPIndexReaderIndexIterator.this .term);
1239: }
1240:
1241: public boolean hasNext() {
1242: return pos < count - 1;
1243: }
1244:
1245: public Interval next() {
1246: if (!hasNext())
1247: throw new NoSuchElementException();
1248: return Interval.valueOf(positionCache[++pos]);
1249: }
1250:
1251: public Interval nextInterval() {
1252: return pos < count - 1 ? Interval
1253: .valueOf(positionCache[++pos]) : null;
1254: }
1255:
1256: public int extent() {
1257: return 1;
1258: }
1259:
1260: public String toString() {
1261: return index + ": " + term + "[doc=" + currentDocument
1262: + ", count=" + count + ", pos=" + pos + "]";
1263: }
1264: };
1265:
1266: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
1267: throws IOException {
1268: intervalIterator();
1269: return singletonIntervalIterator;
1270: }
1271:
1272: public IntervalIterator intervalIterator() throws IOException {
1273: return intervalIterator(keyIndex);
1274: }
1275:
1276: public IntervalIterator intervalIterator(final Index index)
1277: throws IOException {
1278: if (ASSERTS)
1279: ensureCurrentDocument();
1280: // TODO: this was if ( index != keyIndex || hasPayloads )
1281: if (index != keyIndex)
1282: return IntervalIterators.TRUE;
1283: if (ASSERTS)
1284: assert intervalIterator != null;
1285: intervalIterator.reset();
1286: return intervalIterator;
1287: }
1288:
1289: public ReferenceSet<Index> indices() {
1290: return index.singletonSet;
1291: }
1292: }
1293:
1294: private IndexIterator documents(final CharSequence term,
1295: final int termNumber) throws IOException {
1296: indexIterator.term(term);
1297: indexIterator.position(termNumber);
1298: return indexIterator;
1299: }
1300:
1301: public IndexIterator documents(final int term) throws IOException {
1302: return documents(null, term);
1303: }
1304:
1305: public IndexIterator documents(final CharSequence term)
1306: throws IOException {
1307: if (closed)
1308: throw new IllegalStateException("This "
1309: + getClass().getSimpleName() + " has been closed");
1310: if (index.termMap != null) {
1311: final int termIndex = (int) index.termMap.getLong(term);
1312: if (termIndex == -1)
1313: return index.emptyIndexIterator;
1314: return documents(term, termIndex);
1315: }
1316: throw new UnsupportedOperationException("Index " + index
1317: + " has no term map");
1318: }
1319:
1320: @Override
1321: public IndexIterator nextIterator() throws IOException {
1322: return indexIterator.advance();
1323: }
1324:
1325: public String toString() {
1326: return getClass().getSimpleName() + "[" + index + "]";
1327: }
1328:
1329: public void close() throws IOException {
1330: super.close();
1331: indexIterator.ibs.close();
1332: indexIterator.positions.close();
1333: }
1334: }
|