0001: package it.unimi.dsi.mg4j.index;
0002:
0003: /*
0004: * MG4J: Managing Gigabytes for Java
0005: *
0006: * Copyright (C) 2003-2006 Paolo Boldi and Sebastiano Vigna
0007: *
0008: * This library is free software; you can redistribute it and/or modify it
0009: * under the terms of the GNU Lesser General Public License as published by the Free
0010: * Software Foundation; either version 2.1 of the License, or (at your option)
0011: * any later version.
0012: *
0013: * This library is distributed in the hope that it will be useful, but
0014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
0015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
0016: * for more details.
0017: *
0018: * You should have received a copy of the GNU Lesser General Public License
0019: * along with this program; if not, write to the Free Software
0020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
0021: *
0022: */
0023:
0024: import it.unimi.dsi.fastutil.ints.IntIterator;
0025: import it.unimi.dsi.fastutil.ints.IntIterators;
0026: import it.unimi.dsi.fastutil.ints.IntSet;
0027: import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
0028: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
0029: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
0030: import it.unimi.dsi.fastutil.objects.ReferenceSet;
0031: import it.unimi.dsi.mg4j.index.AbstractIndexIterator;
0032: import it.unimi.dsi.mg4j.index.AbstractIndexReader;
0033: import it.unimi.dsi.mg4j.index.BitStreamIndex;
0034: import it.unimi.dsi.mg4j.index.Index;
0035: import it.unimi.dsi.mg4j.index.IndexIterator;
0036: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
0037: import it.unimi.dsi.mg4j.index.payload.Payload;
0038: import it.unimi.dsi.io.InputBitStream;
0039: import it.unimi.dsi.util.Interval;
0040: import it.unimi.dsi.mg4j.search.IntervalIterator;
0041: import it.unimi.dsi.mg4j.search.IntervalIterators;
0042: import it.unimi.dsi.bits.Fast;
0043: import it.unimi.dsi.Util;
0044:
0045: import java.io.IOException;
0046: import java.util.NoSuchElementException;
0047:
0048: import org.apache.log4j.Logger;
0049:
0050: /** A bitstream-based {@linkplain IndexReader index reader}. */
0051:
0052: public class BitStreamIndexReader extends AbstractIndexReader {
0053: @SuppressWarnings("unused")
0054: private static final Logger LOGGER = Util
0055: .getLogger(BitStreamIndexReader.class);
0056:
0057: /** The reference index. */
0058: protected final BitStreamIndex index;
0059:
0060: private final static boolean ASSERTS = false;
0061: private final static boolean DEBUG = false;
0062:
0063: /** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */
0064: protected final BitStreamIndexReaderIndexIterator indexIterator;
0065:
0066: /** Creates a new skip index reader, with the specified underlying {@link Index} and input bit stream.
0067: *
0068: * @param index the index.
0069: * @param ibs the underlying bit stream.
0070: */
0071: public BitStreamIndexReader(final BitStreamIndex index,
0072: final InputBitStream ibs) {
0073: this .index = index;
0074: this .indexIterator = new BitStreamIndexReaderIndexIterator(
0075: this , ibs);
0076: }
0077:
0078: protected static final class BitStreamIndexReaderIndexIterator
0079: extends AbstractIndexIterator implements IndexIterator {
0080: /** The enclosing instance. */
0081: private final BitStreamIndexReader parent;
0082: /** The reference index. */
0083: protected final BitStreamIndex index;
0084: /** The underlying input bit stream. */
0085: protected final InputBitStream ibs;
0086: /** The enclosed interval iterator. */
0087: private final IndexIntervalIterator intervalIterator;
0088: /** A singleton set containing the enclosed interval iterator. */
0089: private final Reference2ReferenceMap<Index, IntervalIterator> singletonIntervalIterator;
0090: /** The key index. */
0091: private final Index keyIndex;
0092:
0093: /** The cached copy of {@link #index index.hasPositions}. */
0094: protected final boolean hasPositions;
0095: /** The cached copy of {@link #index index.hasCounts}. */
0096: protected final boolean hasCounts;
0097: /** The cached copy of {@link #index index.hasPayloads}. */
0098: protected final boolean hasPayloads;
0099: /** Whether the underlying index has skips. */
0100: protected final boolean hasSkips;
0101:
0102: /** The cached copy of {@link #index index.pointerCoding}. */
0103: protected final Coding pointerCoding;
0104:
0105: /** The cached copy of {@link #index index.countCoding}. */
0106: protected final Coding countCoding;
0107:
0108: /** The cached copy of {@link #index index.positionCoding}. */
0109: protected final Coding positionCoding;
0110:
0111: /** The payload, in case the index of this reader has payloads, or <code>null</code>. */
0112: protected final Payload payload;
0113:
0114: /** The parameter <code>b</code> for Golomb coding of pointers. */
0115: protected int b;
0116: /** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */
0117: protected int log2b;
0118:
0119: /** The current term. */
0120: protected int currentTerm = -1;
0121: /** The current frequency. */
0122: protected int frequency;
0123: /** Whether the current terms has pointers at all (this happens when the {@link #frequency} is smaller than the number of documents). */
0124: protected boolean hasPointers;
0125: /** The current count (if this index contains counts). */
0126: protected int count;
0127: /** The last document pointer we read from current list, -1 if we just read the frequency,
0128: * {@link Integer#MAX_VALUE} if we are beyond the end of list. */
0129: protected int currentDocument;
0130: /** The number of the document record we are going to read inside the current inverted list. */
0131: protected int numberOfDocumentRecord;
0132: /** This variable tracks the current state of the reader. */
0133: protected int state;
0134:
0135: /** The parameter <code>h</code> (the maximum height of a skip tower). */
0136: public final int height;
0137: /** The quantum. */
0138: public final int quantum;
0139: /** The bit mask giving the remainder of the division by {@link #quantum}. */
0140: public final int quantumModuloMask;
0141: /** The shift giving result of the division by {@link #quantum}. */
0142: public final int quantumDivisionShift;
0143: /** The maximum height of a skip tower in the current block. May be less than {@link #height} if the block is defective,
0144: * and will be -1 on defective quanta (no tower at all). */
0145: private int maxh;
0146: /** The maximum valid index of the current skip tower, if any. */
0147: private int s;
0148: /** The minimum valid index of the current skip tower, or {@link Integer#MAX_VALUE}. If {@link #maxh} is negative, the value is undefined. */
0149: private int lowest;
0150: /** We have <var>w</var> = <var>Hq</var>. */
0151: private final int w;
0152: /** The bit mask giving the remainder of the division by {@link #w}. */
0153: private final int wModuloMask;
0154: /** The shift giving result of the division by {@link #w}. */
0155: private final int wDivisionShift;
0156: /** The Golomb modulus for a top pointer skip, for each level. */
0157: private int[] towerTopB;
0158: /** The most significant bit of the Golomb modulus for a top point[]er skip, for each level. */
0159: private int[] towerTopLog2B;
0160: /** The Golomb modulus for a lower pointer skip, for each level. */
0161: private int[] towerLowerB;
0162: /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
0163: private int[] towerLowerLog2B;
0164: /** The prediction for a pointer skip, for each level. */
0165: private int[] pointerPrediction;
0166: /** An array to decode bit skips. */
0167: private long[] bitSkip;
0168: /** An array to decode the pointer skips. */
0169: private int[] pointerSkip;
0170: /** The number of bits read just after reading the last skip tower. */
0171: private long readBitsAtLastSkipTower;
0172: /** The document pointer corresponding to the last skip tower. */
0173: private int pointerAtLastSkipTower;
0174: /** The current quantum bit length, as provided by the index. */
0175: private int quantumBitLength;
0176: /** The current entry bit length, as provided by the index. */
0177: private int entryBitLength;
0178: /** This value of {@link #state} means that we are positioned just before a tower. */
0179: private static final int BEFORE_TOWER = 0;
0180:
0181: /** This value of {@link #state} can be assumed only in indices that contain a payload; it
0182: * means that we are positioned just before the payload for the current document record. */
0183: private static final int BEFORE_PAYLOAD = 1;
0184:
0185: /** This value of {@link #state} can be assumed only in indices that contain counts; it
0186: * means that we are positioned just before the count for the current document record. */
0187: private static final int BEFORE_COUNT = 2;
0188:
0189: /** This value of {@link #state} can be assumed only in indices that contain document positions;
0190: * it means that we are positioned just before the position list of the current document record. */
0191: private static final int BEFORE_POSITIONS = 3;
0192:
0193: /** This value of {@link #state} means that we are at the start of a new document record,
0194: * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} == {@link #frequency}),
0195: * in which case we are at the end of the inverted list, and {@link #endOfList()} is true. */
0196: private static final int BEFORE_POINTER = 4;
0197:
0198: /** The cached position array. */
0199: protected int[] positionCache = new int[16];
0200:
0201: public BitStreamIndexReaderIndexIterator(
0202: final BitStreamIndexReader parent,
0203: final InputBitStream ibs) {
0204: this .parent = parent;
0205: this .ibs = ibs;
0206: index = parent.index;
0207: keyIndex = index.keyIndex;
0208: pointerCoding = index.pointerCoding;
0209:
0210: hasPayloads = index.hasPayloads;
0211: payload = hasPayloads ? index.payload.copy() : null;
0212: hasCounts = index.hasCounts;
0213: countCoding = index.countCoding;
0214: hasPositions = index.hasPositions;
0215: positionCoding = index.positionCoding;
0216: intervalIterator = index.hasPositions ? new IndexIntervalIterator()
0217: : null;
0218: singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps
0219: .singleton(keyIndex,
0220: (IntervalIterator) intervalIterator)
0221: : null;
0222: quantum = index.quantum;
0223: height = index.height;
0224: if ((quantum == -1) != (height == -1))
0225: throw new IllegalArgumentException();
0226: hasSkips = quantum != -1 && height != -1;
0227: if (hasSkips) {
0228: w = (1 << height) * quantum;
0229: quantumModuloMask = quantum - 1;
0230: wModuloMask = w - 1;
0231: quantumDivisionShift = Fast.mostSignificantBit(quantum);
0232: wDivisionShift = Fast.mostSignificantBit(w);
0233: bitSkip = new long[height + 1];
0234: pointerSkip = new int[height + 1];
0235: towerTopB = new int[height + 1];
0236: towerTopLog2B = new int[height + 1];
0237: towerLowerB = new int[height + 1];
0238: towerLowerLog2B = new int[height + 1];
0239: pointerPrediction = new int[height + 1];
0240: } else {
0241: w = quantumModuloMask = wModuloMask = quantumDivisionShift = wDivisionShift = 0;
0242: bitSkip = null;
0243: pointerSkip = towerTopB = towerTopLog2B = towerLowerB = towerLowerLog2B = pointerPrediction = null;
0244: }
0245: }
0246:
0247: /** Positions the index on the inverted list of a given term.
0248: *
0249: * <p>This method can be called at any time. Note that it is <em>always</em> possible
0250: * to call this method with argument 0, even if offsets have not been loaded.
0251: *
0252: * @param term a term.
0253: */
0254: protected void position(final int term) throws IOException {
0255: if (term == 0) {
0256: ibs.position(0);
0257: ibs.readBits(0);
0258: } else {
0259: if (index.offsets == null)
0260: throw new IllegalStateException(
0261: "You cannot position an index without offsets");
0262: final long offset = index.offsets.getLong(term);
0263: ibs.position(offset);
0264: // TODO: Can't we set this to 0?
0265: ibs.readBits(offset);
0266: }
0267: currentTerm = term;
0268: readFrequency();
0269: }
0270:
0271: public int termNumber() {
0272: return currentTerm;
0273: }
0274:
0275: protected IndexIterator advance() throws IOException {
0276: if (currentTerm == index.numberOfTerms - 1)
0277: return null;
0278: if (currentTerm != -1) {
0279: skipTo(Integer.MAX_VALUE);
0280: nextDocument(); // This guarantees we have no garbage before the frequency
0281: }
0282: currentTerm++;
0283: readFrequency();
0284: return this ;
0285: }
0286:
0287: private void readFrequency() throws IOException {
0288: // Read the frequency
0289: switch (index.frequencyCoding) {
0290: case GAMMA:
0291: frequency = ibs.readGamma() + 1;
0292: break;
0293: case SHIFTED_GAMMA:
0294: frequency = ibs.readShiftedGamma() + 1;
0295: break;
0296: case DELTA:
0297: frequency = ibs.readDelta() + 1;
0298: break;
0299: default:
0300: throw new IllegalStateException(
0301: "The required frequency coding ("
0302: + index.frequencyCoding
0303: + ") is not supported.");
0304: }
0305: hasPointers = frequency < index.numberOfDocuments;
0306: // We compute the modulus used for pointer Golomb coding
0307: if (pointerCoding == Coding.GOLOMB) {
0308: if (hasPointers) {
0309: b = BitStreamIndex.golombModulus(frequency,
0310: index.numberOfDocuments);
0311: log2b = Fast.mostSignificantBit(b);
0312: }
0313: }
0314: if (hasSkips) {
0315: quantumBitLength = entryBitLength = -1;
0316: lowest = Integer.MAX_VALUE;
0317: if (ASSERTS)
0318: for (int i = height; i > Math
0319: .min(
0320: height,
0321: Fast
0322: .mostSignificantBit(frequency >> quantumDivisionShift)); i--)
0323: towerTopB[i] = towerLowerB[i] = pointerPrediction[i] = -1;
0324: final long pointerQuantumSigma = BitStreamIndex
0325: .quantumSigma(frequency,
0326: index.numberOfDocuments, quantum);
0327: for (int i = Math
0328: .min(
0329: height,
0330: Fast
0331: .mostSignificantBit(frequency >> quantumDivisionShift)); i >= 0; i--) {
0332: towerTopB[i] = BitStreamIndex
0333: .gaussianGolombModulus(pointerQuantumSigma,
0334: i + 1);
0335: towerTopLog2B[i] = Fast
0336: .mostSignificantBit(towerTopB[i]);
0337: towerLowerB[i] = BitStreamIndex
0338: .gaussianGolombModulus(pointerQuantumSigma,
0339: i);
0340: towerLowerLog2B[i] = Fast
0341: .mostSignificantBit(towerLowerB[i]);
0342: pointerPrediction[i] = (int) ((quantum * (1L << i)
0343: * index.numberOfDocuments + frequency / 2) / frequency);
0344: }
0345: }
0346: count = -1;
0347: currentDocument = -1;
0348: numberOfDocumentRecord = -1;
0349: state = BEFORE_POINTER;
0350: }
0351:
0352: public Index index() {
0353: return keyIndex;
0354: }
0355:
0356: public int frequency() {
0357: return frequency;
0358: }
0359:
0360: private void ensureCurrentDocument() {
0361: if (currentDocument < 0)
0362: throw new IllegalStateException(
0363: "nextDocument() has never been called for (term="
0364: + currentTerm + ")");
0365: if (currentDocument == Integer.MAX_VALUE)
0366: throw new IllegalStateException(
0367: "This reader is positioned beyond the end of list of (term="
0368: + currentTerm + ")");
0369: }
0370:
0371: /** Returns whether there are no more document records in the current inverted list.
0372: *
0373: * <p>This method returns true if the last document pointer of the current inverted
0374: * list has been read. It makes no distinction as to where (inside the last document
0375: * record) this reader is currently positioned. In particular, this method will
0376: * return true independently of whether count and positions have been read or not (we
0377: * note by passing that this is the only sensible behaviour, as you can build indices
0378: * with or without counts/positions).
0379: *
0380: * <p>This method will return true also when this reader is positioned <em>beyond</em>
0381: * the last document pointer. In this case, {@link #currentDocumentPointer()} will
0382: * return {@link Integer#MAX_VALUE}.
0383: *
0384: * @return true whether there are no more document records in the current inverted list.
0385: */
0386: private boolean endOfList() {
0387: if (ASSERTS)
0388: assert numberOfDocumentRecord <= frequency;
0389: return numberOfDocumentRecord >= frequency - 1;
0390: }
0391:
0392: public int document() {
0393: if (ASSERTS)
0394: ensureCurrentDocument();
0395: return currentDocument;
0396: }
0397:
0398: public Payload payload() throws IOException {
0399: if (DEBUG)
0400: System.err.println(this + ".payload()");
0401: if (ASSERTS)
0402: ensureCurrentDocument();
0403: if (!hasPayloads)
0404: throw new UnsupportedOperationException("This index ("
0405: + index + ") does not contain payloads");
0406: if (state <= BEFORE_PAYLOAD) {
0407: if (state == BEFORE_TOWER)
0408: readTower();
0409: payload.read(ibs);
0410: state = hasCounts ? BEFORE_COUNT : BEFORE_POINTER;
0411: }
0412: return payload;
0413: }
0414:
0415: public int count() throws IOException {
0416: if (DEBUG)
0417: System.err.println(this + ".count()");
0418: if (count != -1)
0419: return count;
0420: if (ASSERTS)
0421: ensureCurrentDocument();
0422: if (!hasCounts)
0423: throw new UnsupportedOperationException("This index ("
0424: + index + ") does not contain counts");
0425: if (state == BEFORE_TOWER)
0426: readTower();
0427: if (state == BEFORE_PAYLOAD)
0428: payload.read(ibs);
0429: {
0430: if (ASSERTS && state != BEFORE_COUNT)
0431: throw new IllegalStateException();
0432: state = hasPositions ? BEFORE_POSITIONS
0433: : BEFORE_POINTER;
0434: switch (countCoding) {
0435: case UNARY:
0436: count = ibs.readUnary() + 1;
0437: break;
0438: case SHIFTED_GAMMA:
0439: count = ibs.readShiftedGamma() + 1;
0440: break;
0441: case GAMMA:
0442: count = ibs.readGamma() + 1;
0443: break;
0444: case DELTA:
0445: count = ibs.readDelta() + 1;
0446: break;
0447: default:
0448: throw new IllegalStateException(
0449: "The required count coding (" + countCoding
0450: + ") is not supported.");
0451: }
0452: }
0453: return count;
0454: }
0455:
0456: /** We read positions, assuming state <= BEFORE_POSITIONS */
0457: @SuppressWarnings("unused")
0458: protected void updatePositionCache() throws IOException {
0459: if (ASSERTS)
0460: assert state <= BEFORE_POSITIONS;
0461: if (!hasPositions)
0462: throw new UnsupportedOperationException("This index("
0463: + index + ") does not contain positions");
0464: if (state < BEFORE_POSITIONS) {
0465: if (state == BEFORE_TOWER)
0466: readTower();
0467: if (state == BEFORE_PAYLOAD)
0468: payload.read(ibs);
0469: if (state == BEFORE_COUNT) {
0470: if (ASSERTS && state != BEFORE_COUNT)
0471: throw new IllegalStateException();
0472: switch (countCoding) {
0473: case UNARY:
0474: count = ibs.readUnary() + 1;
0475: break;
0476: case SHIFTED_GAMMA:
0477: count = ibs.readShiftedGamma() + 1;
0478: break;
0479: case GAMMA:
0480: count = ibs.readGamma() + 1;
0481: break;
0482: case DELTA:
0483: count = ibs.readDelta() + 1;
0484: break;
0485: default:
0486: throw new IllegalStateException(
0487: "The required count coding ("
0488: + countCoding
0489: + ") is not supported.");
0490: }
0491: }
0492: }
0493: if (count > positionCache.length)
0494: positionCache = new int[Math.max(
0495: positionCache.length * 2, count)];
0496: final int[] occ = positionCache;
0497: state = BEFORE_POINTER;
0498: switch (positionCoding) {
0499: case SHIFTED_GAMMA:
0500: ibs.readShiftedGammas(occ, count);
0501: for (int i = 1; i < count; i++)
0502: occ[i] += occ[i - 1] + 1;
0503: return;
0504: case GAMMA:
0505: ibs.readGammas(occ, count);
0506: for (int i = 1; i < count; i++)
0507: occ[i] += occ[i - 1] + 1;
0508: return;
0509: case DELTA:
0510: ibs.readDeltas(occ, count);
0511: for (int i = 1; i < count; i++)
0512: occ[i] += occ[i - 1] + 1;
0513: return;
0514: case GOLOMB:
0515: if (ASSERTS)
0516: assert index.sizes != null;
0517: int docSize = index.sizes.getInt(currentDocument);
0518: if (count < 3)
0519: for (int i = 0; i < count; i++)
0520: occ[i] = ibs.readMinimalBinary(docSize);
0521: else {
0522: final int bb = BitStreamIndex.golombModulus(count,
0523: docSize);
0524: int prev = -1;
0525: if (bb != 0) {
0526: final int log2bb = Fast.mostSignificantBit(bb);
0527: for (int i = 0; i < count; i++)
0528: occ[i] = prev = ibs.readGolomb(bb, log2bb)
0529: + prev + 1;
0530: } else
0531: for (int i = 0; i < count; i++)
0532: occ[i] = i;
0533: }
0534: return;
0535: case SKEWED_GOLOMB:
0536: if (ASSERTS)
0537: assert index.sizes != null;
0538: int docSize2 = index.sizes.getInt(currentDocument);
0539: if (count < 3)
0540: for (int i = 0; i < count; i++)
0541: occ[i] = ibs.readMinimalBinary(docSize2);
0542: else {
0543: final int sb = ibs.readMinimalBinary(docSize2) + 1;
0544: int prev2 = -1;
0545: for (int i = 0; i < count; i++)
0546: occ[i] = prev2 = ibs.readSkewedGolomb(sb)
0547: + prev2 + 1;
0548: }
0549: return;
0550: case INTERPOLATIVE:
0551: it.unimi.dsi.mg4j.io.InterpolativeCoding.read(ibs, occ,
0552: 0, count, 0, index.sizes
0553: .getInt(currentDocument) - 1);
0554: return;
0555: default:
0556: throw new IllegalStateException(
0557: "The required position coding ("
0558: + index.positionCoding
0559: + ") is not supported.");
0560: }
0561: }
0562:
0563: public IntIterator positions() throws IOException {
0564: if (ASSERTS)
0565: ensureCurrentDocument();
0566: if (state <= BEFORE_POSITIONS)
0567: updatePositionCache();
0568: return IntIterators.wrap(positionCache, 0, count);
0569: }
0570:
0571: public int[] positionArray() throws IOException {
0572: if (ASSERTS)
0573: ensureCurrentDocument();
0574: if (state <= BEFORE_POSITIONS)
0575: updatePositionCache();
0576: return positionCache;
0577: }
0578:
0579: // TODO: check who's using this (positionArray() is actually faster now)
0580: public int positions(final int[] position) throws IOException {
0581: if (ASSERTS)
0582: ensureCurrentDocument();
0583: if (state <= BEFORE_POSITIONS)
0584: updatePositionCache(); // And also that positions have been read
0585: if (position.length < count)
0586: return -count;
0587: for (int i = count; i-- != 0;)
0588: position[i] = this .positionCache[i];
0589: return count;
0590: }
0591:
0592: public int nextDocument() throws IOException {
0593: if (DEBUG)
0594: System.err.println("{" + this + "} nextDocument()");
0595: if (state != BEFORE_POINTER) {
0596: if (state == BEFORE_TOWER)
0597: readTower();
0598: if (state == BEFORE_PAYLOAD)
0599: payload.read(ibs);
0600: if (state == BEFORE_COUNT) {
0601: if (ASSERTS && state != BEFORE_COUNT)
0602: throw new IllegalStateException();
0603: state = hasPositions ? BEFORE_POSITIONS
0604: : BEFORE_POINTER;
0605: switch (countCoding) {
0606: case UNARY:
0607: count = ibs.readUnary() + 1;
0608: break;
0609: case SHIFTED_GAMMA:
0610: count = ibs.readShiftedGamma() + 1;
0611: break;
0612: case GAMMA:
0613: count = ibs.readGamma() + 1;
0614: break;
0615: case DELTA:
0616: count = ibs.readDelta() + 1;
0617: break;
0618: default:
0619: throw new IllegalStateException(
0620: "The required count coding ("
0621: + countCoding
0622: + ") is not supported.");
0623: }
0624: }
0625: if (state == BEFORE_POSITIONS) {
0626: // Here we just skip; note that the state change is necessary if endOfList() is true
0627: state = BEFORE_POINTER;
0628: switch (positionCoding) {
0629: case SHIFTED_GAMMA:
0630: ibs.skipShiftedGammas(count);
0631: break;
0632: case GAMMA:
0633: ibs.skipGammas(count);
0634: break;
0635: case DELTA:
0636: ibs.skipDeltas(count);
0637: break;
0638: case GOLOMB:
0639: if (ASSERTS)
0640: assert index.sizes != null;
0641: int docSize = index.sizes
0642: .getInt(currentDocument);
0643: if (count < 3)
0644: for (int i = 0; i < count; i++)
0645: ibs.readMinimalBinary(docSize);
0646: else {
0647: final int bb = BitStreamIndex
0648: .golombModulus(count, docSize);
0649: if (bb != 0) {
0650: final int log2bb = Fast
0651: .mostSignificantBit(bb);
0652: for (int i = 0; i < count; i++)
0653: ibs.readGolomb(bb, log2bb);
0654: }
0655: }
0656: break;
0657: case SKEWED_GOLOMB:
0658: if (ASSERTS)
0659: assert index.sizes != null;
0660: docSize = index.sizes.getInt(currentDocument);
0661: if (count < 3)
0662: for (int i = 0; i < count; i++)
0663: ibs.readMinimalBinary(docSize);
0664: else {
0665: final int sb = ibs
0666: .readMinimalBinary(docSize) + 1;
0667: for (int i = 0; i < count; i++)
0668: ibs.readSkewedGolomb(sb);
0669: }
0670: break;
0671: case INTERPOLATIVE:
0672: it.unimi.dsi.mg4j.io.InterpolativeCoding.read(
0673: ibs, null, 0, count, 0, index.sizes
0674: .getInt(currentDocument) - 1);
0675: break;
0676: default:
0677: throw new IllegalStateException(
0678: "The required position coding ("
0679: + positionCoding
0680: + ") is not supported.");
0681: }
0682: }
0683: }
0684: if (endOfList())
0685: return -1;
0686: if (hasPointers) {// We do not write pointers for everywhere occurring terms.
0687: switch (pointerCoding) {
0688: case SHIFTED_GAMMA:
0689: currentDocument += ibs.readShiftedGamma() + 1;
0690: break;
0691: case GAMMA:
0692: currentDocument += ibs.readGamma() + 1;
0693: break;
0694: case DELTA:
0695: currentDocument += ibs.readDelta() + 1;
0696: break;
0697: case GOLOMB:
0698: currentDocument += ibs.readGolomb(b, log2b) + 1;
0699: break;
0700: default:
0701: throw new IllegalStateException(
0702: "The required pointer coding ("
0703: + pointerCoding
0704: + ") is not supported.");
0705: }
0706: } else
0707: currentDocument++;
0708: numberOfDocumentRecord++;
0709: if (hasPayloads)
0710: state = BEFORE_PAYLOAD;
0711: else if (hasCounts)
0712: state = BEFORE_COUNT;
0713: count = -1;
0714: if (hasSkips
0715: && (numberOfDocumentRecord & quantumModuloMask) == 0)
0716: state = BEFORE_TOWER;
0717: return currentDocument;
0718: }
0719:
0720: /** Reads the entire skip tower for the current position.
0721: */
0722: private void readTower() throws IOException {
0723: readTower(-1);
0724: }
0725:
0726: /** Reads the skip tower for the current position, possibly skipping part of the tower.
0727: *
0728: * <P>Note that this method will update {@link #state} only if it reads the entire tower,
0729: * otherwise the state remains {@link #BEFORE_TOWER}.
0730: *
0731: * @param pointer the tower will be read up to the first entry smaller than or equal to this pointer; use
0732: * -1 to guarantee that the entire tower will be read.
0733: */
0734: private void readTower(final int pointer) throws IOException {
0735: int i, j, k, cacheOffset, cache, towerLength = 0;
0736: long bitsAtTowerStart = 0;
0737: boolean truncated = false;
0738: if (ASSERTS)
0739: assert numberOfDocumentRecord % quantum == 0;
0740: if (ASSERTS && state != BEFORE_TOWER)
0741: throw new IllegalStateException(
0742: "readTower() called in state " + state);
0743: cacheOffset = (numberOfDocumentRecord & wModuloMask);
0744: k = cacheOffset >> quantumDivisionShift;
0745: if (ASSERTS && k == 0) { // Invalidate current tower data
0746: it.unimi.dsi.fastutil.ints.IntArrays.fill(pointerSkip,
0747: Integer.MAX_VALUE);
0748: it.unimi.dsi.fastutil.longs.LongArrays.fill(bitSkip,
0749: Integer.MAX_VALUE);
0750: }
0751: // Compute the height of the current skip tower.
0752: s = (k == 0) ? height : Fast.leastSignificantBit(k);
0753: cache = frequency - w
0754: * (numberOfDocumentRecord >> wDivisionShift);
0755: if (cache < w) {
0756: maxh = Fast
0757: .mostSignificantBit((cache >> quantumDivisionShift)
0758: - k);
0759: if (maxh < s) {
0760: s = maxh;
0761: truncated = true;
0762: } else
0763: truncated = false;
0764: } else {
0765: cache = w;
0766: maxh = height;
0767: truncated = k == 0;
0768: }
0769: //assert w == cache || k == 0 || lastMaxh == Fast.mostSignificantBit( k ^ ( cache/quantum ) ) : lastMaxh +","+ (Fast.mostSignificantBit( k ^ ( cache/quantum ) ));
0770: i = s;
0771: if (s >= 0) {
0772: if (k == 0) {
0773: if (quantumBitLength < 0) {
0774: quantumBitLength = ibs.readDelta();
0775: entryBitLength = ibs.readDelta();
0776: } else {
0777: quantumBitLength += Fast.nat2int(ibs
0778: .readDelta());
0779: entryBitLength += Fast.nat2int(ibs.readDelta());
0780: }
0781: if (DEBUG)
0782: System.err
0783: .println("{" + this
0784: + "} quantum bit length="
0785: + quantumBitLength
0786: + " entry bit length="
0787: + entryBitLength);
0788: }
0789: if (DEBUG)
0790: System.err.println("{" + this
0791: + "} Reading tower; pointer=" + pointer
0792: + " maxh=" + maxh + " s=" + s);
0793: if (s > 0) {
0794: towerLength = entryBitLength * (s + 1)
0795: + Fast.nat2int(ibs.readDelta());
0796: if (DEBUG)
0797: System.err.println("{" + this
0798: + "} Tower length=" + towerLength);
0799: }
0800: // We store the number of bits read at the start of the tower (just after the length).
0801: bitsAtTowerStart = ibs.readBits();
0802: if (truncated) {
0803: if (DEBUG)
0804: System.err.println("{" + this
0805: + "} Truncated--reading tops");
0806: // We read the tower top.
0807: pointerSkip[s] = Fast.nat2int(ibs.readGolomb(
0808: towerTopB[s], towerTopLog2B[s]))
0809: + pointerPrediction[s];
0810: bitSkip[s] = quantumBitLength * (1 << s)
0811: + entryBitLength * ((1 << s + 1) - s - 2)
0812: + Fast.nat2int(ibs.readLongDelta());
0813: } else {
0814: // We copy the tower top from the lowest inherited entry suitably updated.
0815: pointerSkip[s] = pointerSkip[s + 1]
0816: - (currentDocument - pointerAtLastSkipTower);
0817: bitSkip[s] = bitSkip[s + 1]
0818: - (bitsAtTowerStart - readBitsAtLastSkipTower)
0819: - towerLength;
0820: }
0821: // We read the remaining part of the tower, at least until we point after pointer.
0822: if (currentDocument + pointerSkip[i] > pointer) {
0823: for (i = s - 1; i >= 0; i--) {
0824: pointerSkip[i] = Fast.nat2int(ibs.readGolomb(
0825: towerLowerB[i], towerLowerLog2B[i]))
0826: + pointerSkip[i + 1] / 2;
0827: bitSkip[i] = (bitSkip[i + 1] - entryBitLength
0828: * (i + 1))
0829: / 2 - Fast.nat2int(ibs.readLongDelta());
0830: if (DEBUG
0831: && currentDocument + pointerSkip[i] <= pointer)
0832: System.err
0833: .println("{"
0834: + this
0835: + "} stopping reading at i="
0836: + i
0837: + " as currentDocument ("
0838: + currentDocument
0839: + ") plus pointer skip ("
0840: + pointerSkip[i]
0841: + ") is smaller than or equal target ("
0842: + pointer + ")");
0843: if (currentDocument + pointerSkip[i] <= pointer)
0844: break;
0845: }
0846: }
0847: }
0848: /* If we did not read the entire tower, we need to fix the skips we read (as they
0849: * are offsets from the *end* of the tower) and moreover we must make unusable the
0850: * rest of the tower (for asserts). */
0851: if (i > 0) {
0852: final long fix = ibs.readBits() - bitsAtTowerStart;
0853: for (j = s; j >= i; j--)
0854: bitSkip[j] += towerLength - fix;
0855: if (ASSERTS)
0856: for (; j >= 0; j--)
0857: pointerSkip[j] = Integer.MAX_VALUE;
0858: } else
0859: state = hasPayloads ? BEFORE_PAYLOAD
0860: : hasCounts ? BEFORE_COUNT : BEFORE_POINTER;
0861: // We update the inherited tower.
0862: final long deltaBits = ibs.readBits()
0863: - readBitsAtLastSkipTower;
0864: final int deltaPointers = currentDocument
0865: - pointerAtLastSkipTower;
0866: for (j = Fast.mostSignificantBit(k
0867: ^ (cache >> quantumDivisionShift)); j >= s + 1; j--) {
0868: bitSkip[j] -= deltaBits;
0869: pointerSkip[j] -= deltaPointers;
0870: }
0871: readBitsAtLastSkipTower = ibs.readBits();
0872: pointerAtLastSkipTower = currentDocument;
0873: lowest = i < 0 ? 0 : i;
0874: if (DEBUG) {
0875: System.err.println("{" + this + "} "
0876: + "Computed skip tower (lowest: " + lowest
0877: + ") for document record number "
0878: + numberOfDocumentRecord + " (pointer "
0879: + currentDocument + ") from " + Math.max(i, 0)
0880: + ": ");
0881: System.err.print("% ");
0882: for (j = 0; j <= s; j++)
0883: System.err.print(pointerSkip[j] + ":" + bitSkip[j]
0884: + " ");
0885: System.err.print(" [");
0886: for (; j <= height; j++)
0887: System.err.print(pointerSkip[j] + ":" + bitSkip[j]
0888: + " ");
0889: System.err.print("]");
0890: System.err.println();
0891: }
0892: }
0893:
0894: /*
0895: public int skip( final int n ) throws IOException {
0896: int i, k, cacheOffset, start = numberOfDocumentRecord, skip = 0;
0897:
0898: if ( DEBUG ) System.err.println( "{" + this + "} " + "Going to enter linear skip code with lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n + ", endOfList()=" + endOfList() );
0899: if ( n < 0 ) throw new IllegalArgumentException();
0900: if ( n == 0 ) return 0;
0901:
0902: // If we are just at the start of a list, let us read the first pointer.
0903: if ( numberOfDocumentRecord == -1 ) readDocumentPointer();
0904: if ( state == BEFORE_TOWER ) readTower( -1 );
0905:
0906: if ( DEBUG ) System.err.println( "{" + this + "} " + "Entering skip code with lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n + ", endOfList()=" + endOfList() );
0907:
0908: for(;;) {
0909: if ( DEBUG ) System.err.println( "{" + this + "} " + "In for loop, lastDoc=" + currentDocument + ", maxh=" + maxh + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n );
0910:
0911: cacheOffset = ( numberOfDocumentRecord & wModuloMask );
0912: k = cacheOffset >> quantumDivisionShift;
0913:
0914: if ( maxh < 0 ) break; // Defective quantum--no tower.
0915:
0916: for( i = Fast.mostSignificantBit( k ^ ( Math.min( w, frequency - numberOfDocumentRecord + cacheOffset ) >> quantumDivisionShift ) ); i >= 0; i-- )
0917: if ( ( skip = ( ( k & - ( 1 << i ) ) + ( 1 << i ) ) * quantum - cacheOffset ) <= n ) break;
0918:
0919: if ( i >= 0 ) {
0920: ibs.skip( bitSkip[ i ] - ( ibs.readBits() - readBitsAtLastSkipTower ) );
0921: state = BEFORE_TOWER;
0922: currentDocument = pointerSkip[ i ] + pointerAtLastSkipTower;
0923: numberOfDocumentRecord += skip;
0924: // If we skipped beyond the end of the list, we invalidate the current document.
0925: if ( numberOfDocumentRecord == frequency ) currentDocument = -1;
0926: readTower( -1 );
0927: count = -1; // We must invalidate count as readDocumentPointer() would do.
0928: if ( endOfList() ) return numberOfDocumentRecord - start;
0929: }
0930: else break;
0931: }
0932:
0933: if ( DEBUG ) System.err.println( "{" + this + "} " + "Completing sequentially, lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n );
0934:
0935: while( numberOfDocumentRecord - start < n ) {
0936: if ( endOfList() ) break;
0937: readDocumentPointer();
0938: }
0939: return numberOfDocumentRecord - start;
0940: }
0941: */
0942: public int skipTo(final int p) throws IOException {
0943: if (DEBUG)
0944: System.err.println(this + ".skipTo(" + p
0945: + ") [currentDocument=" + currentDocument
0946: + ", numberOfDocumentRecord="
0947: + numberOfDocumentRecord + ", endOfList()="
0948: + endOfList());
0949: // If we are just at the start of a list, let us read the first pointer.
0950: if (numberOfDocumentRecord == -1)
0951: nextDocument(); // TODO: shouldn't we just read the tower?
0952: if (currentDocument >= p) {
0953: if (DEBUG)
0954: System.err.println(this
0955: + ": No skip necessary, returning "
0956: + currentDocument);
0957: return currentDocument;
0958: }
0959: if (hasSkips) {
0960: if (state == BEFORE_TOWER)
0961: readTower(p);
0962: final int[] pointerSkip = this .pointerSkip;
0963: for (;;) {
0964: if (ASSERTS)
0965: assert maxh < 0 || lowest > 0
0966: || pointerSkip[0] != Integer.MAX_VALUE;
0967: // If on a defective quantum (no tower) or p is inside the current quantum (no need to scan the tower) we bail out.
0968: if (maxh < 0
0969: || lowest == 0
0970: && pointerAtLastSkipTower + pointerSkip[0] > p)
0971: break;
0972: if (DEBUG)
0973: System.err.println(this
0974: + ": In for loop, currentDocument="
0975: + currentDocument + ", maxh=" + maxh
0976: + ", numberOfDocumentRecord="
0977: + numberOfDocumentRecord + ", p=" + p);
0978: final int cacheOffset = (numberOfDocumentRecord & wModuloMask);
0979: final int k = cacheOffset >> quantumDivisionShift;
0980: final int top = Fast
0981: .mostSignificantBit(k
0982: ^ (Math.min(w, frequency
0983: - numberOfDocumentRecord
0984: + cacheOffset) >> quantumDivisionShift));
0985: int i;
0986: for (i = lowest; i <= top; i++) {
0987: if (ASSERTS && (k & 1 << i) != 0)
0988: assert pointerSkip[i] == pointerSkip[i + 1];
0989: if (ASSERTS)
0990: assert pointerSkip[i] != Integer.MAX_VALUE : "Invalid pointer skip "
0991: + i
0992: + " (lowest="
0993: + lowest
0994: + ", top=" + top + ")";
0995: if (pointerAtLastSkipTower + pointerSkip[i] > p)
0996: break;
0997: }
0998: if (--i < 0)
0999: break;
1000: if (ASSERTS)
1001: assert i >= lowest : i + " < " + lowest;
1002: if (DEBUG)
1003: System.err.println(this
1004: + ": Safely after for with i=" + i
1005: + ", P[i]=" + pointerSkip[i]
1006: + ", A[i]=" + bitSkip[i]);
1007: if (DEBUG)
1008: System.err
1009: .println(this
1010: + ": ["
1011: + ibs.readBits()
1012: + "] Skipping "
1013: + (bitSkip[i] - (ibs.readBits() - readBitsAtLastSkipTower))
1014: + " bits ("
1015: + (((k & -(1 << i)) + (1 << i))
1016: * quantum - cacheOffset)
1017: + " records) to get to document pointer "
1018: + (currentDocument + pointerSkip[i]));
1019: ibs
1020: .skip(bitSkip[i]
1021: - (ibs.readBits() - readBitsAtLastSkipTower));
1022: state = BEFORE_TOWER;
1023: currentDocument = pointerSkip[i]
1024: + pointerAtLastSkipTower;
1025: numberOfDocumentRecord += ((k & -(1 << i)) + (1 << i))
1026: * quantum - cacheOffset;
1027: // If we skipped beyond the end of the list, we invalidate the current document.
1028: if (numberOfDocumentRecord == frequency) {
1029: currentDocument = Integer.MAX_VALUE;
1030: state = BEFORE_POINTER; // We are actually before a frequency, but we must avoid that calls to nextDocument() read anything
1031: } else
1032: readTower(p); // Note that if we are exactly on the destination pointer, we will read the entire tower.
1033: count = -1; // We must invalidate count as readDocumentPointer() would do.
1034: if (endOfList()) {
1035: if (DEBUG)
1036: System.err
1037: .println(this
1038: + ".toSkip(): end-of-list, returning "
1039: + currentDocument);
1040: // Note that if p == Integer.MAX_VALUE, we are certainly beyond end-of-list
1041: return p == Integer.MAX_VALUE ? Integer.MAX_VALUE
1042: : currentDocument;
1043: }
1044: }
1045: if (DEBUG)
1046: System.err
1047: .println(this
1048: + ": Completing sequentially, currentDocument="
1049: + currentDocument
1050: + ", numberOfDocumentRecord="
1051: + numberOfDocumentRecord + ", p="
1052: + p);
1053: }
1054: while (currentDocument < p) {
1055: if (DEBUG)
1056: System.err
1057: .println(this
1058: + ": Skipping sequentially (second), currentDocument="
1059: + currentDocument
1060: + ", numberOfDocumentRecord="
1061: + numberOfDocumentRecord + ", p="
1062: + p);
1063: if (nextDocument() == -1) {
1064: if (DEBUG)
1065: System.err.println(this
1066: + ": end-of-list, returning MAX_VALUE");
1067: return Integer.MAX_VALUE;
1068: }
1069: }
1070: if (DEBUG)
1071: System.err.println(this + ".toSkip(): Returning "
1072: + currentDocument);
1073: return currentDocument;
1074: }
1075:
1076: public void dispose() throws IOException {
1077: parent.close();
1078: }
1079:
1080: public boolean hasNext() {
1081: return !endOfList();
1082: }
1083:
1084: public int nextInt() {
1085: if (!hasNext())
1086: throw new NoSuchElementException();
1087: try {
1088: return nextDocument();
1089: } catch (IOException e) {
1090: throw new RuntimeException(e);
1091: }
1092: }
1093:
1094: public String toString() {
1095: return index + " [" + currentTerm + "]";
1096: }
1097:
1098: /** An interval iterator returning the positions of the current document as singleton intervals. */
1099: private final class IndexIntervalIterator extends
1100: AbstractObjectIterator<Interval> implements
1101: IntervalIterator {
1102: int pos = -1;
1103:
1104: public void reset() throws IOException {
1105: pos = -1;
1106: if (state <= BEFORE_POSITIONS)
1107: updatePositionCache(); // This guarantees the position cache is ok
1108: }
1109:
1110: public void intervalTerms(final IntSet terms) {
1111: terms
1112: .add(BitStreamIndexReaderIndexIterator.this .currentTerm);
1113: }
1114:
1115: public boolean hasNext() {
1116: return pos < count - 1;
1117: }
1118:
1119: public Interval next() {
1120: if (!hasNext())
1121: throw new NoSuchElementException();
1122: return Interval.valueOf(positionCache[++pos]);
1123: }
1124:
1125: public Interval nextInterval() {
1126: return pos < count - 1 ? Interval
1127: .valueOf(positionCache[++pos]) : null;
1128: }
1129:
1130: public int extent() {
1131: return 1;
1132: }
1133:
1134: public String toString() {
1135: return index + ": " + term + "[doc=" + currentDocument
1136: + ", count=" + count + ", pos=" + pos + "]";
1137: }
1138: };
1139:
1140: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
1141: throws IOException {
1142: intervalIterator();
1143: return singletonIntervalIterator;
1144: }
1145:
1146: public IntervalIterator intervalIterator() throws IOException {
1147: return intervalIterator(keyIndex);
1148: }
1149:
1150: public IntervalIterator intervalIterator(final Index index)
1151: throws IOException {
1152: if (ASSERTS)
1153: ensureCurrentDocument();
1154: // TODO: this was if ( index != keyIndex || hasPayloads )
1155: if (index != keyIndex)
1156: return IntervalIterators.TRUE;
1157: if (ASSERTS)
1158: assert intervalIterator != null;
1159: intervalIterator.reset();
1160: return intervalIterator;
1161: }
1162:
1163: public ReferenceSet<Index> indices() {
1164: return index.singletonSet;
1165: }
1166: }
1167:
1168: private IndexIterator documents(final CharSequence term,
1169: final int termNumber) throws IOException {
1170: indexIterator.term(term);
1171: indexIterator.position(termNumber);
1172: return indexIterator;
1173: }
1174:
1175: public IndexIterator documents(final int term) throws IOException {
1176: return documents(null, term);
1177: }
1178:
1179: public IndexIterator documents(final CharSequence term)
1180: throws IOException {
1181: if (closed)
1182: throw new IllegalStateException("This "
1183: + getClass().getSimpleName() + " has been closed");
1184: if (index.termMap != null) {
1185: final int termIndex = (int) index.termMap.getLong(term);
1186: if (termIndex == -1)
1187: return index.emptyIndexIterator;
1188: return documents(term, termIndex);
1189: }
1190: throw new UnsupportedOperationException("Index " + index
1191: + " has no term map");
1192: }
1193:
1194: @Override
1195: public IndexIterator nextIterator() throws IOException {
1196: return indexIterator.advance();
1197: }
1198:
1199: public String toString() {
1200: return getClass().getSimpleName() + "[" + index + "]";
1201: }
1202:
1203: public void close() throws IOException {
1204: super.close();
1205: indexIterator.ibs.close();
1206: }
1207: }
|