001: package it.unimi.dsi.mg4j.index;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntArrays;
025: import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
026: import it.unimi.dsi.fastutil.ints.IntIterator;
027: import it.unimi.dsi.fastutil.ints.IntIterators;
028: import it.unimi.dsi.fastutil.ints.IntSet;
029: import it.unimi.dsi.mg4j.index.payload.Payload;
030: import it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator;
031: import it.unimi.dsi.mg4j.search.DocumentIterator;
032: import it.unimi.dsi.util.Interval;
033: import it.unimi.dsi.mg4j.search.IntervalIterator;
034: import it.unimi.dsi.mg4j.search.OrDocumentIterator;
035: import it.unimi.dsi.mg4j.search.score.BM25Scorer;
036: import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;
037:
038: import java.io.IOException;
039:
040: /** A virtual {@linkplain IndexIterator index iterator} that merges several component index iterators.
041: *
042: * <P>This class adds to {@link it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator}
043: * an interval interator generating the OR of the intervals returned for each of the documents involved.
044: * The main difference with an {@link OrDocumentIterator} built on the same array of component iterators
045: * is that this class implements {@link IndexIterator} and hence provides a {@link #count()} (the sum
046: * of counts of those component iterators positioned on the current document) and a {@link #frequency()}. The
047: * latter is by default the maximum frequency of a component iterator, but it can be set
048: * at {@link MultiTermIndexIterator#getInstance(int, Index, IndexIterator[]) construction time}.
049: *
050: * <p>The main <i>raison d'être</i> of this class is support for query expansion: a blind application
051: * of {@link OrDocumentIterator} to an array of index iterators would mislead scorers such as {@link BM25Scorer}
052: * because low-frequency terms (e.g., <i>hapax legomena</i>) would be responsible for most of the score.
053: */
054:
055: public class MultiTermIndexIterator extends
056: AbstractUnionDocumentIterator implements IndexIterator {
057: @SuppressWarnings("unused")
058: private static final boolean ASSERTS = false;
059:
060: /** Value to be used for term frequency, or {@link Integer#MIN_VALUE} to use the max; in any case, this attribute is used to cache
061: * frequency after the first call to {@link #frequency()}. */
062: private int frequency;
063: /** The term of this iterator. */
064: protected String term;
065: /** The id of this iterator. */
066: protected int id;
067: /** The count of the last returned document. */
068: private int count = -1;
069: /** Whether all underlying index iterators have counts. */
070: private final boolean hasCounts;
071: /** Whether all underlying index iterators have positions. */
072: private final boolean hasPositions;
073:
074: /** Returns an index iterator that merges the given array of iterators.
075: * This method requires that at least one iterator is provided. The frequency is computed as a max,
076: * and {@link #index()} will return the result of the same method on the first iterator.
077: *
078: * @param indexIterator the iterators to be joined (at least one).
079: * @return a merged index iterator.
080: * @throws IllegalArgumentException if no iterators were provided.
081: */
082: public static IndexIterator getInstance(
083: final IndexIterator... indexIterator) throws IOException {
084: return getInstance(Integer.MIN_VALUE, indexIterator);
085: }
086:
087: /** Returns an index iterator that merges the given array of iterators.
088: *
089: * <P>Note that the special case of the empty and of the singleton arrays
090: * are handled efficiently. The frequency is computed as a max, and
091: * {@link #index()} will return <code>index</code>.
092: *
093: * @param index the index that wil be returned by {@link #index()}.
094: * @param indexIterator the iterators to be joined.
095: * @return a merged index iterator.
096: */
097: public static IndexIterator getInstance(final Index index,
098: final IndexIterator... indexIterator) throws IOException {
099: return getInstance(Integer.MIN_VALUE, index, indexIterator);
100: }
101:
102: /** Returns an index iterator that merges the given array of iterators.
103: * This method requires that at least one iterator is provided.
104: *
105: * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
106: * @param indexIterator the iterators to be joined (at least one).
107: * @return a merged index iterator.
108: * @throws IllegalArgumentException if no iterators were provided, or they run on different indices.
109: */
110: public static IndexIterator getInstance(final int defaultFrequency,
111: final IndexIterator... indexIterator) throws IOException {
112: if (indexIterator.length == 0)
113: throw new IllegalArgumentException();
114: return getInstance(defaultFrequency, indexIterator[0].index(),
115: indexIterator);
116: }
117:
118: /** Returns an index iterator that merges the given array of iterators.
119: *
120: * <P>Note that the special case of the empty and of the singleton arrays
121: * are handled efficiently.
122: *
123: * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
124: * @param index the index that wil be returned by {@link #index()}.
125: * @param indexIterator the iterators to be joined.
126: * @return a merged index iterator.
127: * @throws IllegalArgumentException if there is some iterator on an index different from <code>index</code>.
128: */
129: public static IndexIterator getInstance(final int defaultFrequency,
130: final Index index, final IndexIterator... indexIterator)
131: throws IOException {
132: if (indexIterator.length == 0)
133: return index.emptyIndexIterator;
134: if (indexIterator.length == 1)
135: return indexIterator[0];
136: return new MultiTermIndexIterator(defaultFrequency,
137: indexIterator);
138: }
139:
140: /** Creates a new document iterator that merges the given array of iterators.
141: *
142: * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
143: * @param indexIterator the iterators to be joined.
144: */
145: @SuppressWarnings("cast")
146: protected MultiTermIndexIterator(final int defaultFrequency,
147: final IndexIterator... indexIterator) throws IOException {
148: super ((DocumentIterator[]) indexIterator);
149: this .frequency = defaultFrequency;
150: boolean havePositions = true, haveCounts = true;
151: for (IndexIterator i : indexIterator) {
152: if (!i.index().hasCounts)
153: haveCounts = false;
154: if (!i.index().hasPositions)
155: havePositions = false;
156:
157: }
158:
159: hasCounts = haveCounts;
160: hasPositions = havePositions;
161: }
162:
163: protected IntervalIterator getComposedIntervalIterator(
164: final Index index) {
165: return new MultiTermIntervalIterator();
166: }
167:
168: @Override
169: public int skipTo(final int n) throws IOException {
170: if (last >= n)
171: return last;
172: // We invalidate count before calling the superclass method.
173: count = -1;
174: return super .skipTo(n);
175: }
176:
177: public int nextDocument() throws IOException {
178: // We invalidate count before calling the superclass method.
179: count = -1;
180: return super .nextDocument();
181: }
182:
183: /** The count is the sum of counts of those component iterators positioned on the current document.
184: *
185: * @return the sum of counts.
186: */
187: public int count() throws IOException {
188: if (!hasCounts)
189: throw new IllegalStateException(
190: "Some of the underlying iterators do not have counts");
191: if (last == -1)
192: throw new IllegalStateException();
193: if (count == -1) {
194: int count = 0;
195: for (int i = computeFront(); i-- != 0;)
196: count += indexIterator[front[i]].count();
197: this .count = count;
198: }
199: return count;
200: }
201:
202: /** The frequency is either the default frequency set at construction time, or the maximum frequency of the component iterators.
203: *
204: * @return the frequency.
205: */
206: public int frequency() throws IOException {
207: if (frequency != Integer.MIN_VALUE)
208: return frequency;
209: int frequency = Integer.MIN_VALUE;
210: for (int i = n; i-- != 0;)
211: frequency = Math.max(frequency, indexIterator[i]
212: .frequency());
213: return this .frequency = frequency; // caching it!
214: }
215:
216: public void term(final CharSequence term) {
217: this .term = term == null ? null : term.toString();
218: }
219:
220: public String term() {
221: return term;
222: }
223:
224: public int termNumber() {
225: // TODO: this is not particularly sensible
226: return indexIterator[0].termNumber();
227: }
228:
229: public void id(final int id) {
230: this .id = id;
231: }
232:
233: public int id() {
234: return id;
235: }
236:
237: public Index index() {
238: return soleIndex;
239: }
240:
241: /** This method is not implemented by this class.
242: */
243: public Payload payload() {
244: throw new UnsupportedOperationException();
245: }
246:
247: public int[] positionArray() throws IOException {
248: if (!hasPositions)
249: throw new IllegalStateException(
250: "Some of the underlying iterators do not have positions");
251:
252: // If the front contains a single element, we can just use its position array.
253: if (computeFront() == 1)
254: return indexIterator[front[0]].positionArray();
255:
256: final MultiTermIntervalIterator multiTermIntervalIterator = (MultiTermIntervalIterator) intervalIterator();
257: multiTermIntervalIterator.drain();
258: return multiTermIntervalIterator.cache;
259: }
260:
261: public IntIterator positions() throws IOException {
262: return IntIterators.wrap(positionArray(), 0, count);
263: }
264:
265: public int positions(int[] position) throws IOException {
266: int c = count;
267: if (position.length < c)
268: return -c;
269: final int[] cache = positionArray();
270: for (int i = c; i-- != 0;)
271: position[i] = cache[i];
272: return c;
273: }
274:
275: @Override
276: public boolean accept(DocumentIteratorVisitor visitor)
277: throws IOException {
278: return visitor.visit(this );
279: }
280:
281: @Override
282: public boolean acceptOnTruePaths(DocumentIteratorVisitor visitor)
283: throws IOException {
284: return visitor.visit(this );
285: }
286:
287: /** An optimised interval iterator with the same semantics as that implemented
288: * by {@link OrDocumentIterator}, but not allowing duplicate positions.
289: s *
290: * <p>This iterator provides an additional {@link #drain()} method that exhausts the
291: * merge queue, leaving however the returned elements in the {@link #cache} array. Moreover,
292: * the internal state of the iterator is modified so that it continues to behave normally,
293: * returning however its results from {@link #cache}. In this way we can easily provide
294: * efficient implementations for {@link IndexIterator#positions()}, {@link IndexIterator#positionArray()},
295: * and {@link IndexIterator#positions(int[])}.
296: */
297: private class MultiTermIntervalIterator extends
298: AbstractCompositeIndexIntervalIterator implements
299: IntervalIterator {
300: @SuppressWarnings({"unused","hiding"})
301: private final static boolean DEBUG = false;
302: @SuppressWarnings("hiding")
303: private final static boolean ASSERTS = false;
304:
305: /** A heap-based indirect priority queue used to keep track of the currently scanned positions. */
306: private final IntHeapSemiIndirectPriorityQueue positionQueue;
307: /** The cached results of this iterator. */
308: public int[] cache;
309: /** The number of results emitted by this iterator since the last call to {@link #reset()}. */
310: private int emitted;
311: /** The number of results extracted in {@link #cache} since the last call to {@link #reset()}. */
312: private int extracted;
313:
314: public MultiTermIntervalIterator() {
315: super (n);
316: positionQueue = new IntHeapSemiIndirectPriorityQueue(curr);
317: cache = new int[4];
318: }
319:
320: public void reset() throws IOException {
321: emitted = extracted = 0;
322: next = null;
323: positionQueue.clear();
324:
325: for (int i = computeFront(), k; i-- != 0;) {
326: k = front[i];
327: position[k] = indexIterator[k].positionArray();
328: count[k] = indexIterator[k].count();
329: curr[k] = position[k][0];
330: currPos[k] = 0;
331: positionQueue.enqueue(k);
332: }
333:
334: if (ASSERTS)
335: assert !positionQueue.isEmpty();
336: }
337:
338: public void intervalTerms(final IntSet terms) {
339: // TODO: this is not particularly sensible
340: terms.add(indexIterator[0].termNumber());
341: }
342:
343: public Interval nextInterval() {
344: if (next != null) {
345: final Interval result = next;
346: next = null;
347: return result;
348: }
349:
350: if (emitted < extracted)
351: return Interval.valueOf(cache[emitted++]);
352:
353: if (positionQueue.isEmpty())
354: return null;
355:
356: final int first = positionQueue.first();
357:
358: if (extracted == cache.length)
359: cache = IntArrays.grow(cache, extracted + 1);
360: cache[extracted++] = curr[first];
361:
362: if (++currPos[first] < count[first]) {
363: curr[first] = position[first][currPos[first]];
364: positionQueue.changed();
365: if (curr[positionQueue.first()] == cache[extracted - 1])
366: throw new IllegalArgumentException(
367: "Duplicate positions in " + this );
368: } else
369: positionQueue.dequeue();
370:
371: return Interval.valueOf(cache[emitted++]);
372: }
373:
374: public int extent() {
375: return 1;
376: }
377:
378: /** Drains all elements from the queue, stores them in {@link #cache} and
379: * restores {@link #emitted} so that the iterators continues to work transparently.
380: */
381:
382: public void drain() {
383: final int emittedNow = emitted - (next != null ? 1 : 0);
384: next = null;
385: while (nextInterval() != null)
386: ;
387: emitted = emittedNow;
388: }
389: }
390: }
|