001: package it.unimi.dsi.mg4j.index.cluster;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
025: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
026: import it.unimi.dsi.fastutil.objects.ReferenceArraySet;
027: import it.unimi.dsi.fastutil.objects.ReferenceSet;
028: import it.unimi.dsi.mg4j.index.Index;
029: import it.unimi.dsi.mg4j.search.AbstractDocumentIterator;
030: import it.unimi.dsi.mg4j.search.DocumentIterator;
031: import it.unimi.dsi.mg4j.search.IntervalIterator;
032: import it.unimi.dsi.mg4j.search.IntervalIterators;
033: import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;
034:
035: import java.io.IOException;
036:
037: /** A document iterator merging iterators from local indices.
038: *
039: * @author Sebastiano Vigna
040: */
041:
042: public class DocumentalMergedClusterDocumentIterator extends
043: AbstractDocumentIterator implements DocumentIterator {
044: /** The component document iterators. */
045: final protected DocumentIterator[] documentIterator;
046: /** The number of component iterators. */
047: final protected int n;
048: /** The indices corresponding to each underlying document iterator. */
049: protected final int[] usedIndex;
050: /** The cached strategy of the index we refer to. */
051: protected final DocumentalClusteringStrategy strategy;
052: /** The queue of document iterator indices (offsets into {@link #documentIterator} and {@link #usedIndex}). */
053: protected final IntHeapSemiIndirectPriorityQueue queue;
054: /** The reference array for the queue (containing <em>global</em> document pointers). */
055: protected final int[] globalDocumentPointer;
056: /** The set of indices involved in this iterator. */
057: protected final ReferenceSet<Index> indices = new ReferenceArraySet<Index>();
058:
059: /** The underlying index reader. */
060: private final DocumentalClusterIndexReader indexReader;
061:
062: /** The current iterator. */
063: protected int currentIterator = -1;
064: /** Whether there are no more documents to be returned. */
065: protected boolean exhausted;
066:
067: /** Creates a new document iterator for a documental cluster.
068: *
069: * <p>This constructor uses an array of document iterators that it is not required to be full.
070: * This is very useful with rare terms.
071: *
072: * @param indexReader the underlying index reader.
073: * @param documentIterator an array of document iterators.
074: * @param usedIndex an array parallel to <code>documentIterator</code> containing the ordinal numbers
075: * of the indices corresponding to the iterators.
076: */
077:
078: public DocumentalMergedClusterDocumentIterator(
079: final DocumentalClusterIndexReader indexReader,
080: final DocumentIterator[] documentIterator, int[] usedIndex)
081: throws IOException {
082: this .documentIterator = documentIterator;
083: this .n = documentIterator.length;
084: this .indexReader = indexReader;
085: this .usedIndex = usedIndex;
086:
087: strategy = indexReader.index.strategy;
088: globalDocumentPointer = new int[n];
089: queue = new IntHeapSemiIndirectPriorityQueue(
090: globalDocumentPointer, n);
091:
092: int result;
093: for (int i = n; i-- != 0;) {
094: if ((result = documentIterator[i].nextDocument()) != -1) {
095: indices.addAll(documentIterator[i].indices());
096: globalDocumentPointer[i] = strategy.globalPointer(
097: usedIndex[i], result);
098: queue.enqueue(i);
099: }
100: }
101:
102: if (queue.isEmpty())
103: exhausted = true;
104: else {
105: currentIterator = queue.first();
106: next = globalDocumentPointer[currentIterator];
107: }
108: }
109:
110: public IntervalIterator intervalIterator() throws IOException {
111: if (last == -1)
112: throw new IllegalStateException();
113: return documentIterator[currentIterator].intervalIterator();
114: }
115:
116: public IntervalIterator intervalIterator(Index index)
117: throws IOException {
118: if (last == -1)
119: throw new IllegalStateException();
120: if (!indices.contains(index))
121: return IntervalIterators.TRUE;
122: return documentIterator[currentIterator]
123: .intervalIterator(index);
124: }
125:
126: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
127: throws IOException {
128: if (last == -1)
129: throw new IllegalStateException();
130: return documentIterator[currentIterator].intervalIterators();
131: }
132:
133: public ReferenceSet<Index> indices() {
134: return indices;
135: }
136:
137: // TODO: this needs tests
138: public int skipTo(final int p) throws IOException {
139: int i, d;
140:
141: if (p <= last)
142: return last;
143:
144: //System.err.println( "Advancing to " + n + " doc: " + Arrays.toString( doc ) + " first: " + queue.first() );
145: next = -1;
146: while (!queue.isEmpty()
147: && globalDocumentPointer[i = queue.first()] < p) {
148: d = documentIterator[i].skipTo(strategy.localPointer(p));
149: if (d == Integer.MAX_VALUE)
150: queue.dequeue();
151: else {
152: globalDocumentPointer[i] = strategy.globalPointer(
153: usedIndex[i], d);
154: if (globalDocumentPointer[i] < p)
155: queue.dequeue(); // This covers the case of getting to the end of list without finding p
156: else
157: queue.changed();
158: }
159: }
160:
161: if (queue.isEmpty()) {
162: exhausted = true;
163: last = -1;
164: return Integer.MAX_VALUE;
165: }
166:
167: return last = globalDocumentPointer[currentIterator = queue
168: .first()];
169: }
170:
171: public int nextDocument() throws IOException {
172: if (next >= 0) {
173: last = next;
174: next = -1;
175: return last;
176: }
177:
178: if (exhausted)
179: return last = -1;
180:
181: final int result;
182: if ((result = documentIterator[currentIterator].nextDocument()) != -1) {
183: globalDocumentPointer[currentIterator] = strategy
184: .globalPointer(usedIndex[currentIterator], result);
185: queue.changed();
186: } else
187: queue.dequeue();
188:
189: if (queue.isEmpty()) {
190: exhausted = true;
191: return last = -1;
192: }
193:
194: currentIterator = queue.first();
195: return last = globalDocumentPointer[currentIterator];
196: }
197:
198: public boolean accept(DocumentIteratorVisitor visitor)
199: throws IOException {
200: boolean result = true;
201: for (DocumentIterator d : documentIterator)
202: if (!(result &= d.accept(visitor)))
203: break;
204: return result;
205: }
206:
207: public boolean acceptOnTruePaths(DocumentIteratorVisitor visitor)
208: throws IOException {
209: return documentIterator[currentIterator]
210: .acceptOnTruePaths(visitor);
211: }
212:
213: public void dispose() throws IOException {
214: indexReader.close();
215: }
216: }
|