Source Code Cross Referenced for MultiTermIndexIterator.java in » Search-Engine » mg4j » it » unimi » dsi » mg4j » index » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Search Engine » mg4j » it.unimi.dsi.mg4j.index

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        package it.unimi.dsi.mg4j.index;
002:
003:        /*		 
004:         * MG4J: Managing Gigabytes for Java
005:         *
006:         * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna 
007:         *
008:         *  This library is free software; you can redistribute it and/or modify it
009:         *  under the terms of the GNU Lesser General Public License as published by the Free
010:         *  Software Foundation; either version 2.1 of the License, or (at your option)
011:         *  any later version.
012:         *
013:         *  This library is distributed in the hope that it will be useful, but
014:         *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015:         *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
016:         *  for more details.
017:         *
018:         *  You should have received a copy of the GNU Lesser General Public License
019:         *  along with this program; if not, write to the Free Software
020:         *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021:         *
022:         */
023:
024:        import it.unimi.dsi.fastutil.ints.IntArrays;
025:        import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
026:        import it.unimi.dsi.fastutil.ints.IntIterator;
027:        import it.unimi.dsi.fastutil.ints.IntIterators;
028:        import it.unimi.dsi.fastutil.ints.IntSet;
029:        import it.unimi.dsi.mg4j.index.payload.Payload;
030:        import it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator;
031:        import it.unimi.dsi.mg4j.search.DocumentIterator;
032:        import it.unimi.dsi.util.Interval;
033:        import it.unimi.dsi.mg4j.search.IntervalIterator;
034:        import it.unimi.dsi.mg4j.search.OrDocumentIterator;
035:        import it.unimi.dsi.mg4j.search.score.BM25Scorer;
036:        import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;
037:
038:        import java.io.IOException;
039:
040:        /** A virtual {@linkplain IndexIterator index iterator} that merges several component index iterators.
041:         *
042:         * <P>This class adds to {@link it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator}
043:         * an interval interator generating the OR of the intervals returned for each of the documents involved.
044:         * The main difference with an {@link OrDocumentIterator} built on the same array of component iterators
045:         * is that this class implements {@link IndexIterator} and hence provides a {@link #count()} (the sum
046:         * of counts of those component iterators positioned on the current document) and a {@link #frequency()}. The
047:         * latter is by default the maximum frequency of a component iterator, but it can be set 
048:         * at {@link MultiTermIndexIterator#getInstance(int, Index, IndexIterator[]) construction time}.
049:         * 
050:         * <p>The main <i>raison d'&ecirc;tre</i> of this class is support for query expansion: a blind application
051:         * of {@link OrDocumentIterator} to an array of index iterators would mislead scorers such as {@link BM25Scorer}
052:         * because low-frequency terms (e.g., <i>hapax legomena</i>) would be responsible for most of the score. 
053:         */
054:
055:        public class MultiTermIndexIterator extends
056:                AbstractUnionDocumentIterator implements  IndexIterator {
057:            @SuppressWarnings("unused")
058:            private static final boolean ASSERTS = false;
059:
060:            /** Value to be used for term frequency, or {@link Integer#MIN_VALUE} to use the max; in any case, this attribute is used to cache
061:             *  frequency after the first call to {@link #frequency()}. */
062:            private int frequency;
063:            /** The term of this iterator. */
064:            protected String term;
065:            /** The id of this iterator. */
066:            protected int id;
067:            /** The count of the last returned document. */
068:            private int count = -1;
069:            /** Whether all underlying index iterators have counts. */
070:            private final boolean hasCounts;
071:            /** Whether all underlying index iterators have positions. */
072:            private final boolean hasPositions;
073:
074:            /** Returns an index iterator that merges the given array of iterators.
075:             *  This method requires that at least one iterator is provided. The frequency is computed as a max,
076:             *  and {@link #index()} will return the result of the same method on the first iterator.
077:             * 
078:             * @param indexIterator the iterators to be joined (at least one).
079:             * @return a merged index iterator. 
080:             * @throws IllegalArgumentException if no iterators were provided.
081:             */
082:            public static IndexIterator getInstance(
083:                    final IndexIterator... indexIterator) throws IOException {
084:                return getInstance(Integer.MIN_VALUE, indexIterator);
085:            }
086:
087:            /** Returns an index iterator that merges the given array of iterators.
088:             * 
089:             * <P>Note that the special case of the empty and of the singleton arrays
090:             * are handled efficiently. The frequency is computed as a max, and
091:             * {@link #index()} will return <code>index</code>.
092:             * 
093:             * @param index the index that wil be returned by {@link #index()}.
094:             * @param indexIterator the iterators to be joined.
095:             * @return a merged index iterator. 
096:             */
097:            public static IndexIterator getInstance(final Index index,
098:                    final IndexIterator... indexIterator) throws IOException {
099:                return getInstance(Integer.MIN_VALUE, index, indexIterator);
100:            }
101:
102:            /** Returns an index iterator that merges the given array of iterators.
103:             *  This method requires that at least one iterator is provided.
104:             * 
105:             * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
106:             * @param indexIterator the iterators to be joined (at least one).
107:             * @return a merged index iterator. 
108:             * @throws IllegalArgumentException if no iterators were provided, or they run on different indices.
109:             */
110:            public static IndexIterator getInstance(final int defaultFrequency,
111:                    final IndexIterator... indexIterator) throws IOException {
112:                if (indexIterator.length == 0)
113:                    throw new IllegalArgumentException();
114:                return getInstance(defaultFrequency, indexIterator[0].index(),
115:                        indexIterator);
116:            }
117:
118:            /** Returns an index iterator that merges the given array of iterators.
119:             * 
120:             * <P>Note that the special case of the empty and of the singleton arrays
121:             * are handled efficiently. 
122:             * 
123:             * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
124:             * @param index the index that wil be returned by {@link #index()}.
125:             * @param indexIterator the iterators to be joined.
126:             * @return a merged index iterator. 
127:             * @throws IllegalArgumentException if there is some iterator on an index different from <code>index</code>.
128:             */
129:            public static IndexIterator getInstance(final int defaultFrequency,
130:                    final Index index, final IndexIterator... indexIterator)
131:                    throws IOException {
132:                if (indexIterator.length == 0)
133:                    return index.emptyIndexIterator;
134:                if (indexIterator.length == 1)
135:                    return indexIterator[0];
136:                return new MultiTermIndexIterator(defaultFrequency,
137:                        indexIterator);
138:            }
139:
140:            /** Creates a new document iterator that merges the given array of iterators. 
141:             * 
142:             * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
143:             * @param indexIterator the iterators to be joined.
144:             */
145:            @SuppressWarnings("cast")
146:            protected MultiTermIndexIterator(final int defaultFrequency,
147:                    final IndexIterator... indexIterator) throws IOException {
148:                super ((DocumentIterator[]) indexIterator);
149:                this .frequency = defaultFrequency;
150:                boolean havePositions = true, haveCounts = true;
151:                for (IndexIterator i : indexIterator) {
152:                    if (!i.index().hasCounts)
153:                        haveCounts = false;
154:                    if (!i.index().hasPositions)
155:                        havePositions = false;
156:
157:                }
158:
159:                hasCounts = haveCounts;
160:                hasPositions = havePositions;
161:            }
162:
163:            protected IntervalIterator getComposedIntervalIterator(
164:                    final Index index) {
165:                return new MultiTermIntervalIterator();
166:            }
167:
168:            @Override
169:            public int skipTo(final int n) throws IOException {
170:                if (last >= n)
171:                    return last;
172:                // We invalidate count before calling the superclass method.
173:                count = -1;
174:                return super .skipTo(n);
175:            }
176:
177:            public int nextDocument() throws IOException {
178:                // We invalidate count before calling the superclass method.
179:                count = -1;
180:                return super .nextDocument();
181:            }
182:
183:            /** The count is the sum of counts of those component iterators positioned on the current document.
184:             * 
185:             *  @return the sum of counts.
186:             */
187:            public int count() throws IOException {
188:                if (!hasCounts)
189:                    throw new IllegalStateException(
190:                            "Some of the underlying iterators do not have counts");
191:                if (last == -1)
192:                    throw new IllegalStateException();
193:                if (count == -1) {
194:                    int count = 0;
195:                    for (int i = computeFront(); i-- != 0;)
196:                        count += indexIterator[front[i]].count();
197:                    this .count = count;
198:                }
199:                return count;
200:            }
201:
202:            /** The frequency is either the default frequency set at construction time, or the maximum frequency of the component iterators. 
203:             * 
204:             * @return the frequency.
205:             */
206:            public int frequency() throws IOException {
207:                if (frequency != Integer.MIN_VALUE)
208:                    return frequency;
209:                int frequency = Integer.MIN_VALUE;
210:                for (int i = n; i-- != 0;)
211:                    frequency = Math.max(frequency, indexIterator[i]
212:                            .frequency());
213:                return this .frequency = frequency; // caching it!
214:            }
215:
216:            public void term(final CharSequence term) {
217:                this .term = term == null ? null : term.toString();
218:            }
219:
220:            public String term() {
221:                return term;
222:            }
223:
224:            public int termNumber() {
225:                // TODO: this is not particularly sensible
226:                return indexIterator[0].termNumber();
227:            }
228:
229:            public void id(final int id) {
230:                this .id = id;
231:            }
232:
233:            public int id() {
234:                return id;
235:            }
236:
237:            public Index index() {
238:                return soleIndex;
239:            }
240:
241:            /** This method is not implemented by this class.
242:             */
243:            public Payload payload() {
244:                throw new UnsupportedOperationException();
245:            }
246:
247:            public int[] positionArray() throws IOException {
248:                if (!hasPositions)
249:                    throw new IllegalStateException(
250:                            "Some of the underlying iterators do not have positions");
251:
252:                // If the front contains a single element, we can just use its position array.
253:                if (computeFront() == 1)
254:                    return indexIterator[front[0]].positionArray();
255:
256:                final MultiTermIntervalIterator multiTermIntervalIterator = (MultiTermIntervalIterator) intervalIterator();
257:                multiTermIntervalIterator.drain();
258:                return multiTermIntervalIterator.cache;
259:            }
260:
261:            public IntIterator positions() throws IOException {
262:                return IntIterators.wrap(positionArray(), 0, count);
263:            }
264:
265:            public int positions(int[] position) throws IOException {
266:                int c = count;
267:                if (position.length < c)
268:                    return -c;
269:                final int[] cache = positionArray();
270:                for (int i = c; i-- != 0;)
271:                    position[i] = cache[i];
272:                return c;
273:            }
274:
275:            @Override
276:            public boolean accept(DocumentIteratorVisitor visitor)
277:                    throws IOException {
278:                return visitor.visit(this );
279:            }
280:
281:            @Override
282:            public boolean acceptOnTruePaths(DocumentIteratorVisitor visitor)
283:                    throws IOException {
284:                return visitor.visit(this );
285:            }
286:
287:            /** An optimised interval iterator with the same semantics as that implemented
288:             *  by {@link OrDocumentIterator}, but not allowing duplicate positions.
289:            s	 *  
290:             *  <p>This iterator provides an additional {@link #drain()} method that exhausts the
291:             *  merge queue, leaving however the returned elements in the {@link #cache} array. Moreover,
292:             *  the internal state of the iterator is modified so that it continues to behave normally,
293:             *  returning however its results from {@link #cache}. In this way we can easily provide
294:             *  efficient implementations for {@link IndexIterator#positions()}, {@link IndexIterator#positionArray()},
295:             *  and {@link IndexIterator#positions(int[])}.
296:             */
297:            private class MultiTermIntervalIterator extends
298:                    AbstractCompositeIndexIntervalIterator implements 
299:                    IntervalIterator {
300:                @SuppressWarnings({"unused","hiding"})
301:                private final static boolean DEBUG = false;
302:                @SuppressWarnings("hiding")
303:                private final static boolean ASSERTS = false;
304:
305:                /** A heap-based indirect priority queue used to keep track of the currently scanned positions. */
306:                private final IntHeapSemiIndirectPriorityQueue positionQueue;
307:                /** The cached results of this iterator. */
308:                public int[] cache;
309:                /** The number of results emitted by this iterator since the last call to {@link #reset()}. */
310:                private int emitted;
311:                /** The number of results extracted in {@link #cache} since the last call to {@link #reset()}. */
312:                private int extracted;
313:
314:                public MultiTermIntervalIterator() {
315:                    super (n);
316:                    positionQueue = new IntHeapSemiIndirectPriorityQueue(curr);
317:                    cache = new int[4];
318:                }
319:
320:                public void reset() throws IOException {
321:                    emitted = extracted = 0;
322:                    next = null;
323:                    positionQueue.clear();
324:
325:                    for (int i = computeFront(), k; i-- != 0;) {
326:                        k = front[i];
327:                        position[k] = indexIterator[k].positionArray();
328:                        count[k] = indexIterator[k].count();
329:                        curr[k] = position[k][0];
330:                        currPos[k] = 0;
331:                        positionQueue.enqueue(k);
332:                    }
333:
334:                    if (ASSERTS)
335:                        assert !positionQueue.isEmpty();
336:                }
337:
338:                public void intervalTerms(final IntSet terms) {
339:                    // TODO: this is not particularly sensible
340:                    terms.add(indexIterator[0].termNumber());
341:                }
342:
343:                public Interval nextInterval() {
344:                    if (next != null) {
345:                        final Interval result = next;
346:                        next = null;
347:                        return result;
348:                    }
349:
350:                    if (emitted < extracted)
351:                        return Interval.valueOf(cache[emitted++]);
352:
353:                    if (positionQueue.isEmpty())
354:                        return null;
355:
356:                    final int first = positionQueue.first();
357:
358:                    if (extracted == cache.length)
359:                        cache = IntArrays.grow(cache, extracted + 1);
360:                    cache[extracted++] = curr[first];
361:
362:                    if (++currPos[first] < count[first]) {
363:                        curr[first] = position[first][currPos[first]];
364:                        positionQueue.changed();
365:                        if (curr[positionQueue.first()] == cache[extracted - 1])
366:                            throw new IllegalArgumentException(
367:                                    "Duplicate positions in " + this );
368:                    } else
369:                        positionQueue.dequeue();
370:
371:                    return Interval.valueOf(cache[emitted++]);
372:                }
373:
374:                public int extent() {
375:                    return 1;
376:                }
377:
378:                /** Drains all elements from the queue, stores them in {@link #cache} and
379:                 * restores {@link #emitted} so that the iterators continues to work transparently. 
380:                 */
381:
382:                public void drain() {
383:                    final int emittedNow = emitted - (next != null ? 1 : 0);
384:                    next = null;
385:                    while (nextInterval() != null)
386:                        ;
387:                    emitted = emittedNow;
388:                }
389:            }
390:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.