001: /*
002: * regain - A file search engine providing plenty of formats
003: * Copyright (C) 2004 Til Schneider
004: *
005: * This library is free software; you can redistribute it and/or
006: * modify it under the terms of the GNU Lesser General Public
007: * License as published by the Free Software Foundation; either
008: * version 2.1 of the License, or (at your option) any later version.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
013: * Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General Public
016: * License along with this library; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Contact: Til Schneider, info@murfman.de
020: *
021: * CVS information:
022: * $RCSfile$
023: * $Source$
024: * $Date: 2005-08-08 10:40:41 +0200 (Mo, 08 Aug 2005) $
025: * $Author: til132 $
026: * $Revision: 151 $
027: */
028: package net.sf.regain.search.results;
029:
030: import java.io.IOException;
031: import java.util.ArrayList;
032:
033: import net.sf.regain.RegainException;
034:
035: import org.apache.lucene.document.Document;
036: import org.apache.lucene.search.Hits;
037:
038: /**
039: * Merges several search hits.
040: * <p>
041: * The hits should be disjunct. In other words: If two of the hits contain the
042: * same document, it will come up twice in the results. This class is designed
043: * to merge hits from different search indexes, which are disjunct and therefor
044: * don't contain the same document.
045: * <p>
046: * The hits are merged after the zipper principle: The documents are merged
047: * after their score. The merging is done on demand.
048: *
049: * @author Til Schneider, www.murfman.de
050: */
051: public class MergedHits {
052:
053: /** The hits to merge. */
054: private Hits[] mHitsArr;
055:
056: /** The cursors where the merge currently is in each hits object. */
057: private int[] mHitCursorArr;
058:
059: /** The hits that have been merged so far. */
060: private ArrayList mMergedHitList;
061:
062: /** The total length. Is -1 util it is calculated the first time. */
063: private int mLength;
064:
065: /**
066: * Creates a new instance of MergedHits.
067: *
068: * @param hitsArr The hits to merge.
069: */
070: public MergedHits(Hits[] hitsArr) {
071: mHitsArr = hitsArr;
072: mLength = -1;
073:
074: // NOTE: In Java int arrays are initialized with 0
075: mHitCursorArr = new int[hitsArr.length];
076:
077: mMergedHitList = new ArrayList();
078: }
079:
080: /**
081: * Gets the length of the merged hits.
082: *
083: * @return The length of the merged hits.
084: */
085: public int length() {
086: if (mLength == -1) {
087: mLength = 0;
088: for (int i = 0; i < mHitsArr.length; i++) {
089: mLength += mHitsArr[i].length();
090: }
091: }
092:
093: return mLength;
094: }
095:
096: /**
097: * Gets a hit.
098: *
099: * @param n The index of the hit to get.
100: * @return The wanted hit.
101: * @throws IOException If getting the hit failed.
102: */
103: private Hit getHit(int n) throws IOException {
104: mergeUntil(n);
105: return (Hit) mMergedHitList.get(n);
106: }
107:
108: /**
109: * Gets a certain document.
110: *
111: * @param n The index of the document to get.
112: * @return The wanted document.
113: * @throws IOException If getting the document failed.
114: */
115: public Document doc(int n) throws IOException {
116: return getHit(n).getDocument();
117: }
118:
119: /**
120: * Gets the score of a certain document.
121: *
122: * @param n The index of the document to get the score of.
123: * @return The score of the document
124: * @throws IOException If getting the score failed.
125: */
126: public float score(int n) throws IOException {
127: return getHit(n).getScore();
128: }
129:
130: /**
131: * Gets the index of the Hits object a hit comes from. (The index in the
132: * hitsArr that was passed to the constructor).
133: *
134: * @param n The index of the hit to get the Hits index for.
135: * @return The index of the Hits object a hit comes from.
136: * @throws RegainException If getting the hits index failed.
137: */
138: public int getHitsIndex(int n) throws RegainException {
139: try {
140: return getHit(n).getHitsIndex();
141: } catch (IOException exc) {
142: throw new RegainException("Getting the hit #" + n
143: + " failed", exc);
144: }
145: }
146:
147: /**
148: * Gets the position of a hit in the Hits object it comes from.
149: *
150: * @param n The index of the hit to get the Hits position for.
151: * @return The position of a hit in the Hits object it comes from.
152: * @throws RegainException If getting the hits position failed.
153: */
154: public int getHitsPosition(int n) throws RegainException {
155: try {
156: return getHit(n).getHitsPosition();
157: } catch (IOException exc) {
158: throw new RegainException("Getting the hit #" + n
159: + " failed", exc);
160: }
161: }
162:
163: /**
164: * Merges the hits until a certain index.
165: *
166: * @param n The index to merge to.
167: * @throws IOException If getting a score failed.
168: */
169: private void mergeUntil(int n) throws IOException {
170: mMergedHitList.ensureCapacity(n + 1);
171:
172: while (mMergedHitList.size() < (n + 1)) {
173: // Find the document with the highest score
174: int maxHitsIndex = -1;
175: float maxScore = -1;
176: for (int i = 0; i < mHitsArr.length; i++) {
177: Hits currHits = mHitsArr[i];
178: int currCursor = mHitCursorArr[i];
179: if (currCursor < currHits.length()) {
180: float currScore = currHits.score(currCursor);
181: if (currScore > maxScore) {
182: // This is the better score -> Remember it
183: maxScore = currScore;
184: maxHitsIndex = i;
185: } else if (currScore == maxScore) {
186: // This score is the same as the max. In order to have a fair
187: // algorithm (which means that every Hits is used at some time), we
188: // use the Hits with the lower cursor (which is the one where the
189: // fewest documents were used until now)
190: if (currCursor < mHitCursorArr[maxHitsIndex]) {
191: maxScore = currScore;
192: maxHitsIndex = i;
193: }
194: }
195: }
196: }
197:
198: // Add the best document
199: mMergedHitList.add(new Hit(maxHitsIndex,
200: mHitCursorArr[maxHitsIndex], maxScore));
201:
202: // Move the max cursor
203: mHitCursorArr[maxHitsIndex]++;
204: }
205: }
206:
207: /**
208: * Gets the hits to merge.
209: *
210: * @return The hits to merge.
211: */
212: Hits[] getHitsArr() {
213: return mHitsArr;
214: }
215:
216: /**
217: * Contains the data of one hit.
218: */
219: private class Hit {
220: /** The index of the Hits object this hit comes from. (The index in the mHitsArr) */
221: private int mHitsIndex;
222: /** The position of this hit in the Hits object it comes from. */
223: private int mHitsPosition;
224: /** The score of this hit. */
225: private float mScore;
226: /** The cached document. Is null until the first time requested. */
227: private Document mDocument;
228:
229: /**
230: * Creates a new instance of Hit.
231: *
232: * @param hitsIndex The index of the Hits object this hit comes from.
233: * (The index in the mHitsArr)
234: * @param hitsPosition The position of this hit in the Hits object it comes from.
235: * @param score The score of this hit.
236: */
237: public Hit(int hitsIndex, int hitsPosition, float score) {
238: mHitsIndex = hitsIndex;
239: mHitsPosition = hitsPosition;
240: mScore = score;
241: }
242:
243: /**
244: * Gets the index of the Hits object this hit comes from. (The index in th
245: * mHitsArr)
246: *
247: * @return The index of the Hits object this hit comes from.
248: */
249: public int getHitsIndex() {
250: return mHitsIndex;
251: }
252:
253: /**
254: * Gets the position of this hit in the Hits object it comes from.
255: *
256: * @return The position of this hit in the Hits object it comes from.
257: */
258: public int getHitsPosition() {
259: return mHitsPosition;
260: }
261:
262: /**
263: * Gets the score of this hit.
264: *
265: * @return The score of this hit.
266: */
267: public float getScore() {
268: return mScore;
269: }
270:
271: /**
272: * Gets the document.
273: *
274: * @return The document.
275: * @throws IOException If getting the document failed.
276: */
277: public Document getDocument() throws IOException {
278: if (mDocument == null) {
279: mDocument = getHitsArr()[mHitsIndex].doc(mHitsPosition);
280: }
281:
282: return mDocument;
283: }
284:
285: } // inner class Hit
286:
287: }
|