001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.util.HashSet;
022:
023: import junit.framework.TestCase;
024:
025: import org.apache.lucene.analysis.standard.StandardAnalyzer;
026: import org.apache.lucene.document.Document;
027: import org.apache.lucene.document.Field;
028: import org.apache.lucene.index.IndexReader;
029: import org.apache.lucene.index.IndexWriter;
030: import org.apache.lucene.index.Term;
031: import org.apache.lucene.index.TermDocs;
032: import org.apache.lucene.store.RAMDirectory;
033:
034: public class DuplicateFilterTest extends TestCase {
035: private static final String KEY_FIELD = "url";
036: private RAMDirectory directory;
037: private IndexReader reader;
038: TermQuery tq = new TermQuery(new Term("text", "lucene"));
039: private IndexSearcher searcher;
040:
041: protected void setUp() throws Exception {
042: directory = new RAMDirectory();
043: IndexWriter writer = new IndexWriter(directory,
044: new StandardAnalyzer(), true);
045:
046: //Add series of docs with filterable fields : url, text and dates flags
047: addDoc(writer, "http://lucene.apache.org",
048: "lucene 1.4.3 available", "20040101");
049: addDoc(writer, "http://lucene.apache.org",
050: "New release pending", "20040102");
051: addDoc(writer, "http://lucene.apache.org",
052: "Lucene 1.9 out now", "20050101");
053: addDoc(writer, "http://www.bar.com", "Local man bites dog",
054: "20040101");
055: addDoc(writer, "http://www.bar.com", "Dog bites local man",
056: "20040102");
057: addDoc(writer, "http://www.bar.com", "Dog uses Lucene",
058: "20050101");
059: addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out",
060: "20050101");
061: addDoc(writer, "http://lucene.apache.org",
062: "Oops. Lucene 2.1 out", "20050102");
063:
064: writer.close();
065: reader = IndexReader.open(directory);
066: searcher = new IndexSearcher(reader);
067:
068: }
069:
070: protected void tearDown() throws Exception {
071: reader.close();
072: searcher.close();
073: directory.close();
074: }
075:
076: private void addDoc(IndexWriter writer, String url, String text,
077: String date) throws IOException {
078: Document doc = new Document();
079: doc.add(new Field(KEY_FIELD, url, Field.Store.YES,
080: Field.Index.UN_TOKENIZED));
081: doc.add(new Field("text", text, Field.Store.YES,
082: Field.Index.TOKENIZED));
083: doc.add(new Field("date", date, Field.Store.YES,
084: Field.Index.TOKENIZED));
085: writer.addDocument(doc);
086: }
087:
088: public void testDefaultFilter() throws Throwable {
089: DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
090: HashSet results = new HashSet();
091: Hits h = searcher.search(tq, df);
092: for (int i = 0; i < h.length(); i++) {
093: Document d = h.doc(i);
094: String url = d.get(KEY_FIELD);
095: assertFalse("No duplicate urls should be returned", results
096: .contains(url));
097: results.add(url);
098: }
099: }
100:
101: public void testNoFilter() throws Throwable {
102: HashSet results = new HashSet();
103: Hits h = searcher.search(tq);
104: assertTrue("Default searching should have found some matches",
105: h.length() > 0);
106: boolean dupsFound = false;
107: for (int i = 0; i < h.length(); i++) {
108: Document d = h.doc(i);
109: String url = d.get(KEY_FIELD);
110: if (!dupsFound)
111: dupsFound = results.contains(url);
112: results.add(url);
113: }
114: assertTrue(
115: "Default searching should have found duplicate urls",
116: dupsFound);
117: }
118:
119: public void testFastFilter() throws Throwable {
120: DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
121: df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
122: HashSet results = new HashSet();
123: Hits h = searcher.search(tq, df);
124: assertTrue("Filtered searching should have found some matches",
125: h.length() > 0);
126: for (int i = 0; i < h.length(); i++) {
127: Document d = h.doc(i);
128: String url = d.get(KEY_FIELD);
129: assertFalse("No duplicate urls should be returned", results
130: .contains(url));
131: results.add(url);
132: }
133: assertEquals("Two urls found", 2, results.size());
134: }
135:
136: public void testKeepsLastFilter() throws Throwable {
137: DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
138: df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
139: Hits h = searcher.search(tq, df);
140: assertTrue("Filtered searching should have found some matches",
141: h.length() > 0);
142: for (int i = 0; i < h.length(); i++) {
143: Document d = h.doc(i);
144: String url = d.get(KEY_FIELD);
145: TermDocs td = reader.termDocs(new Term(KEY_FIELD, url));
146: int lastDoc = 0;
147: while (td.next()) {
148: lastDoc = td.doc();
149: }
150: assertEquals("Duplicate urls should return last doc",
151: lastDoc, h.id((i)));
152: }
153: }
154:
155: public void testKeepsFirstFilter() throws Throwable {
156: DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
157: df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
158: Hits h = searcher.search(tq, df);
159: assertTrue("Filtered searching should have found some matches",
160: h.length() > 0);
161: for (int i = 0; i < h.length(); i++) {
162: Document d = h.doc(i);
163: String url = d.get(KEY_FIELD);
164: TermDocs td = reader.termDocs(new Term(KEY_FIELD, url));
165: int lastDoc = 0;
166: td.next();
167: lastDoc = td.doc();
168: assertEquals("Duplicate urls should return first doc",
169: lastDoc, h.id((i)));
170: }
171: }
172:
173: }
|