001: package org.pdfbox.searchengine.lucene;
002:
003: /*
004: * This source was originally written as an example for the lucene project.
005: * It has been modified to use PDFBox as a lucene document creator.
006: * -Ben Litchfield
007: *
008: *====================================================================
009: * The Apache Software License, Version 1.1
010: *
011: * Copyright (c) 2001 The Apache Software Foundation. All rights
012: * reserved.
013: *
014: * Redistribution and use in source and binary forms, with or without
015: * modification, are permitted provided that the following conditions
016: * are met:
017: *
018: * 1. Redistributions of source code must retain the above copyright
019: * notice, this list of conditions and the following disclaimer.
020: *
021: * 2. Redistributions in binary form must reproduce the above copyright
022: * notice, this list of conditions and the following disclaimer in
023: * the documentation and/or other materials provided with the
024: * distribution.
025: *
026: * 3. The end-user documentation included with the redistribution,
027: * if any, must include the following acknowledgment:
028: * "This product includes software developed by the
029: * Apache Software Foundation (http://www.apache.org/)."
030: * Alternately, this acknowledgment may appear in the software itself,
031: * if and wherever such third-party acknowledgments normally appear.
032: *
033: * 4. The names "Apache" and "Apache Software Foundation" and
034: * "Apache Lucene" must not be used to endorse or promote products
035: * derived from this software without prior written permission. For
036: * written permission, please contact apache@apache.org.
037: *
038: * 5. Products derived from this software may not be called "Apache",
039: * "Apache Lucene", nor may "Apache" appear in their name, without
040: * prior written permission of the Apache Software Foundation.
041: *
042: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
043: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
044: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
045: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
046: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
047: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
048: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
049: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
050: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
051: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
052: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
053: * SUCH DAMAGE.
054: * ====================================================================
055: *
056: * This software consists of voluntary contributions made by many
057: * individuals on behalf of the Apache Software Foundation. For more
058: * information on the Apache Software Foundation, please see
059: * <http://www.apache.org/>.
060: */
061:
062: import org.apache.lucene.analysis.standard.StandardAnalyzer;
063:
064: import org.apache.lucene.demo.HTMLDocument;
065:
066: import org.apache.lucene.document.Document;
067:
068: import org.apache.lucene.index.IndexReader;
069: import org.apache.lucene.index.IndexWriter;
070: import org.apache.lucene.index.Term;
071: import org.apache.lucene.index.TermEnum;
072:
073: import java.util.Arrays;
074:
075: import java.io.File;
076: import java.io.IOException;
077:
078: import java.util.Date;
079:
080: /**
081: * This is a class that will index some files on a local filesystem. This code
082: * was modified from a demo that comes with the lucene search engine.
083: *
084: * @author Lucene team
085: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
086: *
087: * @version $Revision: 1.8 $
088: */
089: public class IndexFiles {
090: private boolean deleting = false; // true during deletion pass
091: private IndexReader reader; // existing index
092: private IndexWriter writer; // new index being built
093: private TermEnum uidIter; // document id iterator
094:
095: /**
096: * This is the main entry point for the indexer.
097: *
098: * @param argv The command line arguments.
099: */
100: public static void main(String[] argv) {
101:
102: String index = "index";
103: boolean create = false;
104: File root = null;
105:
106: String usage = "org.pdfbox.searchengine.lucene.IndexFiles [-create] [-index <index>] <root_directory>";
107:
108: if (argv.length == 0) {
109: System.err.println("Usage: " + usage);
110: return;
111: }
112:
113: for (int i = 0; i < argv.length; i++) {
114: if (argv[i].equals("-index")) { // parse -index option
115: index = argv[++i];
116: } else if (argv[i].equals("-create")) { // parse -create option
117: create = true;
118: } else if (i != argv.length - 1) {
119: System.err.println("Usage: " + usage);
120: return;
121: } else {
122: System.out.println("root=" + argv[i]);
123: root = new File(argv[i]);
124: }
125: }
126: IndexFiles indexer = new IndexFiles();
127: indexer.index(root, create, index);
128: }
129:
130: /**
131: * This will index a directory.
132: *
133: * @param root The root directory to start indexing.
134: * @param create Should we create a new index?
135: * @param index The name of the index.
136: */
137: public void index(File root, boolean create, String index) {
138:
139: try {
140: Date start = new Date();
141:
142: writer = new IndexWriter(index, new StandardAnalyzer(),
143: create);
144:
145: if (!create) { // delete stale docs
146: deleting = true;
147: indexDocs(root, index, create);
148: }
149:
150: indexDocs(root, index, create); // add new docs
151:
152: System.out.println("Optimizing index...");
153: writer.optimize();
154: writer.close();
155:
156: Date end = new Date();
157:
158: System.out.print(end.getTime() - start.getTime());
159: System.out.println(" total milliseconds");
160:
161: } catch (Exception e) {
162: e.printStackTrace();
163: }
164: }
165:
166: /**
167: * Walk directory hierarchy in uid order, while keeping uid iterator from
168: * existing index in sync. Mismatches indicate one of: (a) old documents to
169: * be deleted; (b) unchanged documents, to be left alone; or (c) new
170: * documents, to be indexed.
171: *
172: * @param file The directory to index.
173: * @param index The index to add the file to.
174: * @param create A flag telling if we should create the index.
175: *
176: * @throws Exception If there is any error indexing the directory.
177: */
178: private void indexDocs(File file, String index, boolean create)
179: throws Exception {
180: if (!create) { // incrementally update
181:
182: reader = IndexReader.open(index); // open existing index
183: uidIter = reader.terms(new Term("uid", "")); // init uid iterator
184:
185: indexDocs(file);
186:
187: if (deleting) { // delete rest of stale docs
188: while (uidIter.term() != null
189: && uidIter.term().field().equals("uid")) {
190: System.out.println("deleting "
191: + HTMLDocument.uid2url(uidIter.term()
192: .text()));
193: reader.deleteDocuments(uidIter.term());
194: uidIter.next();
195: }
196: deleting = false;
197: }
198:
199: uidIter.close(); // close uid iterator
200: reader.close(); // close existing index
201:
202: } else {
203: indexDocs(file);
204: }
205: }
206:
207: private void indexDocs(File file) throws Exception {
208: if (file.isDirectory()) { // if a directory
209: String[] files = file.list(); // list its files
210: Arrays.sort(files); // sort the files
211: for (int i = 0; i < files.length; i++) // recursively index them
212: {
213: indexDocs(new File(file, files[i]));
214: }
215: } else {
216: if (uidIter != null) {
217: String uid = HTMLDocument.uid(file); // construct uid for doc
218:
219: while (uidIter.term() != null
220: && uidIter.term().field().equals("uid")
221: && uidIter.term().text().compareTo(uid) < 0) {
222: if (deleting) { // delete stale docs
223: System.out.println("deleting "
224: + HTMLDocument.uid2url(uidIter.term()
225: .text()));
226: reader.deleteDocuments(uidIter.term());
227: }
228: uidIter.next();
229: }
230: if (uidIter.term() != null
231: && uidIter.term().field().equals("uid")
232: && uidIter.term().text().compareTo(uid) == 0) {
233: System.out.println("Next uid=" + uidIter);
234: uidIter.next(); // keep matching docs
235: }
236: } else {
237: try {
238: addDocument(file);
239: } catch (IOException e) {
240: //catch exception and move onto the next document
241: System.out.println(e.getMessage());
242: }
243: }
244: }
245: }
246:
247: private void addDocument(File file) throws IOException,
248: InterruptedException {
249: String path = file.getName().toUpperCase();
250: Document doc = null;
251: //Gee, this would be a great place for a command pattern
252: if (path.endsWith(".HTML") || // index .html files
253: path.endsWith(".HTM") || // index .htm files
254: path.endsWith(".TXT")) {
255: System.out.println("Indexing Text document: " + file);
256: doc = HTMLDocument.Document(file);
257: } else if (path.endsWith(".PDF")) {
258: System.out.println("Indexing PDF document: " + file);
259: doc = LucenePDFDocument.getDocument(file);
260: } else {
261: System.out.println("Skipping " + file);
262: }
263:
264: if (doc != null) {
265: writer.addDocument(doc);
266: }
267: }
268: }
|