Source Code Cross Referenced for IndexFiles.java in » PDF » PDFBox-0.7.3 » org » pdfbox » searchengine » lucene » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » PDF » PDFBox 0.7.3 » org.pdfbox.searchengine.lucene

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        package org.pdfbox.searchengine.lucene;
002:
003:        /*
004:         * This source was originally written as an example for the lucene project.
005:         * It has been modified to use PDFBox as a  lucene document creator.
006:         * -Ben Litchfield
007:         *
008:         *====================================================================
009:         * The Apache Software License, Version 1.1
010:         *
011:         * Copyright (c) 2001 The Apache Software Foundation.  All rights
012:         * reserved.
013:         *
014:         * Redistribution and use in source and binary forms, with or without
015:         * modification, are permitted provided that the following conditions
016:         * are met:
017:         *
018:         * 1. Redistributions of source code must retain the above copyright
019:         *    notice, this list of conditions and the following disclaimer.
020:         *
021:         * 2. Redistributions in binary form must reproduce the above copyright
022:         *    notice, this list of conditions and the following disclaimer in
023:         *    the documentation and/or other materials provided with the
024:         *    distribution.
025:         *
026:         * 3. The end-user documentation included with the redistribution,
027:         *    if any, must include the following acknowledgment:
028:         *       "This product includes software developed by the
029:         *        Apache Software Foundation (http://www.apache.org/)."
030:         *    Alternately, this acknowledgment may appear in the software itself,
031:         *    if and wherever such third-party acknowledgments normally appear.
032:         *
033:         * 4. The names "Apache" and "Apache Software Foundation" and
034:         *    "Apache Lucene" must not be used to endorse or promote products
035:         *    derived from this software without prior written permission. For
036:         *    written permission, please contact apache@apache.org.
037:         *
038:         * 5. Products derived from this software may not be called "Apache",
039:         *    "Apache Lucene", nor may "Apache" appear in their name, without
040:         *    prior written permission of the Apache Software Foundation.
041:         *
042:         * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
043:         * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
044:         * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
045:         * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
046:         * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
047:         * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
048:         * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
049:         * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
050:         * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
051:         * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
052:         * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
053:         * SUCH DAMAGE.
054:         * ====================================================================
055:         *
056:         * This software consists of voluntary contributions made by many
057:         * individuals on behalf of the Apache Software Foundation.  For more
058:         * information on the Apache Software Foundation, please see
059:         * <http://www.apache.org/>.
060:         */
061:
062:        import org.apache.lucene.analysis.standard.StandardAnalyzer;
063:
064:        import org.apache.lucene.demo.HTMLDocument;
065:
066:        import org.apache.lucene.document.Document;
067:
068:        import org.apache.lucene.index.IndexReader;
069:        import org.apache.lucene.index.IndexWriter;
070:        import org.apache.lucene.index.Term;
071:        import org.apache.lucene.index.TermEnum;
072:
073:        import java.util.Arrays;
074:
075:        import java.io.File;
076:        import java.io.IOException;
077:
078:        import java.util.Date;
079:
080:        /**
081:         * This is a class that will index some files on a local filesystem.  This code
082:         * was modified from a demo that comes with the lucene search engine.
083:         *
084:         * @author Lucene team
085:         * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
086:         *
087:         * @version $Revision: 1.8 $
088:         */
089:        public class IndexFiles {
090:            private boolean deleting = false; // true during deletion pass
091:            private IndexReader reader; // existing index
092:            private IndexWriter writer; // new index being built
093:            private TermEnum uidIter; // document id iterator
094:
095:            /**
096:             * This is the main entry point for the indexer.
097:             *
098:             * @param argv The command line arguments.
099:             */
100:            public static void main(String[] argv) {
101:
102:                String index = "index";
103:                boolean create = false;
104:                File root = null;
105:
106:                String usage = "org.pdfbox.searchengine.lucene.IndexFiles [-create] [-index <index>] <root_directory>";
107:
108:                if (argv.length == 0) {
109:                    System.err.println("Usage: " + usage);
110:                    return;
111:                }
112:
113:                for (int i = 0; i < argv.length; i++) {
114:                    if (argv[i].equals("-index")) { // parse -index option
115:                        index = argv[++i];
116:                    } else if (argv[i].equals("-create")) { // parse -create option
117:                        create = true;
118:                    } else if (i != argv.length - 1) {
119:                        System.err.println("Usage: " + usage);
120:                        return;
121:                    } else {
122:                        System.out.println("root=" + argv[i]);
123:                        root = new File(argv[i]);
124:                    }
125:                }
126:                IndexFiles indexer = new IndexFiles();
127:                indexer.index(root, create, index);
128:            }
129:
130:            /**
131:             * This will index a directory.
132:             *
133:             * @param root The root directory to start indexing.
134:             * @param create Should we create a new index?
135:             * @param index The name of the index.
136:             */
137:            public void index(File root, boolean create, String index) {
138:
139:                try {
140:                    Date start = new Date();
141:
142:                    writer = new IndexWriter(index, new StandardAnalyzer(),
143:                            create);
144:
145:                    if (!create) { // delete stale docs
146:                        deleting = true;
147:                        indexDocs(root, index, create);
148:                    }
149:
150:                    indexDocs(root, index, create); // add new docs
151:
152:                    System.out.println("Optimizing index...");
153:                    writer.optimize();
154:                    writer.close();
155:
156:                    Date end = new Date();
157:
158:                    System.out.print(end.getTime() - start.getTime());
159:                    System.out.println(" total milliseconds");
160:
161:                } catch (Exception e) {
162:                    e.printStackTrace();
163:                }
164:            }
165:
166:            /**
167:             * Walk directory hierarchy in uid order, while keeping uid iterator from
168:             * existing index in sync.  Mismatches indicate one of: (a) old documents to
169:             * be deleted; (b) unchanged documents, to be left alone; or (c) new
170:             * documents, to be indexed.
171:             *
172:             * @param file The directory to index.
173:             * @param index The index to add the file to.
174:             * @param create A flag telling if we should create the index.
175:             *
176:             * @throws Exception If there is any error indexing the directory.
177:             */
178:            private void indexDocs(File file, String index, boolean create)
179:                    throws Exception {
180:                if (!create) { // incrementally update
181:
182:                    reader = IndexReader.open(index); // open existing index
183:                    uidIter = reader.terms(new Term("uid", "")); // init uid iterator
184:
185:                    indexDocs(file);
186:
187:                    if (deleting) { // delete rest of stale docs
188:                        while (uidIter.term() != null
189:                                && uidIter.term().field().equals("uid")) {
190:                            System.out.println("deleting "
191:                                    + HTMLDocument.uid2url(uidIter.term()
192:                                            .text()));
193:                            reader.deleteDocuments(uidIter.term());
194:                            uidIter.next();
195:                        }
196:                        deleting = false;
197:                    }
198:
199:                    uidIter.close(); // close uid iterator
200:                    reader.close(); // close existing index
201:
202:                } else {
203:                    indexDocs(file);
204:                }
205:            }
206:
207:            private void indexDocs(File file) throws Exception {
208:                if (file.isDirectory()) { // if a directory
209:                    String[] files = file.list(); // list its files
210:                    Arrays.sort(files); // sort the files
211:                    for (int i = 0; i < files.length; i++) // recursively index them
212:                    {
213:                        indexDocs(new File(file, files[i]));
214:                    }
215:                } else {
216:                    if (uidIter != null) {
217:                        String uid = HTMLDocument.uid(file); // construct uid for doc
218:
219:                        while (uidIter.term() != null
220:                                && uidIter.term().field().equals("uid")
221:                                && uidIter.term().text().compareTo(uid) < 0) {
222:                            if (deleting) { // delete stale docs
223:                                System.out.println("deleting "
224:                                        + HTMLDocument.uid2url(uidIter.term()
225:                                                .text()));
226:                                reader.deleteDocuments(uidIter.term());
227:                            }
228:                            uidIter.next();
229:                        }
230:                        if (uidIter.term() != null
231:                                && uidIter.term().field().equals("uid")
232:                                && uidIter.term().text().compareTo(uid) == 0) {
233:                            System.out.println("Next uid=" + uidIter);
234:                            uidIter.next(); // keep matching docs
235:                        }
236:                    } else {
237:                        try {
238:                            addDocument(file);
239:                        } catch (IOException e) {
240:                            //catch exception and move onto the next document
241:                            System.out.println(e.getMessage());
242:                        }
243:                    }
244:                }
245:            }
246:
247:            private void addDocument(File file) throws IOException,
248:                    InterruptedException {
249:                String path = file.getName().toUpperCase();
250:                Document doc = null;
251:                //Gee, this would be a great place for a command pattern
252:                if (path.endsWith(".HTML") || // index .html files
253:                        path.endsWith(".HTM") || // index .htm files
254:                        path.endsWith(".TXT")) {
255:                    System.out.println("Indexing Text document: " + file);
256:                    doc = HTMLDocument.Document(file);
257:                } else if (path.endsWith(".PDF")) {
258:                    System.out.println("Indexing PDF document: " + file);
259:                    doc = LucenePDFDocument.getDocument(file);
260:                } else {
261:                    System.out.println("Skipping " + file);
262:                }
263:
264:                if (doc != null) {
265:                    writer.addDocument(doc);
266:                }
267:            }
268:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.