001: /**
002: * Copyright (c) 2003-2006, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.searchengine.lucene;
031:
032: import java.io.File;
033: import java.io.FileInputStream;
034: import java.io.InputStream;
035: import java.io.IOException;
036: import java.io.Reader;
037: import java.io.StringReader;
038: import java.io.StringWriter;
039: import java.util.Calendar;
040:
041: import java.net.URL;
042: import java.net.URLConnection;
043:
044: import java.util.Date;
045:
046: import org.apache.lucene.document.DateTools;
047: import org.apache.lucene.document.Document;
048: import org.apache.lucene.document.Field;
049:
050: import org.pdfbox.pdmodel.PDDocument;
051: import org.pdfbox.pdmodel.PDDocumentInformation;
052:
053: import org.pdfbox.exceptions.CryptographyException;
054: import org.pdfbox.exceptions.InvalidPasswordException;
055:
056: import org.pdfbox.util.PDFTextStripper;
057:
058: /**
059: * This class is used to create a document for the lucene search engine.
060: * This should easily plug into the IndexHTML or IndexFiles that comes with
061: * the lucene project. This class will populate the following fields.
062: * <table>
063: * <tr>
064: * <th>Lucene Field Name</th>
065: * <th>Description</th>
066: * </tr>
067: * <tr>
068: * <td>path</td>
069: * <td>File system path if loaded from a file</td>
070: * </tr>
071: * <tr>
072: * <td>url</td>
073: * <td>URL to PDF document</td>
074: * </tr>
075: * <tr>
076: * <td>contents</td>
077: * <td>Entire contents of PDF document, indexed but not stored</td>
078: * </tr>
079: * <tr>
080: * <td>summary</td>
081: * <td>First 500 characters of content</td>
082: * </tr>
083: * <tr>
084: * <td>modified</td>
085: * <td>The modified date/time according to the url or path</td>
086: * </tr>
087: * <tr>
088: * <td>uid</td>
089: * <td>A unique identifier for the Lucene document.</td>
090: * </tr>
091: * <tr>
092: * <td>CreationDate</td>
093: * <td>From PDF meta-data if available</td>
094: * </tr>
095: * <tr>
096: * <td>Creator</td>
097: * <td>From PDF meta-data if available</td>
098: * </tr>
099: * <tr>
100: * <td>Keywords</td>
101: * <td>From PDF meta-data if available</td>
102: * </tr>
103: * <tr>
104: * <td>ModificationDate</td>
105: * <td>From PDF meta-data if available</td>
106: * </tr>
107: * <tr>
108: * <td>Producer</td>
109: * <td>From PDF meta-data if available</td>
110: * </tr>
111: * <tr>
112: * <td>Subject</td>
113: * <td>From PDF meta-data if available</td>
114: * </tr>
115: * <tr>
116: * <td>Trapped</td>
117: * <td>From PDF meta-data if available</td>
118: * </tr>
119: * </table>
120: *
121: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
122: * @version $Revision: 1.22 $
123: */
124: public final class LucenePDFDocument {
125: private static final char FILE_SEPARATOR = System.getProperty(
126: "file.separator").charAt(0);
127:
128: // given caveat of increased search times when using
129: //MICROSECOND, only use SECOND by default
130: private DateTools.Resolution dateTimeResolution = DateTools.Resolution.SECOND;
131:
132: private PDFTextStripper stripper = null;
133:
134: /**
135: * Constructor.
136: */
137: public LucenePDFDocument() {
138: }
139:
140: /**
141: * Set the text stripper that will be used during extraction.
142: *
143: * @param aStripper The new pdf text stripper.
144: */
145: public void setTextStripper(PDFTextStripper aStripper) {
146: stripper = aStripper;
147: }
148:
149: /**
150: * Get the Lucene data time resolution.
151: *
152: * @return current date/time resolution
153: */
154: public DateTools.Resolution getDateTimeResolution() {
155: return dateTimeResolution;
156: }
157:
158: /**
159: * Set the Lucene data time resolution.
160: *
161: * @param resolution set new date/time resolution
162: */
163: public void setDateTimeResolution(DateTools.Resolution resolution) {
164: dateTimeResolution = resolution;
165: }
166:
167: //
168: // compatibility methods for lucene-1.9+
169: //
170: private String timeToString(long time) {
171: return DateTools.timeToString(time, dateTimeResolution);
172: }
173:
174: private void addKeywordField(Document document, String name,
175: String value) {
176: if (value != null) {
177: document.add(new Field(name, value, Field.Store.YES,
178: Field.Index.UN_TOKENIZED));
179: }
180: }
181:
182: private void addTextField(Document document, String name,
183: Reader value) {
184: if (value != null) {
185: document.add(new Field(name, value));
186: }
187: }
188:
189: private void addTextField(Document document, String name,
190: String value) {
191: if (value != null) {
192: document.add(new Field(name, value, Field.Store.YES,
193: Field.Index.TOKENIZED));
194: }
195: }
196:
197: private void addTextField(Document document, String name, Date value) {
198: if (value != null) {
199: addTextField(document, name, DateTools.dateToString(value,
200: dateTimeResolution));
201: }
202: }
203:
204: private void addTextField(Document document, String name,
205: Calendar value) {
206: if (value != null) {
207: addTextField(document, name, value.getTime());
208: }
209: }
210:
211: private static void addUnindexedField(Document document,
212: String name, String value) {
213: if (value != null) {
214: document.add(new Field(name, value, Field.Store.YES,
215: Field.Index.NO));
216: }
217: }
218:
219: private void addUnstoredKeywordField(Document document,
220: String name, String value) {
221: if (value != null) {
222: document.add(new Field(name, value, Field.Store.NO,
223: Field.Index.UN_TOKENIZED));
224: }
225: }
226:
227: /**
228: * Convert the PDF stream to a lucene document.
229: *
230: * @param is The input stream.
231: * @return The input stream converted to a lucene document.
232: * @throws IOException If there is an error converting the PDF.
233: */
234: public Document convertDocument(InputStream is) throws IOException {
235: Document document = new Document();
236: addContent(document, is, "<inputstream>");
237: return document;
238:
239: }
240:
241: /**
242: * This will take a reference to a PDF document and create a lucene document.
243: *
244: * @param file A reference to a PDF document.
245: * @return The converted lucene document.
246: *
247: * @throws IOException If there is an exception while converting the document.
248: */
249: public Document convertDocument(File file) throws IOException {
250: Document document = new Document();
251:
252: // Add the url as a field named "url". Use an UnIndexed field, so
253: // that the url is just stored with the document, but is not searchable.
254: addUnindexedField(document, "path", file.getPath());
255: addUnindexedField(document, "url", file.getPath().replace(
256: FILE_SEPARATOR, '/'));
257:
258: // Add the last modified date of the file a field named "modified". Use a
259: // Keyword field, so that it's searchable, but so that no attempt is made
260: // to tokenize the field into words.
261: addKeywordField(document, "modified", timeToString(file
262: .lastModified()));
263:
264: String uid = file.getPath().replace(FILE_SEPARATOR, '\u0000')
265: + "\u0000" + timeToString(file.lastModified());
266:
267: // Add the uid as a field, so that index can be incrementally maintained.
268: // This field is not stored with document, it is indexed, but it is not
269: // tokenized prior to indexing.
270: addUnstoredKeywordField(document, "uid", uid);
271:
272: FileInputStream input = null;
273: try {
274: input = new FileInputStream(file);
275: addContent(document, input, file.getPath());
276: } finally {
277: if (input != null) {
278: input.close();
279: }
280: }
281:
282: // return the document
283:
284: return document;
285: }
286:
287: /**
288: * Convert the document from a PDF to a lucene document.
289: *
290: * @param url A url to a PDF document.
291: * @return The PDF converted to a lucene document.
292: * @throws IOException If there is an error while converting the document.
293: */
294: public Document convertDocument(URL url) throws IOException {
295: Document document = new Document();
296: URLConnection connection = url.openConnection();
297: connection.connect();
298: // Add the url as a field named "url". Use an UnIndexed field, so
299: // that the url is just stored with the document, but is not searchable.
300: addUnindexedField(document, "url", url.toExternalForm());
301:
302: // Add the last modified date of the file a field named "modified". Use a
303: // Keyword field, so that it's searchable, but so that no attempt is made
304: // to tokenize the field into words.
305: addKeywordField(document, "modified", timeToString(connection
306: .getLastModified()));
307:
308: String uid = url.toExternalForm().replace(FILE_SEPARATOR,
309: '\u0000')
310: + "\u0000" + timeToString(connection.getLastModified());
311:
312: // Add the uid as a field, so that index can be incrementally maintained.
313: // This field is not stored with document, it is indexed, but it is not
314: // tokenized prior to indexing.
315: addUnstoredKeywordField(document, "uid", uid);
316:
317: InputStream input = null;
318: try {
319: input = connection.getInputStream();
320: addContent(document, input, url.toExternalForm());
321: } finally {
322: if (input != null) {
323: input.close();
324: }
325: }
326:
327: // return the document
328: return document;
329: }
330:
331: /**
332: * This will get a lucene document from a PDF file.
333: *
334: * @param is The stream to read the PDF from.
335: *
336: * @return The lucene document.
337: *
338: * @throws IOException If there is an error parsing or indexing the document.
339: */
340: public static Document getDocument(InputStream is)
341: throws IOException {
342: LucenePDFDocument converter = new LucenePDFDocument();
343: return converter.convertDocument(is);
344: }
345:
346: /**
347: * This will get a lucene document from a PDF file.
348: *
349: * @param file The file to get the document for.
350: *
351: * @return The lucene document.
352: *
353: * @throws IOException If there is an error parsing or indexing the document.
354: */
355: public static Document getDocument(File file) throws IOException {
356: LucenePDFDocument converter = new LucenePDFDocument();
357: return converter.convertDocument(file);
358: }
359:
360: /**
361: * This will get a lucene document from a PDF file.
362: *
363: * @param url The file to get the document for.
364: *
365: * @return The lucene document.
366: *
367: * @throws IOException If there is an error parsing or indexing the document.
368: */
369: public static Document getDocument(URL url) throws IOException {
370: LucenePDFDocument converter = new LucenePDFDocument();
371: return converter.convertDocument(url);
372: }
373:
374: /**
375: * This will add the contents to the lucene document.
376: *
377: * @param document The document to add the contents to.
378: * @param is The stream to get the contents from.
379: * @param documentLocation The location of the document, used just for debug messages.
380: *
381: * @throws IOException If there is an error parsing the document.
382: */
383: private void addContent(Document document, InputStream is,
384: String documentLocation) throws IOException {
385: PDDocument pdfDocument = null;
386: try {
387: pdfDocument = PDDocument.load(is);
388:
389: if (pdfDocument.isEncrypted()) {
390: //Just try using the default password and move on
391: pdfDocument.decrypt("");
392: }
393:
394: //create a writer where to append the text content.
395: StringWriter writer = new StringWriter();
396: if (stripper == null) {
397: stripper = new PDFTextStripper();
398: } else {
399: stripper.resetEngine();
400: }
401: stripper.writeText(pdfDocument, writer);
402:
403: // Note: the buffer to string operation is costless;
404: // the char array value of the writer buffer and the content string
405: // is shared as long as the buffer content is not modified, which will
406: // not occur here.
407: String contents = writer.getBuffer().toString();
408:
409: StringReader reader = new StringReader(contents);
410:
411: // Add the tag-stripped contents as a Reader-valued Text field so it will
412: // get tokenized and indexed.
413: addTextField(document, "contents", reader);
414:
415: PDDocumentInformation info = pdfDocument
416: .getDocumentInformation();
417: if (info != null) {
418: addTextField(document, "Author", info.getAuthor());
419: addTextField(document, "CreationDate", info
420: .getCreationDate());
421: addTextField(document, "Creator", info.getCreator());
422: addTextField(document, "Keywords", info.getKeywords());
423: addTextField(document, "ModificationDate", info
424: .getModificationDate());
425: addTextField(document, "Producer", info.getProducer());
426: addTextField(document, "Subject", info.getSubject());
427: addTextField(document, "Title", info.getTitle());
428: addTextField(document, "Trapped", info.getTrapped());
429: }
430: int summarySize = Math.min(contents.length(), 500);
431: String summary = contents.substring(0, summarySize);
432: // Add the summary as an UnIndexed field, so that it is stored and returned
433: // with hit documents for display.
434: addUnindexedField(document, "summary", summary);
435: } catch (CryptographyException e) {
436: throw new IOException("Error decrypting document("
437: + documentLocation + "): " + e);
438: } catch (InvalidPasswordException e) {
439: //they didn't suppply a password and the default of "" was wrong.
440: throw new IOException("Error: The document("
441: + documentLocation
442: + ") is encrypted and will not be indexed.");
443: } finally {
444: if (pdfDocument != null) {
445: pdfDocument.close();
446: }
447: }
448: }
449:
450: /**
451: * This will test creating a document.
452: *
453: * usage: java pdfparser.searchengine.lucene.LucenePDFDocument <pdf-document>
454: *
455: * @param args command line arguments.
456: *
457: * @throws IOException If there is an error.
458: */
459: public static void main(String[] args) throws IOException {
460: if (args.length != 1) {
461: String us = LucenePDFDocument.class.getName();
462: System.err.println("usage: java " + us + " <pdf-document>");
463: System.exit(1);
464: }
465: System.out
466: .println("Document=" + getDocument(new File(args[0])));
467: }
468: }
|