01: /*
02: * Copyright 2003-2004 Michael Franken, Zilverline.
03: *
04: * The contents of this file, or the files included with this file, are subject to
05: * the current version of ZILVERLINE Collaborative Source License for the
06: * Zilverline Search Engine (the "License"); You may not use this file except in
07: * compliance with the License.
08: *
09: * You may obtain a copy of the License at
10: *
11: * http://www.zilverline.org.
12: *
13: * See the License for the rights, obligations and
14: * limitations governing use of the contents of the file.
15: *
16: * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
17: * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
18: * copyrights in the portions it created. All Rights Reserved.
19: *
20: */
21:
22: package org.zilverline.core;
23:
24: import java.io.File;
25: import java.io.InputStream;
26:
27: /**
28: * This interface defines the type of family of extractors. Extractors extract all relevant info from a File, and return the info in
29: * a ParsedFileInfo Object. These are mappings used by zilverline to plugin extractors based on file extensions. The plugin is a
30: * java class that implements the Extractor interface and needs to be available on the classpath.
31: *
32: * <p>
33: * So if for example you specify the mapping "pdf => org.zilverline.extractors.PDFExtractor" make sure
34: * org.zilverline.extractors.PDFExtractor is available, otherwise an Exception will be raised and handled by zilverline.
35: * </p>
36: *
37: * <p>
38: * Right now you can use the TEXT, HTML, WORD, EXCEL, POWERPOINT and PDF extractors, and define the extensions you want to map. You
39: * can not use wildcards, but you can define multiple extensions for one Extractor. By default the extensions are treated case
40: * insensitively, but you can change that. Note that you van use an empty extension as well.
41: * </p>
42: *
43: * @author Michael Franken
44: * @version $Revision: 1.5 $
45: *
46: * @see org.zilverline.core.ParsedFileInfo
47: */
48: public interface Extractor {
49: /**
50: * This method extracts all relevant info of the file as an ParsedFileInfo object.
51: *
52: * @param f the File to extract content from
53: *
54: * @return ParsedFileInfo the object containing relevant info of the provided file
55: */
56: ParsedFileInfo extractInfo(final File f);
57:
58: /**
59: * Extract the content from the given InputStream.
60: *
61: */
62: String getContent(final InputStream is);
63:
64: // /**
65: // * Extract the content from the given String.
66: // *
67: // */
68: // String getContent(final String s);
69:
70: }
|