001: /*
002: * Copyright 2003-2004 Michael Franken, Zilverline.
003: *
004: * The contents of this file, or the files included with this file, are subject to
005: * the current version of ZILVERLINE Collaborative Source License for the
006: * Zilverline Search Engine (the "License"); You may not use this file except in
007: * compliance with the License.
008: *
009: * You may obtain a copy of the License at
010: *
011: * http://www.zilverline.org.
012: *
013: * See the License for the rights, obligations and
014: * limitations governing use of the contents of the file.
015: *
016: * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
017: * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
018: * copyrights in the portions it created. All Rights Reserved.
019: *
020: */
021:
022: package org.zilverline.core;
023:
024: import java.io.File;
025: import java.util.Iterator;
026: import java.util.Map;
027: import java.util.TreeMap;
028:
029: import javax.activation.MimetypesFileTypeMap;
030:
031: import net.sf.jmimemagic.Magic;
032: import net.sf.jmimemagic.MagicException;
033: import net.sf.jmimemagic.MagicMatch;
034: import net.sf.jmimemagic.MagicMatchNotFoundException;
035: import net.sf.jmimemagic.MagicParseException;
036:
037: import org.apache.commons.logging.Log;
038: import org.apache.commons.logging.LogFactory;
039: import org.springframework.util.StringUtils;
040: import org.zilverline.extractors.ExcelExtractor;
041: import org.zilverline.extractors.FileInfoExtractor;
042: import org.zilverline.extractors.HTMLExtractor;
043: import org.zilverline.extractors.PDFExtractor;
044: import org.zilverline.extractors.PowerPointExtractor;
045: import org.zilverline.extractors.RTFExtractor;
046: import org.zilverline.extractors.TextExtractor;
047: import org.zilverline.extractors.WordExtractor;
048: import org.zilverline.util.FileUtils;
049:
050: /**
051: * Factory for creating Extractors based on file's extension.
052: *
053: * @author Michael Franken
054: * @version $Revision: 1.19 $
055: *
056: * @see Extractor
057: */
058: public final class ExtractorFactory {
059: /** logger for Commons logging. */
060: private static Log log = LogFactory.getLog(ExtractorFactory.class);
061:
062: /** Map holding mappings from file extension to extractor classname. */
063: private Map mappings = new TreeMap();
064:
065: /** Map holding mappings from mime types to extractor classname. */
066: private Map mimeMappings = new TreeMap();
067:
068: /** The Factory ignores case by default. */
069: private boolean caseSensitive = false;
070:
071: /** The Factory does not store file info for unknown formats by default. */
072: private boolean defaultFileinfo = false;
073:
074: /**
075: * Create a factory with defaults set with the extractors provided by Zilverline.
076: *
077: */
078: public ExtractorFactory() {
079: mappings.put("pdf", "org.zilverline.extractors.PDFExtractor");
080: mappings.put("doc", "org.zilverline.extractors.WordExtractor");
081: mappings.put("rtf", "org.zilverline.extractors.RTFExtractor");
082: mappings.put("html", "org.zilverline.extractors.HTMLExtractor");
083: mappings.put("htm", "org.zilverline.extractors.HTMLExtractor");
084: mappings.put("txt", "org.zilverline.extractors.TextExtractor");
085: mappings.put("xls", "org.zilverline.extractors.ExcelExtractor");
086: mappings.put("ppt",
087: "org.zilverline.extractors.PowerPointExtractor");
088:
089: addMimeMappings();
090: }
091:
092: private void addMimeMappings() {
093: mimeMappings.put("application/pdf",
094: "org.zilverline.extractors.PDFExtractor");
095: mimeMappings.put("*.pdf",
096: "org.zilverline.extractors.PDFExtractor");
097: mimeMappings.put("*.pdf/octet-stream",
098: "org.zilverline.extractors.PDFExtractor");
099: mimeMappings.put("application/msword",
100: "org.zilverline.extractors.WordExtractor");
101: mimeMappings.put("application/rtf",
102: "org.zilverline.extractors.RTFExtractor");
103: mimeMappings.put("text/html",
104: "org.zilverline.extractors.HTMLExtractor");
105: mimeMappings.put("text/plain",
106: "org.zilverline.extractors.TextExtractor");
107: mimeMappings.put("application/vnd.ms-excel",
108: "org.zilverline.extractors.ExcelExtractor");
109: mimeMappings.put("application/vnd.ms-powerpoint",
110: "org.zilverline.extractors.PowerPointExtractor");
111: }
112:
113: /**
114: * Returns whether a given File can be extracted based on its extension.
115: *
116: * @param f File that needs an Extractor
117: *
118: * @return Extractor for File, or null if mapping is unknown
119: */
120: public boolean canExtract(final File f) {
121: log.debug("Can we extract: " + f.getName() + "?");
122:
123: String extension = FileUtils.getExtension(f);
124:
125: if (!caseSensitive) {
126: extension = extension.toLowerCase();
127: }
128:
129: log.debug("" + mappings.containsKey(extension));
130:
131: return mappings.containsKey(extension);
132: }
133:
134: /**
135: * Returns whether a MIME-type can be extracted.
136: *
137: * @param type MIME-type that needs an Extractor
138: *
139: * @return Extractor for File, or null if mapping is unknown
140: */
141: public boolean canExtract(final String type) {
142: log.debug("Can we extract: " + type + "?");
143: String theType = type;
144: if (!caseSensitive) {
145: theType = type.toLowerCase();
146: }
147: theType = theType.split(";")[0];
148: boolean canExtract = mimeMappings.containsKey(theType);
149: log.debug("" + canExtract);
150: return canExtract;
151: }
152:
153: /**
154: * Returns an Extractor for a given File, based on its extension. A new Extractor object is created everytime, preventing any
155: * threadsafety issues
156: *
157: * @param f File that needs an Extractor
158: *
159: * @return Extractor for File, or null if mapping is unknown
160: */
161: public Extractor createExtractor(final File f) {
162: log.debug("returning Extractor for: " + f.getName());
163:
164: String extension = FileUtils.getExtension(f);
165:
166: if (!caseSensitive) {
167: extension = extension.toLowerCase();
168: }
169:
170: String className = (String) mappings.get(extension);
171: // if we found nothing return the FileInfoExtractor if that's the default
172: if (!StringUtils.hasText(className) && defaultFileinfo)
173: className = "org.zilverline.extractors.FileInfoExtractor";
174:
175: try {
176: if (className != null) {
177: Class c = Class.forName(className);
178:
179: if (c != null) {
180: log.debug("Returning Extractor: " + className);
181:
182: return (Extractor) c.newInstance();
183: }
184: }
185: } catch (InstantiationException e1) {
186: log.debug("Can not initiate Extractor '" + className
187: + "' for " + f.getName(), e1);
188: } catch (IllegalAccessException e1) {
189: log.debug("Can not access Extractor " + className
190: + "' for " + f.getName(), e1);
191: } catch (ClassNotFoundException e) {
192: log.debug("Class not found: " + className + "' for "
193: + f.getName(), e);
194: }
195:
196: log.debug("Unknown format: " + f.getName());
197:
198: return null;
199: }
200:
201: /**
202: * Get the MIME-type of a given file.
203: *
204: * @param f the File
205: * @return the MIME-type of String
206: */
207: public static String getMimeType(final File f) {
208: String type = new MimetypesFileTypeMap().getContentType(f);
209: if ("application/octet-stream".equalsIgnoreCase(type)) {
210: try {
211: Magic parser = new Magic();
212: // getMagicMatch accepts Files or byte[],
213: // which is nice if you want to test streams
214: MagicMatch match = parser.getMagicMatch(f);
215: return match.getMimeType();
216: } catch (MagicParseException e) {
217: log.warn("Can't parse " + f.getName(), e);
218: } catch (MagicMatchNotFoundException e) {
219: log.warn("Can't find type for " + f.getName(), e);
220: } catch (MagicException e) {
221: log.warn("Can't find type for " + f.getName(), e);
222: }
223: }
224: return type;
225: }
226:
227: /**
228: * Returns an Extractor for a given MIME-type. A new Extractor object is created everytime, preventing any threadsafety issues
229: *
230: * @param type File that needs an Extractor
231: *
232: * @return Extractor for MIME-type, or null if mapping is unknown
233: */
234: public Extractor createExtractor(final String type) {
235: log.debug("returning Extractor for: " + type);
236:
237: String theType = type;
238:
239: if (!caseSensitive) {
240: theType = theType.toLowerCase();
241: }
242: theType = theType.split(";")[0];
243:
244: String className = (String) mimeMappings.get(theType);
245:
246: try {
247: if (className != null) {
248: Class c = Class.forName(className);
249:
250: if (c != null) {
251: log.debug("Returning Extractor: " + className);
252:
253: return (Extractor) c.newInstance();
254: }
255: }
256: } catch (InstantiationException e1) {
257: log.debug("Can not initiate Extractor '" + className
258: + "' for " + theType, e1);
259: } catch (IllegalAccessException e1) {
260: log.debug("Can not access Extractor " + className
261: + "' for " + theType, e1);
262: } catch (ClassNotFoundException e) {
263: log.debug("Class not found: " + className + "' for "
264: + theType, e);
265: }
266:
267: log.warn("Unknown format: " + theType);
268:
269: return null;
270: }
271:
272: /**
273: * Set mappings from a Map object. The mappings are file extensions with commands as values. For instance
274: * 'pdf=org.zilverline.core.PDFExtractor'.
275: *
276: * @param props properties as a Map with extension as key and command as value
277: */
278: public void setMappings(final Map props) {
279: mappings.clear();
280:
281: if (caseSensitive) {
282: // copy as-is
283: mappings.putAll(props);
284: } else {
285: // convert the keys to lowercase
286: Iterator iter = props.entrySet().iterator();
287:
288: while (iter.hasNext()) {
289: Map.Entry element = (Map.Entry) iter.next();
290:
291: mappings.put(((String) element.getKey()).toLowerCase(),
292: element.getValue());
293: }
294: }
295:
296: log.debug("Map now is: " + mappings);
297: }
298:
299: /**
300: * Check whether the Factory ignores case or not.
301: *
302: * @return value indicating case sensitivity
303: */
304: public boolean isCaseSensitive() {
305: return caseSensitive;
306: }
307:
308: /**
309: * Sets whether the Factory ignores case or not.
310: *
311: * @param b indicates whether to handle mappings casesensitively
312: */
313: public void setCaseSensitive(final boolean b) {
314: caseSensitive = b;
315: }
316:
317: /**
318: * Get the mappings for the Factory.
319: *
320: * @return the mappings
321: */
322: public Map getMappings() {
323: return mappings;
324: }
325:
326: /**
327: * Find all Extractors on the classpath. This is an expensive operation, use with care.
328: *
329: * @return array of names of found Extractors
330: */
331: public static String[] findExtractorsOnClasspath() {
332: log.debug("Known Extractors on classpath");
333: String[] extractorNames = null;
334: Class[] extractors = { FileInfoExtractor.class,
335: PDFExtractor.class, WordExtractor.class,
336: RTFExtractor.class, HTMLExtractor.class,
337: TextExtractor.class, ExcelExtractor.class,
338: PowerPointExtractor.class };
339: extractorNames = new String[extractors.length];
340: for (int i = 0; i < extractors.length; i++) {
341: extractorNames[i] = extractors[i].getName();
342: log.debug("Extractor: " + extractors[i].getName());
343: }
344: return extractorNames;
345: }
346:
347: public Map getMimeMappings() {
348: return mimeMappings;
349: }
350:
351: public void setMimeMappings(Map mimeMappings) {
352: this .mimeMappings = mimeMappings;
353: }
354:
355: /**
356: * @return the defaultFileinfo
357: */
358: public boolean isDefaultFileinfo() {
359: return defaultFileinfo;
360: }
361:
362: /**
363: * @param defaultFileinfo the defaultFileinfo to set
364: */
365: public void setDefaultFileinfo(boolean defaultFileinfo) {
366: this.defaultFileinfo = defaultFileinfo;
367: }
368:
369: }
|