001: //mimeTypeParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2005
007: //
008: //this file is contributed by Martin Thelian
009: //last major change: 16.05.2005
010: //
011: //This program is free software; you can redistribute it and/or modify
012: //it under the terms of the GNU General Public License as published by
013: //the Free Software Foundation; either version 2 of the License, or
014: //(at your option) any later version.
015: //
016: //This program is distributed in the hope that it will be useful,
017: //but WITHOUT ANY WARRANTY; without even the implied warranty of
018: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: //GNU General Public License for more details.
020: //
021: //You should have received a copy of the GNU General Public License
022: //along with this program; if not, write to the Free Software
023: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: //Using this software in any meaning (reading, learning, copying, compiling,
026: //running) means that you agree that the Author(s) is (are) not responsible
027: //for cost, loss of data or any harm that may be caused directly or indirectly
028: //by usage of this softare or this documentation. The usage of this software
029: //is on your own risk. The installation and usage (starting/running) of this
030: //software may allow other people or application to access your computer and
031: //any attached devices and is highly dependent on the configuration of the
032: //software which must be done by the user of the software; the author(s) is
033: //(are) also not responsible for proper configuration and usage of the
034: //software, even if provoked by documentation provided together with
035: //the software.
036: //
037: //Any changes to this file according to the GPL as documented in the file
038: //gpl.txt aside this file in the shipment you received can be done to the
039: //lines that follows this copyright notice here, but changes must not be
040: //done inside the copyright notive above. A re-distribution must contain
041: //the intact and unchanged copyright notice.
042: //Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.plasma.parser.mimeType;
045:
046: import java.io.File;
047: import java.io.IOException;
048: import java.io.InputStream;
049: import java.util.Collection;
050: import java.util.Hashtable;
051:
052: import net.sf.jmimemagic.Magic;
053: import net.sf.jmimemagic.MagicMatch;
054: import net.sf.jmimemagic.MagicMatchNotFoundException;
055:
056: import org.apache.log4j.Level;
057: import org.apache.log4j.Logger;
058:
059: import de.anomic.plasma.plasmaParser;
060: import de.anomic.plasma.plasmaParserDocument;
061: import de.anomic.plasma.parser.AbstractParser;
062: import de.anomic.plasma.parser.Parser;
063: import de.anomic.plasma.parser.ParserException;
064: import de.anomic.server.serverFileUtils;
065: import de.anomic.yacy.yacyURL;
066:
067: public class mimeTypeParser extends AbstractParser implements Parser {
068:
069: /**
070: * a list of mime types that are supported by this parser class
071: * @see #getSupportedMimeTypes()
072: */
073: public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
074: static {
075: SUPPORTED_MIME_TYPES.put("text/xml", "xml");
076: SUPPORTED_MIME_TYPES.put("application/xml", "xml");
077: SUPPORTED_MIME_TYPES.put("application/x-xml", "xml");
078: SUPPORTED_MIME_TYPES.put("application/octet-stream", "");
079: SUPPORTED_MIME_TYPES.put("application/x-compress", "");
080: SUPPORTED_MIME_TYPES.put("application/x-compressed", "");
081: }
082:
083: /**
084: * a list of library names that are needed by this parser
085: * @see Parser#getLibxDependences()
086: */
087: private static final String[] LIBX_DEPENDENCIES = new String[] {
088: "commons-logging.jar", "jmimemagic-0.1.0.jar",
089: "jakarta-oro-2.0.7.jar", "log4j-1.2.9.jar", "xerces.jar" };
090:
091: /**
092: * Helping structure used to detect loops in the mimeType detection
093: * process
094: */
095: private static Hashtable<Thread, Integer> threadLoopDetection = new Hashtable<Thread, Integer>();
096:
097: public mimeTypeParser() {
098: super (LIBX_DEPENDENCIES);
099: this .parserName = "MimeType Parser";
100: }
101:
102: @SuppressWarnings("unchecked")
103: public String getMimeType(File sourceFile) {
104: String mimeType = null;
105:
106: try {
107: MagicMatch match = Magic.getMagicMatch(sourceFile, true);
108:
109: // if a match was found we can return the new mimeType
110: if (match != null) {
111: Collection<MagicMatch> subMatches = match
112: .getSubMatches();
113: if ((subMatches != null) && (!subMatches.isEmpty())) {
114: mimeType = subMatches.iterator().next()
115: .getMimeType();
116: } else {
117: mimeType = match.getMimeType();
118: }
119: return mimeType;
120: }
121: } catch (Exception e) {
122: /* ignore this */
123: }
124: return null;
125: }
126:
127: @SuppressWarnings("unchecked")
128: public plasmaParserDocument parse(yacyURL location,
129: String mimeType, String charset, File sourceFile)
130: throws ParserException, InterruptedException {
131:
132: String orgMimeType = mimeType;
133:
134: // determining the mime type of the file ...
135: try {
136: // adding current thread to loop detection list
137: Integer loopDepth = null;
138: if (threadLoopDetection.containsKey(Thread.currentThread())) {
139: loopDepth = threadLoopDetection.get(Thread
140: .currentThread());
141: } else {
142: loopDepth = new Integer(0);
143: }
144: if (loopDepth.intValue() > 5)
145: return null;
146: threadLoopDetection.put(Thread.currentThread(),
147: new Integer(loopDepth.intValue() + 1));
148:
149: // deactivating the logging for jMimeMagic
150: Logger jmimeMagicLogger = Logger
151: .getLogger("net.sf.jmimemagic");
152: jmimeMagicLogger.setLevel(Level.OFF);
153:
154: MagicMatch match = Magic.getMagicMatch(sourceFile, true,
155: false);
156:
157: // if a match was found we can return the new mimeType
158: if (match != null) {
159: Collection<MagicMatch> subMatches = match
160: .getSubMatches();
161: if ((subMatches != null) && (!subMatches.isEmpty())) {
162: mimeType = subMatches.iterator().next()
163: .getMimeType();
164: if ((mimeType == null) || (mimeType.length() == 0))
165: mimeType = match.getMimeType();
166: } else {
167: mimeType = match.getMimeType();
168: }
169:
170: // to avoid loops we have to test if the mimetype has changed ...
171: if (this .getSupportedMimeTypes().containsKey(mimeType))
172: throw new ParserException(
173: "Unable to detect mimetype of resource (1).",
174: location);
175: if (orgMimeType.equals(mimeType))
176: throw new ParserException(
177: "Unable to detect mimetype of resource (2).",
178: location);
179:
180: // check for interruption
181: checkInterruption();
182:
183: // parsing the content using the determined mimetype
184: plasmaParser theParser = new plasmaParser();
185: return theParser.parseSource(location, mimeType,
186: charset, sourceFile);
187: }
188: throw new ParserException(
189: "Unable to detect mimetype of resource (3).",
190: location);
191: } catch (MagicMatchNotFoundException e) {
192: throw new ParserException(
193: "Unable to detect mimetype of resource (4).",
194: location);
195: } catch (Exception e) {
196: if (e instanceof InterruptedException)
197: throw (InterruptedException) e;
198: if (e instanceof ParserException)
199: throw (ParserException) e;
200:
201: throw new ParserException(
202: "Unexpected error while detect mimetype of resource. "
203: + e.getMessage(), location);
204: } finally {
205: Integer loopDepth = threadLoopDetection.get(Thread
206: .currentThread());
207: if (loopDepth.intValue() <= 1) {
208: threadLoopDetection.remove(Thread.currentThread());
209: } else {
210: threadLoopDetection.put(Thread.currentThread(),
211: new Integer(loopDepth.intValue() - 1));
212: }
213: }
214: }
215:
216: public plasmaParserDocument parse(yacyURL location,
217: String mimeType, String charset, InputStream source)
218: throws ParserException, InterruptedException {
219: File dstFile = null;
220: try {
221: dstFile = File.createTempFile("mimeTypeParser", ".tmp");
222: serverFileUtils.copy(source, dstFile);
223: return parse(location, mimeType, charset, dstFile);
224: } catch (IOException e) {
225: throw new ParserException(
226: "Unexpected error while detect mimetype of resource. "
227: + e.getMessage(), location);
228: } finally {
229: if (dstFile != null) {
230: dstFile.delete();
231: }
232: }
233:
234: }
235:
236: public java.util.Hashtable<String, String> getSupportedMimeTypes() {
237: return mimeTypeParser.SUPPORTED_MIME_TYPES;
238: }
239:
240: public void reset() {
241: // Nothing todo here at the moment
242: super.reset();
243: }
244:
245: }
|