01: /* MimetypeUtils
02: *
03: * $Id: MimetypeUtils.java 3119 2005-02-17 20:39:21Z stack-sf $
04: *
05: * Created on Sep 22, 2004
06: *
07: * Copyright (C) 2004 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.util;
26:
27: import java.util.regex.Matcher;
28: import java.util.regex.Pattern;
29:
30: /**
31: * Class of mimetype utilities.
32: * @author stack
33: */
34: public class MimetypeUtils {
35: /**
36: * The 'no-type' content-type.
37: *
38: * Defined in the ARC file spec at
39: * http://www.archive.org/web/researcher/ArcFileFormat.php.
40: */
41: public static final String NO_TYPE_MIMETYPE = "no-type";
42:
43: /**
44: * Truncation regex.
45: */
46: final static Pattern TRUNCATION_REGEX = Pattern
47: .compile("^([^\\s;,]+).*");
48:
49: /**
50: * Truncate passed mimetype.
51: *
52: * Ensure no spaces. Strip encoding. Truncation required by
53: * ARC files.
54: *
55: * <p>Truncate at delimiters [;, ].
56: * Truncate multi-part content type header at ';'.
57: * Apache httpclient collapses values of multiple instances of the
58: * header into one comma-separated value,therefore truncated at ','.
59: * Current ia_tools that work with arc files expect 5-column
60: * space-separated meta-lines, therefore truncate at ' '.
61: *
62: * @param contentType Raw content-type.
63: *
64: * @return Computed content-type made from passed content-type after
65: * running it through a set of rules.
66: */
67: public static String truncate(String contentType) {
68: if (contentType == null) {
69: contentType = NO_TYPE_MIMETYPE;
70: } else {
71: Matcher matcher = TRUNCATION_REGEX.matcher(contentType);
72: if (matcher.matches()) {
73: contentType = matcher.group(1);
74: } else {
75: contentType = NO_TYPE_MIMETYPE;
76: }
77: }
78:
79: return contentType;
80: }
81: }
|