001: //
002: // Informa -- RSS Library for Java
003: // Copyright (c) 2002 by Niko Schmuck
004: //
005: // Niko Schmuck
006: // http://sourceforge.net/projects/informa
007: // mailto:niko_schmuck@users.sourceforge.net
008: //
009: // This library is free software.
010: //
011: // You may redistribute it and/or modify it under the terms of the GNU
012: // Lesser General Public License as published by the Free Software Foundation.
013: //
014: // Version 2.1 of the license should be included with this distribution in
015: // the file LICENSE. If the license is not included with this distribution,
016: // you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
017: // or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
018: // MA 02139 USA.
019: //
020: // This library is distributed in the hope that it will be useful,
021: // but WITHOUT ANY WARRANTY; without even the implied waranty of
022: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
023: // Lesser General Public License for more details.
024: //
025:
026: // $Id: FormatDetector.java,v 1.24 2005/06/17 13:48:22 spyromus Exp $
027:
028: package de.nava.informa.utils;
029:
030: import java.io.BufferedInputStream;
031: import java.io.InputStream;
032: import java.io.IOException;
033: import java.net.URL;
034:
035: import org.apache.commons.logging.Log;
036: import org.apache.commons.logging.LogFactory;
037:
038: import de.nava.informa.core.ChannelFormat;
039: import de.nava.informa.core.UnsupportedFormatException;
040:
041: /**
042: * Utility class for analysing the news channel syntax and mapping to
043: * known format to ease further processing.
044: *
045: * @author Niko Schmuck (niko@nava.de)
046: */
047: public final class FormatDetector {
048:
049: private static Log logger = LogFactory.getLog(FormatDetector.class);
050:
051: private static final int NR_FIRST_BYTES = 2048;
052:
053: /**
054: * Guess the format of the specified news channel. For performance
055: * reason it is wise to minimize the number of format guesses.
056: *
057: * @param url a url to the news channel.
058: * @return The news channel synatx format, currently only RSS 0.91
059: * ({@link de.nava.informa.core.ChannelFormat#RSS_0_91})
060: * and RSS/RDF 1.0
061: * ({@link de.nava.informa.core.ChannelFormat#RSS_1_0})
062: * are recognized.
063: * @throws UnsupportedFormatException in case a news channel format
064: * could not be guessed.
065: * @throws IOException if the given url cannot be read in.
066: */
067: public static ChannelFormat getFormat(URL url) throws IOException,
068: UnsupportedFormatException {
069:
070: logger.info("Trying to retrieve stream from " + url);
071: BufferedInputStream in = new BufferedInputStream(url
072: .openStream(), NR_FIRST_BYTES);
073: return getFormat(in);
074: }
075:
076: /**
077: * Guess the format of the specified news channel. For performance
078: * reason it is wise to minimize the number of format guesses.
079: *
080: * @param in an InputStream to the news channel.
081: * @return The news channel synatx format, currently only RSS 0.91
082: * ({@link de.nava.informa.core.ChannelFormat#RSS_0_91})
083: * and RSS/RDF 1.0
084: * ({@link de.nava.informa.core.ChannelFormat#RSS_1_0})
085: * are recognized.
086: * @throws UnsupportedFormatException in case a news channel format
087: * could not be guessed.
088: * @throws IOException if the given url cannot be read in.
089: */
090: public static ChannelFormat getFormat(InputStream in)
091: throws IOException, UnsupportedFormatException {
092:
093: byte[] b = new byte[NR_FIRST_BYTES];
094:
095: int bytesRead = 0;
096: while (bytesRead < NR_FIRST_BYTES) {
097: int bytes = in.read(b, bytesRead, NR_FIRST_BYTES
098: - bytesRead);
099: if (bytes == -1)
100: break;
101: bytesRead += bytes;
102: }
103:
104: String rootElement = getRootElement(b);
105: logger.debug("Detected [" + rootElement + "].");
106: if (rootElement.startsWith("rss")) {
107: if (rootElement.indexOf("0.91") > 0) {
108: logger
109: .info("Channel uses RSS root element (Version 0.91).");
110: return ChannelFormat.RSS_0_91;
111: } else if (rootElement.indexOf("0.92") > 0) {
112: logger
113: .info("Channel uses RSS root element (Version 0.92).");
114: // FIXME: should really return ChannelFormat.RSS_0_92
115: // when aware of all subtle differences.
116: return ChannelFormat.RSS_0_92;
117: } else if (rootElement.indexOf("0.93") > 0) {
118: logger
119: .info("Channel uses RSS root element (Version 0.93).");
120: logger
121: .warn("RSS 0.93 not fully supported yet, fall back to 0.92.");
122: // FIXME: should really return ChannelFormat.RSS_0_93
123: // when aware of all subtle differences.
124: return ChannelFormat.RSS_0_92;
125: } else if (rootElement.indexOf("0.94") > 0) {
126: logger
127: .info("Channel uses RSS root element (Version 0.94).");
128: logger
129: .warn("RSS 0.94 not fully supported yet, fall back to 0.92.");
130: // FIXME: should really return ChannelFormat.RSS_0_94
131: // when aware of all subtle differences.
132: return ChannelFormat.RSS_0_92;
133: } else if (rootElement.indexOf("2.0") > 0) {
134: logger
135: .info("Channel uses RSS root element (Version 2.0).");
136: return ChannelFormat.RSS_2_0;
137: } else {
138: throw new UnsupportedFormatException(
139: "Unsupported RSS version [" + rootElement
140: + "].");
141: }
142: } else if (rootElement.indexOf("rdf") >= 0) {
143: logger.info("Channel uses RDF root element.");
144: return ChannelFormat.RSS_1_0;
145: } else if (rootElement.indexOf("feed") >= 0) {
146: if (rootElement.indexOf("0.1") >= 0) {
147: return ChannelFormat.ATOM_0_1;
148: } else if (rootElement.indexOf("0.2") >= 0) {
149: return ChannelFormat.ATOM_0_2;
150: } else if (rootElement.indexOf("0.3") >= 0) {
151: return ChannelFormat.ATOM_0_3;
152: } else {
153: throw new UnsupportedFormatException(
154: "Unsupported ATOM version [" + rootElement
155: + "].");
156: }
157: } else {
158: throw new UnsupportedFormatException(
159: "Not able to parse document "
160: + "with root element [" + rootElement
161: + "].");
162: }
163: }
164:
165: /**
166: * Gets the name of the root element and the attributes (inclusive
167: * namespace declarations).
168: */
169: private static final String getRootElement(byte[] b) {
170: String s = new String(b);
171: int startPos = 0;
172: int endPos = 0;
173: boolean inComment = false;
174: for (int i = 0; i < s.length(); i++) {
175: if (s.charAt(i) == '<'
176: && Character.isLetter(s.charAt(i + 1))
177: && !inComment) {
178: startPos = i + 1;
179: for (int j = i + 1; j < s.length(); j++) {
180: if (s.charAt(j) == '>') {
181: endPos = j;
182: break;
183: }
184: }
185: break;
186: } else if (!inComment && s.charAt(i) == '<'
187: && s.charAt(i + 1) == '!' && s.charAt(i + 2) == '-'
188: && s.charAt(i + 3) == '-')
189: inComment = true;
190: else if (inComment && s.charAt(i) == '-'
191: && s.charAt(i + 1) == '-' && s.charAt(i + 2) == '>')
192: inComment = false;
193: } // for i
194: if (startPos >= 0 && endPos >= 0 && endPos > startPos) {
195: return s.substring(startPos, endPos);
196: } else {
197: throw new IllegalArgumentException(
198: "Unable to retrieve root " + "element from " + s);
199: }
200: }
201:
202: }
|