001: /*---------------------------------------------------------------------------*\
002: $Id: FeedDownloadThread.java 7041 2007-09-09 01:04:47Z bmc $
003: ---------------------------------------------------------------------------
004: This software is released under a BSD-style license:
005:
006: Copyright (c) 2004-2007 Brian M. Clapper. All rights reserved.
007:
008: Redistribution and use in source and binary forms, with or without
009: modification, are permitted provided that the following conditions are
010: met:
011:
012: 1. Redistributions of source code must retain the above copyright notice,
013: this list of conditions and the following disclaimer.
014:
015: 2. The end-user documentation included with the redistribution, if any,
016: must include the following acknowlegement:
017:
018: "This product includes software developed by Brian M. Clapper
019: (bmc@clapper.org, http://www.clapper.org/bmc/). That software is
020: copyright (c) 2004-2007 Brian M. Clapper."
021:
022: Alternately, this acknowlegement may appear in the software itself,
023: if wherever such third-party acknowlegements normally appear.
024:
025: 3. Neither the names "clapper.org", "curn", nor any of the names of the
026: project contributors may be used to endorse or promote products
027: derived from this software without prior written permission. For
028: written permission, please contact bmc@clapper.org.
029:
030: 4. Products derived from this software may not be called "curn", nor may
031: "clapper.org" appear in their names without prior written permission
032: of Brian M. Clapper.
033:
034: THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
035: WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
036: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
037: NO EVENT SHALL BRIAN M. CLAPPER BE LIABLE FOR ANY DIRECT, INDIRECT,
038: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
039: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
040: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
041: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
042: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
043: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
044: \*---------------------------------------------------------------------------*/
045:
046: package org.clapper.curn;
047:
048: import java.io.IOException;
049: import java.io.File;
050: import java.io.FileInputStream;
051: import java.io.FileOutputStream;
052: import java.io.FileWriter;
053: import java.io.InputStream;
054: import java.io.InputStreamReader;
055: import java.io.OutputStreamWriter;
056: import java.io.Reader;
057: import java.io.Writer;
058:
059: import java.net.URL;
060: import java.net.URLConnection;
061: import java.net.MalformedURLException;
062: import java.util.Date;
063: import java.util.Iterator;
064: import java.util.Collection;
065: import java.util.Collections;
066: import java.util.Queue;
067: import java.util.concurrent.atomic.AtomicInteger;
068: import java.util.zip.GZIPInputStream;
069:
070: import org.clapper.curn.parser.RSSChannel;
071: import org.clapper.curn.parser.RSSItem;
072: import org.clapper.curn.parser.RSSLink;
073: import org.clapper.curn.parser.RSSParser;
074: import org.clapper.curn.parser.RSSParserException;
075:
076: import org.clapper.util.io.FileUtil;
077: import org.clapper.util.logging.Logger;
078: import org.clapper.util.text.TextUtil;
079:
080: class FeedDownloadThread implements Runnable {
081: /*----------------------------------------------------------------------*\
082: Private Constants
083: \*----------------------------------------------------------------------*/
084:
085: private static final String HTTP_CONTENT_TYPE_CHARSET_FIELD = "charset=";
086: private static final int HTTP_CONTENT_TYPE_CHARSET_FIELD_LEN = HTTP_CONTENT_TYPE_CHARSET_FIELD
087: .length();
088:
089: /*----------------------------------------------------------------------*\
090: Private Instance Data
091: \*----------------------------------------------------------------------*/
092:
093: private final Logger log; //NOPMD
094: private final String id;
095: private final CurnConfig configuration;
096: private final RSSParser rssParser;
097: private final FeedCache cache;
098: private final Queue<FeedInfo> feedQueue;
099: private FeedException exception = null;
100: private final MetaPlugIn metaPlugIn = MetaPlugIn.getMetaPlugIn();
101: private RSSChannel channel = null;
102: private FeedDownloadDoneHandler feedDownloadDoneHandler = null;
103:
104: private static AtomicInteger nextThreadID = new AtomicInteger(0);
105:
106: /*----------------------------------------------------------------------*\
107: Inner Classes
108: \*----------------------------------------------------------------------*/
109:
110: /**
111: * Encapsulates information about a downloaded feed.
112: */
113: private class DownloadedTempFile {
114: File file;
115: String encoding;
116: int bytesDownloaded;
117:
118: DownloadedTempFile(File tempFile, String encoding,
119: int bytesDownloaded) {
120: this .file = tempFile;
121: this .encoding = encoding;
122: this .bytesDownloaded = bytesDownloaded;
123: }
124: }
125:
126: /*----------------------------------------------------------------------*\
127: Constructor
128: \*----------------------------------------------------------------------*/
129:
130: /**
131: * Create a new <tt>FeedDownloadThread</tt> object to download feeds.
132: *
133: * @param parser the RSS parser to use
134: * @param feedCache the feed cache to save cache data to
135: * @param configFile the parsed configuration file
136: * @param feedQueue list of feeds to be processed. The list is
137: * assumed to be shared across multiple threads,
138: * and must be thread safe.
139: * @param feedDoneHandler called when afeed is finished downloading
140: */
141: FeedDownloadThread(RSSParser parser, FeedCache feedCache,
142: CurnConfig configFile, Queue<FeedInfo> feedQueue,
143: FeedDownloadDoneHandler feedDoneHandler) {
144: this .id = String.valueOf(nextThreadID.getAndIncrement());
145:
146: String name = "FeedDownloadThread-" + this .id;
147:
148: Thread.currentThread().setName(name);
149: this .log = new Logger(name);
150: this .configuration = configFile;
151: this .rssParser = parser;
152: this .cache = feedCache;
153: this .feedQueue = feedQueue;
154: this .feedDownloadDoneHandler = feedDoneHandler;
155:
156: //setPriority (getPriority() + 1);
157: }
158:
159: /*----------------------------------------------------------------------*\
160: Public Methods
161: \*----------------------------------------------------------------------*/
162:
163: /**
164: * Run the thread. Pulls the next <tt>FeedInfo</tt> object from the
165: * feed queue (the list passed to the constructor) and processes it.
166: * The thread stops running when it has finished downloading a feed and
167: * it finds that the feed queue is empty.
168: */
169: public void run() {
170: boolean done = false;
171:
172: log.info("Thread is alive at priority "
173: + Thread.currentThread().getPriority());
174:
175: while (!done) {
176: FeedInfo feed = null;
177:
178: log.debug("Checking feed queue.");
179: feed = feedQueue.poll();
180:
181: if (feed == null) {
182: log
183: .info("Queue of feeds is empty. Nothing left to do.");
184: done = true;
185: }
186:
187: else {
188: processFeed(feed);
189: }
190: }
191:
192: log.debug("Thread is finishing.");
193: }
194:
195: /*----------------------------------------------------------------------*\
196: Package-visible Methods
197: \*----------------------------------------------------------------------*/
198:
199: /**
200: * Processes the specified feed. This method is called by {@link #run}.
201: * It's also intended to be called directly, when <i>curn</i> is
202: * running in non-threaded mode. Once this method returns, use the
203: * {@link #errorOccurred} method to determine whether a feed-processing
204: * error occurred, and the {@link #getException} method to receive the
205: * exception if an error did occur. (If an error does occur, this method
206: * logs it regardless.)
207: *
208: * @param feed The <tt>FeedInfo</tt> object for the feed to be processed
209: *
210: * @see #errorOccurred
211: * @see #getException
212: */
213: void processFeed(final FeedInfo feed) {
214: this .exception = null;
215: this .channel = null;
216:
217: try {
218: log.info("Processing feed: " + feed.getURL().toString());
219:
220: channel = handleFeed(feed, rssParser);
221: if (channel != null)
222: feedDownloadDoneHandler.feedFinished(feed, channel);
223: }
224:
225: catch (FeedException ex) {
226: this .exception = new FeedException(feed,
227: Constants.BUNDLE_NAME,
228: "FeedDownloadThread.downloadError",
229: "(Config file \"{0}\") error downloading feed",
230: new Object[] { configuration
231: .getConfigurationFileURL(), }, ex);
232: log.error(ex.getMessages(true), this .exception);
233: }
234:
235: catch (CurnException ex) {
236: this .exception = new FeedException(feed,
237: Constants.BUNDLE_NAME,
238: "FeedDownloadThread.downloadError",
239: "(Config file \"{0}\") error downloading feed",
240: new Object[] { configuration
241: .getConfigurationFileURL(), }, ex);
242: log.error(ex.getMessages(true), this .exception);
243: }
244: }
245:
246: /**
247: * Get the parsed channel data.
248: *
249: * @return the channel data
250: */
251: RSSChannel getParsedChannelData() {
252: return channel;
253: }
254:
255: /**
256: * Determine whether an error occurred during processing of the most
257: * recent feed. If an error did occur, you can use {@link #getException}
258: * to get the corresponding exception.
259: *
260: * @return <tt>true</tt> if an error occurred while processing the last
261: * feed, <tt>false</tt> if no error occurred
262: *
263: * @see #processFeed
264: * @see #getException
265: */
266: boolean errorOccurred() {
267: return (this .exception != null);
268: }
269:
270: /**
271: * If an error occurred during processing of the most recent feed,
272: * this method will return the exception associated with the error.
273: *
274: * @return the exception associated with the most recent error, or
275: * null if no error has occurred
276: *
277: * @see #processFeed
278: * @see #errorOccurred
279: */
280: FeedException getException() {
281: return this .exception;
282: }
283:
284: /*----------------------------------------------------------------------*\
285: Private Methods
286: \*----------------------------------------------------------------------*/
287:
288: /**
289: * Actually processes a feed. This method is called by checkFeed()
290: * after checkFeed() determines that there's a reason to try to download
291: * the feed (i.e., the feed has a "save as" setting, and/or parsing is
292: * desired.
293: * @param feedInfo the info about the feed
294: * @param parser the RSS parser to use, or null if parsing is to
295: * be skipped
296: *
297: * @return the <tt>RSSChannel</tt> representing the parsed feed, if
298: * parsing was enabled; otherwise, null.
299: *
300: * @throws FeedException feed download error
301: * @throws CurnException some other error (e.g., plug-in error)
302: */
303: private RSSChannel handleFeed(final FeedInfo feedInfo,
304: final RSSParser parser) throws FeedException, CurnException {
305: URL feedURL = feedInfo.getURL();
306: String feedURLString = feedURL.toString();
307: RSSChannel resultChannel = null;
308:
309: try {
310: log.info("Checking for new data from RSS feed "
311: + feedURLString);
312:
313: boolean forceDownload = metaPlugIn.forceFeedDownload(
314: feedInfo, cache);
315: log.debug("Feed \"" + feedURLString + "\": forceDownload="
316: + forceDownload);
317:
318: // Open the connection.
319:
320: URLConnection conn = feedURL.openConnection();
321:
322: if (!metaPlugIn.runPreFeedDownloadPlugIn(feedInfo, conn)) {
323: log.debug("Feed " + feedInfo.getURL().toString()
324: + ": A plug-in disabled the feed.");
325: }
326:
327: else {
328: resultChannel = downloadAndProcessFeed(feedInfo,
329: parser, conn, forceDownload);
330: }
331: }
332:
333: catch (MalformedURLException ex) {
334: throw new FeedException(feedInfo, ex);
335: }
336:
337: catch (IOException ex) {
338: throw new FeedException(feedInfo, ex);
339: }
340:
341: return resultChannel;
342: }
343:
344: /**
345: * Unconditionally download and process a feed. Only called by
346: * handleFeed().
347: *
348: * @param feedInfo the info about the feed
349: * @param parser the RSS parser to use, or null if parsing is to
350: * be skipped
351: * @param urlConn open URLConnection for the feed
352: * @param forceDownload <tt>true</tt> to force the download even if the
353: * feed hasn't changed, <tt>false</tt> to observe
354: * the normal rules
355: *
356: * @return the <tt>RSSChannel</tt> representing the parsed feed, if
357: * parsing was enabled; otherwise, null.
358: *
359: * @throws FeedException feed download error
360: * @throws CurnException some other error (e.g., plug-in error)
361: */
362: private RSSChannel downloadAndProcessFeed(final FeedInfo feedInfo,
363: final RSSParser parser, final URLConnection urlConn,
364: final boolean forceDownload) throws FeedException,
365: CurnException {
366: RSSChannel resultChannel = null;
367: URL feedURL = feedInfo.getURL();
368:
369: try {
370: // Don't download the channel if it hasn't been modified since
371: // we last checked it. We set the If-Modified-Since header, to
372: // tell the web server not to return the content if it's not
373: // newer than what we saw before. However, as a double-check
374: // (for web servers that ignore the header), we also check the
375: // Last-Modified header, if any, that's returned; if it's not
376: // newer, we don't bother to parse and process the returned
377: // XML.
378:
379: if (!forceDownload)
380: setIfModifiedSinceHeader(urlConn, feedInfo, cache);
381:
382: // If the feed has actually changed, or if downloading is force,
383: // process it.
384:
385: if ((!forceDownload)
386: && (!feedHasChanged(urlConn, feedInfo, cache))) {
387: log.info("Feed has not changed. Skipping it.");
388: }
389:
390: else {
391: log.debug("Feed may have changed. "
392: + "Downloading and processing it.");
393:
394: // Download the feed to a file. We'll parse the file.
395:
396: DownloadedTempFile tempFile = downloadFeed(urlConn,
397: feedInfo);
398:
399: if (tempFile.bytesDownloaded == 0) {
400: log.debug("Feed \"" + feedURL
401: + "\" returned no data.");
402: }
403:
404: else {
405: metaPlugIn.runPostFeedDownloadPlugIn(feedInfo,
406: tempFile.file, tempFile.encoding);
407:
408: if (parser == null) {
409: log
410: .debug("No RSS parser. Skipping XML parse phase.");
411: }
412:
413: else {
414: log.debug("Using RSS parser "
415: + parser.getClass().getName()
416: + " to parse \"" + feedURL + "\"");
417:
418: InputStream is = new FileInputStream(
419: tempFile.file);
420: resultChannel = parser.parseRSSFeed(feedURL,
421: is, tempFile.encoding);
422: is.close();
423:
424: // Make sure the channel has a link.
425:
426: Collection<RSSLink> links = resultChannel
427: .getLinks();
428: if ((links == null) || (links.size() == 0)) {
429: RSSLink link = new RSSLink(feedURL,
430: "text/xml", RSSLink.Type.SELF);
431: resultChannel.setLinks(Collections
432: .singleton(link));
433: }
434:
435: if (resultChannel != null) {
436: metaPlugIn.runPostFeedParsePlugIn(feedInfo,
437: cache, resultChannel);
438: }
439:
440: processChannelItems(resultChannel, feedInfo);
441: if (resultChannel.getItems().size() == 0)
442: resultChannel = null;
443: }
444: }
445:
446: tempFile.file.delete();
447: if (cache != null) {
448: cache.addToCache(null, feedURL, new Date(urlConn
449: .getLastModified()), feedInfo);
450: }
451: }
452: }
453:
454: catch (IOException ex) {
455: throw new FeedException(feedInfo, ex);
456: }
457:
458: catch (RSSParserException ex) {
459: throw new FeedException(feedInfo, ex);
460: }
461:
462: log.debug("downloadAndProcessFeed(): Feed="
463: + feedInfo.getURL()
464: + ", returning "
465: + ((resultChannel == null) ? "null" : resultChannel
466: .toString()));
467:
468: return resultChannel;
469: }
470:
471: /**
472: * Download a feed.
473: *
474: * @param conn the <tt>URLConnection</tt> for the feed
475: * @param feedInfo the <tt>FeedInfo</tt> object for the feed
476: *
477: * @return the <tt>DownloadedTempFile</tt> object that captures the
478: * details about the downloaded file
479: *
480: * @throws IOException I/O error
481: * @throws CurnException some other error
482: */
483: private DownloadedTempFile downloadFeed(final URLConnection conn,
484: final FeedInfo feedInfo) throws CurnException, IOException {
485: URL feedURL = feedInfo.getURL();
486: String feedURLString = feedURL.toString();
487: int totalBytes = 0;
488: File tempFile = CurnUtil.createTempXMLFile();
489:
490: log.debug("Downloading \"" + feedURLString + "\" to file \""
491: + tempFile.getPath());
492:
493: InputStream urlStream = getURLInputStream(conn);
494: Reader reader;
495: Writer writer;
496:
497: // Determine the character set encoding to use.
498:
499: String protocol = feedURL.getProtocol();
500: String encoding = null;
501:
502: if (protocol.equals("http") || protocol.equals("https")) {
503: String contentTypeHeader = conn.getContentType();
504:
505: if (contentTypeHeader != null) {
506: encoding = contentTypeCharSet(contentTypeHeader);
507: log.debug("HTTP server says encoding for \""
508: + feedURLString + "\" is \""
509: + ((encoding == null) ? "<null>" : encoding)
510: + "\"");
511: }
512: }
513:
514: else if (protocol.equals("file")) {
515: // Assume the same default encoding used by "SaveAsEncoding",
516: // unless explicitly specified.
517:
518: encoding = Constants.DEFAULT_SAVE_AS_ENCODING;
519: log.debug("Default encoding for \"" + feedURLString
520: + "\" is \"" + encoding + "\"");
521: }
522:
523: // Set the forced encoding, if specified. Note: This is done after
524: // we check the HTTP encoding, so we can log any discrepancies
525: // between the config-specified encoding and the HTTP
526: // server-specified encoding.
527:
528: String forcedEncoding = feedInfo.getForcedCharacterEncoding();
529: if (forcedEncoding != null) {
530: log.debug("URL \"" + feedURLString
531: + "\": Forcing encoding to be \"" + forcedEncoding
532: + "\"");
533: encoding = forcedEncoding;
534: }
535:
536: if (encoding != null) {
537: log.debug("Encoding is \"" + encoding + "\"");
538: reader = new InputStreamReader(urlStream, encoding);
539: writer = new OutputStreamWriter(new FileOutputStream(
540: tempFile), encoding);
541:
542: /*
543: // Cheat by writing an encoding line to the temp file.
544: writer.write ("<?xml version=\"1.0\" encoding=\""
545: encoding
546: + "\"> ");
547: */
548: }
549:
550: else {
551: InputStreamReader isr = new InputStreamReader(urlStream);
552: reader = isr;
553: writer = new FileWriter(tempFile);
554: log.debug("No encoding for \"" + feedURLString
555: + "\". Using VM default of \"" + isr.getEncoding()
556: + "\"");
557: }
558:
559: totalBytes = FileUtil.copyReader(reader, writer);
560: log.debug("Total bytes downloaded: " + totalBytes);
561: writer.close();
562: urlStream.close();
563:
564: // It's possible for totalBytes to be zero if, for instance, the
565: // use of the If-Modified-Since header caused an HTTP server to
566: // return no content.
567:
568: return new DownloadedTempFile(tempFile, encoding, totalBytes);
569: }
570:
571: /**
572: * Given a content-type header, extract the character set information.
573: *
574: * @param contentType the content type header
575: *
576: * @return the character set, or null if not available
577: */
578: private String contentTypeCharSet(final String contentType) {
579: String result = null;
580: String[] fields = TextUtil.split(contentType, "; \t");
581:
582: for (int i = 0; i < fields.length; i++) {
583: // Compare in a case-insensitive fashion. Some servers (e.g.,
584: // versions of Microsoft's IIS) will specify "Charset=", not
585: // "charset=".
586:
587: String s = fields[i].toLowerCase();
588: if (s.startsWith(HTTP_CONTENT_TYPE_CHARSET_FIELD)
589: && (s.length() > HTTP_CONTENT_TYPE_CHARSET_FIELD_LEN)) {
590: // Strip any quotes from the beginning and end of the field.
591: // Some web servers tack them on, some don't. This isn't,
592: // strictly speaking, kosher, according to the HTTP spec.
593: // But curn has to deal with real-life, including server
594: // brokenness.
595:
596: result = fields[i].substring(
597: HTTP_CONTENT_TYPE_CHARSET_FIELD_LEN).replace(
598: "\"", "");
599:
600: break;
601: }
602: }
603:
604: return result;
605: }
606:
607: /**
608: * Get the input stream for a URL. Handles compressed data.
609: *
610: * @param conn the <tt>URLConnection</tt> to process
611: *
612: * @return the <tt>InputStream</tt>
613: *
614: * @throws IOException I/O error
615: */
616: private InputStream getURLInputStream(final URLConnection conn)
617: throws IOException {
618: InputStream is = conn.getInputStream();
619: String ce = conn.getHeaderField("content-encoding");
620:
621: if (ce != null) {
622: String urlString = conn.getURL().toString();
623:
624: log.debug("URL \"" + urlString + "\" -> Content-Encoding: "
625: + ce);
626: if (ce.indexOf("gzip") != -1) {
627: log.debug("URL \"" + urlString
628: + "\" is compressed. Using GZIPInputStream.");
629: is = new GZIPInputStream(is);
630: }
631: }
632:
633: return is;
634: }
635:
636: /**
637: * Conditionally set the header that "If-Modified-Since" header for a
638: * feed. Must be called on a <tt>URLConnection</tt> before the
639: * <tt>InputStream</tt> is retrieved. Uses the feed cache to set the
640: * value.
641: *
642: * @param conn the <tt>URLConnection</tt> on which to set the
643: * header
644: * @param feedInfo the information on the feed
645: * @param cache the cache
646: */
647: private void setIfModifiedSinceHeader(final URLConnection conn,
648: final FeedInfo feedInfo, final FeedCache cache) {
649: long lastSeen = 0;
650: URL feedURL = feedInfo.getURL();
651:
652: if (cache != null) {
653: FeedCacheEntry entry = cache.getEntryByURL(feedURL);
654:
655: if (entry != null) {
656: lastSeen = entry.getTimestamp();
657:
658: if (lastSeen > 0) {
659: if (log.isDebugEnabled()) {
660: log
661: .debug("Setting If-Modified-Since header for "
662: + "feed \""
663: + feedURL.toString()
664: + "\" to: "
665: + String.valueOf(lastSeen)
666: + " ("
667: + new Date(lastSeen).toString()
668: + ")");
669: }
670:
671: conn.setIfModifiedSince(lastSeen);
672: }
673: }
674: }
675: }
676:
677: /**
678: * Query the appropriate URL connection headers to determine whether
679: * the remote server thinks feed data has changed since the last time
680: * the feed was downloaded. Must be called on a <tt>URLConnection</tt>
681: * after the <tt>InputStream</tt> is retrieved. Uses the feed cache to
682: * set the value.
683: *
684: * @param conn the <tt>URLConnection</tt> whose headers are to be
685: * checked
686: * @param feedInfo the information on the feed
687: * @param cache the cache
688: *
689: * @throws IOException I/O error
690: */
691: private boolean feedHasChanged(final URLConnection conn,
692: final FeedInfo feedInfo, final FeedCache cache)
693: throws IOException {
694: long lastSeen = 0;
695: long lastModified = 0;
696: boolean hasChanged = false;
697: URL feedURL = feedInfo.getURL();
698:
699: if (cache != null) {
700: FeedCacheEntry entry = cache.getEntryByURL(feedURL);
701:
702: if (entry != null)
703: lastSeen = entry.getTimestamp();
704: }
705:
706: if (lastSeen == 0) {
707: log.debug("Feed \"" + feedURL.toString()
708: + "\" has no recorded last-seen time.");
709: hasChanged = true;
710: }
711:
712: else if ((lastModified = conn.getLastModified()) == 0) {
713: log.debug("Feed \"" + feedURL.toString()
714: + "\" provides no last-modified time.");
715: hasChanged = true;
716: }
717:
718: else if (lastSeen >= lastModified) {
719: log.debug("Feed \"" + feedURL.toString()
720: + "\" has Last-Modified time of "
721: + new Date(lastModified).toString()
722: + ", which is not newer than last-seen time of "
723: + new Date(lastSeen).toString()
724: + ". Feed has no new data.");
725: }
726:
727: else {
728: log.debug("Feed \"" + feedURL.toString()
729: + "\" has Last-Modified time of "
730: + new Date(lastModified).toString()
731: + ", which is newer than last-seen time of "
732: + new Date(lastSeen).toString()
733: + ". Feed might have new data.");
734: hasChanged = true;
735: }
736:
737: return hasChanged;
738: }
739:
740: /**
741: * Process all the items for a channel.
742: *
743: * @param channel the channel
744: * @param feedInfo the feed information for the channel
745: *
746: * @throws RSSParserException parser exception
747: * @throws MalformedURLException bad URL
748: */
749: private void processChannelItems(final RSSChannel channel,
750: final FeedInfo feedInfo) throws RSSParserException,
751: MalformedURLException {
752: Collection<RSSItem> items;
753: String channelName;
754:
755: RSSLink selfLink = channel.getLink(RSSLink.Type.SELF);
756: if (selfLink == null)
757: channelName = feedInfo.getURL().toString();
758: else
759: channelName = selfLink.getURL().toString();
760:
761: items = channel.getItems();
762:
763: // First, weed out the ones we don't care about.
764:
765: log.info("Channel \"" + channelName + "\": "
766: + String.valueOf(items.size()) + " total items");
767: for (Iterator<RSSItem> it = items.iterator(); it.hasNext();) {
768: RSSItem item = it.next();
769: RSSLink itemLink = item.getURL();
770:
771: if (itemLink == null) {
772: log.debug("Skipping item with null URL.");
773: it.remove();
774: continue;
775: }
776:
777: URL itemURL = itemLink.getURL();
778:
779: // Normalize the URL and save it.
780:
781: itemURL = CurnUtil.normalizeURL(itemURL);
782: itemLink.setURL(itemURL);
783:
784: // Skip it if it's cached. Note:
785: //
786: // 1. If the item has a unique ID, then the ID alone is used to
787: // determine whether it's been cached. This is the preferred
788: // strategy, since it handles the case where a feed has
789: // unique item IDs that change every day, but has URLs
790: // that are re-used.
791: //
792: // 2. If the item has no unique ID, then determine whether the
793: // URL is cached. If it is, then compare the publication date
794: // of the item with the cached publication date, to see whether
795: // the item is new. If the publication date is missing from
796: // one of them, then use the URL alone and assume/hope that
797: // the item's URL is unique.
798:
799: String itemID = item.getID();
800: log.debug("Item link: " + itemURL);
801: log.debug("Item ID: "
802: + ((itemID == null) ? "<null>" : itemID));
803:
804: if ((cache != null) && (!itemIsNew(item, itemURL))) {
805: log.debug("Discarding old, cached item.");
806: it.remove();
807: }
808: }
809:
810: // Add all the items to the cache, and adjust whatever items are to
811: // be adjusted.
812:
813: if ((items.size() > 0) && (cache != null)) {
814: for (RSSItem item : items) {
815: RSSLink itemLink = item.getURL();
816: assert (itemLink != null);
817: URL itemURL = itemLink.getURL();
818:
819: log.debug("Caching URL: " + itemURL);
820: cache.addToCache(item.getID(), itemURL, item
821: .getPublicationDate(), feedInfo);
822: }
823: }
824:
825: // Change the channel's items to the ones that are left.
826:
827: log.debug("Setting channel items: total=" + items.size());
828: channel.setItems(items);
829: }
830:
831: /**
832: * Determine whether an item is cached.
833: *
834: * @param item the item to test
835: * @param itemURL the item's normalized URL (which might not be the
836: * same as the URL in the item itself)
837: *
838: * @return true if cached, false if not
839: */
840: private boolean itemIsNew(final RSSItem item, final URL itemURL) {
841: String itemURLString = itemURL.toString();
842: boolean isNew = true;
843: FeedCacheEntry cacheEntry = cache.getEntryForItem(item);
844:
845: if (cacheEntry == null) {
846: log.debug("URL \"" + itemURLString
847: + "\" is not in the cache. It's new.");
848: }
849:
850: else if (cacheEntry.isSticky()) {
851: log.debug("URL \"" + itemURLString
852: + "\" is marked sticky. " + "Treating it as new.");
853: isNew = true;
854: }
855:
856: else {
857: Date cachePubDate = cacheEntry.getPublicationDate();
858: Date itemPubDate = item.getPublicationDate();
859:
860: if ((cachePubDate == null) || (itemPubDate == null)) {
861: log
862: .debug("Missing publication date in item and/or "
863: + "cache for URL \""
864: + itemURLString
865: + "\". Assuming URL is old, since it is in the "
866: + "cache. Skipping it.");
867: isNew = false;
868: }
869:
870: else {
871: log
872: .debug("URL \"" + itemURLString
873: + "\": Cached publication date is "
874: + cachePubDate.toString()
875: + "\", item publication date is "
876: + itemPubDate);
877: if (itemPubDate.after(cachePubDate)) {
878: log
879: .debug("URL \""
880: + itemURLString
881: + "\" is newer than cached publication date. "
882: + "Keeping it.");
883: }
884:
885: else {
886: log
887: .debug("URL \""
888: + itemURLString
889: + "\" is not newer than cached publication date. "
890: + "Skipping it.");
891: isNew = false;
892: }
893: }
894: }
895:
896: return isNew;
897: }
898: }
|