001: /*---------------------------------------------------------------------------*\
002: $Id: FeedCache.java 7041 2007-09-09 01:04:47Z bmc $
003: ---------------------------------------------------------------------------
004: This software is released under a BSD-style license:
005:
006: Copyright (c) 2004-2007 Brian M. Clapper. All rights reserved.
007:
008: Redistribution and use in source and binary forms, with or without
009: modification, are permitted provided that the following conditions are
010: met:
011:
012: 1. Redistributions of source code must retain the above copyright notice,
013: this list of conditions and the following disclaimer.
014:
015: 2. The end-user documentation included with the redistribution, if any,
016: must include the following acknowlegement:
017:
018: "This product includes software developed by Brian M. Clapper
019: (bmc@clapper.org, http://www.clapper.org/bmc/). That software is
020: copyright (c) 2004-2007 Brian M. Clapper."
021:
022: Alternately, this acknowlegement may appear in the software itself,
023: if wherever such third-party acknowlegements normally appear.
024:
025: 3. Neither the names "clapper.org", "curn", nor any of the names of the
026: project contributors may be used to endorse or promote products
027: derived from this software without prior written permission. For
028: written permission, please contact bmc@clapper.org.
029:
030: 4. Products derived from this software may not be called "curn", nor may
031: "clapper.org" appear in their names without prior written permission
032: of Brian M. Clapper.
033:
034: THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
035: WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
036: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
037: NO EVENT SHALL BRIAN M. CLAPPER BE LIABLE FOR ANY DIRECT, INDIRECT,
038: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
039: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
040: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
041: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
042: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
043: THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
044: \*---------------------------------------------------------------------------*/
045:
046: package org.clapper.curn;
047:
048: import java.net.URL;
049: import java.util.ArrayList;
050: import java.util.Collection;
051: import java.util.Collections;
052: import java.util.Date;
053: import java.util.HashMap;
054: import java.util.LinkedList;
055: import java.util.List;
056: import java.util.Map;
057: import org.clapper.curn.parser.RSSItem;
058: import org.clapper.util.logging.Logger;
059:
060: /**
061: * Defines the in-memory format of the <i>curn</i> cache, and provides
062: * methods for saving and restoring the cache.
063: *
064: * @see Curn
065: * @see org.clapper.curn.parser.RSSChannel
066: *
067: * @version <tt>$Revision: 7041 $</tt>
068: */
069: public class FeedCache {
070: /*----------------------------------------------------------------------*\
071: Private Constants
072: \*----------------------------------------------------------------------*/
073:
074: /*----------------------------------------------------------------------*\
075: Private Classes
076: \*----------------------------------------------------------------------*/
077:
078: /*----------------------------------------------------------------------*\
079: Private Data Items
080: \*----------------------------------------------------------------------*/
081:
082: /**
083: * The configuration
084: */
085: private final CurnConfig config;
086:
087: /**
088: * The actual cache, indexed by unique ID.
089: */
090: private Map<String, FeedCacheEntry> cacheByID = null;
091:
092: /**
093: * Alternate cache (not saved, but regenerated on the fly), indexed
094: * by URL.
095: */
096: private Map<String, FeedCacheEntry> cacheByURL = null;
097:
098: /**
099: * A list of feed entries, used only during load.
100: */
101: private List<FeedCacheEntry> loadedEntries = new LinkedList<FeedCacheEntry>();
102:
103: /**
104: * Current time
105: */
106: private long currentTime = System.currentTimeMillis();
107:
108: /**
109: * For log messages
110: */
111: private static final Logger log = new Logger(FeedCache.class);
112:
113: /*----------------------------------------------------------------------*\
114: Constructor
115: \*----------------------------------------------------------------------*/
116:
117: /**
118: * Construct a new, empty cache object.
119: *
120: * @param config the <i>curn</i> configuration
121: */
122: FeedCache(CurnConfig config) {
123: this .config = config;
124: }
125:
126: /*----------------------------------------------------------------------*\
127: Public Methods
128: \*----------------------------------------------------------------------*/
129:
130: /**
131: * Determine whether the cache contains an entry with the specified
132: * unique ID.
133: *
134: * @param id the ID to check.
135: *
136: * @return <tt>true</tt> if the ID is present in the cache,
137: * <tt>false</tt> if not
138: */
139: public boolean containsID(final String id) {
140: boolean hasKey = cacheByID.containsKey(id);
141: log.debug("Cache contains \"" + id + "\"? " + hasKey);
142: return hasKey;
143: }
144:
145: /**
146: * Determine whether the cache contains the specified URL.
147: *
148: * @param url the URL to check. This method normalizes it.
149: *
150: * @return <tt>true</tt> if the ID is present in the cache,
151: * <tt>false</tt> if not
152: */
153: public boolean containsURL(final URL url) {
154: boolean hasURL = false;
155:
156: if (cacheByURL != null) {
157: String urlKey = CurnUtil.urlToLookupKey(url);
158: hasURL = cacheByURL.containsKey(urlKey);
159: log.debug("Cache contains \"" + urlKey + "\"? " + hasURL);
160: }
161:
162: return hasURL;
163: }
164:
165: /**
166: * Get an entry from the cache by its unique ID.
167: *
168: * @param id the unique ID to check
169: *
170: * @return the corresponding <tt>FeedCacheEntry</tt> object, or null if
171: * not found
172: */
173: public FeedCacheEntry getEntry(final String id) {
174: FeedCacheEntry result = null;
175:
176: if (cacheByID != null)
177: result = cacheByID.get(id);
178:
179: return result;
180: }
181:
182: /**
183: * Get an entry from the cache by its URL.
184: *
185: * @param url the URL
186: *
187: * @return the corresponding <tt>FeedCacheEntry</tt> object, or null if
188: * not found
189: */
190: public FeedCacheEntry getEntryByURL(final URL url) {
191: FeedCacheEntry result = null;
192:
193: if (cacheByURL != null)
194: result = cacheByURL.get(CurnUtil.urlToLookupKey(url));
195:
196: return result;
197: }
198:
199: /**
200: * Get an entry for an {@link RSSItem} from the cache. This method
201: * attempts to find the item by its unique ID. If the item has no ID,
202: * then this method attempts to find the item by its URL.
203: *
204: * @param item the {@link RSSItem} to find in the cache
205: *
206: * @return the corresponding {@link FeedCacheEntry} object, or null if
207: * not found
208: */
209: public FeedCacheEntry getEntryForItem(RSSItem item) {
210: FeedCacheEntry entry = null;
211: String itemID = item.getID();
212: URL itemURL = CurnUtil.normalizeURL(item.getURL().getURL());
213:
214: if (itemID != null) {
215: log.debug("Attempting to find item \"" + item.toString()
216: + "\" in cache.");
217: entry = getEntry(itemID);
218: }
219:
220: else {
221: log.debug("Item has no Unique ID. Locating it by URL (\""
222: + itemURL.toString() + "\")");
223: entry = getEntryByURL(itemURL);
224: }
225:
226: return entry;
227: }
228:
229: /**
230: * Add (or replace) a cached URL.
231: *
232: * @param uniqueID the unique ID string for the cache entry, or null.
233: * If null, the URL is used as the unique ID.
234: * @param url the URL to cache. May be an individual item URL, or
235: * the URL for an entire feed.
236: * @param pubDate the publication date, if known; or null
237: * @param parentFeed the associated feed
238: *
239: * @see CurnUtil#normalizeURL
240: */
241: public void addToCache(String uniqueID, final URL url,
242: final Date pubDate, final FeedInfo parentFeed) {
243: synchronized (this ) {
244: if (cacheByID == null) {
245: cacheByID = new HashMap<String, FeedCacheEntry>();
246: cacheByURL = new HashMap<String, FeedCacheEntry>();
247: }
248: }
249:
250: if (uniqueID == null)
251: uniqueID = url.toExternalForm();
252:
253: URL parentURL = parentFeed.getURL();
254: FeedCacheEntry entry = new FeedCacheEntry(uniqueID, parentURL,
255: url, pubDate, System.currentTimeMillis());
256:
257: log.debug("Adding cache entry for URL \""
258: + entry.getEntryURL().toExternalForm() + "\". ID=\""
259: + uniqueID + "\", channel URL: \""
260: + entry.getChannelURL().toExternalForm() + "\"");
261:
262: cacheByID.put(uniqueID, entry);
263: cacheByURL.put(CurnUtil.urlToLookupKey(url), entry);
264: }
265:
266: /**
267: * Get all entries in the cache, in no particular order.
268: *
269: * @return a <tt>Collection</tt> of entries
270: */
271: public Collection<FeedCacheEntry> getAllEntries() {
272: Collection<FeedCacheEntry> result = null;
273:
274: if (cacheByID != null)
275: result = Collections.unmodifiableCollection(cacheByID
276: .values());
277: else
278: result = new ArrayList<FeedCacheEntry>();
279:
280: return result;
281: }
282:
283: /**
284: * Set the cache's notion of the current time, which affects how elements
285: * are pruned when loaded from the cache. Only meaningful if set before
286: * the <tt>load()</tt> method is called. If this method is never
287: * called, then the cache uses the current time.
288: *
289: * @param datetime the time to use
290: */
291: public void setCurrentTime(final Date datetime) {
292: this .currentTime = datetime.getTime();
293: }
294:
295: /*----------------------------------------------------------------------*\
296: Package-visible Methods
297: \*----------------------------------------------------------------------*/
298:
299: /**
300: * Add a {@link FeedCacheEntry} to the cache. This method exists primarily
301: * for use during deserialization of the cache.
302: *
303: * @param entry the entry
304: */
305: void loadFeedCacheEntry(FeedCacheEntry entry) {
306: // Load onto the end of a list, for speed. pruneCache()
307: // will sift through the list when we're done.
308:
309: loadedEntries.add(entry);
310: }
311:
312: /**
313: * Signify that the cache is finished loading (i.e., that all calls to
314: * loadFeedCacheEntry are done).
315: */
316: void optimizeAfterLoad() {
317: pruneCache();
318: }
319:
320: /*----------------------------------------------------------------------*\
321: Private Methods
322: \*----------------------------------------------------------------------*/
323:
324: /**
325: * Prune the loaded cache of out-of-date data.
326: */
327: private void pruneCache() {
328: log.debug("PRUNING CACHE");
329: log.debug("Cache's notion of current time: "
330: + new Date(currentTime));
331: Map<URL, FeedInfo> feedInfoMap = config.getFeedInfoMap();
332:
333: int maxEntries = loadedEntries.size();
334: if (maxEntries == 0)
335: maxEntries = 100;
336:
337: // Use default load factor (0.75), and rely on the HashMap class's
338: // documented behavior: "If the initial capacity is greater than the
339: // maximum number of entries divided by the load factor, no rehash
340: // operations will ever occur."
341:
342: int initialCapacity = (int) (((float) maxEntries) / 0.75f);
343:
344: log.debug("HashMap sizing: Max entries=" + maxEntries + ", "
345: + "initialCapacity=" + initialCapacity);
346: cacheByID = new HashMap<String, FeedCacheEntry>(initialCapacity);
347: cacheByURL = new HashMap<String, FeedCacheEntry>(
348: initialCapacity);
349:
350: for (FeedCacheEntry entry : loadedEntries) {
351: boolean removed = false;
352: URL channelURL = entry.getChannelURL();
353: String itemKey = entry.getUniqueID();
354:
355: if (log.isDebugEnabled())
356: dumpCacheEntry(itemKey, entry, "");
357:
358: FeedInfo feedInfo = feedInfoMap.get(channelURL);
359:
360: if (feedInfo == null) {
361: // Cached URL no longer corresponds to a configured site
362: // URL. Kill it.
363:
364: log
365: .debug("Cached item \""
366: + itemKey
367: + "\", with base URL \""
368: + channelURL.toString()
369: + "\" no longer corresponds to a configured feed. "
370: + "Tossing it.");
371: removed = true;
372: }
373:
374: else {
375: long timestamp = entry.getTimestamp();
376: long maxCacheMS = feedInfo.getMillisecondsToCache();
377: long expires = timestamp + maxCacheMS;
378:
379: if (log.isDebugEnabled()) {
380: log.debug(" Cache time: "
381: + feedInfo.getDaysToCache() + " days ("
382: + maxCacheMS + " ms)");
383: log.debug(" Expires: "
384: + new Date(expires).toString());
385: }
386:
387: if (timestamp > currentTime) {
388: log
389: .debug("Cache time for item \""
390: + itemKey
391: + "\" is in the future, relative to cache's "
392: + "notion of current time. Setting its "
393: + "timestamp to the current time.");
394: entry.setTimestamp(currentTime);
395: }
396:
397: else if (expires < currentTime) {
398: log.debug("Cache time for item \"" + itemKey
399: + "\" has expired. Deleting cache entry.");
400: removed = true;
401: }
402: }
403:
404: if (!removed) {
405: // Add to URL cache.
406:
407: URL url = CurnUtil.normalizeURL(entry.getEntryURL());
408: String strURL = url.toString();
409: log.debug("Loading entry for URL \"" + strURL
410: + "\" into in-memory URL lookup cache.");
411: cacheByURL.put(strURL, entry);
412: log.debug("Loading entry for URL \"" + strURL
413: + "\" into in-memory ID lookup cache.");
414: cacheByID.put(itemKey, entry);
415: log.debug("Insert complete.");
416: }
417: }
418:
419: log.debug("DONE PRUNING CACHE");
420: }
421:
422: /**
423: * Dump a single cache entry via the "debug" log facility.
424: *
425: * @param itemKey the hash table key for the item
426: * @param entry the cache entry
427: * @param indent string to use to indent output, if desired
428: */
429: private void dumpCacheEntry(final Object itemKey,
430: final FeedCacheEntry entry, String indent) {
431: long timestamp = entry.getTimestamp();
432:
433: if (indent == null)
434: indent = "";
435:
436: log
437: .debug(indent + "Cached item \"" + itemKey.toString()
438: + "\"");
439: log.debug(indent + " Item URL: "
440: + entry.getEntryURL().toString());
441: log.debug(indent + " Channel URL: "
442: + entry.getChannelURL().toString());
443: log.debug(indent + " Cached on: "
444: + new Date(timestamp).toString());
445: }
446:
447: }
|