001: package net.javacoding.jspider.core.storage.memory;
002:
003: import net.javacoding.jspider.core.event.impl.*;
004: import net.javacoding.jspider.core.model.*;
005: import net.javacoding.jspider.core.storage.spi.ResourceDAOSPI;
006: import net.javacoding.jspider.core.storage.spi.StorageSPI;
007: import net.javacoding.jspider.core.storage.exception.InvalidStateTransitionException;
008: import net.javacoding.jspider.core.util.URLUtil;
009:
010: import java.net.URL;
011: import java.util.*;
012:
013: /**
014: * $Id: ResourceDAOImpl.java,v 1.12 2003/04/11 16:37:07 vanrogu Exp $
015: */
016: class ResourceDAOImpl implements ResourceDAOSPI {
017:
018: protected StorageSPI storage;
019:
020: protected Map knownURLs;
021: protected Map byId;
022:
023: protected Set spideredResources; /* urls visited by a spider, but not yet parsed */
024:
025: protected Set ignoredForFetchingResources; /* urls ignored because of rule decisions */
026: protected Set ignoredForParsingResources; /* urls ignored because non-HTML */
027: protected Set forbiddenResources; /* forbidden urls */
028: protected Set fetchErrorResources; /* urls that could not be visited by the spider */
029: protected Set parseErrorResources; /* resources that could not be parsed correctly */
030: protected Set parsedResources; /* urls that were spidered AND interpreted */
031:
032: protected Map referers;
033: protected Map referees;
034:
035: protected Map byFolder;
036: protected Map rootResources;
037:
038: public ResourceDAOImpl(StorageSPI storage) {
039: this .storage = storage;
040: spideredResources = new HashSet();
041: ignoredForFetchingResources = new HashSet();
042: ignoredForParsingResources = new HashSet();
043: forbiddenResources = new HashSet();
044: fetchErrorResources = new HashSet();
045: parseErrorResources = new HashSet();
046: parsedResources = new HashSet();
047: knownURLs = new HashMap();
048: this .byId = new HashMap();
049: this .referees = new HashMap();
050: this .referers = new HashMap();
051: this .byFolder = new HashMap();
052: this .rootResources = new HashMap();
053: }
054:
055: public void create(int id, ResourceInternal resource) {
056: URL url = resource.getURL();
057: knownURLs.put(url, resource);
058: byId.put(new Integer(id), resource);
059:
060: if (resource.getFolder() == null) {
061: Set set = (Set) rootResources.get(URLUtil.getSiteURL(url));
062: if (set == null) {
063: set = new HashSet();
064: rootResources.put(URLUtil.getSiteURL(url), set);
065: }
066: set.add(resource);
067: } else {
068: Set set = (Set) byFolder.get(resource.getFolder());
069: if (set == null) {
070: set = new HashSet();
071: byFolder.put(resource.getFolder(), set);
072: }
073: set.add(resource);
074: }
075: }
076:
077: public void registerURLReference(URL url, URL refererURL) {
078: ResourceInternal resource = (ResourceInternal) knownURLs
079: .get(url);
080: if (refererURL != null) {
081: ResourceInternal referer = (ResourceInternal) knownURLs
082: .get(refererURL);
083: storeRef(referers, resource, referer, refererURL, url);
084: storeRef(referees, referer, resource, refererURL, url);
085: }
086: }
087:
088: public ResourceInternal[] findByFolder(FolderInternal folder) {
089: Set set = (Set) byFolder.get(folder);
090: if (set == null) {
091: return new ResourceInternal[0];
092: }
093: return (ResourceInternal[]) set
094: .toArray(new ResourceInternal[set.size()]);
095: }
096:
097: protected void storeRef(Map map, ResourceInternal key,
098: ResourceInternal data, URL referer, URL referee) {
099: Map refmap = (Map) map.get(key.getURL());
100: if (refmap == null) {
101: refmap = new HashMap();
102: map.put(key.getURL(), refmap);
103: }
104: ResourceReferenceInternal rri = (ResourceReferenceInternal) refmap
105: .get(data.getURL());
106: if (rri == null) {
107: rri = new ResourceReferenceInternal(storage, referer,
108: referee, 0);
109: refmap.put(data.getURL(), rri);
110: }
111: rri.incrementCount();
112: }
113:
114: public ResourceInternal[] findAllResources() {
115: return (ResourceInternal[]) knownURLs.values().toArray(
116: new ResourceInternal[knownURLs.size()]);
117: }
118:
119: public ResourceInternal[] getRefereringResources(
120: ResourceInternal resource) {
121: ResourceReferenceInternal[] refs = getIncomingReferences(resource);
122: ArrayList al = new ArrayList();
123: for (int i = 0; i < refs.length; i++) {
124: ResourceReferenceInternal ref = refs[i];
125: al.add(ref.getReferer());
126: }
127: return (ResourceInternal[]) al.toArray(new ResourceInternal[al
128: .size()]);
129: }
130:
131: public ResourceReferenceInternal[] getOutgoingReferences(
132: ResourceInternal resource) {
133: Map map = (Map) referees.get(resource.getURL());
134: if (map == null) {
135: return new ResourceReferenceInternal[0];
136: } else {
137: return (ResourceReferenceInternal[]) map.values().toArray(
138: new ResourceReferenceInternal[map.size()]);
139: }
140: }
141:
142: public ResourceReferenceInternal[] getIncomingReferences(
143: ResourceInternal resource) {
144: Map map = (Map) referers.get(resource.getURL());
145: if (map == null) {
146: return new ResourceReferenceInternal[0];
147: } else {
148: return (ResourceReferenceInternal[]) map.values().toArray(
149: new ResourceReferenceInternal[map.size()]);
150: }
151: }
152:
153: public ResourceInternal[] getReferencedResources(
154: ResourceInternal resource) {
155: ResourceReferenceInternal[] refs = getOutgoingReferences(resource);
156: ArrayList al = new ArrayList();
157: for (int i = 0; i < refs.length; i++) {
158: ResourceReferenceInternal ref = refs[i];
159: al.add(ref.getReferee());
160: }
161: return (ResourceInternal[]) al.toArray(new ResourceInternal[al
162: .size()]);
163: }
164:
165: public ResourceInternal[] getBySite(SiteInternal site) {
166: ArrayList al = new ArrayList();
167: Iterator it = knownURLs.keySet().iterator();
168: while (it.hasNext()) {
169: URL url = (URL) it.next();
170: URL siteURL = URLUtil.getSiteURL(url);
171: if (site.getURL().equals(siteURL)) {
172: al.add(getResource(url));
173: }
174: }
175: return (ResourceInternal[]) al.toArray(new ResourceInternal[al
176: .size()]);
177: }
178:
179: public ResourceInternal[] getRootResources(SiteInternal site) {
180: Set set = (Set) rootResources.get(site.getURL());
181: if (set == null) {
182: return new ResourceInternal[0];
183: } else {
184: return (ResourceInternal[]) set
185: .toArray(new ResourceInternal[set.size()]);
186: }
187: }
188:
189: public ResourceInternal getResource(int id) {
190: return (ResourceInternal) byId.get(new Integer(id));
191: }
192:
193: public ResourceInternal getResource(URL url) {
194: return (ResourceInternal) knownURLs.get(url);
195: }
196:
197: public synchronized void setSpidered(URL url,
198: URLSpideredOkEvent event) {
199: ResourceInternal resource = getResource(url);
200: resource.setFetched(event.getHttpStatus(), event.getSize(),
201: event.getTimeMs(), event.getMimeType(), null, event
202: .getHeaders());
203: resource.setBytes(event.getBytes());
204: }
205:
206: public synchronized void setIgnoredForParsing(URL url)
207: throws InvalidStateTransitionException {
208: ResourceInternal resource = getResource(url);
209: resource.setParseIgnored();
210: ignoredForParsingResources.add(url);
211: }
212:
213: public synchronized void setIgnoredForFetching(URL url,
214: URLFoundEvent event) throws InvalidStateTransitionException {
215: ResourceInternal resource = getResource(url);
216: resource.setFetchIgnored();
217: ignoredForFetchingResources.add(event.getFoundURL());
218: }
219:
220: public synchronized void setForbidden(URL url, URLFoundEvent event)
221: throws InvalidStateTransitionException {
222: ResourceInternal resource = getResource(url);
223: resource.setForbidden();
224: forbiddenResources.add(event.getFoundURL());
225: }
226:
227: public synchronized void setError(URL url,
228: ResourceParsedErrorEvent event)
229: throws InvalidStateTransitionException {
230: ResourceInternal resource = getResource(url);
231: resource.setParseError();
232: parseErrorResources.add(url);
233: }
234:
235: public synchronized void setParsed(URL url,
236: ResourceParsedOkEvent event)
237: throws InvalidStateTransitionException {
238: ResourceInternal resource = getResource(url);
239: resource.setParsed();
240: parsedResources.add(resource);
241: }
242:
243: public synchronized void setError(URL url,
244: URLSpideredErrorEvent event)
245: throws InvalidStateTransitionException {
246: ResourceInternal resource = getResource(url);
247: resource.setFetchError(event.getHttpStatus(), event
248: .getHeaders());
249: fetchErrorResources.add(url);
250: }
251:
252: }
|