Source Code Cross Referenced for AdaptiveRevisitHostQueueTest.java in  » Web-Crawler » heritrix » org » archive » crawler » frontier » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.frontier 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* ARHostQueueTest.java
002:         *
003:         * Created on Sep 13, 2004
004:         *
005:         * Copyright (C) 2004 Kristinn Sigur?sson.
006:         *
007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
008:         *
009:         * Heritrix is free software; you can redistribute it and/or modify
010:         * it under the terms of the GNU Lesser Public License as published by
011:         * the Free Software Foundation; either version 2.1 of the License, or
012:         * any later version.
013:         *
014:         * Heritrix is distributed in the hope that it will be useful,
015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017:         * GNU Lesser Public License for more details.
018:         *
019:         * You should have received a copy of the GNU Lesser Public License
020:         * along with Heritrix; if not, write to the Free Software
021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022:         */
023:        package org.archive.crawler.frontier;
024:
025:        import java.io.File;
026:
027:        import org.archive.crawler.datamodel.CrawlURI;
028:        import org.archive.net.UURI;
029:        import org.archive.net.UURIFactory;
030:        import org.archive.util.TmpDirTestCase;
031:        import org.archive.util.FileUtils;
032:
033:        import com.sleepycat.bind.serial.StoredClassCatalog;
034:        import com.sleepycat.je.DatabaseConfig;
035:        import com.sleepycat.je.Environment;
036:        import com.sleepycat.je.EnvironmentConfig;
037:
038:        /**
039:         * A JUnit test for {@link AdaptiveRevisitHostQueue AdaptiveRevisitHostQueue}
040:         * class. 
041:         * <p>
042:         * Since the ARHostQueue maintains significant state information there is only
043:         * one Unit test described here that tests various different transitions.
044:         *
045:         * @author Kristinn Sigurdsson
046:         */
047:        public class AdaptiveRevisitHostQueueTest extends TmpDirTestCase
048:                implements  AdaptiveRevisitAttributeConstants {
049:            public void testHQ() throws Exception {
050:                EnvironmentConfig envConfig = new EnvironmentConfig();
051:                envConfig.setTransactional(true);
052:                envConfig.setAllowCreate(true);
053:                File envDir = new File(getTmpDir(), "AR");
054:                if (envDir.exists()) {
055:                    FileUtils.deleteDir(envDir);
056:                }
057:                envDir.mkdirs();
058:                Environment env = new Environment(envDir, envConfig);
059:                // Open the class catalog database. Create it if it does not
060:                // already exist. 
061:                DatabaseConfig dbConfig = new DatabaseConfig();
062:                dbConfig.setAllowCreate(true);
063:                StoredClassCatalog catalog = new StoredClassCatalog(env
064:                        .openDatabase(null, "classes", dbConfig));
065:                AdaptiveRevisitHostQueue hq = new AdaptiveRevisitHostQueue(
066:                        "bok.hi.is", env, catalog, 1);
067:
068:                // Make the CrawlUris
069:                CrawlURI[] curis = { null, null, null, null };
070:
071:                UURI uuri = UURIFactory.getInstance("http://bok.hi.is/1.html");
072:                curis[0] = new CrawlURI(uuri);
073:                curis[0].setVia(null);
074:
075:                uuri = UURIFactory.getInstance("http://bok.hi.is/2.html");
076:                curis[1] = new CrawlURI(uuri);
077:                curis[1].setVia(null);
078:
079:                uuri = UURIFactory.getInstance("http://bok.hi.is/3.html");
080:                curis[2] = new CrawlURI(uuri);
081:                curis[2].setVia(null);
082:
083:                uuri = UURIFactory.getInstance("http://bok.hi.is/4.html");
084:                curis[3] = new CrawlURI(uuri);
085:                curis[3].setVia(null);
086:
087:                assertTrue("HQ should be empty initially",
088:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
089:                assertEquals("Incorrect nextReadyTime on Empty",
090:                        Long.MAX_VALUE, hq.getNextReadyTime());
091:                assertEquals("Initial size of HQ should be 0", 0, hq.getSize());
092:
093:                assertEquals(
094:                        "Peek should return null when 'ready queue' is empty",
095:                        null, hq.peek());
096:
097:                /*
098:                 * Add three CrawlURIs and ensures that the correct one is reported by 
099:                 * peek(); All are added later then current time!
100:                 */
101:
102:                curis[0].putLong(A_TIME_OF_NEXT_PROCESSING, System
103:                        .currentTimeMillis()); // now
104:                curis[1].putLong(A_TIME_OF_NEXT_PROCESSING, System
105:                        .currentTimeMillis() + 5000); // in 5 sec
106:                curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, System
107:                        .currentTimeMillis() + 20000); // in 20 sec.
108:
109:                hq.add(curis[0], false);
110:                assertEquals("First CrawlURI should be top", curis[0]
111:                        .toString(), hq.peek().toString());
112:                assertTrue("HQ should no longer be empty",
113:                        hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
114:                assertEquals("Size of HQ should now be 1", 1, hq.getSize());
115:
116:                /*
117:                 * Invoke next and ensure that the HQ is now busy (initial valence was
118:                 * set to 1). Also check for proper errors for a busy HQ. Such as when
119:                 * trying to reinvoke next().
120:                 *
121:                 */
122:                CrawlURI curi = hq.next(); // Should return curis[2]
123:                assertEquals("next() did not return 'top' URI", curis[0]
124:                        .toString(), curi.toString());
125:                assertTrue("HQ should now be busy, is " + hq.getStateByName(),
126:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
127:                try {
128:                    hq.next();
129:                    assertTrue(
130:                            "next() should throw an IllegalStateException if HQ "
131:                                    + "not ready", false);
132:                } catch (IllegalStateException e) {
133:                    // This is supposed to happen.
134:                }
135:                assertEquals("New top URI should be null", null, hq.peek());
136:
137:                hq.add(curis[1], false);
138:                assertEquals("Second CrawlURI should be top", curis[1]
139:                        .toString(), hq.peek().toString());
140:                assertEquals("Size of HQ should now be 2", 2, hq.getSize());
141:
142:                // Return it with next fetch time in the future.
143:                curi.putLong(A_TIME_OF_NEXT_PROCESSING, hq.peek().getLong(
144:                        A_TIME_OF_NEXT_PROCESSING) + 100000); // 100 sec behind current top.
145:                hq.update(curi, false, 0);
146:                assertEquals("Second CrawlURI should be still be top", curis[1]
147:                        .toString(), hq.peek().toString());
148:                assertEquals("Size of HQ should still be 2", 2, hq.getSize());
149:
150:                hq.add(curis[2], false);
151:                assertEquals("Second CrawlURI should still be top", curis[1]
152:                        .toString(), hq.peek().toString());
153:                assertEquals("Size of HQ should now be 3", 3, hq.getSize());
154:
155:                /*
156:                 * If there are no URIs ready, the queue should snooze, even though no
157:                 * politeness demand has been made.
158:                 * <p>
159:                 * Confirms this and that it wakes up.
160:                 */
161:                assertTrue(
162:                        "HQ should be snoozed, is " + hq.getStateByName(),
163:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
164:                // Wait past wakeup time        
165:                synchronized (this ) {
166:                    wait(hq.getNextReadyTime() - System.currentTimeMillis()
167:                            + 100);
168:                }
169:                assertTrue("HQ should now be ready, is " + hq.getStateByName(),
170:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
171:
172:                /*
173:                 * Re-adds a URI with a lower ready time which should promote it to the
174:                 * top of the queue. Checks if this happens correctly.
175:                 * 
176:                 * Then tests an add override which would demote it back, ensures that 
177:                 * this fails as it should (i.e. URIs time of next processing remains 
178:                 * unchanged).
179:                 */
180:                curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, curis[1]
181:                        .getLong(A_TIME_OF_NEXT_PROCESSING) - 1000); // 1 sec. prior to current top 
182:                hq.add(curis[2], true);
183:                assertEquals("Size of HQ should still be 3", hq.getSize(), 3);
184:                assertEquals("Third CrawlURI should be now be top", curis[2]
185:                        .toString(), hq.peek().toString());
186:                curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, curis[1]
187:                        .getLong(A_TIME_OF_NEXT_PROCESSING) + 10000); // 10 sec. later 
188:                hq.add(curis[2], true);
189:                assertEquals("Size of HQ should still be 3", hq.getSize(), 3);
190:                assertEquals("Third CrawlURI should still top", curis[2]
191:                        .toString(), hq.peek().toString());
192:
193:                /*
194:                 * Invoke next and ensure that the HQ is now busy (initial valence was
195:                 * set to 1). Also check for proper errors for a busy HQ. Such as when
196:                 * trying to reinvoke next().
197:                 *
198:                 */
199:                curi = hq.next(); // Should return curis[2]
200:                assertEquals("next() did not return 'top' URI", curis[2]
201:                        .toString(), curi.toString());
202:                assertTrue("HQ should now be busy, is " + hq.getStateByName(),
203:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
204:                try {
205:                    hq.next();
206:                    assertTrue(
207:                            "next() should throw an IllegalStateException if HQ "
208:                                    + "not ready", false);
209:                } catch (IllegalStateException e) {
210:                    // This is supposed to happen.
211:                }
212:                assertEquals("New top URI", curis[1].toString(), hq.peek()
213:                        .toString());
214:
215:                /*
216:                 * Add a URI while HQ is busy. Check if this succeeds normally.
217:                 *
218:                 */
219:
220:                curis[3].putLong(A_TIME_OF_NEXT_PROCESSING, curis[1]
221:                        .getLong(A_TIME_OF_NEXT_PROCESSING) - 1); // 1 msec. ahead of current top (order [2] 3 1 0) 
222:                hq.add(curis[3], false);
223:                assertEquals("Size of HQ should now be 4", 4, hq.getSize());
224:
225:                /*
226:                 * Invoke update, first with an invalid URI (not the one issued by 
227:                 * next() earlier), this should fail. Then with the correct one, this  
228:                 * should succeed. Then finally test update again with an invalid URI 
229:                 * (i.e. when no HQ has no outstanding URIs, that should fail.
230:                 * 
231:                 * At each step, proper checks are made of state and that  methods give  
232:                 * appropriate errors.
233:                 * 
234:                 * Updated URI is given low time of next processing to put it 'in front'
235:                 */
236:
237:                try {
238:                    hq.update(curis[1], false, 0);
239:                    assertTrue("update() should not accept URI", false);
240:                } catch (IllegalStateException e) {
241:                    // This is supposed to happen
242:                }
243:
244:                // We do not change the 'time of next processing' on update
245:                // so curis[2] should again be at top of queue. 
246:                long timeOfPolitenessWakeUp = System.currentTimeMillis() + 2000;
247:                hq.update(curi, true, timeOfPolitenessWakeUp); // Wake in 5 sec.
248:                assertTrue(
249:                        "HQ should be snoozed, is " + hq.getStateByName(),
250:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
251:
252:                try {
253:                    hq.update(curis[2], false, 0);
254:                    assertTrue("update() should not accept URI", false);
255:                } catch (IllegalStateException e) {
256:                    // This is supposed to happen
257:                }
258:                assertEquals(
259:                        "HQs time of next ready should reflect set wait time ",
260:                        timeOfPolitenessWakeUp, hq.getNextReadyTime());
261:
262:                /*
263:                 * Check if the HQ wakes up from it's 'snoozing'
264:                 *
265:                 */
266:                // Wait past wakeup time        
267:                synchronized (this ) {
268:                    wait(hq.getNextReadyTime() - System.currentTimeMillis()
269:                            + 100);
270:                }
271:                assertTrue("HQ should now be ready, is " + hq.getStateByName(),
272:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
273:                assertEquals(
274:                        "HQs time of next ready should still be when it 'woken' "
275:                                + "up.", timeOfPolitenessWakeUp, hq
276:                                .getNextReadyTime());
277:
278:                /*
279:                 * Invoke next so that the HQ has a URI being processed. Then
280:                 * close the HQ and reopen it to ensure that this happens normally, i.e.
281:                 * state is recovered properly, including the restoration of the URI
282:                 * being processed, back to the regular queue (where it should be 
283:                 * first).
284:                 * 
285:                 * On recreating the HQ, set valence to 2.
286:                 */
287:                curi = hq.next(); // Should return curis[2]
288:                assertEquals("next() did not return 'top' URI", curis[2]
289:                        .toString(), curi.toString());
290:                assertTrue("HQ should now be busy, is " + hq.getStateByName(),
291:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
292:                hq.close();
293:
294:                hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 2);
295:
296:                assertEquals("Size of HQ after reopening should now be 4", 4,
297:                        hq.getSize());
298:                assertTrue("HQ should be ready on reopen, is "
299:                        + hq.getStateByName(),
300:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
301:                assertEquals("CrawlURI 'in processing' before should be top",
302:                        curi.toString(), hq.peek().toString());
303:
304:                /* Check if valence higher then 1 is properly handled.
305:                 * 
306:                 * Invoke next(), check if still ready and new top URI.
307:                 */
308:                curi = hq.next(); // Should return curis[2]
309:                assertEquals("next() did not return 'top' URI", curis[2]
310:                        .toString(), curi.toString());
311:                assertTrue("HQ should still be ready, is "
312:                        + hq.getStateByName(),
313:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
314:
315:                /* Invoke next() again, check if now busy.
316:                 */
317:                curi = hq.next(); // Should return curis[3]
318:                assertEquals("next() did not return 'top' URI", curis[3]
319:                        .toString(), curi.toString());
320:                assertTrue("HQ should be busy, is " + hq.getStateByName(), hq
321:                        .getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
322:                assertEquals("Size of HQ should still be 4", 4, hq.getSize());
323:
324:                /* Update() second URI issued. Confirm HQ is now ready again. URI is 
325:                 * given same time of next processing to put it 'in front'. (no snooze)
326:                 */
327:                hq.update(curi, false, 0);
328:                assertTrue("HQ should now be ready, is " + hq.getStateByName(),
329:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
330:                assertEquals("'updated' CrawlURI before should be top", curi
331:                        .toString(), hq.peek().toString());
332:
333:                /* Update() again, ensure proper state. URI is NOT placed at front of 
334:                 * queue and snooze time is given. But the HQ should not enter a 
335:                 * snoozed state because the 'other' slot is free.
336:                 */
337:
338:                hq.update(curis[2], true, System.currentTimeMillis() + 1000000); // 10sec
339:                curis[3].putLong(A_TIME_OF_NEXT_PROCESSING, curis[1]
340:                        .getLong(A_TIME_OF_NEXT_PROCESSING) + 1000); // 1 sec. behind of current top 
341:                assertTrue("HQ should still be ready, is "
342:                        + hq.getStateByName(),
343:                        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
344:                assertEquals("Top CrawlURI before should be unchanged", curi
345:                        .toString(), hq.peek().toString());
346:
347:                // TODO: Test sorting with scheduling directives.
348:
349:                /*
350:                 * Close the ARHostQueue and the Environment
351:                 */
352:                hq.close();
353:                catalog.close();
354:                env.close();
355:                cleanUpOldFiles("AR");
356:            }
357:
358:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.