1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.frontier;
24
25 import java.io.File;
26
27 import org.archive.crawler.datamodel.CrawlURI;
28 import org.archive.net.UURI;
29 import org.archive.net.UURIFactory;
30 import org.archive.util.TmpDirTestCase;
31 import org.archive.util.FileUtils;
32
33 import com.sleepycat.bind.serial.StoredClassCatalog;
34 import com.sleepycat.je.DatabaseConfig;
35 import com.sleepycat.je.Environment;
36 import com.sleepycat.je.EnvironmentConfig;
37
38 /***
39 * A JUnit test for {@link AdaptiveRevisitHostQueue AdaptiveRevisitHostQueue}
40 * class.
41 * <p>
42 * Since the ARHostQueue maintains significant state information there is only
43 * one Unit test described here that tests various different transitions.
44 *
45 * @author Kristinn Sigurdsson
46 */
47 public class AdaptiveRevisitHostQueueTest
48 extends TmpDirTestCase
49 implements AdaptiveRevisitAttributeConstants {
50 public void testHQ() throws Exception {
51 EnvironmentConfig envConfig = new EnvironmentConfig();
52 envConfig.setTransactional(true);
53 envConfig.setAllowCreate(true);
54 File envDir = new File(getTmpDir(), "AR");
55 if (envDir.exists()) {
56 FileUtils.deleteDir(envDir);
57 }
58 envDir.mkdirs();
59 Environment env = new Environment(envDir, envConfig);
60
61
62 DatabaseConfig dbConfig = new DatabaseConfig();
63 dbConfig.setAllowCreate(true);
64 StoredClassCatalog catalog =
65 new StoredClassCatalog(env.openDatabase(null, "classes", dbConfig));
66 AdaptiveRevisitHostQueue hq =
67 new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 1);
68
69
70
71 CrawlURI[] curis = {null,null,null,null};
72
73 UURI uuri = UURIFactory.getInstance("http://bok.hi.is/1.html");
74 curis[0] = new CrawlURI(uuri);
75 curis[0].setVia(null);
76
77 uuri = UURIFactory.getInstance("http://bok.hi.is/2.html");
78 curis[1] = new CrawlURI(uuri);
79 curis[1].setVia(null);
80
81 uuri = UURIFactory.getInstance("http://bok.hi.is/3.html");
82 curis[2] = new CrawlURI(uuri);
83 curis[2].setVia(null);
84
85 uuri = UURIFactory.getInstance("http://bok.hi.is/4.html");
86 curis[3] = new CrawlURI(uuri);
87 curis[3].setVia(null);
88
89 assertTrue("HQ should be empty initially",
90 hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
91 assertEquals("Incorrect nextReadyTime on Empty",
92 Long.MAX_VALUE,hq.getNextReadyTime());
93 assertEquals("Initial size of HQ should be 0",0,hq.getSize());
94
95 assertEquals("Peek should return null when 'ready queue' is empty",
96 null, hq.peek());
97
98
99
100
101
102
103 curis[0].putLong(
104 A_TIME_OF_NEXT_PROCESSING,
105 System.currentTimeMillis());
106 curis[1].putLong(
107 A_TIME_OF_NEXT_PROCESSING,
108 System.currentTimeMillis()+5000);
109 curis[2].putLong(
110 A_TIME_OF_NEXT_PROCESSING,
111 System.currentTimeMillis()+20000);
112
113 hq.add(curis[0],false);
114 assertEquals("First CrawlURI should be top",curis[0].toString(),
115 hq.peek().toString());
116 assertTrue("HQ should no longer be empty",
117 hq.getState()!=AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
118 assertEquals("Size of HQ should now be 1",1,hq.getSize());
119
120
121
122
123
124
125
126 CrawlURI curi = hq.next();
127 assertEquals("next() did not return 'top' URI",
128 curis[0].toString(),curi.toString());
129 assertTrue("HQ should now be busy, is " + hq.getStateByName(),
130 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
131 try{
132 hq.next();
133 assertTrue("next() should throw an IllegalStateException if HQ " +
134 "not ready",false);
135 } catch(IllegalStateException e){
136
137 }
138 assertEquals("New top URI should be null",
139 null,hq.peek());
140
141 hq.add(curis[1],false);
142 assertEquals("Second CrawlURI should be top",curis[1].toString(),
143 hq.peek().toString());
144 assertEquals("Size of HQ should now be 2",2,hq.getSize());
145
146
147 curi.putLong(A_TIME_OF_NEXT_PROCESSING,
148 hq.peek().getLong(A_TIME_OF_NEXT_PROCESSING)
149 +100000);
150 hq.update(curi,false,0);
151 assertEquals("Second CrawlURI should be still be top",
152 curis[1].toString(),hq.peek().toString());
153 assertEquals("Size of HQ should still be 2",2,hq.getSize());
154
155 hq.add(curis[2],false);
156 assertEquals("Second CrawlURI should still be top",
157 curis[1].toString(), hq.peek().toString());
158 assertEquals("Size of HQ should now be 3",3,hq.getSize());
159
160
161
162
163
164
165
166 assertTrue("HQ should be snoozed, is " + hq.getStateByName(),
167 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
168
169 synchronized(this){
170 wait(hq.getNextReadyTime()-System.currentTimeMillis()+100);
171 }
172 assertTrue("HQ should now be ready, is " + hq.getStateByName(),
173 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
174
175
176
177
178
179
180
181
182
183 curis[2].putLong(
184 A_TIME_OF_NEXT_PROCESSING,
185 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
186 -1000);
187 hq.add(curis[2],true);
188 assertEquals("Size of HQ should still be 3",hq.getSize(),3);
189 assertEquals("Third CrawlURI should be now be top",
190 curis[2].toString(), hq.peek().toString());
191 curis[2].putLong(A_TIME_OF_NEXT_PROCESSING,
192 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
193 +10000);
194 hq.add(curis[2],true);
195 assertEquals("Size of HQ should still be 3",hq.getSize(),3);
196 assertEquals("Third CrawlURI should still top",
197 curis[2].toString(), hq.peek().toString());
198
199
200
201
202
203
204
205
206 curi = hq.next();
207 assertEquals("next() did not return 'top' URI",
208 curis[2].toString(),curi.toString());
209 assertTrue("HQ should now be busy, is " + hq.getStateByName(),
210 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
211 try{
212 hq.next();
213 assertTrue("next() should throw an IllegalStateException if HQ " +
214 "not ready",false);
215 } catch(IllegalStateException e){
216
217 }
218 assertEquals("New top URI",
219 curis[1].toString(),hq.peek().toString());
220
221
222
223
224
225
226 curis[3].putLong(A_TIME_OF_NEXT_PROCESSING,
227 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
228 - 1);
229 hq.add(curis[3],false);
230 assertEquals("Size of HQ should now be 4",4,hq.getSize());
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245 try {
246 hq.update(curis[1],false,0);
247 assertTrue("update() should not accept URI",false);
248 } catch(IllegalStateException e){
249
250 }
251
252
253
254 long timeOfPolitenessWakeUp = System.currentTimeMillis()+2000;
255 hq.update(curi,true,timeOfPolitenessWakeUp);
256 assertTrue("HQ should be snoozed, is " + hq.getStateByName(),
257 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
258
259 try {
260 hq.update(curis[2],false,0);
261 assertTrue("update() should not accept URI",false);
262 } catch(IllegalStateException e){
263
264 }
265 assertEquals("HQs time of next ready should reflect set wait time ",
266 timeOfPolitenessWakeUp, hq.getNextReadyTime());
267
268
269
270
271
272
273
274 synchronized(this){
275 wait(hq.getNextReadyTime()-System.currentTimeMillis()+100);
276 }
277 assertTrue("HQ should now be ready, is " + hq.getStateByName(),
278 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
279 assertEquals("HQs time of next ready should still be when it 'woken' " +
280 "up.", timeOfPolitenessWakeUp, hq.getNextReadyTime());
281
282
283
284
285
286
287
288
289
290
291 curi = hq.next();
292 assertEquals("next() did not return 'top' URI",
293 curis[2].toString(),curi.toString());
294 assertTrue("HQ should now be busy, is " + hq.getStateByName(),
295 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
296 hq.close();
297
298 hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 2);
299
300 assertEquals("Size of HQ after reopening should now be 4",
301 4, hq.getSize());
302 assertTrue("HQ should be ready on reopen, is " + hq.getStateByName(),
303 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
304 assertEquals("CrawlURI 'in processing' before should be top",
305 curi.toString(), hq.peek().toString());
306
307
308
309
310
311 curi = hq.next();
312 assertEquals("next() did not return 'top' URI",
313 curis[2].toString(),curi.toString());
314 assertTrue("HQ should still be ready, is " + hq.getStateByName(),
315 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
316
317
318
319 curi = hq.next();
320 assertEquals("next() did not return 'top' URI",
321 curis[3].toString(),curi.toString());
322 assertTrue("HQ should be busy, is " + hq.getStateByName(),
323 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
324 assertEquals("Size of HQ should still be 4",
325 4, hq.getSize());
326
327
328
329
330
331 hq.update(curi,false,0);
332 assertTrue("HQ should now be ready, is " + hq.getStateByName(),
333 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
334 assertEquals("'updated' CrawlURI before should be top",
335 curi.toString(), hq.peek().toString());
336
337
338
339
340
341
342
343 hq.update(curis[2],true,System.currentTimeMillis() + 1000000);
344 curis[3].putLong(A_TIME_OF_NEXT_PROCESSING,
345 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
346 + 1000);
347 assertTrue("HQ should still be ready, is " + hq.getStateByName(),
348 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
349 assertEquals("Top CrawlURI before should be unchanged",
350 curi.toString(), hq.peek().toString());
351
352
353
354
355
356
357
358 hq.close();
359 catalog.close();
360 env.close();
361 cleanUpOldFiles("AR");
362 }
363
364 }