View Javadoc

1   /* ARHostQueueTest.java
2   *
3   * Created on Sep 13, 2004
4   *
5   * Copyright (C) 2004 Kristinn Sigur?sson.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.crawler.frontier;
24  
25  import java.io.File;
26  
27  import org.archive.crawler.datamodel.CrawlURI;
28  import org.archive.net.UURI;
29  import org.archive.net.UURIFactory;
30  import org.archive.util.TmpDirTestCase;
31  import org.archive.util.FileUtils;
32  
33  import com.sleepycat.bind.serial.StoredClassCatalog;
34  import com.sleepycat.je.DatabaseConfig;
35  import com.sleepycat.je.Environment;
36  import com.sleepycat.je.EnvironmentConfig;
37  
38  /***
39   * A JUnit test for {@link AdaptiveRevisitHostQueue AdaptiveRevisitHostQueue}
40   * class. 
41   * <p>
42   * Since the ARHostQueue maintains significant state information there is only
43   * one Unit test described here that tests various different transitions.
44   *
45   * @author Kristinn Sigurdsson
46   */
47  public class AdaptiveRevisitHostQueueTest
48  extends TmpDirTestCase
49  implements AdaptiveRevisitAttributeConstants {
50      public void testHQ() throws Exception {
51          EnvironmentConfig envConfig = new EnvironmentConfig();
52          envConfig.setTransactional(true); 
53          envConfig.setAllowCreate(true);    
54          File envDir = new File(getTmpDir(), "AR");
55          if (envDir.exists()) {
56              FileUtils.deleteDir(envDir);
57          }
58          envDir.mkdirs();
59          Environment env = new Environment(envDir, envConfig);
60          // Open the class catalog database. Create it if it does not
61          // already exist. 
62          DatabaseConfig dbConfig = new DatabaseConfig();
63          dbConfig.setAllowCreate(true);
64          StoredClassCatalog catalog =
65              new StoredClassCatalog(env.openDatabase(null, "classes", dbConfig));
66          AdaptiveRevisitHostQueue hq =
67              new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 1);
68  
69  
70          // Make the CrawlUris
71          CrawlURI[] curis = {null,null,null,null};
72  
73          UURI uuri = UURIFactory.getInstance("http://bok.hi.is/1.html");
74          curis[0] = new CrawlURI(uuri);
75          curis[0].setVia(null);
76          
77          uuri = UURIFactory.getInstance("http://bok.hi.is/2.html");
78          curis[1] = new CrawlURI(uuri);
79          curis[1].setVia(null);
80  
81          uuri = UURIFactory.getInstance("http://bok.hi.is/3.html");
82          curis[2] = new CrawlURI(uuri);
83          curis[2].setVia(null);
84  
85          uuri = UURIFactory.getInstance("http://bok.hi.is/4.html");
86          curis[3] = new CrawlURI(uuri);
87          curis[3].setVia(null);
88  
89          assertTrue("HQ should be empty initially",
90                  hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
91          assertEquals("Incorrect nextReadyTime on Empty",
92                  Long.MAX_VALUE,hq.getNextReadyTime());
93          assertEquals("Initial size of HQ should be 0",0,hq.getSize());
94          
95          assertEquals("Peek should return null when 'ready queue' is empty", 
96                  null, hq.peek());
97      
98          /*
99           * Add three CrawlURIs and ensures that the correct one is reported by 
100          * peek(); All are added later then current time!
101          */
102 
103         curis[0].putLong(
104                 A_TIME_OF_NEXT_PROCESSING,
105                 System.currentTimeMillis()); // now
106         curis[1].putLong(
107                 A_TIME_OF_NEXT_PROCESSING,
108                 System.currentTimeMillis()+5000); // in 5 sec
109         curis[2].putLong(
110                 A_TIME_OF_NEXT_PROCESSING,
111                 System.currentTimeMillis()+20000); // in 20 sec.
112         
113         hq.add(curis[0],false);
114         assertEquals("First CrawlURI should be top",curis[0].toString(),
115                 hq.peek().toString());
116         assertTrue("HQ should no longer be empty",
117                 hq.getState()!=AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
118         assertEquals("Size of HQ should now be 1",1,hq.getSize());
119         
120         /*
121          * Invoke next and ensure that the HQ is now busy (initial valence was
122          * set to 1). Also check for proper errors for a busy HQ. Such as when
123          * trying to reinvoke next().
124          *
125          */
126         CrawlURI curi = hq.next(); // Should return curis[2]
127         assertEquals("next() did not return 'top' URI",
128                 curis[0].toString(),curi.toString());
129         assertTrue("HQ should now be busy, is " + hq.getStateByName(),
130                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
131         try{
132             hq.next();
133             assertTrue("next() should throw an IllegalStateException if HQ " +
134                     "not ready",false);
135         } catch(IllegalStateException e){
136             // This is supposed to happen.
137         }
138         assertEquals("New top URI should be null",
139                 null,hq.peek());
140         
141         hq.add(curis[1],false);
142         assertEquals("Second CrawlURI should be top",curis[1].toString(),
143                 hq.peek().toString());
144         assertEquals("Size of HQ should now be 2",2,hq.getSize());
145 
146         // Return it with next fetch time in the future.
147         curi.putLong(A_TIME_OF_NEXT_PROCESSING,
148             hq.peek().getLong(A_TIME_OF_NEXT_PROCESSING)
149                         +100000); // 100 sec behind current top.
150         hq.update(curi,false,0);
151         assertEquals("Second CrawlURI should be still be top",
152                 curis[1].toString(),hq.peek().toString());
153         assertEquals("Size of HQ should still be 2",2,hq.getSize());
154         
155         hq.add(curis[2],false);
156         assertEquals("Second CrawlURI should still be top",
157                 curis[1].toString(), hq.peek().toString());
158         assertEquals("Size of HQ should now be 3",3,hq.getSize());
159 
160         /*
161          * If there are no URIs ready, the queue should snooze, even though no
162          * politeness demand has been made.
163          * <p>
164          * Confirms this and that it wakes up.
165          */
166         assertTrue("HQ should be snoozed, is " + hq.getStateByName(),
167                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
168         // Wait past wakeup time        
169         synchronized(this){
170             wait(hq.getNextReadyTime()-System.currentTimeMillis()+100);
171         }
172         assertTrue("HQ should now be ready, is " + hq.getStateByName(),
173                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
174     
175         /*
176          * Re-adds a URI with a lower ready time which should promote it to the
177          * top of the queue. Checks if this happens correctly.
178          * 
179          * Then tests an add override which would demote it back, ensures that 
180          * this fails as it should (i.e. URIs time of next processing remains 
181          * unchanged).
182          */
183         curis[2].putLong(
184                 A_TIME_OF_NEXT_PROCESSING,
185                 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
186                             -1000); // 1 sec. prior to current top 
187         hq.add(curis[2],true);
188         assertEquals("Size of HQ should still be 3",hq.getSize(),3);
189         assertEquals("Third CrawlURI should be now be top",
190                 curis[2].toString(), hq.peek().toString());
191         curis[2].putLong(A_TIME_OF_NEXT_PROCESSING,
192                 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
193                             +10000); // 10 sec. later 
194         hq.add(curis[2],true);
195         assertEquals("Size of HQ should still be 3",hq.getSize(),3);
196         assertEquals("Third CrawlURI should still top",
197                 curis[2].toString(), hq.peek().toString());
198 
199     
200         /*
201          * Invoke next and ensure that the HQ is now busy (initial valence was
202          * set to 1). Also check for proper errors for a busy HQ. Such as when
203          * trying to reinvoke next().
204          *
205          */
206         curi = hq.next(); // Should return curis[2]
207         assertEquals("next() did not return 'top' URI",
208                 curis[2].toString(),curi.toString());
209         assertTrue("HQ should now be busy, is " + hq.getStateByName(),
210                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
211         try{
212             hq.next();
213             assertTrue("next() should throw an IllegalStateException if HQ " +
214                     "not ready",false);
215         } catch(IllegalStateException e){
216             // This is supposed to happen.
217         }
218         assertEquals("New top URI",
219                 curis[1].toString(),hq.peek().toString());
220         
221         /*
222          * Add a URI while HQ is busy. Check if this succeeds normally.
223          *
224          */
225         
226         curis[3].putLong(A_TIME_OF_NEXT_PROCESSING,
227                 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) 
228                         - 1); // 1 msec. ahead of current top (order [2] 3 1 0) 
229         hq.add(curis[3],false);
230         assertEquals("Size of HQ should now be 4",4,hq.getSize());
231         
232         
233         /*
234          * Invoke update, first with an invalid URI (not the one issued by 
235          * next() earlier), this should fail. Then with the correct one, this  
236          * should succeed. Then finally test update again with an invalid URI 
237          * (i.e. when no HQ has no outstanding URIs, that should fail.
238          * 
239          * At each step, proper checks are made of state and that  methods give  
240          * appropriate errors.
241          * 
242          * Updated URI is given low time of next processing to put it 'in front'
243          */
244     
245         try {
246             hq.update(curis[1],false,0);
247             assertTrue("update() should not accept URI",false);
248         } catch(IllegalStateException e){
249             // This is supposed to happen
250         }
251         
252         // We do not change the 'time of next processing' on update
253         // so curis[2] should again be at top of queue. 
254         long timeOfPolitenessWakeUp = System.currentTimeMillis()+2000;
255         hq.update(curi,true,timeOfPolitenessWakeUp); // Wake in 5 sec.
256         assertTrue("HQ should be snoozed, is " + hq.getStateByName(),
257                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
258         
259         try {
260             hq.update(curis[2],false,0);
261             assertTrue("update() should not accept URI",false);
262         } catch(IllegalStateException e){
263             // This is supposed to happen
264         }
265         assertEquals("HQs time of next ready should reflect set wait time ",
266                 timeOfPolitenessWakeUp, hq.getNextReadyTime());
267         
268         
269         /*
270          * Check if the HQ wakes up from it's 'snoozing'
271          *
272          */
273         // Wait past wakeup time        
274         synchronized(this){
275             wait(hq.getNextReadyTime()-System.currentTimeMillis()+100);
276         }
277         assertTrue("HQ should now be ready, is " + hq.getStateByName(),
278                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
279         assertEquals("HQs time of next ready should still be when it 'woken' " +
280                 "up.", timeOfPolitenessWakeUp, hq.getNextReadyTime());
281    
282         /*
283          * Invoke next so that the HQ has a URI being processed. Then
284          * close the HQ and reopen it to ensure that this happens normally, i.e.
285          * state is recovered properly, including the restoration of the URI
286          * being processed, back to the regular queue (where it should be 
287          * first).
288          * 
289          * On recreating the HQ, set valence to 2.
290          */
291         curi = hq.next(); // Should return curis[2]
292         assertEquals("next() did not return 'top' URI",
293                 curis[2].toString(),curi.toString());
294         assertTrue("HQ should now be busy, is " + hq.getStateByName(),
295                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
296         hq.close();
297         
298         hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 2);
299         
300         assertEquals("Size of HQ after reopening should now be 4",
301                 4, hq.getSize());
302         assertTrue("HQ should be ready on reopen, is " + hq.getStateByName(),
303                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
304         assertEquals("CrawlURI 'in processing' before should be top",
305                 curi.toString(), hq.peek().toString());
306     
307         /* Check if valence higher then 1 is properly handled.
308          * 
309          * Invoke next(), check if still ready and new top URI.
310          */ 
311         curi = hq.next(); // Should return curis[2]
312         assertEquals("next() did not return 'top' URI",
313                 curis[2].toString(),curi.toString());
314         assertTrue("HQ should still be ready, is " + hq.getStateByName(),
315                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
316         
317         /* Invoke next() again, check if now busy.
318          */ 
319         curi = hq.next(); // Should return curis[3]
320         assertEquals("next() did not return 'top' URI",
321                 curis[3].toString(),curi.toString());
322         assertTrue("HQ should be busy, is " + hq.getStateByName(),
323                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
324         assertEquals("Size of HQ should still be 4",
325                 4, hq.getSize());
326 
327         
328         /* Update() second URI issued. Confirm HQ is now ready again. URI is 
329          * given same time of next processing to put it 'in front'. (no snooze)
330          */ 
331         hq.update(curi,false,0);
332         assertTrue("HQ should now be ready, is " + hq.getStateByName(),
333                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
334         assertEquals("'updated' CrawlURI before should be top",
335                 curi.toString(), hq.peek().toString());
336         
337         
338         /* Update() again, ensure proper state. URI is NOT placed at front of 
339          * queue and snooze time is given. But the HQ should not enter a 
340          * snoozed state because the 'other' slot is free.
341          */
342         
343         hq.update(curis[2],true,System.currentTimeMillis() + 1000000); // 10sec
344         curis[3].putLong(A_TIME_OF_NEXT_PROCESSING,
345                 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) 
346                         + 1000); // 1 sec. behind of current top 
347         assertTrue("HQ should still be ready, is " + hq.getStateByName(),
348                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
349         assertEquals("Top CrawlURI before should be unchanged",
350                 curi.toString(), hq.peek().toString());
351         
352 
353         // TODO: Test sorting with scheduling directives.
354         
355         /*
356          * Close the ARHostQueue and the Environment
357          */
358         hq.close();
359         catalog.close();
360         env.close();
361         cleanUpOldFiles("AR");
362     }
363     
364 }