1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.deciderules;
24
25 import java.io.File;
26
27 import javax.management.Attribute;
28 import javax.management.AttributeNotFoundException;
29 import javax.management.InvalidAttributeValueException;
30 import javax.management.MBeanException;
31 import javax.management.ReflectionException;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CandidateURI;
35 import org.archive.crawler.datamodel.CrawlOrder;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.filter.ContentTypeRegExpFilter;
38 import org.archive.crawler.settings.MapType;
39 import org.archive.crawler.settings.SettingsHandler;
40 import org.archive.crawler.settings.XMLSettingsHandler;
41 import org.archive.net.UURI;
42 import org.archive.net.UURIFactory;
43 import org.archive.util.SurtPrefixSet;
44 import org.archive.util.TmpDirTestCase;
45
46 /***
47 * @author stack
48 * @version $Date: 2007-04-06 01:13:26 +0000 (Fri, 06 Apr 2007) $, $Revision: 5041 $
49 */
50 public class DecideRuleSequenceTest extends TmpDirTestCase {
51 /***
52 * Gets setup by {@link #setUp()}.
53 */
54 private DecideRuleSequence rule = null;
55
56 protected void setUp() throws Exception {
57 super.setUp();
58 final String name = this.getClass().getName();
59 SettingsHandler settingsHandler = new XMLSettingsHandler(
60 new File(getTmpDir(), name + ".order.xml"));
61 settingsHandler.initialize();
62
63
64
65 this.rule = (DecideRuleSequence)((MapType)settingsHandler.getOrder().
66 getAttribute(CrawlOrder.ATTR_RULES)).addElement(settingsHandler.
67 getSettingsObject(null), new DecideRuleSequence(name));
68 }
69
70 public void testEmptySequence() {
71 Object decision = this.rule.decisionFor("test");
72 assertTrue("Expect PASS but got " + decision,
73 decision == DecideRule.PASS);
74 }
75
76 public void testSingleACCEPT() throws InvalidAttributeValueException {
77 Object decision = addDecideRule(new AcceptDecideRule("ACCEPT")).
78 decisionFor("test");
79 assertTrue("Expect ACCEPT but got " + decision,
80 decision == DecideRule.ACCEPT);
81 }
82
83 public void testSingleREJECT() throws InvalidAttributeValueException {
84 Object decision = addDecideRule(new RejectDecideRule("REJECT")).
85 decisionFor("test");
86 assertTrue("Expect REJECT but got " + decision,
87 decision == DecideRule.REJECT);
88 }
89
90 public void testSinglePASS() throws InvalidAttributeValueException {
91 Object decision = addDecideRule(new DecideRule("PASS")).
92 decisionFor("test");
93 assertTrue("Expect PASS but got " + decision,
94 decision == DecideRule.PASS);
95 }
96
97
98 public void testACCEPTWins() throws InvalidAttributeValueException {
99 addDecideRule(new DecideRule("PASS1"));
100 addDecideRule(new RejectDecideRule("REJECT1"));
101 addDecideRule(new DecideRule("PASS2"));
102 addDecideRule(new AcceptDecideRule("ACCEPT1"));
103 addDecideRule(new RejectDecideRule("REJECT2"));
104 addDecideRule(new DecideRule("PASS3"));
105 addDecideRule(new AcceptDecideRule("ACCEPT2"));
106 addDecideRule(new DecideRule("PASS4"));
107 Object decision = this.rule.decisionFor("test");
108 assertTrue("Expect ACCEPT but got " + decision,
109 decision == DecideRule.ACCEPT);
110 }
111
112 public void testREJECTWins() throws InvalidAttributeValueException {
113 addDecideRule(new DecideRule("PASS1"));
114 addDecideRule(new RejectDecideRule("REJECT1"));
115 addDecideRule(new DecideRule("PASS2"));
116 addDecideRule(new AcceptDecideRule("ACCEPT1"));
117 addDecideRule(new RejectDecideRule("REJECT2"));
118 addDecideRule(new DecideRule("PASS3"));
119 addDecideRule(new AcceptDecideRule("ACCEPT2"));
120 addDecideRule(new DecideRule("PASS4"));
121 addDecideRule(new RejectDecideRule("REJECT3"));
122 Object decision = this.rule.decisionFor("test");
123 assertTrue("Expect REJECT but got " + decision,
124 decision == DecideRule.REJECT);
125 }
126
127 public void testRegex()
128 throws InvalidAttributeValueException, AttributeNotFoundException,
129 MBeanException, ReflectionException {
130 final String regexName = "REGEX";
131 DecideRule r = addDecideRule(new MatchesRegExpDecideRule(regexName));
132
133 r.setAttribute(new Attribute(MatchesRegExpDecideRule.ATTR_REGEXP,
134 "^.*//.archive//.org"));
135 Object decision = this.rule.decisionFor("http://google.com");
136 assertTrue("Expect PASS but got " + decision,
137 decision == DecideRule.PASS);
138 decision = this.rule.decisionFor("http://archive.org");
139 assertTrue("Expect PASS but got " + decision,
140 decision == DecideRule.PASS);
141 decision = this.rule.decisionFor("http://www.archive.org");
142 assertTrue("Expect ACCEPT but got " + decision,
143 decision == DecideRule.ACCEPT);
144 }
145
146 public void testNotRegex()
147 throws InvalidAttributeValueException, AttributeNotFoundException,
148 MBeanException, ReflectionException {
149 final String regexName = "NOT_REGEX";
150 DecideRule r = addDecideRule(new NotMatchesRegExpDecideRule(regexName));
151
152 r.setAttribute(new Attribute(MatchesRegExpDecideRule.ATTR_REGEXP,
153 "^.*//.archive//.org"));
154 Object decision = this.rule.decisionFor("http://google.com");
155 assertTrue("Expect ACCEPT but got " + decision,
156 decision == DecideRule.ACCEPT);
157 decision = this.rule.decisionFor("http://www.archive.org");
158 assertTrue("Expect PASS but got " + decision,
159 decision == DecideRule.PASS);
160 }
161
162
163 public void testPrerequisite()
164 throws InvalidAttributeValueException, URIException {
165 addDecideRule(new PrerequisiteAcceptDecideRule("PREREQUISITE"));
166 UURI uuri = UURIFactory.getInstance("http://archive.org");
167 CandidateURI candidate = new CandidateURI(uuri);
168 Object decision = this.rule.decisionFor(candidate);
169 assertTrue("Expect PASS but got " + decision,
170 decision == DecideRule.PASS);
171 candidate = new CandidateURI(uuri, "LLP", null, null);
172 decision = this.rule.decisionFor(candidate);
173 assertTrue("Expect ACCEPT but got " + decision,
174 decision == DecideRule.ACCEPT);
175 }
176
177 public void testHops()
178 throws InvalidAttributeValueException, URIException {
179 addDecideRule(new TooManyHopsDecideRule("HOPS"));
180 testHopLimit(TooManyHopsDecideRule.DEFAULT_MAX_HOPS.intValue(), 'L',
181 DecideRule.PASS, DecideRule.REJECT);
182 }
183
184 public void testTransclusion()
185 throws InvalidAttributeValueException, URIException {
186 addDecideRule(new TransclusionDecideRule("TRANSCLUSION"));
187 final int max =
188 TransclusionDecideRule.DEFAULT_MAX_TRANS_HOPS.intValue();
189 final char pathExpansion = 'E';
190 UURI uuri = UURIFactory.getInstance("http://archive.org");
191 CandidateURI candidate = new CandidateURI(uuri);
192 Object decision = this.rule.decisionFor(candidate);
193 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
194 decision == DecideRule.PASS);
195 StringBuffer path = new StringBuffer(max);
196 for (int i = 0; i < (max - 1); i++) {
197 path.append(pathExpansion);
198 }
199 candidate = new CandidateURI(uuri, path.toString(), null, null);
200 decision = this.rule.decisionFor(candidate);
201 assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,
202 decision == DecideRule.ACCEPT);
203 String pathCopy = path.toString();
204 path.append(pathExpansion);
205 candidate = new CandidateURI(uuri, path.toString(), null, null);
206 decision = this.rule.decisionFor(candidate);
207 assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,
208 decision == DecideRule.ACCEPT);
209 path.append(pathExpansion);
210 candidate = new CandidateURI(uuri, path.toString(), null, null);
211 decision = this.rule.decisionFor(candidate);
212 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
213 decision == DecideRule.PASS);
214 candidate = new CandidateURI(uuri, pathCopy + 'L', null, null);
215 decision = this.rule.decisionFor(candidate);
216 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
217 decision == DecideRule.PASS);
218 }
219
220 public void testPathologicalPath()
221 throws InvalidAttributeValueException, URIException {
222 addDecideRule(new PathologicalPathDecideRule("PATHOLOGICAL"));
223 final int max =
224 PathologicalPathDecideRule.DEFAULT_REPETITIONS.intValue();
225 String uri = "http://archive.org/";
226 final String segment = "abc/";
227 for (int i = 1; i < max; i++) {
228 uri = uri + segment;
229 }
230 final String baseUri = uri;
231 UURI uuri = UURIFactory.getInstance(uri);
232 CandidateURI candidate = new CandidateURI(uuri);
233 Object decision = this.rule.decisionFor(candidate);
234 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
235 decision == DecideRule.PASS);
236 uuri = UURIFactory.getInstance(baseUri + segment);
237 candidate = new CandidateURI(uuri);
238 decision = this.rule.decisionFor(candidate);
239 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
240 decision == DecideRule.PASS);
241 uuri = UURIFactory.getInstance(baseUri + segment + segment);
242 candidate = new CandidateURI(uuri);
243 decision = this.rule.decisionFor(candidate);
244 assertTrue("Expect " + DecideRule.REJECT + " but got " + decision,
245 decision == DecideRule.REJECT);
246 }
247
248 public void testTooManyPathSegments()
249 throws InvalidAttributeValueException, URIException {
250 addDecideRule(new TooManyPathSegmentsDecideRule("SEGMENTS"));
251 final int max =
252 TooManyPathSegmentsDecideRule.DEFAULT_MAX_PATH_DEPTH.intValue();
253 StringBuffer baseUri = new StringBuffer("http://archive.org");
254 for (int i = 0; i < max; i++) {
255 baseUri.append('/');
256 baseUri.append(Integer.toString(i + 1));
257 }
258 UURI uuri = UURIFactory.getInstance(baseUri.toString());
259 CandidateURI candidate = new CandidateURI(uuri);
260 Object decision = this.rule.decisionFor(candidate);
261 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
262 decision == DecideRule.PASS);
263 baseUri.append("/x");
264 uuri = UURIFactory.getInstance(baseUri.toString());
265 candidate = new CandidateURI(uuri);
266 decision = this.rule.decisionFor(candidate);
267 assertTrue("Expect " + DecideRule.REJECT + " but got " + decision,
268 decision == DecideRule.REJECT);
269 }
270
271 public void testMatchesFilePattern()
272 throws InvalidAttributeValueException, URIException {
273 addDecideRule(new MatchesFilePatternDecideRule("FILE_PATTERN"));
274 StringBuffer baseUri = new StringBuffer("http://archive.org/");
275 UURI uuri = UURIFactory.getInstance(baseUri.toString() + "ms.doc");
276 CandidateURI candidate = new CandidateURI(uuri);
277 Object decision = this.rule.decisionFor(candidate);
278 assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,
279 decision == DecideRule.ACCEPT);
280 uuri = UURIFactory.getInstance(baseUri.toString() + "index.html");
281 candidate = new CandidateURI(uuri);
282 decision = this.rule.decisionFor(candidate);
283 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
284 decision == DecideRule.PASS);
285 }
286
287 public void testNotMatchesFilePattern()
288 throws InvalidAttributeValueException, URIException {
289 addDecideRule(new NotMatchesFilePatternDecideRule("NOT_FILE_PATTERN"));
290 StringBuffer baseUri = new StringBuffer("http://archive.org/");
291 UURI uuri = UURIFactory.getInstance(baseUri.toString() + "ms.doc");
292 CandidateURI candidate = new CandidateURI(uuri);
293 Object decision = this.rule.decisionFor(candidate);
294 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
295 decision == DecideRule.PASS);
296 uuri = UURIFactory.getInstance(baseUri.toString() + "index.html");
297 candidate = new CandidateURI(uuri);
298 decision = this.rule.decisionFor(candidate);
299 assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,
300 decision == DecideRule.ACCEPT);
301 }
302
303 protected void testHopLimit(final int max, final char pathExpansion,
304 final String defaultDecision, final String overLimitDecision)
305 throws URIException {
306 UURI uuri = UURIFactory.getInstance("http://archive.org");
307 CandidateURI candidate = new CandidateURI(uuri);
308 Object decision = this.rule.decisionFor(candidate);
309 assertTrue("Expect " + defaultDecision + " but got " + decision,
310 decision == defaultDecision);
311 StringBuffer path = new StringBuffer(max);
312 for (int i = 0; i < (max - 1); i++) {
313 path.append(pathExpansion);
314 }
315 candidate = new CandidateURI(uuri, path.toString(), null, null);
316 decision = this.rule.decisionFor(candidate);
317 assertTrue("Expect " + defaultDecision + " but got " + decision,
318 decision == defaultDecision);
319 path.append(pathExpansion);
320 candidate = new CandidateURI(uuri, path.toString(), null, null);
321 decision = this.rule.decisionFor(candidate);
322 assertTrue("Expect " + defaultDecision + " but got " + decision,
323 decision == defaultDecision);
324 path.append(pathExpansion);
325 candidate = new CandidateURI(uuri, path.toString(), null, null);
326 decision = this.rule.decisionFor(candidate);
327 assertTrue("Expect " + overLimitDecision + " but got " + decision,
328 decision == overLimitDecision);
329 }
330
331 public void testScopePlusOne()
332 throws URIException, InvalidAttributeValueException,
333 AttributeNotFoundException, MBeanException,
334 ReflectionException {
335
336 ScopePlusOneDecideRule t = new ScopePlusOneDecideRule("host");
337 SurtPrefixSet mSet = new SurtPrefixSet();
338 mSet.add(SurtPrefixSet.prefixFromPlain("http://audio.archive.org"));
339 mSet.convertAllPrefixesToHosts();
340 t.surtPrefixes = mSet;
341 DecideRule s = addDecideRule(t);
342 s.setAttribute(new Attribute(ScopePlusOneDecideRule.ATTR_SCOPE,
343 ScopePlusOneDecideRule.HOST));
344
345
346 UURI uuri =
347 UURIFactory.getInstance("http://audio.archive.org/examples");
348 CandidateURI candidate = new CandidateURI(uuri);
349 Object decision = this.rule.decisionFor(candidate);
350 assertTrue("URI Expect " + DecideRule.ACCEPT + " for " + candidate +
351 " but got " + decision, decision == DecideRule.ACCEPT);
352 UURI uuriOne = UURIFactory.getInstance("http://movies.archive.org");
353 CandidateURI plusOne = new CandidateURI(uuriOne);
354 plusOne.setVia(uuri);
355 decision = this.rule.decisionFor(plusOne);
356 assertTrue("PlusOne Expect " + DecideRule.ACCEPT + " for " + plusOne +
357 " with via " + plusOne.flattenVia() + " but got " + decision,
358 decision == DecideRule.ACCEPT);
359 UURI uuriTwo = UURIFactory.getInstance("http://sloan.archive.org");
360 CandidateURI plusTwo = new CandidateURI(uuriTwo);
361 plusTwo.setVia(uuriOne);
362 decision = this.rule.decisionFor(plusTwo);
363 assertTrue("PlusTwo Expect " + DecideRule.PASS + " for " + plusTwo +
364 " with via " + plusTwo.flattenVia() + " but got " + decision,
365 decision == DecideRule.PASS);
366
367
368
369 ScopePlusOneDecideRule u = new ScopePlusOneDecideRule("domain");
370 SurtPrefixSet mSet1 = new SurtPrefixSet();
371 mSet1.add(SurtPrefixSet.prefixFromPlain("archive.org"));
372 mSet1.convertAllPrefixesToDomains();
373 u.surtPrefixes = mSet1;
374 DecideRule v = addDecideRule(u);
375 v.setAttribute(new Attribute(ScopePlusOneDecideRule.ATTR_SCOPE,
376 ScopePlusOneDecideRule.DOMAIN));
377
378 decision = this.rule.decisionFor(candidate);
379 assertTrue("Domain: URI Expect " + DecideRule.ACCEPT + " for " +
380 candidate + " but got " + decision, decision == DecideRule.ACCEPT);
381 decision = this.rule.decisionFor(plusOne);
382 assertTrue("Domain: PlusOne Expect " + DecideRule.ACCEPT + " for " +
383 plusOne + " with via " + plusOne.flattenVia() + " but got " +
384 decision, decision == DecideRule.ACCEPT);
385 decision = this.rule.decisionFor(plusTwo);
386 assertTrue("Domain: PlusTwo Expect " + DecideRule.ACCEPT + " for " +
387 plusTwo + " with via " + plusTwo.flattenVia() + " but got " +
388 decision, decision == DecideRule.ACCEPT);
389 UURI uuriThree = UURIFactory.getInstance("http://sloan.org");
390 CandidateURI plusThree = new CandidateURI(uuriThree);
391 plusThree.setVia(uuriTwo);
392 decision = this.rule.decisionFor(plusThree);
393 assertTrue("Domain: PlusThree Expect " + DecideRule.ACCEPT + " for " +
394 plusThree + " with via " + plusThree.flattenVia() + " but got " +
395 decision, decision == DecideRule.ACCEPT);
396 UURI uuriFour = UURIFactory.getInstance("http://example.com");
397 CandidateURI plusFour = new CandidateURI(uuriFour);
398 plusFour.setVia(uuriThree);
399 decision = this.rule.decisionFor(plusFour);
400 assertTrue("Domain: PlusFour Expect " + DecideRule.PASS + " for " +
401 plusFour + " with via " + plusFour.flattenVia() + " but got " +
402 decision, decision == DecideRule.PASS);
403 }
404
405 public void testFilter()
406 throws InvalidAttributeValueException, URIException, AttributeNotFoundException, MBeanException, ReflectionException {
407 FilterDecideRule dr = new FilterDecideRule(
408 "FilterDecideRule(ContentTypeRegExpFilter)");
409 addDecideRule(dr);
410 StringBuffer baseUri = new StringBuffer();
411 UURI uuri = UURIFactory.getInstance("http://example.com/foo");
412 CrawlURI curi = new CrawlURI(uuri);
413 curi.setContentType("text/html");
414 Object decision = this.rule.decisionFor(curi);
415
416
417 assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,
418 decision == DecideRule.ACCEPT);
419 ContentTypeRegExpFilter filt =
420 new ContentTypeRegExpFilter("ContentTypeRegExpFilter","app.*");
421 dr.filters.addElement(null,filt);
422 decision = this.rule.decisionFor(curi);
423
424 assertTrue("Expect " + DecideRule.REJECT + " but got " + decision,
425 decision == DecideRule.REJECT);
426 curi.setContentType("application/octet-stream");
427 decision = this.rule.decisionFor(curi);
428
429 assertTrue("Expect " + DecideRule.ACCEPT + " but got " + decision,
430 decision == DecideRule.ACCEPT);
431
432 dr.setAttribute(new Attribute(FilterDecideRule.ATTR_TRUE_DECISION,"PASS"));
433 decision = this.rule.decisionFor(curi);
434 assertTrue("Expect " + DecideRule.PASS + " but got " + decision,
435 decision == DecideRule.PASS);
436 }
437
438 protected DecideRule addDecideRule(DecideRule dr)
439 throws InvalidAttributeValueException {
440 MapType rules = this.rule.getRules(null);
441 rules.addElement(null, dr);
442 return dr;
443 }
444
445 public void testContentTypeMatchesRegexpDecideRule() throws Exception{
446 ContentTypeMatchesRegExpDecideRule dr = new ContentTypeMatchesRegExpDecideRule("CTMREDRtest");
447 DecideRule v = addDecideRule(dr);
448
449 v.setAttribute(new Attribute(MatchesRegExpDecideRule.ATTR_REGEXP,"text/html"));
450 UURI uuri = UURIFactory.getInstance("http://www.archive.org");
451 CrawlURI crawlUri = new CrawlURI(uuri);
452
453
454 Object decision = this.rule.decisionFor(crawlUri);
455 assertTrue("URI Expect " + DecideRule.PASS + " for " + crawlUri +
456 " but got " + decision, decision == DecideRule.PASS);
457
458
459 crawlUri.setContentType("application/pdf");
460 decision = this.rule.decisionFor(crawlUri);
461 assertTrue("URI Expect " + DecideRule.PASS + " for " + crawlUri +
462 " but got " + decision, decision == DecideRule.PASS);
463
464
465 crawlUri.setContentType("text/html");
466 decision = this.rule.decisionFor(crawlUri);
467 assertTrue("URI Expect " + DecideRule.ACCEPT + " for " + crawlUri +
468 " but got " + decision, decision == DecideRule.ACCEPT);
469 }
470
471 public void testContentTypeNotMatchesRegexpDecideRule() throws Exception{
472 ContentTypeNotMatchesRegExpDecideRule dr = new ContentTypeNotMatchesRegExpDecideRule("CTNMREDRtest");
473 DecideRule v = addDecideRule(dr);
474
475 v.setAttribute(new Attribute(MatchesRegExpDecideRule.ATTR_REGEXP,"text/html"));
476 UURI uuri = UURIFactory.getInstance("http://www.archive.org");
477 CrawlURI crawlUri = new CrawlURI(uuri);
478
479
480 Object decision = this.rule.decisionFor(crawlUri);
481 assertTrue("URI Expect " + DecideRule.PASS + " for " + crawlUri +
482 " but got " + decision, decision == DecideRule.PASS);
483
484
485 crawlUri.setContentType("text/html");
486 decision = this.rule.decisionFor(crawlUri);
487 assertTrue("URI Expect " + DecideRule.PASS + " for " + crawlUri +
488 " but got " + decision, decision == DecideRule.PASS);
489
490
491 crawlUri.setContentType("application/pdf");
492 decision = this.rule.decisionFor(crawlUri);
493 assertTrue("URI Expect " + DecideRule.ACCEPT + " for " + crawlUri +
494 " but got " + decision, decision == DecideRule.ACCEPT);
495 }
496 }