1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.datamodel;
26
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.io.StringReader;
30
31 import junit.framework.TestCase;
32
33 public class RobotstxtTest extends TestCase {
34 public void testParseRobots() throws IOException {
35 BufferedReader reader = new BufferedReader(new StringReader("BLAH"));
36 Robotstxt r = new Robotstxt(reader);
37 assertFalse(r.hasErrors);
38 assertTrue(r.getUserAgents().size() == 0);
39
40 String agent = "archive.org_bot";
41 reader = new BufferedReader(
42 new StringReader("User-agent: " + agent + "\n" +
43 "Disallow: /cgi-bin/\n" +
44 "Disallow: /details/software\n"));
45 r = new Robotstxt(reader);
46 assertFalse(r.hasErrors);
47 assertTrue(r.getUserAgents().size() == 1);
48 assertTrue(r.agentsToDirectives.size() == 1);
49 assertEquals(r.getUserAgents().get(0), agent);
50
51 agent = "*";
52 reader = new BufferedReader(
53 new StringReader("User-agent: " + agent + "\n" +
54 "Disallow: /cgi-bin/\n" +
55 "Disallow: /details/software\n"));
56 r = new Robotstxt(reader);
57 assertFalse(r.hasErrors);
58 assertTrue(r.getUserAgents().size() == 1);
59 assertTrue(r.agentsToDirectives.size() == 1);
60 assertEquals(r.getUserAgents().get(0), "");
61 }
62
63 Robotstxt sampleRobots1() throws IOException {
64 BufferedReader reader = new BufferedReader(
65 new StringReader(
66 "User-agent: *\n" +
67 "Disallow: /cgi-bin/\n" +
68 "Disallow: /details/software\n" +
69 "\n"+
70 "User-agent: denybot\n" +
71 "Disallow: /\n" +
72 "\n"+
73 "User-agent: allowbot1\n" +
74 "Disallow: \n" +
75 "\n"+
76 "User-agent: allowbot2\n" +
77 "Disallow: /foo\n" +
78 "Allow: /\n"+
79 "\n"+
80 "User-agent: delaybot\n" +
81 "Disallow: /\n" +
82 "Crawl-Delay: 20\n"+
83 "Allow: /images/\n"
84 ));
85 return new Robotstxt(reader);
86 }
87
88 public void testDirectives() throws IOException {
89 Robotstxt r = sampleRobots1();
90
91 assertTrue(r.getDirectivesFor("Mozilla allowbot1 99.9").allows("/path"));
92 assertTrue(r.getDirectivesFor("Mozilla allowbot1 99.9").allows("/"));
93
94 assertTrue(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/path"));
95 assertTrue(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/"));
96
97 assertFalse(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/foo"));
98
99 assertFalse(r.getDirectivesFor("Mozilla denybot 99.9").allows("/path"));
100 assertFalse(r.getDirectivesFor("Mozilla denybot 99.9").allows("/"));
101
102 assertTrue(r.getDirectivesFor("Mozilla anonbot 99.9").allows("/path"));
103 assertFalse(r.getDirectivesFor("Mozilla anonbot 99.9").allows("/cgi-bin/foo.pl"));
104
105 assertEquals(-1f,r.getDirectivesFor("Mozilla denybot 99.9").getCrawlDelay());
106
107 assertEquals(20f,r.getDirectivesFor("Mozilla delaybot 99.9").getCrawlDelay());
108 }
109
110 Robotstxt htmlMarkupRobots() throws IOException {
111 BufferedReader reader = new BufferedReader(
112 new StringReader(
113 "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\"><HTML>\n"
114 +"<HEAD>\n"
115 +"<TITLE>/robots.txt</TITLE>\n"
116 +"<HEAD>\n"
117 +"<BODY>\n"
118 +"User-agent: *<BR>\n"
119 +"Disallow: /<BR>\n"
120 +"Crawl-Delay: 30<BR>\n"
121 +"\n"
122 +"</BODY>\n"
123 +"</HTML>\n"
124 ));
125 return new Robotstxt(reader);
126 }
127
128 /***
129 * Test handling of a robots.txt with extraneous HTML markup
130 * @throws IOException
131 */
132 public void testHtmlMarkupRobots() throws IOException {
133 Robotstxt r = htmlMarkupRobots();
134 assertFalse(r.getDirectivesFor("anybot").allows("/index.html"));
135 assertEquals(30f,r.getDirectivesFor("anybot").getCrawlDelay());
136 }
137 }