RobotstxtTest xref

View Javadoc

1   /* RobotstxtTest
2    *
3    * $Id: RobotstxtTest.java 7135 2011-04-16 19:18:44Z nlevitt $
4    *
5    * Created Sep 1, 2005
6    *
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.datamodel;
26  
27  import java.io.BufferedReader;
28  import java.io.IOException;
29  import java.io.StringReader;
30  
31  import junit.framework.TestCase;
32  
33  public class RobotstxtTest extends TestCase {
34      public void testParseRobots() throws IOException {
35          BufferedReader reader = new BufferedReader(new StringReader("BLAH"));
36          Robotstxt r = new Robotstxt(reader);
37          assertFalse(r.hasErrors);
38          assertTrue(r.getUserAgents().size() == 0);
39          // Parse archive robots.txt with heritrix agent.
40          String agent = "archive.org_bot";
41          reader = new BufferedReader(
42              new StringReader("User-agent: " + agent + "\n" +
43              "Disallow: /cgi-bin/\n" +
44              "Disallow: /details/software\n"));
45          r = new Robotstxt(reader);
46          assertFalse(r.hasErrors);
47          assertTrue(r.getUserAgents().size() == 1);
48          assertTrue(r.agentsToDirectives.size() == 1);
49          assertEquals(r.getUserAgents().get(0), agent);
50          // Parse archive robots.txt with star agent.
51          agent = "*";
52          reader = new BufferedReader(
53              new StringReader("User-agent: " + agent + "\n" +
54              "Disallow: /cgi-bin/\n" +
55              "Disallow: /details/software\n"));
56          r = new Robotstxt(reader);
57          assertFalse(r.hasErrors);
58          assertTrue(r.getUserAgents().size() == 1);
59          assertTrue(r.agentsToDirectives.size() == 1);
60          assertEquals(r.getUserAgents().get(0), "");
61      }
62      
63      Robotstxt sampleRobots1() throws IOException {
64          BufferedReader reader = new BufferedReader(
65              new StringReader(
66                  "User-agent: *\n" +
67                  "Disallow: /cgi-bin/\n" +
68                  "Disallow: /details/software\n" +
69                  "\n"+
70                  "User-agent: denybot\n" +
71                  "Disallow: /\n" +
72                  "\n"+
73                  "User-agent: allowbot1\n" +
74                  "Disallow: \n" +
75                  "\n"+
76                  "User-agent: allowbot2\n" +
77                  "Disallow: /foo\n" +
78                  "Allow: /\n"+
79                  "\n"+
80                  "User-agent: delaybot\n" +
81                  "Disallow: /\n" +
82                  "Crawl-Delay: 20\n"+
83                  "Allow: /images/\n"
84              ));
85          return new Robotstxt(reader); 
86      }
87      
88      public void testDirectives() throws IOException {
89          Robotstxt r = sampleRobots1();
90          // bot allowed with empty disallows
91          assertTrue(r.getDirectivesFor("Mozilla allowbot1 99.9").allows("/path"));
92          assertTrue(r.getDirectivesFor("Mozilla allowbot1 99.9").allows("/"));
93          // bot allowed with explicit allow
94          assertTrue(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/path"));
95          assertTrue(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/"));
96          // bot denied with specific disallow overriding general allow
97          assertFalse(r.getDirectivesFor("Mozilla allowbot2 99.9").allows("/foo"));
98          // bot denied with blanket deny
99          assertFalse(r.getDirectivesFor("Mozilla denybot 99.9").allows("/path"));
100         assertFalse(r.getDirectivesFor("Mozilla denybot 99.9").allows("/"));
101         // unnamed bot with mixed catchall allow/deny
102         assertTrue(r.getDirectivesFor("Mozilla anonbot 99.9").allows("/path"));
103         assertFalse(r.getDirectivesFor("Mozilla anonbot 99.9").allows("/cgi-bin/foo.pl"));
104         // no crawl-delay
105         assertEquals(-1f,r.getDirectivesFor("Mozilla denybot 99.9").getCrawlDelay());
106         // with crawl-delay 
107         assertEquals(20f,r.getDirectivesFor("Mozilla delaybot 99.9").getCrawlDelay());
108     }
109     
110     Robotstxt htmlMarkupRobots() throws IOException {
111         BufferedReader reader = new BufferedReader(
112             new StringReader(
113                 "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\"><HTML>\n"
114                 +"<HEAD>\n"
115                 +"<TITLE>/robots.txt</TITLE>\n"
116                 +"<HEAD>\n"
117                 +"<BODY>\n"
118                 +"User-agent: *<BR>\n"
119                 +"Disallow: /<BR>\n"
120                 +"Crawl-Delay: 30<BR>\n"
121                 +"\n"
122                 +"</BODY>\n"
123                 +"</HTML>\n"
124             ));
125         return new Robotstxt(reader); 
126     }
127     
128     /***
129      * Test handling of a robots.txt with extraneous HTML markup
130      * @throws IOException
131      */
132     public void testHtmlMarkupRobots() throws IOException {
133         Robotstxt r = htmlMarkupRobots();
134         assertFalse(r.getDirectivesFor("anybot").allows("/index.html"));
135         assertEquals(30f,r.getDirectivesFor("anybot").getCrawlDelay());
136     }
137 }