View Javadoc

1   /* Robots.java
2    *
3    * $Id: Robotstxt.java 6824 2010-04-13 22:43:44Z gojomo $
4    *
5    * Created Sep 1, 2005
6    *
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.datamodel;
26  
27  import java.io.BufferedReader;
28  import java.io.IOException;
29  import java.io.Serializable;
30  import java.util.HashMap;
31  import java.util.LinkedList;
32  import java.util.List;
33  import java.util.Map;
34  
35  /***
36   * Utility class for parsing and representing 'robots.txt' format 
37   * directives, into a list of named user-agents and map from user-agents 
38   * to RobotsDirectives. 
39   */
40  public class Robotstxt implements Serializable {
41      static final long serialVersionUID = 7025386509301303890L;
42      
43      // all user agents contained in this robots.txt
44      // may be thinned of irrelevant entries
45      LinkedList<String> userAgents = new LinkedList<String>();
46      // map user-agents to directives
47      Map<String,RobotsDirectives> agentsToDirectives = 
48          new HashMap<String,RobotsDirectives>();
49      // 
50      boolean hasErrors = false;
51      
52      static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
53      
54      public Robotstxt(BufferedReader reader) throws IOException {
55          String read;
56          // current is the disallowed paths for the preceding User-Agent(s)
57          RobotsDirectives current = null;
58          // whether a non-'User-Agent' directive has been encountered
59          boolean hasDirectivesYet = false; 
60          String catchall = null;
61          while (reader != null) {
62              do {
63                  read = reader.readLine();
64                  // Skip comments & blanks
65              } while ((read != null) && ((read = read.trim()).startsWith("#") ||
66                  read.length() == 0));
67              if (read == null) {
68                  reader.close();
69                  reader = null;
70              } else {
71                  // remove any html markup
72                  read = read.replaceAll("<[^>]+>","");
73                  int commentIndex = read.indexOf("#");
74                  if (commentIndex > -1) {
75                      // Strip trailing comment
76                      read = read.substring(0, commentIndex);
77                  }
78                  read = read.trim();
79                  if (read.matches("(?i)^User-agent:.*")) {
80                      String ua = read.substring(11).trim().toLowerCase();
81                      if (current == null || hasDirectivesYet ) {
82                          // only create new rules-list if necessary
83                          // otherwise share with previous user-agent
84                          current = new RobotsDirectives();
85                          hasDirectivesYet = false; 
86                      }
87                      if (ua.equals("*")) {
88                          ua = "";
89                          catchall = ua;
90                      } else {
91                          userAgents.addLast(ua);
92                      }
93                      agentsToDirectives.put(ua, current);
94                      continue;
95                  }
96                  if (read.matches("(?i)Disallow:.*")) {
97                      if (current == null) {
98                          // buggy robots.txt
99                          hasErrors = true;
100                         continue;
101                     }
102                     String path = read.substring(9).trim();
103                     // tolerate common error of ending path with '*' character
104                     // (not allowed by original spec; redundant but harmless with 
105                     // Google's wildcarding extensions -- which we don't yet fully
106                     // support). 
107                     if(path.endsWith("*")) {
108                         path = path.substring(0,path.length()-1); 
109                     }
110                     current.addDisallow(path);
111                     hasDirectivesYet = true; 
112                     continue;
113                 }
114                 if (read.matches("(?i)Crawl-delay:.*")) {
115                     if (current == null) {
116                         // buggy robots.txt
117                         hasErrors = true;
118                         continue;
119                     }
120                     // consider a crawl-delay, even though we don't 
121                     // yet understand it, as sufficient to end a 
122                     // grouping of User-Agent lines
123                     hasDirectivesYet = true;
124                     String val = read.substring(12).trim();
125                     val = val.split("[^//d//.]+")[0];
126                     try {
127                         current.setCrawlDelay(Float.parseFloat(val));
128                     } catch (NumberFormatException nfe) {
129                         // ignore
130                     }
131                     continue;
132                 }
133                 if (read.matches("(?i)Allow:.*")) {
134                     if (current == null) {
135                         // buggy robots.txt
136                         hasErrors = true;
137                         continue;
138                     }
139                     String path = read.substring(6).trim();
140                     // tolerate common error of ending path with '*' character
141                     // (not allowed by original spec; redundant but harmless with 
142                     // Google's wildcarding extensions -- which we don't yet fully
143                     // support). 
144                     if(path.endsWith("*")) {
145                         path = path.substring(0,path.length()-1); 
146                     }
147                     current.addAllow(path);
148                     hasDirectivesYet = true;
149                     continue;
150                 }
151                 // unknown line; do nothing for now
152             }
153         }
154 
155         if (catchall != null) {
156             userAgents.addLast(catchall);
157         }
158     }
159 
160 
161     /***
162      * Does this policy effectively allow everything? (No 
163      * disallows or timing (crawl-delay) directives?)
164      * @return
165      */
166     public boolean allowsAll() {
167         // TODO: refine so directives that are all empty are also 
168         // recognized as allowing all
169         return agentsToDirectives.isEmpty();
170     }
171     
172     public List<String> getUserAgents() {
173         return userAgents;
174     }
175 
176     public RobotsDirectives getDirectivesFor(String ua) {
177         // find matching ua
178         for(String uaListed : userAgents) {
179             if(ua.indexOf(uaListed)>-1) {
180                 return agentsToDirectives.get(uaListed);
181             }
182         }
183         // no applicable user-agents, so empty directives
184         return NO_DIRECTIVES; 
185     }
186 }