1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.datamodel;
26
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.io.Serializable;
30 import java.util.HashMap;
31 import java.util.LinkedList;
32 import java.util.List;
33 import java.util.Map;
34
35 /***
36 * Utility class for parsing and representing 'robots.txt' format
37 * directives, into a list of named user-agents and map from user-agents
38 * to RobotsDirectives.
39 */
40 public class Robotstxt implements Serializable {
41 static final long serialVersionUID = 7025386509301303890L;
42
43
44
45 LinkedList<String> userAgents = new LinkedList<String>();
46
47 Map<String,RobotsDirectives> agentsToDirectives =
48 new HashMap<String,RobotsDirectives>();
49
50 boolean hasErrors = false;
51
52 static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
53
54 public Robotstxt(BufferedReader reader) throws IOException {
55 String read;
56
57 RobotsDirectives current = null;
58
59 boolean hasDirectivesYet = false;
60 String catchall = null;
61 while (reader != null) {
62 do {
63 read = reader.readLine();
64
65 } while ((read != null) && ((read = read.trim()).startsWith("#") ||
66 read.length() == 0));
67 if (read == null) {
68 reader.close();
69 reader = null;
70 } else {
71
72 read = read.replaceAll("<[^>]+>","");
73 int commentIndex = read.indexOf("#");
74 if (commentIndex > -1) {
75
76 read = read.substring(0, commentIndex);
77 }
78 read = read.trim();
79 if (read.matches("(?i)^User-agent:.*")) {
80 String ua = read.substring(11).trim().toLowerCase();
81 if (current == null || hasDirectivesYet ) {
82
83
84 current = new RobotsDirectives();
85 hasDirectivesYet = false;
86 }
87 if (ua.equals("*")) {
88 ua = "";
89 catchall = ua;
90 } else {
91 userAgents.addLast(ua);
92 }
93 agentsToDirectives.put(ua, current);
94 continue;
95 }
96 if (read.matches("(?i)Disallow:.*")) {
97 if (current == null) {
98
99 hasErrors = true;
100 continue;
101 }
102 String path = read.substring(9).trim();
103
104
105
106
107 if(path.endsWith("*")) {
108 path = path.substring(0,path.length()-1);
109 }
110 current.addDisallow(path);
111 hasDirectivesYet = true;
112 continue;
113 }
114 if (read.matches("(?i)Crawl-delay:.*")) {
115 if (current == null) {
116
117 hasErrors = true;
118 continue;
119 }
120
121
122
123 hasDirectivesYet = true;
124 String val = read.substring(12).trim();
125 val = val.split("[^//d//.]+")[0];
126 try {
127 current.setCrawlDelay(Float.parseFloat(val));
128 } catch (NumberFormatException nfe) {
129
130 }
131 continue;
132 }
133 if (read.matches("(?i)Allow:.*")) {
134 if (current == null) {
135
136 hasErrors = true;
137 continue;
138 }
139 String path = read.substring(6).trim();
140
141
142
143
144 if(path.endsWith("*")) {
145 path = path.substring(0,path.length()-1);
146 }
147 current.addAllow(path);
148 hasDirectivesYet = true;
149 continue;
150 }
151
152 }
153 }
154
155 if (catchall != null) {
156 userAgents.addLast(catchall);
157 }
158 }
159
160
161 /***
162 * Does this policy effectively allow everything? (No
163 * disallows or timing (crawl-delay) directives?)
164 * @return
165 */
166 public boolean allowsAll() {
167
168
169 return agentsToDirectives.isEmpty();
170 }
171
172 public List<String> getUserAgents() {
173 return userAgents;
174 }
175
176 public RobotsDirectives getDirectivesFor(String ua) {
177
178 for(String uaListed : userAgents) {
179 if(ua.indexOf(uaListed)>-1) {
180 return agentsToDirectives.get(uaListed);
181 }
182 }
183
184 return NO_DIRECTIVES;
185 }
186 }