1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.io.BufferedReader;
27 import java.io.IOException;
28 import java.io.ObjectInputStream;
29 import java.io.ObjectOutputStream;
30 import java.io.Serializable;
31 import java.util.ArrayList;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.logging.Level;
35 import java.util.logging.Logger;
36
37 import org.apache.commons.httpclient.URIException;
38 import org.archive.crawler.settings.CrawlerSettings;
39
40 /***
41 * RobotsExclusionPolicy represents the actual policy adopted with
42 * respect to a specific remote server, usually constructed from
43 * consulting the robots.txt, if any, the server provided.
44 *
45 * (The similarly named RobotsHonoringPolicy, on the other hand,
46 * describes the strategy used by the crawler to determine to what
47 * extent it respects exclusion rules.)
48 *
49 * The expiration of policies after a suitable amount of time has
50 * elapsed since last fetch is handled outside this class, in
51 * CrawlServer itself.
52 *
53 * TODO: refactor RobotsHonoringPolicy to be a class-per-policy, and
54 * then see if a CrawlServer with a HonoringPolicy and a RobotsTxt
55 * makes this mediating class unnecessary.
56 *
57 * @author gojomo
58 *
59 */
60 public class RobotsExclusionPolicy implements Serializable {
61
62 private static final long serialVersionUID = 6323907991237383113L;
63
64 private static final Logger logger =
65 Logger.getLogger(RobotsExclusionPolicy.class.getName());
66
67 private final static int NORMAL_TYPE = 0;
68 private final static int ALLOWALL_TYPE = 1;
69 private final static int DENYALL_TYPE = 2;
70 private transient int type = NORMAL_TYPE;
71
72 public static RobotsExclusionPolicy ALLOWALL =
73 new RobotsExclusionPolicy(ALLOWALL_TYPE);
74 public static RobotsExclusionPolicy DENYALL =
75 new RobotsExclusionPolicy(DENYALL_TYPE);
76
77 private Robotstxt robotstxt = null;
78
79
80
81 transient RobotsHonoringPolicy honoringPolicy = null;
82
83 private String lastUsedUserAgent = null;
84 private List<String> userAgentsToTest = null;
85
86 /***
87 * @param settings
88 * @param reader
89 * @param honoringPolicy
90 * @return Robot exclusion policy.
91 * @throws IOException
92 */
93 public static RobotsExclusionPolicy policyFor(CrawlerSettings settings,
94 BufferedReader reader, RobotsHonoringPolicy honoringPolicy)
95 throws IOException {
96 Robotstxt robots = new Robotstxt(reader);
97 return (robots.allowsAll())?
98 ALLOWALL:
99 new RobotsExclusionPolicy(settings, robots, honoringPolicy);
100 }
101
102
103
104 /***
105 * @param settings
106 * @param u
107 * @param d
108 * @param honoringPolicy
109 */
110 public RobotsExclusionPolicy(CrawlerSettings settings,
111 Robotstxt robotstxt,
112 RobotsHonoringPolicy honoringPolicy) {
113 this.robotstxt = robotstxt;
114 this.honoringPolicy = honoringPolicy;
115
116 if(honoringPolicy == null) return;
117
118
119 if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED)) {
120 userAgentsToTest = robotstxt.getUserAgents();
121
122
123 } else if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
124 userAgentsToTest = new ArrayList<String>();
125 Iterator userAgentSet = honoringPolicy.getUserAgents(settings).iterator();
126 while(userAgentSet.hasNext()) {
127 String userAgent = (String) userAgentSet.next();
128
129 Iterator iter = robotstxt.getUserAgents().iterator();
130 while ( iter.hasNext() ) {
131 String ua = (String)iter.next();
132 if (userAgent.indexOf(ua)>-1) {
133 userAgentsToTest.add(ua);
134 break;
135 }
136 }
137 }
138 }
139 }
140
141 public RobotsExclusionPolicy(int type) {
142 this(null, null, null);
143 this.type = type;
144 }
145
146 public boolean disallows(CrawlURI curi, String userAgent) {
147 if (this == ALLOWALL)
148 return false;
149 if (this == DENYALL)
150 return true;
151
152
153
154 if((honoringPolicy.isType(curi, RobotsHonoringPolicy.CLASSIC)
155 || honoringPolicy.isType(curi, RobotsHonoringPolicy.CUSTOM))
156 && (lastUsedUserAgent == null
157 || !lastUsedUserAgent.equals(userAgent))) {
158
159 lastUsedUserAgent = userAgent;
160 userAgentsToTest = new ArrayList<String>();
161 Iterator iter = robotstxt.getUserAgents().iterator();
162 String lowerCaseUserAgent = userAgent.toLowerCase();
163 while ( iter.hasNext() ) {
164 String ua = (String)iter.next();
165 if (lowerCaseUserAgent.indexOf(ua)>-1) {
166 userAgentsToTest.add(ua);
167 break;
168 }
169 }
170 }
171
172 boolean disallow = false;
173 String ua = null;
174
175
176 Iterator uas = userAgentsToTest.iterator();
177 while(uas.hasNext()) {
178 ua = (String) uas.next();
179 String path = null;
180 try {
181 path = curi.getUURI().getPathQuery();
182 } catch (URIException e) {
183 logger.log(Level.SEVERE,"Failed getPathQuery from " + curi, e);
184 disallow = false;
185 break;
186 }
187 if(robotstxt.getDirectivesFor(ua).allows(path)) {
188
189 disallow = false;
190 break;
191 } else {
192
193
194 disallow = true;
195 }
196 }
197
198
199
200 if(honoringPolicy.shouldMasquerade(curi) && ua != null && !ua.equals("")) {
201 curi.setUserAgent(ua);
202 }
203 return disallow;
204 }
205
206
207
208 /*** If object is DENYALL or ALLOWALL, only the object identity and type
209 * is written in the serialization stream.
210 *
211 * @param stream the serialization stream.
212 * @throws IOException
213 */
214 private void writeObject(ObjectOutputStream stream) throws IOException {
215 stream.writeInt(type);
216 if (type == NORMAL_TYPE) {
217 stream.defaultWriteObject();
218 }
219 }
220
221 /*** If object is DENYALL or ALLOWALL, only the object identity and type
222 * is read from the serialization stream.
223 *
224 * @param stream the serialization stream.
225 * @throws IOException
226 * @throws ClassNotFoundException
227 */
228 private void readObject(ObjectInputStream stream)
229 throws IOException, ClassNotFoundException {
230 type = stream.readInt();
231 if (type == NORMAL_TYPE) {
232 stream.defaultReadObject();
233 }
234 }
235
236 /*** If object is DENYALL or ALLOWALL, the object is replaced by constants
237 * so that check for object equality works.
238 * @return Object.
239 */
240 private Object readResolve() {
241 if (type == NORMAL_TYPE) {
242 return this;
243 } else if (type == ALLOWALL_TYPE) {
244 return ALLOWALL;
245 } else if (type == DENYALL_TYPE) {
246 return DENYALL;
247 }
248 return null;
249 }
250
251
252 /***
253 * Get the crawl-delay that applies to the given user-agent, or
254 * -1 (indicating no crawl-delay known) if not internal RobotsTxt
255 * instance.
256 *
257 * @param userAgent
258 * @return int Crawl-Delay value, or -1 if non available
259 */
260 public float getCrawlDelay(String userAgent) {
261 if (robotstxt==null) {
262 return -1;
263 }
264 return robotstxt.getDirectivesFor(userAgent).getCrawlDelay();
265 }
266
267 }