View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * RobotsExclusionPolicy.java
20   * Created on Apr 17, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.BufferedReader;
27  import java.io.IOException;
28  import java.io.ObjectInputStream;
29  import java.io.ObjectOutputStream;
30  import java.io.Serializable;
31  import java.util.ArrayList;
32  import java.util.Iterator;
33  import java.util.List;
34  import java.util.logging.Level;
35  import java.util.logging.Logger;
36  
37  import org.apache.commons.httpclient.URIException;
38  import org.archive.crawler.settings.CrawlerSettings;
39  
40  /***
41   * RobotsExclusionPolicy represents the actual policy adopted with 
42   * respect to a specific remote server, usually constructed from 
43   * consulting the robots.txt, if any, the server provided. 
44   * 
45   * (The similarly named RobotsHonoringPolicy, on the other hand, 
46   * describes the strategy used by the crawler to determine to what
47   * extent it respects exclusion rules.)
48   * 
49   * The expiration of policies after a suitable amount of time has
50   * elapsed since last fetch is handled outside this class, in 
51   * CrawlServer itself. 
52   * 
53   * TODO: refactor RobotsHonoringPolicy to be a class-per-policy, and 
54   * then see if a CrawlServer with a HonoringPolicy and a RobotsTxt
55   * makes this mediating class unnecessary. 
56   * 
57   * @author gojomo
58   *
59   */
60  public class RobotsExclusionPolicy implements Serializable {
61  
62      private static final long serialVersionUID = 6323907991237383113L;
63  
64      private static final Logger logger =
65          Logger.getLogger(RobotsExclusionPolicy.class.getName());
66  
67      private final static int NORMAL_TYPE = 0;
68      private final static int ALLOWALL_TYPE = 1;
69      private final static int DENYALL_TYPE = 2;
70      private transient int type = NORMAL_TYPE;
71  
72      public static RobotsExclusionPolicy ALLOWALL =
73          new RobotsExclusionPolicy(ALLOWALL_TYPE);
74      public static RobotsExclusionPolicy DENYALL =
75          new RobotsExclusionPolicy(DENYALL_TYPE);
76  
77      private Robotstxt robotstxt = null;
78      // FIXME?: this 'transient' seems wrong -- likely to cause
79      // all non-normal policies to break when CrawlServer
80      // go through a serialization/deserialization cycle
81      transient RobotsHonoringPolicy honoringPolicy = null;
82  
83      private String lastUsedUserAgent = null;
84      private List<String> userAgentsToTest = null;
85  
86      /***
87       * @param settings 
88       * @param reader
89       * @param honoringPolicy
90       * @return Robot exclusion policy.
91       * @throws IOException
92       */
93      public static RobotsExclusionPolicy policyFor(CrawlerSettings settings,
94              BufferedReader reader, RobotsHonoringPolicy honoringPolicy)
95      throws IOException {
96          Robotstxt robots = new Robotstxt(reader);
97          return (robots.allowsAll())?
98              ALLOWALL:
99              new RobotsExclusionPolicy(settings, robots, honoringPolicy);
100     }
101 
102 
103 
104     /***
105      * @param settings 
106      * @param u
107      * @param d
108      * @param honoringPolicy
109      */
110     public RobotsExclusionPolicy(CrawlerSettings settings,
111             Robotstxt robotstxt, 
112             RobotsHonoringPolicy honoringPolicy) {
113         this.robotstxt = robotstxt;
114         this.honoringPolicy = honoringPolicy;
115 
116         if(honoringPolicy == null) return;
117 
118         // If honoring policy is most favored user agent, all rules should be checked
119         if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED)) {
120             userAgentsToTest = robotstxt.getUserAgents();
121 
122         // IF honoring policy is most favored of set, then make a list with only the set as members
123         } else if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
124             userAgentsToTest = new ArrayList<String>();
125             Iterator userAgentSet = honoringPolicy.getUserAgents(settings).iterator();
126             while(userAgentSet.hasNext()) {
127                 String userAgent = (String) userAgentSet.next();
128 
129                 Iterator iter = robotstxt.getUserAgents().iterator();
130                 while ( iter.hasNext() ) {
131                     String ua = (String)iter.next();
132                     if (userAgent.indexOf(ua)>-1) {
133                         userAgentsToTest.add(ua);
134                         break;
135                     }
136                 }
137             }
138         }
139     }
140 
141     public RobotsExclusionPolicy(int type) {
142         this(null, null, null);
143         this.type = type;
144     }
145 
146     public boolean disallows(CrawlURI curi, String userAgent) {
147         if (this == ALLOWALL)
148             return false;
149         if (this == DENYALL)
150             return true;
151 
152         // In the common case with policy=Classic, the useragent is 
153         // remembered from uri to uri on the same server
154         if((honoringPolicy.isType(curi, RobotsHonoringPolicy.CLASSIC) 
155                 || honoringPolicy.isType(curi, RobotsHonoringPolicy.CUSTOM))
156             && (lastUsedUserAgent == null
157             || !lastUsedUserAgent.equals(userAgent))) {
158 
159             lastUsedUserAgent = userAgent;
160             userAgentsToTest = new ArrayList<String>();
161             Iterator iter = robotstxt.getUserAgents().iterator();
162             String lowerCaseUserAgent = userAgent.toLowerCase();
163             while ( iter.hasNext() ) {
164                 String ua = (String)iter.next();
165                 if (lowerCaseUserAgent.indexOf(ua)>-1) {
166                     userAgentsToTest.add(ua);
167                     break; // consider no more sections
168                 }
169             }
170         }
171 
172         boolean disallow = false;
173         String ua = null;
174 
175         // Go thru list of all user agents we might act as
176         Iterator uas = userAgentsToTest.iterator();
177         while(uas.hasNext()) {
178             ua = (String) uas.next();
179             String path = null; 
180             try {
181                  path = curi.getUURI().getPathQuery();
182             } catch (URIException e) {
183                 logger.log(Level.SEVERE,"Failed getPathQuery from " + curi, e);
184                 disallow = false;
185                 break;
186             }
187             if(robotstxt.getDirectivesFor(ua).allows(path)) {
188                 // at least one applicable set of rules allows
189                 disallow = false;
190                 break; 
191             } else {
192                 // at least one applicable set of rules disallows
193                 // so disallow unless later test allows
194                 disallow = true; 
195             }
196         }
197 
198         // Are we supposed to masquerade as the user agent to which restrictions
199         // we follow?
200         if(honoringPolicy.shouldMasquerade(curi) && ua != null && !ua.equals("")) {
201             curi.setUserAgent(ua);
202         }
203         return disallow;
204     }
205 
206     // Methods for object serialization.
207 
208     /*** If object is DENYALL or ALLOWALL, only the object identity and type
209      * is written in the serialization stream.
210      *
211      * @param stream the serialization stream.
212      * @throws IOException 
213      */
214     private void writeObject(ObjectOutputStream stream) throws IOException {
215         stream.writeInt(type);
216         if (type == NORMAL_TYPE) {
217             stream.defaultWriteObject();
218         }
219     }
220 
221     /*** If object is DENYALL or ALLOWALL, only the object identity and type
222      * is read from the serialization stream.
223      *
224      * @param stream the serialization stream.
225      * @throws IOException 
226      * @throws ClassNotFoundException 
227      */
228     private void readObject(ObjectInputStream stream)
229             throws IOException, ClassNotFoundException {
230         type = stream.readInt();
231         if (type == NORMAL_TYPE) {
232             stream.defaultReadObject();
233         }
234     }
235 
236     /*** If object is DENYALL or ALLOWALL, the object is replaced by constants
237      * so that check for object equality works.
238      * @return Object.
239      */
240     private Object readResolve() {
241         if (type == NORMAL_TYPE) {
242             return this;
243         } else if (type == ALLOWALL_TYPE) {
244             return ALLOWALL;
245         } else if (type == DENYALL_TYPE) {
246             return DENYALL;
247         }
248         return null;
249     }
250 
251 
252     /***
253      * Get the crawl-delay that applies to the given user-agent, or
254      * -1 (indicating no crawl-delay known) if not internal RobotsTxt
255      * instance. 
256      * 
257      * @param userAgent
258      * @return int Crawl-Delay value, or -1 if non available
259      */
260     public float getCrawlDelay(String userAgent) {
261         if (robotstxt==null) {
262             return -1;
263         }
264         return robotstxt.getDirectivesFor(userAgent).getCrawlDelay();
265     }
266 
267 }