View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * RobotsHonoringPolicy.java
20   * Created on Oct 30, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.util.logging.Logger;
27  
28  import javax.management.AttributeNotFoundException;
29  
30  import org.archive.crawler.settings.CrawlerSettings;
31  import org.archive.crawler.settings.ModuleType;
32  import org.archive.crawler.settings.SimpleType;
33  import org.archive.crawler.settings.StringList;
34  import org.archive.crawler.settings.TextField;
35  
36  /***
37   * RobotsHonoringPolicy represent the strategy used by the crawler 
38   * for determining how robots.txt files will be honored. 
39   *
40   * Five kinds of policies exist:
41   * <dl>
42   * <dt>classic:</dt>
43   *   <dd>obey the first set of robots.txt directives that apply to your 
44   *   current user-agent</dd>
45   * <dt>ignore:</dt>
46   *   <dd>ignore robots.txt directives entirely</dd>
47   * <dt>custom:</dt>
48   *   <dd>obey a specific operator-entered set of robots.txt directives 
49   *   for a given host</dd>
50   * <dt>most-favored:</dt>
51   *   <dd>obey the most liberal restrictions offered (if *any* crawler is 
52   *   allowed to get a page, get it)</dd>
53   * <dt>most-favored-set:</dt>
54   *   <dd>given some set of user-agent patterns, obey the most liberal 
55   *   restriction offered to any</dd>
56   * </dl>
57   *
58   * The two last ones has the opportunity of adopting a different user-agent 
59   * to reflect the restrictions we've opted to use.
60   *
61   * @author John Erik Halse
62   *
63   */
64  public class RobotsHonoringPolicy  extends ModuleType {
65  
66      private static final long serialVersionUID = 8850011643923116605L;
67  
68      private static Logger logger =
69          Logger.getLogger("org.archive.crawler.datamodel.RobotsHonoringPolicy");
70  
71      public final static int CLASSIC = 0;
72      public final static int IGNORE = 1;
73      public final static int CUSTOM = 2;
74      public final static int MOST_FAVORED = 3;
75      public final static int MOST_FAVORED_SET = 4;
76  
77      public final static String ATTR_NAME = "robots-honoring-policy";
78      public final static String ATTR_TYPE = "type";
79      public final static String ATTR_MASQUERADE = "masquerade";
80      public final static String ATTR_CUSTOM_ROBOTS = "custom-robots";
81      public final static String ATTR_USER_AGENTS = "user-agents";
82  
83  
84      /***
85       * Creates a new instance of RobotsHonoringPolicy.
86       *
87       * @param name the name of the RobotsHonoringPolicy attirubte.
88       */
89      public RobotsHonoringPolicy(String name) {
90          super(name, "Robots honoring policy");
91  
92          String[] allowedTypes = new String[] {
93                  "classic", "ignore", "custom", 
94                  "most-favored", "most-favored-set"};
95  
96          addElementToDefinition(new SimpleType(ATTR_TYPE,
97                  "Policy type. The 'classic' policy simply obeys all " +
98                  "robots.txt rules for the configured user-agent. The " +
99                  "'ignore' policy ignores all robots rules. The 'custom' " +
100                 "policy allows you to specify a policy, in robots.txt " +
101                 "format, as a setting. The 'most-favored' policy will " +
102                 "crawl an URL if the robots.txt allows any user-agent to " +
103                 "crawl it. The 'most-favored-set' policy requires you " +
104                 "to supply an list of alternate user-agents, and for" +
105                 "every page, if any agent of the set is allowed, the" +
106                 "page will be crawled.", "classic", allowedTypes));
107         addElementToDefinition(new SimpleType(ATTR_MASQUERADE,
108                 "Should we masquerade as another user agent when obeying " +
109                 "the rules declared for it. Only relevant if the " +
110                 "policy type is 'most-favored' or 'most-favored-set'.", 
111                 new Boolean(false)));
112         addElementToDefinition(new SimpleType(ATTR_CUSTOM_ROBOTS,
113                 "Custom robots to use if policy type is 'custom'. " +
114                 "Compose as if an actual robots.txt file.", 
115                 new TextField("")));
116         addElementToDefinition(new StringList(ATTR_USER_AGENTS, 
117                 "Alternate user-agent values to consider using for " +
118                 "the 'most-favored-set' policy."));
119     }
120 
121     public RobotsHonoringPolicy() {
122         this(ATTR_NAME);
123     }
124 
125     /***
126      * If policy-type is most favored crawler of set, then this method
127      * gets a list of all useragents in that set.
128      *
129      * @return List of Strings with user agents
130      */
131     public StringList getUserAgents(CrawlerSettings settings) {
132         if (isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
133             try {
134                 return (StringList) getAttribute(settings, ATTR_USER_AGENTS);
135             } catch (AttributeNotFoundException e) {
136                 logger.severe(e.getMessage());
137             }
138         }
139         return null;
140     }
141 
142     /***
143      * This method returns true if the crawler should masquerade as the user agent
144      * which restrictions it opted to use.
145      *
146      * (Only relevant for  policy-types: most-favored and most-favored-set).
147      *
148      * @return true if we should masquerade
149      */
150     public boolean shouldMasquerade(CrawlURI curi) {
151         try {
152             return ((Boolean) getAttribute(curi, ATTR_MASQUERADE)).booleanValue();
153         } catch (AttributeNotFoundException e) {
154             logger.severe(e.getMessage());
155             return false;
156         }
157     }
158 
159     /***
160      * Get the supplied custom robots.txt
161      *
162      * @return String with content of alternate robots.txt
163      */
164     public String getCustomRobots(CrawlerSettings settings) {
165         if(isType(settings, RobotsHonoringPolicy.CUSTOM)) {
166             try {
167                 return getAttribute(settings, ATTR_CUSTOM_ROBOTS).toString();
168             } catch (AttributeNotFoundException e) {
169                 logger.severe(e.getMessage());
170             }
171         }
172         return null;
173     }
174 
175     /***
176      * Get the policy-type.
177      *
178      * @see #CLASSIC
179      * @see #IGNORE
180      * @see #CUSTOM
181      * @see #MOST_FAVORED
182      * @see #MOST_FAVORED_SET
183      *
184      * @return policy type
185      */
186     public int getType(Object context) {
187         int type = CLASSIC;
188         try {
189             String typeName = (String) getAttribute(context, "type");
190             if(typeName.equals("classic")) {
191                 type = RobotsHonoringPolicy.CLASSIC;
192             } else if(typeName.equals("ignore")) {
193                 type = RobotsHonoringPolicy.IGNORE;
194             } else if(typeName.equals("custom")) {
195                 type = RobotsHonoringPolicy.CUSTOM;
196             } else if(typeName.equals("most-favored")) {
197                 type = RobotsHonoringPolicy.MOST_FAVORED;
198             } else if(typeName.equals("most-favored-set")) {
199                 type = RobotsHonoringPolicy.MOST_FAVORED_SET;
200             } else {
201                 throw new IllegalArgumentException();
202             }
203         } catch (AttributeNotFoundException e) {
204             logger.severe(e.getMessage());
205         }
206         return type;
207     }
208 
209     /***
210      * Check if policy is of a certain type.
211      *
212      * @param o An object that can be resolved into a settings object.
213      * @param type the type to check against.
214      * @return true if the policy is of the submitted type
215      */
216     public boolean isType(Object o, int type) {
217         return type == getType(o);
218     }
219 
220 }