1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.util.logging.Logger;
27
28 import javax.management.AttributeNotFoundException;
29
30 import org.archive.crawler.settings.CrawlerSettings;
31 import org.archive.crawler.settings.ModuleType;
32 import org.archive.crawler.settings.SimpleType;
33 import org.archive.crawler.settings.StringList;
34 import org.archive.crawler.settings.TextField;
35
36 /***
37 * RobotsHonoringPolicy represent the strategy used by the crawler
38 * for determining how robots.txt files will be honored.
39 *
40 * Five kinds of policies exist:
41 * <dl>
42 * <dt>classic:</dt>
43 * <dd>obey the first set of robots.txt directives that apply to your
44 * current user-agent</dd>
45 * <dt>ignore:</dt>
46 * <dd>ignore robots.txt directives entirely</dd>
47 * <dt>custom:</dt>
48 * <dd>obey a specific operator-entered set of robots.txt directives
49 * for a given host</dd>
50 * <dt>most-favored:</dt>
51 * <dd>obey the most liberal restrictions offered (if *any* crawler is
52 * allowed to get a page, get it)</dd>
53 * <dt>most-favored-set:</dt>
54 * <dd>given some set of user-agent patterns, obey the most liberal
55 * restriction offered to any</dd>
56 * </dl>
57 *
58 * The two last ones has the opportunity of adopting a different user-agent
59 * to reflect the restrictions we've opted to use.
60 *
61 * @author John Erik Halse
62 *
63 */
64 public class RobotsHonoringPolicy extends ModuleType {
65
66 private static final long serialVersionUID = 8850011643923116605L;
67
68 private static Logger logger =
69 Logger.getLogger("org.archive.crawler.datamodel.RobotsHonoringPolicy");
70
71 public final static int CLASSIC = 0;
72 public final static int IGNORE = 1;
73 public final static int CUSTOM = 2;
74 public final static int MOST_FAVORED = 3;
75 public final static int MOST_FAVORED_SET = 4;
76
77 public final static String ATTR_NAME = "robots-honoring-policy";
78 public final static String ATTR_TYPE = "type";
79 public final static String ATTR_MASQUERADE = "masquerade";
80 public final static String ATTR_CUSTOM_ROBOTS = "custom-robots";
81 public final static String ATTR_USER_AGENTS = "user-agents";
82
83
84 /***
85 * Creates a new instance of RobotsHonoringPolicy.
86 *
87 * @param name the name of the RobotsHonoringPolicy attirubte.
88 */
89 public RobotsHonoringPolicy(String name) {
90 super(name, "Robots honoring policy");
91
92 String[] allowedTypes = new String[] {
93 "classic", "ignore", "custom",
94 "most-favored", "most-favored-set"};
95
96 addElementToDefinition(new SimpleType(ATTR_TYPE,
97 "Policy type. The 'classic' policy simply obeys all " +
98 "robots.txt rules for the configured user-agent. The " +
99 "'ignore' policy ignores all robots rules. The 'custom' " +
100 "policy allows you to specify a policy, in robots.txt " +
101 "format, as a setting. The 'most-favored' policy will " +
102 "crawl an URL if the robots.txt allows any user-agent to " +
103 "crawl it. The 'most-favored-set' policy requires you " +
104 "to supply an list of alternate user-agents, and for" +
105 "every page, if any agent of the set is allowed, the" +
106 "page will be crawled.", "classic", allowedTypes));
107 addElementToDefinition(new SimpleType(ATTR_MASQUERADE,
108 "Should we masquerade as another user agent when obeying " +
109 "the rules declared for it. Only relevant if the " +
110 "policy type is 'most-favored' or 'most-favored-set'.",
111 new Boolean(false)));
112 addElementToDefinition(new SimpleType(ATTR_CUSTOM_ROBOTS,
113 "Custom robots to use if policy type is 'custom'. " +
114 "Compose as if an actual robots.txt file.",
115 new TextField("")));
116 addElementToDefinition(new StringList(ATTR_USER_AGENTS,
117 "Alternate user-agent values to consider using for " +
118 "the 'most-favored-set' policy."));
119 }
120
121 public RobotsHonoringPolicy() {
122 this(ATTR_NAME);
123 }
124
125 /***
126 * If policy-type is most favored crawler of set, then this method
127 * gets a list of all useragents in that set.
128 *
129 * @return List of Strings with user agents
130 */
131 public StringList getUserAgents(CrawlerSettings settings) {
132 if (isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
133 try {
134 return (StringList) getAttribute(settings, ATTR_USER_AGENTS);
135 } catch (AttributeNotFoundException e) {
136 logger.severe(e.getMessage());
137 }
138 }
139 return null;
140 }
141
142 /***
143 * This method returns true if the crawler should masquerade as the user agent
144 * which restrictions it opted to use.
145 *
146 * (Only relevant for policy-types: most-favored and most-favored-set).
147 *
148 * @return true if we should masquerade
149 */
150 public boolean shouldMasquerade(CrawlURI curi) {
151 try {
152 return ((Boolean) getAttribute(curi, ATTR_MASQUERADE)).booleanValue();
153 } catch (AttributeNotFoundException e) {
154 logger.severe(e.getMessage());
155 return false;
156 }
157 }
158
159 /***
160 * Get the supplied custom robots.txt
161 *
162 * @return String with content of alternate robots.txt
163 */
164 public String getCustomRobots(CrawlerSettings settings) {
165 if(isType(settings, RobotsHonoringPolicy.CUSTOM)) {
166 try {
167 return getAttribute(settings, ATTR_CUSTOM_ROBOTS).toString();
168 } catch (AttributeNotFoundException e) {
169 logger.severe(e.getMessage());
170 }
171 }
172 return null;
173 }
174
175 /***
176 * Get the policy-type.
177 *
178 * @see #CLASSIC
179 * @see #IGNORE
180 * @see #CUSTOM
181 * @see #MOST_FAVORED
182 * @see #MOST_FAVORED_SET
183 *
184 * @return policy type
185 */
186 public int getType(Object context) {
187 int type = CLASSIC;
188 try {
189 String typeName = (String) getAttribute(context, "type");
190 if(typeName.equals("classic")) {
191 type = RobotsHonoringPolicy.CLASSIC;
192 } else if(typeName.equals("ignore")) {
193 type = RobotsHonoringPolicy.IGNORE;
194 } else if(typeName.equals("custom")) {
195 type = RobotsHonoringPolicy.CUSTOM;
196 } else if(typeName.equals("most-favored")) {
197 type = RobotsHonoringPolicy.MOST_FAVORED;
198 } else if(typeName.equals("most-favored-set")) {
199 type = RobotsHonoringPolicy.MOST_FAVORED_SET;
200 } else {
201 throw new IllegalArgumentException();
202 }
203 } catch (AttributeNotFoundException e) {
204 logger.severe(e.getMessage());
205 }
206 return type;
207 }
208
209 /***
210 * Check if policy is of a certain type.
211 *
212 * @param o An object that can be resolved into a settings object.
213 * @param type the type to check against.
214 * @return true if the policy is of the submitted type
215 */
216 public boolean isType(Object o, int type) {
217 return type == getType(o);
218 }
219
220 }