1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.datamodel;
26
27 import java.io.Serializable;
28 import java.util.concurrent.ConcurrentSkipListSet;
29
30 /***
31 * Represents the directives that apply to a user-agent (or set of
32 * user-agents)
33 */
34 public class RobotsDirectives implements Serializable {
35 private static final long serialVersionUID = 5386542759286155383L;
36
37 ConcurrentSkipListSet<String> disallows = new ConcurrentSkipListSet<String>();
38 ConcurrentSkipListSet<String> allows = new ConcurrentSkipListSet<String>();
39 float crawlDelay = -1;
40
41 public boolean allows(String path) {
42 return !(longestPrefixLength(disallows, path) > longestPrefixLength(allows, path));
43 }
44
45 /***
46 * @param prefixSet
47 * @param str
48 * @return length of longest entry in {@code prefixSet} that prefixes {@code str}, or zero
49 * if no entry prefixes {@code str}
50 */
51 protected int longestPrefixLength(ConcurrentSkipListSet<String> prefixSet,
52 String str) {
53 String possiblePrefix = prefixSet.floor(str);
54 if (possiblePrefix != null && str.startsWith(possiblePrefix)) {
55 return possiblePrefix.length();
56 } else {
57 return 0;
58 }
59 }
60
61 public void addDisallow(String path) {
62 if(path.length()==0) {
63
64
65 return;
66 }
67 disallows.add(path);
68 }
69
70 public void addAllow(String path) {
71 allows.add(path);
72 }
73
74 public void setCrawlDelay(float i) {
75 crawlDelay=i;
76 }
77
78 public float getCrawlDelay() {
79 return crawlDelay;
80 }
81 }