1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.prefetch;
25
26 import javax.management.AttributeNotFoundException;
27
28 import org.archive.crawler.datamodel.CrawlURI;
29 import org.archive.crawler.datamodel.FetchStatusCodes;
30 import org.archive.crawler.framework.Scoper;
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.crawler.settings.Type;
33 import org.archive.util.TextUtils;
34
35 /***
36 * If set to recheck the crawl's scope, gives a yes/no on whether
37 * a CrawlURI should be processed at all. If not, its status
38 * will be marked OUT_OF_SCOPE and the URI will skip directly
39 * to the first "postprocessor".
40 *
41 *
42 * @author gojomo
43 *
44 */
45 public class Preselector extends Scoper
46 implements FetchStatusCodes {
47
48 private static final long serialVersionUID = 3738560264369561017L;
49
50 /*** whether to reapply crawl scope at this step */
51 public static final String ATTR_RECHECK_SCOPE = "recheck-scope";
52 /*** indicator allowing all URIs (of a given host, typically) to
53 * be blocked at this step*/
54 public static final String ATTR_BLOCK_ALL = "block-all";
55 /*** indicator allowing all matching URIs to be blocked at this step */
56 public static final String ATTR_BLOCK_BY_REGEXP = "block-by-regexp";
57 /*** indicator allowing all matching URIs */
58 public static final String ATTR_ALLOW_BY_REGEXP = "allow-by-regexp";
59
60 /***
61 * Constructor.
62 * @param name Name of this processor.
63 */
64 public Preselector(String name) {
65 super(name, "Preselector. Does one last bit of checking to make " +
66 "sure that the current URI should be fetched.");
67 Type e;
68 e = addElementToDefinition(new SimpleType(ATTR_RECHECK_SCOPE,
69 "Recheck if uri is in scope. This is meaningful if the scope" +
70 " is altered during a crawl. URIs are checked against the" +
71 " scope when they are added to queues. Setting this value to" +
72 " true forces the URI to be checked against the scope when it" +
73 " is comming out of the queue, possibly after the scope is" +
74 " altered.", new Boolean(false)));
75 e.setExpertSetting(true);
76
77 e = addElementToDefinition(new SimpleType(ATTR_BLOCK_ALL,
78 "Block all URIs from being processed. This is most likely to" +
79 " be used in overrides to easily reject certain hosts from" +
80 " being processed.", new Boolean(false)));
81 e.setExpertSetting(true);
82
83 e = addElementToDefinition(new SimpleType(ATTR_BLOCK_BY_REGEXP,
84 "Block all URIs matching the regular expression from being" +
85 " processed.", ""));
86 e.setExpertSetting(true);
87
88 e = addElementToDefinition(new SimpleType(ATTR_ALLOW_BY_REGEXP,
89 "Allow only URIs matching the regular expression to be" +
90 " processed.", ""));
91 e.setExpertSetting(true);
92 }
93
94 protected void innerProcess(CrawlURI curi) {
95
96 try {
97 if (((Boolean) getAttribute(ATTR_BLOCK_ALL, curi)).booleanValue()) {
98 curi.setFetchStatus(S_BLOCKED_BY_USER);
99 curi.skipToProcessorChain(getController().
100 getPostprocessorChain());
101 }
102 } catch (AttributeNotFoundException e) {
103
104 }
105
106
107 try {
108 String regexp = (String) getAttribute(ATTR_ALLOW_BY_REGEXP, curi);
109 if (regexp != null && !regexp.equals("")) {
110 if (!TextUtils.matches(regexp, curi.toString())) {
111 curi.setFetchStatus(S_BLOCKED_BY_USER);
112 curi.skipToProcessorChain(getController().
113 getPostprocessorChain());
114 }
115 }
116 } catch (AttributeNotFoundException e) {
117
118 }
119
120
121
122 try {
123 String regexp = (String) getAttribute(ATTR_BLOCK_BY_REGEXP, curi);
124 if (regexp != null && !regexp.equals("")) {
125 if (TextUtils.matches(regexp, curi.toString())) {
126 curi.setFetchStatus(S_BLOCKED_BY_USER);
127 curi.skipToProcessorChain(getController().
128 getPostprocessorChain());
129 }
130 }
131 } catch (AttributeNotFoundException e) {
132
133 }
134
135
136 try {
137 if (((Boolean) getAttribute(ATTR_RECHECK_SCOPE, curi)).
138 booleanValue()) {
139 if (!isInScope(curi)) {
140
141 curi.setFetchStatus(S_OUT_OF_SCOPE);
142 curi.skipToProcessorChain(getController().
143 getPostprocessorChain());
144 }
145 }
146 } catch (AttributeNotFoundException e) {
147
148 }
149 }
150 }