1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.postprocessor;
26
27 import javax.management.AttributeNotFoundException;
28
29 import org.archive.crawler.datamodel.CrawlURI;
30 import org.archive.crawler.settings.SimpleType;
31 import org.archive.util.TextUtils;
32
33 /***
34 * A WaitEvaluator that compares the CrawlURIs content type to a configurable
35 * regular expression. If it matches, then the wait evaluation is performed.
36 * Otherwise the processor passes on the CrawlURI, doing nothing.
37 *
38 * @author Kristinn Sigurdsson
39 *
40 * @see org.archive.crawler.postprocessor.WaitEvaluator
41 */
42 public class ContentBasedWaitEvaluator extends WaitEvaluator {
43
44 private static final long serialVersionUID = 1623347208782997347L;
45
46 /*** The regular expression that we limit this evaluator to. */
47 public final static String ATTR_CONTENT_REGEXPR =
48 "content-regular-expression";
49 protected final static String DEFAULT_CONTENT_REGEXPR = "^.*$";
50
51 /***
52 * Constructor
53 *
54 * @param name The name of the module
55 */
56 public ContentBasedWaitEvaluator(String name) {
57 this(name,"Evaluates how long to wait before fetching a URI again. " +
58 "Only handles CrawlURIs whose content type matches the " +
59 "regular expression set. " +
60 "Typically, this processor should be in the post processing " +
61 "chain. It will pass if another wait evaluator has already " +
62 "processed the CrawlURI.", DEFAULT_CONTENT_REGEXPR,
63 DEFAULT_INITIAL_WAIT_INTERVAL,
64 DEFAULT_MAX_WAIT_INTERVAL,
65 DEFAULT_MIN_WAIT_INTERVAL,
66 DEFAULT_UNCHANGED_FACTOR,
67 DEFAULT_CHANGED_FACTOR);
68 }
69
70 /***
71 * Constructor
72 *
73 * @param name The name of the module
74 * @param description Description of the module
75 * @param default_inital_wait_interval The default value for initial wait
76 * time
77 * @param default_max_wait_interval The maximum value for wait time
78 * @param default_min_wait_interval The minimum value for wait time
79 * @param default_unchanged_factor The factor for changing wait times of
80 * unchanged documents (will be multiplied by this value)
81 * @param default_changed_factor The factor for changing wait times of
82 * changed documents (will be divided by this value)
83 */
84 public ContentBasedWaitEvaluator(String name, String description,
85 String defaultRegExpr,
86 Long default_inital_wait_interval,
87 Long default_max_wait_interval,
88 Long default_min_wait_interval,
89 Double default_unchanged_factor,
90 Double default_changed_factor){
91 super(name,description,
92 default_inital_wait_interval,
93 default_max_wait_interval,
94 default_min_wait_interval,
95 default_unchanged_factor,
96 default_changed_factor);
97
98 addElementToDefinition(new SimpleType(ATTR_CONTENT_REGEXPR,
99 "Only URIs whose content type matches this regular " +
100 "expression will be evaluated.",
101 defaultRegExpr));
102
103 }
104
105 protected void innerProcess(CrawlURI curi) throws InterruptedException {
106
107 String content_type = curi.getContentType();
108 if(content_type==null){
109
110 return;
111 }
112 String regexpr;
113 try {
114 regexpr = (String)getAttribute(curi,ATTR_CONTENT_REGEXPR);
115 } catch (AttributeNotFoundException e) {
116 logger.warning("Regular expression for content type not found");
117 return;
118 }
119
120 if(TextUtils.matches(regexpr, content_type) == false){
121
122 return;
123 }
124
125
126 super.innerProcess(curi);
127 }
128 }