1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.postprocessor;
26
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29
30 import javax.management.AttributeNotFoundException;
31
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.Processor;
34 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
35 import org.archive.crawler.settings.SimpleType;
36
37 /***
38 * A processor that determines when a URI should be revisited next. Does
39 * <b>not</b> account for DNS and robots.txt expiration. That should be
40 * handled seperately by the Frontiers.
41 *
42 * @author Kristinn Sigurdsson
43 */
44 public class WaitEvaluator extends Processor
45 implements AdaptiveRevisitAttributeConstants {
46
47 private static final long serialVersionUID = 7452762726125458413L;
48
49 Logger logger = Logger.getLogger(WaitEvaluator.class.getName());
50
51 /*** Default wait time after initial visit. */
52 public final static String ATTR_INITIAL_WAIT_INTERVAL =
53 "initial-wait-interval-seconds";
54 protected final static Long DEFAULT_INITIAL_WAIT_INTERVAL =
55 new Long(86400);
56 /*** Maximum wait between visits */
57 public final static String ATTR_MAX_WAIT_INTERVAL =
58 "max-wait-interval-seconds";
59 protected final static Long DEFAULT_MAX_WAIT_INTERVAL =
60 new Long(2419200);
61 /*** Minimum wait between visits */
62 public final static String ATTR_MIN_WAIT_INTERVAL =
63 "min-wait-interval-seconds";
64 protected final static Long DEFAULT_MIN_WAIT_INTERVAL =
65 new Long(3600);
66 /*** Factor increase on wait when unchanged */
67 public final static String ATTR_UNCHANGED_FACTOR = "unchanged-factor";
68 protected final static Double DEFAULT_UNCHANGED_FACTOR = new Double(1.5);
69 /*** Factor decrease on wait when changed */
70 public final static String ATTR_CHANGED_FACTOR = "changed-factor";
71 protected final static Double DEFAULT_CHANGED_FACTOR = new Double(1.5);
72 /*** Fixed wait time for 'unknown' change status. I.e. wait time for URIs
73 * whose content change detection is not available. */
74 public final static String ATTR_DEFAULT_WAIT_INTERVAL =
75 "default-wait-interval-seconds";
76 protected final static Long DEFAULT_DEFAULT_WAIT_INTERVAL =
77 new Long(259200);
78 /*** Indicates if the amount of time the URI was overdue should be added
79 * to the wait time before the new wait time is calculated. */
80 public final static String ATTR_USE_OVERDUE_TIME = "use-overdue-time";
81 protected final static Boolean DEFAULT_USE_OVERDUE_TIME = new Boolean(false);
82
83 /***
84 * Constructor
85 *
86 * @param name The name of the module
87 */
88 public WaitEvaluator(String name) {
89 this(name,
90 "Evaluates how long to wait before fetching a URI again. " +
91 "Typically, this processor should be in the post processing " +
92 "chain. It will pass if another wait evaluator has already " +
93 "processed the CrawlURI.",
94 DEFAULT_INITIAL_WAIT_INTERVAL,
95 DEFAULT_MAX_WAIT_INTERVAL,
96 DEFAULT_MIN_WAIT_INTERVAL,
97 DEFAULT_UNCHANGED_FACTOR,
98 DEFAULT_CHANGED_FACTOR);
99 }
100
101 /***
102 * Constructor
103 *
104 * @param name The name of the module
105 * @param description Description of the module
106 * @param default_inital_wait_interval The default value for initial wait
107 * time
108 * @param default_max_wait_interval The maximum value for wait time
109 * @param default_min_wait_interval The minimum value for wait time
110 * @param default_unchanged_factor The factor for changing wait times of
111 * unchanged documents (will be multiplied by this value)
112 * @param default_changed_factor The factor for changing wait times of
113 * changed documents (will be divided by this value)
114 */
115 public WaitEvaluator(String name, String description,
116 Long default_inital_wait_interval,
117 Long default_max_wait_interval,
118 Long default_min_wait_interval,
119 Double default_unchanged_factor,
120 Double default_changed_factor){
121 super(name, description);
122
123 addElementToDefinition(new SimpleType(ATTR_INITIAL_WAIT_INTERVAL,
124 "The initial wait time between revisits. Will then be " +
125 "updated according to crawler experiance. I.e. shorter " +
126 "wait, visit more often, if document has changed between " +
127 "visits, and vica versa.",
128 default_inital_wait_interval));
129 addElementToDefinition(new SimpleType(ATTR_MAX_WAIT_INTERVAL,
130 "The maximum settable wait time between revisits. Once a " +
131 "URIs wait time reaches this value, it will not grow " +
132 "further, regardless of subsequent visits that discover " +
133 "no changes. Note that this does not ensure that the URI " +
134 "does not wait any longer, since the crawler might be " +
135 "'behind,' forcing a URI to wait until other URIs, " +
136 "scheduled for earlier are completed..",
137 default_max_wait_interval));
138 addElementToDefinition(new SimpleType(ATTR_MIN_WAIT_INTERVAL,
139 "The minum settable wait time between revisits. Once a " +
140 "URIs wait time reaches this value, it will not be shortened " +
141 "further, regardlesss of subsequent visits that discover " +
142 "changes.",
143 default_min_wait_interval));
144 addElementToDefinition(new SimpleType(ATTR_DEFAULT_WAIT_INTERVAL,
145 "Fixed wait time for 'unknown' change status. I.e. wait time " +
146 "for URIs whose content change detection is not available.",
147 DEFAULT_DEFAULT_WAIT_INTERVAL));
148 addElementToDefinition(new SimpleType(ATTR_UNCHANGED_FACTOR,
149 "The factor by which a URIs wait time is increased when a " +
150 "revisit reveals an unchanged document. A value of 1 will " +
151 "leave it unchanged, a value of 2 will double it etc.",
152 default_unchanged_factor));
153 addElementToDefinition(new SimpleType(ATTR_CHANGED_FACTOR,
154 "The factor by which a URIs wait time is decreased when a " +
155 "revisit reveals a changed document. A value of 1 will leave " +
156 "it unchanged, a value of two will half it etc.",
157 default_changed_factor));
158 addElementToDefinition(new SimpleType(ATTR_USE_OVERDUE_TIME,
159 "Indicates if the amount of time the URI was overdue should " +
160 "be added to the wait time before the new wait time is " +
161 "calculated.",
162 DEFAULT_USE_OVERDUE_TIME));
163
164
165 CrawlURI.addAlistPersistentMember(A_WAIT_INTERVAL);
166 }
167
168 protected void innerProcess(CrawlURI curi) throws InterruptedException {
169
170 if(curi.isSuccess()==false){
171
172
173 return;
174 }
175
176 if(curi.containsKey(A_WAIT_REEVALUATED) &&
177 ((Boolean)curi.getObject(A_WAIT_REEVALUATED)).booleanValue()){
178
179
180 return;
181 }
182
183 long min;
184 try {
185 min = ((Long)getAttribute(curi, ATTR_MIN_WAIT_INTERVAL)).
186 longValue() * 1000;
187 } catch (AttributeNotFoundException e1) {
188 min = DEFAULT_MIN_WAIT_INTERVAL.longValue();
189 logger.fine("Unable to load minimum wait interval for " +
190 curi.toString());
191 }
192
193 long max;
194 try {
195 max = ((Long)getAttribute(curi, ATTR_MAX_WAIT_INTERVAL)).
196 longValue() * 1000;
197 } catch (AttributeNotFoundException e1) {
198 max = DEFAULT_MAX_WAIT_INTERVAL.longValue();
199 logger.fine("Unable to load maximum wait interval for " +
200 curi.toString());
201 }
202
203
204 long waitInterval;
205 if (!curi.containsKey(A_CONTENT_STATE_KEY) ||
206 curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNKNOWN) {
207 try {
208 waitInterval = ((Long)getAttribute(curi,
209 ATTR_DEFAULT_WAIT_INTERVAL)).longValue() * 1000;
210 } catch (AttributeNotFoundException e1) {
211 waitInterval = DEFAULT_DEFAULT_WAIT_INTERVAL.longValue();
212 logger.fine("Unable to load default wait interval for "
213 + curi.toString());
214 }
215 } else {
216
217 waitInterval = DEFAULT_INITIAL_WAIT_INTERVAL.longValue()*1000;
218
219
220 if(curi.containsKey(A_WAIT_INTERVAL)){
221 waitInterval = curi.getLong(A_WAIT_INTERVAL);
222
223
224 boolean useOverrideTime = DEFAULT_USE_OVERDUE_TIME.booleanValue();
225 try {
226 useOverrideTime = ((Boolean)getAttribute(
227 curi,ATTR_USE_OVERDUE_TIME)).booleanValue();
228 } catch (AttributeNotFoundException e1) {
229 useOverrideTime = DEFAULT_USE_OVERDUE_TIME.booleanValue();
230 logger.fine("Unable to load use-overdue-time for " +
231 curi.toString());
232 }
233
234 if(useOverrideTime){
235 waitInterval += curi.getLong(A_FETCH_OVERDUE);
236 }
237
238
239 if(curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED){
240
241 double factor;
242 try {
243 factor = ((Double)getAttribute(
244 curi,ATTR_CHANGED_FACTOR)).doubleValue();
245 } catch (AttributeNotFoundException e2) {
246 factor = DEFAULT_CHANGED_FACTOR.doubleValue();
247 logger.fine("Unable to load changed factor for " +
248 curi.toString());
249 }
250 waitInterval = (long)(waitInterval / factor);
251 } else if(curi.getInt(A_CONTENT_STATE_KEY) ==
252 CONTENT_UNCHANGED) {
253
254 double factor;
255 try {
256 factor = ((Double)getAttribute(
257 curi,ATTR_UNCHANGED_FACTOR)).doubleValue();
258 } catch (AttributeNotFoundException e2) {
259 factor = DEFAULT_UNCHANGED_FACTOR.doubleValue();
260 logger.fine("Unable to load unchanged factor for " +
261 curi.toString());
262 }
263 waitInterval = (long)(waitInterval*factor);
264 }
265 } else {
266
267 try {
268 waitInterval = ((Long)getAttribute(
269 curi,ATTR_INITIAL_WAIT_INTERVAL)).longValue()*1000;
270 } catch (AttributeNotFoundException e1) {
271
272 logger.fine("Unable to load initial wait interval for " +
273 curi.toString());
274 }
275 }
276 }
277
278 if(waitInterval < min){
279 waitInterval = min;
280 } else if(waitInterval > max){
281 waitInterval = max;
282 }
283
284 if (logger.isLoggable(Level.FINE)) {
285 logger.fine("URI " + curi.toString() + ", change: "
286 + curi.getInt(A_CONTENT_STATE_KEY) + " new wait interval: "
287 + waitInterval);
288 }
289
290 curi.putLong(A_WAIT_INTERVAL,waitInterval);
291 curi.putObject(A_WAIT_REEVALUATED,new Boolean(true));
292 }
293 }