View Javadoc

1   /* WaitEvaluator
2    * 
3    * $Id: WaitEvaluator.java 4654 2006-09-25 20:19:54Z paul_jack $
4    * 
5    * Created on 26.11.2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.postprocessor;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import javax.management.AttributeNotFoundException;
31  
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.framework.Processor;
34  import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
35  import org.archive.crawler.settings.SimpleType;
36  
37  /***
38   * A processor that determines when a URI should be revisited next. Does
39   * <b>not</b> account for DNS and robots.txt expiration. That should be 
40   * handled seperately by the Frontiers.
41   *
42   * @author Kristinn Sigurdsson
43   */
44  public class WaitEvaluator extends Processor
45  implements AdaptiveRevisitAttributeConstants {
46      
47      private static final long serialVersionUID = 7452762726125458413L;
48  
49      Logger logger = Logger.getLogger(WaitEvaluator.class.getName());
50      
51      /*** Default wait time after initial visit. */
52      public final static String ATTR_INITIAL_WAIT_INTERVAL =
53          "initial-wait-interval-seconds";
54      protected final static Long DEFAULT_INITIAL_WAIT_INTERVAL =
55          new Long(86400); // 1 day
56      /*** Maximum wait between visits */
57      public final static String ATTR_MAX_WAIT_INTERVAL =
58          "max-wait-interval-seconds";
59      protected final static Long DEFAULT_MAX_WAIT_INTERVAL =
60          new Long(2419200); // 4 weeks
61      /*** Minimum wait between visits */
62      public final static String ATTR_MIN_WAIT_INTERVAL =
63          "min-wait-interval-seconds";
64      protected final static Long DEFAULT_MIN_WAIT_INTERVAL =
65          new Long(3600); // 1 hour
66      /*** Factor increase on wait when unchanged */
67      public final static String ATTR_UNCHANGED_FACTOR = "unchanged-factor";
68      protected final static Double DEFAULT_UNCHANGED_FACTOR = new Double(1.5); 
69      /*** Factor decrease on wait when changed */
70      public final static String ATTR_CHANGED_FACTOR = "changed-factor";
71      protected final static Double DEFAULT_CHANGED_FACTOR = new Double(1.5); 
72      /*** Fixed wait time for 'unknown' change status. I.e. wait time for URIs 
73       *  whose content change detection is not available. */
74      public final static String ATTR_DEFAULT_WAIT_INTERVAL =
75          "default-wait-interval-seconds";
76      protected final static Long DEFAULT_DEFAULT_WAIT_INTERVAL =
77          new Long(259200); // 3 days
78      /*** Indicates if the amount of time the URI was overdue should be added
79       *  to the wait time before the new wait time is calculated.  */
80      public final static String ATTR_USE_OVERDUE_TIME = "use-overdue-time";
81      protected final static Boolean DEFAULT_USE_OVERDUE_TIME = new Boolean(false); 
82  
83      /***
84       * Constructor
85       * 
86       * @param name The name of the module
87       */
88      public WaitEvaluator(String name) {
89          this(name, 
90                  "Evaluates how long to wait before fetching a URI again. " +
91                  "Typically, this processor should be in the post processing " +
92                  "chain. It will pass if another wait evaluator has already " +
93                  "processed the CrawlURI.",
94                  DEFAULT_INITIAL_WAIT_INTERVAL,
95                  DEFAULT_MAX_WAIT_INTERVAL,
96                  DEFAULT_MIN_WAIT_INTERVAL,
97                  DEFAULT_UNCHANGED_FACTOR,
98                  DEFAULT_CHANGED_FACTOR);
99      }
100 
101     /***
102      * Constructor
103      * 
104      * @param name The name of the module
105      * @param description Description of the module
106      * @param default_inital_wait_interval The default value for initial wait
107      *           time
108      * @param default_max_wait_interval The maximum value for wait time
109      * @param default_min_wait_interval The minimum value for wait time
110      * @param default_unchanged_factor The factor for changing wait times of
111      *           unchanged documents (will be multiplied by this value)
112      * @param default_changed_factor The factor for changing wait times of
113      *           changed documents (will be divided by this value)
114      */
115     public WaitEvaluator(String name, String description,
116             Long default_inital_wait_interval,
117             Long default_max_wait_interval,
118             Long default_min_wait_interval,
119             Double default_unchanged_factor,
120             Double default_changed_factor){
121         super(name, description);
122         
123         addElementToDefinition(new SimpleType(ATTR_INITIAL_WAIT_INTERVAL,
124                 "The initial wait time between revisits. Will then be " +
125                 "updated according to crawler experiance. I.e. shorter " +
126                 "wait, visit more often, if document has changed between " +
127                 "visits, and vica versa.",
128                 default_inital_wait_interval));
129         addElementToDefinition(new SimpleType(ATTR_MAX_WAIT_INTERVAL,
130                 "The maximum settable wait time between revisits. Once a " +
131                 "URIs wait time reaches this value, it will not grow " +
132                 "further, regardless of subsequent visits that discover " +
133                 "no changes. Note that this does not ensure that the URI " +
134                 "does not wait any longer, since the crawler might be " +
135                 "'behind,' forcing a URI to wait until other URIs, " +
136                 "scheduled for earlier are completed..",
137                 default_max_wait_interval));
138         addElementToDefinition(new SimpleType(ATTR_MIN_WAIT_INTERVAL,
139                 "The minum settable wait time between revisits. Once a " +
140                 "URIs wait time reaches this value, it will not be shortened " +
141                 "further, regardlesss of subsequent visits that discover " +
142                 "changes.",
143                 default_min_wait_interval));
144         addElementToDefinition(new SimpleType(ATTR_DEFAULT_WAIT_INTERVAL,
145                 "Fixed wait time for 'unknown' change status. I.e. wait time " +
146                 "for URIs whose content change detection is not available.",
147                 DEFAULT_DEFAULT_WAIT_INTERVAL));
148         addElementToDefinition(new SimpleType(ATTR_UNCHANGED_FACTOR,
149                 "The factor by which a URIs wait time is increased when a " +
150                 "revisit reveals an unchanged document. A value of 1 will " +
151                 "leave it unchanged, a value of 2 will double it etc.",
152                 default_unchanged_factor));
153         addElementToDefinition(new SimpleType(ATTR_CHANGED_FACTOR,
154                 "The factor by which a URIs wait time is decreased when a " +
155                 "revisit reveals a changed document. A value of 1 will leave " +
156                 "it unchanged, a value of two will half it etc.",
157                 default_changed_factor));
158         addElementToDefinition(new SimpleType(ATTR_USE_OVERDUE_TIME,
159                 "Indicates if the amount of time the URI was overdue should " +
160                 "be added to the wait time before the new wait time is " +
161                 "calculated.",
162                 DEFAULT_USE_OVERDUE_TIME));
163 
164         // Register persistent CrawlURI items 
165         CrawlURI.addAlistPersistentMember(A_WAIT_INTERVAL);
166     }
167 
168     protected void innerProcess(CrawlURI curi) throws InterruptedException {
169         
170         if(curi.isSuccess()==false){
171             // If the URI was not crawled successfully, we can not reevaluate
172             // the wait interval.
173             return;
174         }
175         
176         if(curi.containsKey(A_WAIT_REEVALUATED) && 
177                 ((Boolean)curi.getObject(A_WAIT_REEVALUATED)).booleanValue()){
178             // This CrawlURIs wait interval has already been reevaluted during
179             // this processing round.
180             return;
181         }
182             
183         long min;
184         try {
185             min = ((Long)getAttribute(curi, ATTR_MIN_WAIT_INTERVAL)).
186                 longValue() * 1000;
187         } catch (AttributeNotFoundException e1) {
188             min = DEFAULT_MIN_WAIT_INTERVAL.longValue();
189             logger.fine("Unable to load minimum wait interval for " + 
190                     curi.toString());
191         }
192 
193         long max;
194         try {
195             max = ((Long)getAttribute(curi, ATTR_MAX_WAIT_INTERVAL)).
196                 longValue() * 1000;
197         } catch (AttributeNotFoundException e1) {
198             max = DEFAULT_MAX_WAIT_INTERVAL.longValue();
199             logger.fine("Unable to load maximum wait interval for " + 
200                     curi.toString());
201         }
202 
203         
204         long waitInterval;
205         if (!curi.containsKey(A_CONTENT_STATE_KEY) ||
206                 curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNKNOWN) {
207             try {
208                 waitInterval = ((Long)getAttribute(curi,
209                         ATTR_DEFAULT_WAIT_INTERVAL)).longValue() * 1000;
210             } catch (AttributeNotFoundException e1) {
211                 waitInterval = DEFAULT_DEFAULT_WAIT_INTERVAL.longValue();
212                 logger.fine("Unable to load default wait interval for "
213                         + curi.toString());
214             }
215         } else {
216             /* Calculate curi's time of next processing */ 
217             waitInterval = DEFAULT_INITIAL_WAIT_INTERVAL.longValue()*1000;
218 
219             // Retrieve wait interval
220             if(curi.containsKey(A_WAIT_INTERVAL)){
221                 waitInterval =  curi.getLong(A_WAIT_INTERVAL); 
222 
223                 // Should override time be taken into account?
224                 boolean useOverrideTime = DEFAULT_USE_OVERDUE_TIME.booleanValue();
225                 try {
226                     useOverrideTime = ((Boolean)getAttribute(
227                             curi,ATTR_USE_OVERDUE_TIME)).booleanValue();
228                 } catch (AttributeNotFoundException e1) {
229                     useOverrideTime = DEFAULT_USE_OVERDUE_TIME.booleanValue();
230                     logger.fine("Unable to load use-overdue-time for " + 
231                             curi.toString());
232                 }
233                 
234                 if(useOverrideTime){
235                     waitInterval += curi.getLong(A_FETCH_OVERDUE);
236                 }
237 
238                 // Revise the wait interval
239                 if(curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED){
240                     // Had changed. Decrease wait interval time.
241                     double factor;
242                     try {
243                         factor = ((Double)getAttribute(
244                                 curi,ATTR_CHANGED_FACTOR)).doubleValue();
245                     } catch (AttributeNotFoundException e2) {
246                         factor = DEFAULT_CHANGED_FACTOR.doubleValue();
247                         logger.fine("Unable to load changed factor for " + 
248                                 curi.toString());
249                     }
250                     waitInterval = (long)(waitInterval / factor);
251                 } else if(curi.getInt(A_CONTENT_STATE_KEY) ==
252                         CONTENT_UNCHANGED) {
253                     // Had not changed. Increase wait interval time
254                     double factor;
255                     try {
256                         factor = ((Double)getAttribute(
257                                 curi,ATTR_UNCHANGED_FACTOR)).doubleValue();
258                     } catch (AttributeNotFoundException e2) {
259                         factor = DEFAULT_UNCHANGED_FACTOR.doubleValue();
260                         logger.fine("Unable to load unchanged factor for " + 
261                                 curi.toString());
262                     }
263                     waitInterval = (long)(waitInterval*factor);
264                 }
265             } else {
266                 // If wait element not found, use initial wait interval 
267                 try {
268                     waitInterval = ((Long)getAttribute(
269                             curi,ATTR_INITIAL_WAIT_INTERVAL)).longValue()*1000;
270                 } catch (AttributeNotFoundException e1) {
271                     // If this fails use default (already set) and log error.
272                     logger.fine("Unable to load initial wait interval for " + 
273                             curi.toString());
274                 }        
275             }
276         }
277         
278         if(waitInterval < min){
279             waitInterval = min;
280         } else if(waitInterval > max){
281             waitInterval = max;
282         }
283         
284         if (logger.isLoggable(Level.FINE)) {
285             logger.fine("URI " + curi.toString() + ", change: "
286                     + curi.getInt(A_CONTENT_STATE_KEY) + " new wait interval: "
287                     + waitInterval);
288         }
289         // Update wait interval
290         curi.putLong(A_WAIT_INTERVAL,waitInterval);
291         curi.putObject(A_WAIT_REEVALUATED,new Boolean(true));
292     }
293 }