View Javadoc

1   /* ContentBasedWaitEvaluator
2    * 
3    * $Id: ContentBasedWaitEvaluator.java 4654 2006-09-25 20:19:54Z paul_jack $
4    * 
5    * Created on 1.4.2005
6    *
7    * Copyright (C) 2005 Kristinn Sigurdsson
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.postprocessor;
26  
27  import javax.management.AttributeNotFoundException;
28  
29  import org.archive.crawler.datamodel.CrawlURI;
30  import org.archive.crawler.settings.SimpleType;
31  import org.archive.util.TextUtils;
32  
33  /***
34   * A WaitEvaluator that compares the CrawlURIs content type to a configurable
35   * regular expression. If it matches, then the wait evaluation is performed.
36   * Otherwise the processor passes on the CrawlURI, doing nothing. 
37   *
38   * @author Kristinn Sigurdsson
39   * 
40   * @see org.archive.crawler.postprocessor.WaitEvaluator
41   */
42  public class ContentBasedWaitEvaluator extends WaitEvaluator {
43      
44      private static final long serialVersionUID = 1623347208782997347L;
45  
46      /*** The regular expression that we limit this evaluator to. */
47      public final static String ATTR_CONTENT_REGEXPR =
48          "content-regular-expression";
49      protected final static String DEFAULT_CONTENT_REGEXPR = "^.*$"; //Everything
50  
51      /***
52       * Constructor
53       * 
54       * @param name The name of the module
55       */
56      public ContentBasedWaitEvaluator(String name) {
57          this(name,"Evaluates how long to wait before fetching a URI again. " +
58                  "Only handles CrawlURIs whose content type matches the " +
59                  "regular expression set. " +
60                  "Typically, this processor should be in the post processing " +
61                  "chain. It will pass if another wait evaluator has already " +
62                  "processed the CrawlURI.", DEFAULT_CONTENT_REGEXPR,
63                  DEFAULT_INITIAL_WAIT_INTERVAL,
64                  DEFAULT_MAX_WAIT_INTERVAL,
65                  DEFAULT_MIN_WAIT_INTERVAL,
66                  DEFAULT_UNCHANGED_FACTOR,
67                  DEFAULT_CHANGED_FACTOR);
68      }
69  
70      /***
71       * Constructor
72       * 
73       * @param name The name of the module
74       * @param description Description of the module
75       * @param default_inital_wait_interval The default value for initial wait
76       *           time
77       * @param default_max_wait_interval The maximum value for wait time
78       * @param default_min_wait_interval The minimum value for wait time
79       * @param default_unchanged_factor The factor for changing wait times of
80       *           unchanged documents (will be multiplied by this value)
81       * @param default_changed_factor The factor for changing wait times of
82       *           changed documents (will be divided by this value)
83       */
84      public ContentBasedWaitEvaluator(String name, String description,
85              String defaultRegExpr,
86              Long default_inital_wait_interval,
87              Long default_max_wait_interval,
88              Long default_min_wait_interval,
89              Double default_unchanged_factor,
90              Double default_changed_factor){
91          super(name,description,
92                  default_inital_wait_interval,
93                  default_max_wait_interval,
94                  default_min_wait_interval,
95                  default_unchanged_factor,
96                  default_changed_factor);
97  
98          addElementToDefinition(new SimpleType(ATTR_CONTENT_REGEXPR,
99                  "Only URIs whose content type matches this regular " +
100                 "expression will be evaluated.",
101                 defaultRegExpr));
102 
103     }
104     
105     protected void innerProcess(CrawlURI curi) throws InterruptedException {
106         // Check if content type is available and if it matches the reg.expr.
107         String content_type = curi.getContentType();
108         if(content_type==null){
109             // No content type, exit
110             return;
111         }
112         String regexpr;
113         try {
114             regexpr = (String)getAttribute(curi,ATTR_CONTENT_REGEXPR);
115         } catch (AttributeNotFoundException e) {
116             logger.warning("Regular expression for content type not found");
117             return;
118         }
119 
120         if(TextUtils.matches(regexpr, content_type) == false){
121             // Content type does not match reg.expr. Exit
122             return;
123         }
124         // Ok, it matches, invoke parent method.
125 
126         super.innerProcess(curi);
127     }
128 }