View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Processor.java
20   * Created on Apr 16, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.framework;
25  
26  import java.lang.reflect.Constructor;
27  import java.util.logging.Logger;
28  
29  import javax.management.AttributeNotFoundException;
30  
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.deciderules.DecideRule;
33  import org.archive.crawler.deciderules.DecideRuleSequence;
34  import org.archive.crawler.settings.ModuleType;
35  import org.archive.crawler.settings.SimpleType;
36  
37  /***
38   * Base class for URI processing classes.
39   *
40   * <p> Each URI is processed by a user defined series of processors. This class
41   * provides the basic infrastructure for these but does not actually do
42   * anything. New processors can be easily created by subclassing this class.
43   *
44   * <p> Classes subclassing this one should not trap InterruptedExceptions.
45   * They should be allowed to propagate to the ToeThread executing the processor.
46   * Also they should immediately exit their main method (<tt>innerProcess()</tt>)
47   * if the <tt>interrupted</tt> flag is set.
48   *
49   * @author Gordon Mohr
50   *
51   * @see org.archive.crawler.framework.ToeThread
52   */
53  public class Processor extends ModuleType {
54  
55      private static final long serialVersionUID = 6248563827413710226L;
56  
57      /***
58       * Key to use asking settings for decide-rules value.
59       */
60      public static final String ATTR_DECIDE_RULES = "decide-rules";
61      /*** local name for decide-rules */
62      protected String attrDecideRules; 
63  
64      /***
65       * Key to use asking settings for enabled value.
66       */
67      public final static String ATTR_ENABLED = "enabled";
68  
69      private Processor defaultNextProcessor = null;
70  
71      private static Logger logger =
72          Logger.getLogger("org.archive.crawler.framework.Processor");
73  
74      /***
75       * @param name
76       * @param description
77       */
78      public Processor(String name, String description) {
79          super(name, description);
80          addElementToDefinition(new SimpleType(ATTR_ENABLED,
81              "Is processor enabled", new Boolean(true)));
82          attrDecideRules = getName()+"#"+ATTR_DECIDE_RULES;
83          addElementToDefinition(
84              new DecideRuleSequence(attrDecideRules,
85                  "DecideRules which, if their final decision is REJECT, " +
86                  "prevent this Processor from running."));
87      }
88  
89      /***
90       * Perform processing on the given CrawlURI.
91       *
92       * @param curi
93       * @throws InterruptedException
94       */
95      public final void process(CrawlURI curi) throws InterruptedException {
96          // by default, arrange for curi to proceed to next processor
97          curi.setNextProcessor(getDefaultNextProcessor(curi));
98  
99          // Check if this processor is enabled before processing
100         try {
101             if (!((Boolean) getAttribute(ATTR_ENABLED, curi)).booleanValue()) {
102                 return;
103             }
104         } catch (AttributeNotFoundException e) {
105             logger.severe(e.getMessage());
106         }
107 
108         if(rulesAccept(curi)) {
109             innerProcess(curi);
110         } else {
111             innerRejectProcess(curi);
112         }
113     }
114 
115     protected void checkForInterrupt() throws InterruptedException {
116         if (Thread.interrupted()) {
117             throw new InterruptedException("interrupted");
118         }
119     }
120 
121     /***
122      * @param curi CrawlURI instance.
123      * @throws InterruptedException
124      */
125     protected void innerRejectProcess(CrawlURI curi)
126     throws InterruptedException {
127         // by default do nothing
128     }
129 
130     /***
131      * Classes subclassing this one should override this method to perform
132      * their custom actions on the CrawlURI.
133      *
134      * @param curi The CrawlURI being processed.
135      * @throws InterruptedException
136      */
137     protected void innerProcess(CrawlURI curi)
138     throws InterruptedException {
139         // by default do nothing
140     }
141 
142     /***
143      * Classes subclassing this one should override this method to perform
144      * processor specific actions.
145      * <p>
146      *
147      * This method is garanteed to be called after the crawl is set up, but
148      * before any URI-processing has occured.
149      */
150     protected void initialTasks () {
151         // by default do nothing
152     }
153 
154     /***
155      * Classes subclassing this one should override this method to perform
156      * processor specific actions.
157      *
158      */
159     protected void finalTasks () {
160         // by default do nothing
161     }
162 
163     protected DecideRule getDecideRule(Object o) {
164         try {
165             return (DecideRule)getAttribute(o, attrDecideRules);
166         } catch (AttributeNotFoundException e) {
167             throw new RuntimeException(e);
168         }
169     }
170 
171     protected boolean rulesAccept(Object o) {
172         return rulesAccept(getDecideRule(o),o);
173     }
174 
175     protected boolean rulesAccept(DecideRule rule, Object o) {
176         return rule.decisionFor(o) != DecideRule.REJECT;
177     }
178     /***
179      * Returns the next processor for the given CrawlURI in the processor chain.
180      * @param curi The CrawlURI that we want to find the next processor for.
181      * @return The next processor for the given CrawlURI in the processor chain.
182      */
183     public Processor getDefaultNextProcessor(CrawlURI curi) {
184         return defaultNextProcessor;
185     }
186 
187     /*** Set the default next processor in the chain.
188      *
189      * @param nextProcessor the default next processor in the chain.
190      */
191     public void setDefaultNextProcessor(Processor nextProcessor) {
192         defaultNextProcessor = nextProcessor;
193     }
194 
195     /*** 
196      * Get the controller object.
197      *
198      * @return the controller object.
199      */
200     public CrawlController getController() {
201         return getSettingsHandler().getOrder().getController();
202     }
203 
204     public Processor spawn(int serialNum) {
205         Processor newInst = null;
206         try {
207             Constructor co =
208                 getClass().getConstructor(new Class[] { String.class });
209             newInst =
210                 (Processor) co.newInstance(new Object[] {
211                     getName() + serialNum
212                     });
213             getParent().setAttribute(newInst);
214             newInst.setTransient(true);
215         } catch (Exception e) {
216             // TODO Auto-generated catch block
217             e.printStackTrace();
218         }
219         return newInst;
220     }
221 
222     /***
223      * Compiles and returns a report (in human readable form) about the status
224      * of the processor.  The processor's name (of implementing class) should
225      * always be included.
226      * <p>
227      * Examples of stats declared would include:<br>
228      * * Number of CrawlURIs handled.<br>
229      * * Number of links extracted (for link extractors)<br>
230      * etc.
231      *
232      * @return A human readable report on the processor's state.
233      */
234     public String report(){
235         return ""; // Default behavior.
236     }
237     
238     /***
239      * @param curi CrawlURI to examine.
240      * @return True if content to process -- content length is > 0 
241      */
242     protected boolean isContentToProcess(CrawlURI curi) {
243         return curi.getContentLength() > 0;
244     }
245     
246     /***
247      * @param curi CrawlURI to examine.
248      * @return True if {@link #isContentToProcess(CrawlURI)} and
249      * the CrawlURI represents a successful http transaction.
250      */
251     protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
252         return isContentToProcess(curi) &&
253             curi.isHttpTransaction() &&
254             curi.isSuccess();
255     }
256     
257     /***
258      * @param contentType Found content type.
259      * @param expectedPrefix String to find at start of contenttype: e.g.
260      * <code>text/html</code>.
261      * @return True if passed content-type begins with
262      * expected mimetype.
263      */
264     protected boolean isExpectedMimeType(String contentType,
265             String expectedPrefix) {
266         return contentType != null &&
267             contentType.toLowerCase().startsWith(expectedPrefix);
268     }
269 
270     public void kickUpdate() {
271         // by default do nothing
272     }
273     
274     public boolean isEnabled() {
275         return ((Boolean)getUncheckedAttribute(null, ATTR_ENABLED)).booleanValue();
276     }
277 }