View Javadoc

1   /* Extractor
2   *
3   * $Id: Extractor.java 6977 2010-10-26 23:46:15Z nlevitt $
4   *
5   * Created on Sep 22, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.extractor;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.archive.crawler.datamodel.CrawlOrder;
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.framework.Processor;
33  
34  /***
35   * Convenience shared superclass for Extractor Processors.
36   * 
37   * Currently only wraps Extractor-specific extract() action with
38   * a StackOverflowError catch/log/proceed handler, so that any
39   * extractors that recurse too deep on problematic input will
40   * only suffer a local error, and other normal CrawlURI processing
41   * can continue. See:
42   *  [ 1122836 ] Localize StackOverflowError in Extractors
43   *  http://sourceforge.net/tracker/index.php?func=detail&aid=1122836&group_id=73833&atid=539099
44   * 
45   * This class could also become home to common utility features
46   * of extractors, like a running tally of the URIs examined/discovered,
47   * etc.
48   * 
49   * @author gojomo
50   */
51  public abstract class Extractor extends Processor {
52      private static final Logger logger = Logger
53          .getLogger(Extractor.class.getName());
54  
55      /***
56       * Passthrough constructor.
57       * 
58       * @param name
59       * @param description
60       */
61      public Extractor(String name, String description) {
62          super(name, description);
63          // TODO Auto-generated constructor stub
64      }
65  
66      public void innerProcess(CrawlURI curi) {
67          try {
68              extract(curi);
69          } catch (NullPointerException npe) {
70              // both annotate (to highlight in crawl log) & add as local-error
71              curi.addAnnotation("err=" + npe.getClass().getName());
72              curi.addLocalizedError(getName(), npe, "");
73              // also log as warning
74              logger.log(Level.WARNING, getName() + ": NullPointerException",
75                  npe);
76          } catch (StackOverflowError soe) {
77              // both annotate (to highlight in crawl log) & add as local-error
78              curi.addAnnotation("err=" + soe.getClass().getName());
79              curi.addLocalizedError(getName(), soe, "");
80              // also log as warning
81              logger.log(Level.WARNING, getName() + ": StackOverflowError", soe);
82          } catch (java.nio.charset.CoderMalfunctionError cme) {
83              // See http://sourceforge.net/tracker/index.php?func=detail&aid=1540222&group_id=73833&atid=539099
84              // Both annotate (to highlight in crawl log) & add as local-error
85              curi.addAnnotation("err=" + cme.getClass().getName());
86              curi.addLocalizedError(getName(), cme, ""); // <-- Message field ignored when logging.
87              logger.log(Level.WARNING, getName() + ": CoderMalfunctionError",
88                  cme);
89          }
90      }
91  
92      protected boolean isIndependentExtractors() {
93          try {
94              return ((Boolean) getController().getOrder().getAttribute(
95                      CrawlOrder.ATTR_INDEPENDENT_EXTRACTORS)).booleanValue();
96          } catch (Exception e) {
97              return false;
98          }
99      }
100 
101     /***
102      * @return true if the setting
103      *         {@link CrawlOrder#ATTR_INDEPENDENT_EXTRACTORS} is disabled or
104      *         {@link CrawlURI#hasBeenLinkExtracted()} is false, and
105      *         {@link Processor#isHttpTransactionContentToProcess(CrawlURI)} is
106      *         true.
107      */
108     @Override
109     protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
110         return (isIndependentExtractors() || !curi.hasBeenLinkExtracted())
111                 && super.isHttpTransactionContentToProcess(curi);
112     }
113 
114     protected abstract void extract(CrawlURI curi);
115 }