1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.extractor;
26
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29
30 import org.archive.crawler.datamodel.CrawlOrder;
31 import org.archive.crawler.datamodel.CrawlURI;
32 import org.archive.crawler.framework.Processor;
33
34 /***
35 * Convenience shared superclass for Extractor Processors.
36 *
37 * Currently only wraps Extractor-specific extract() action with
38 * a StackOverflowError catch/log/proceed handler, so that any
39 * extractors that recurse too deep on problematic input will
40 * only suffer a local error, and other normal CrawlURI processing
41 * can continue. See:
42 * [ 1122836 ] Localize StackOverflowError in Extractors
43 * http://sourceforge.net/tracker/index.php?func=detail&aid=1122836&group_id=73833&atid=539099
44 *
45 * This class could also become home to common utility features
46 * of extractors, like a running tally of the URIs examined/discovered,
47 * etc.
48 *
49 * @author gojomo
50 */
51 public abstract class Extractor extends Processor {
52 private static final Logger logger = Logger
53 .getLogger(Extractor.class.getName());
54
55 /***
56 * Passthrough constructor.
57 *
58 * @param name
59 * @param description
60 */
61 public Extractor(String name, String description) {
62 super(name, description);
63
64 }
65
66 public void innerProcess(CrawlURI curi) {
67 try {
68 extract(curi);
69 } catch (NullPointerException npe) {
70
71 curi.addAnnotation("err=" + npe.getClass().getName());
72 curi.addLocalizedError(getName(), npe, "");
73
74 logger.log(Level.WARNING, getName() + ": NullPointerException",
75 npe);
76 } catch (StackOverflowError soe) {
77
78 curi.addAnnotation("err=" + soe.getClass().getName());
79 curi.addLocalizedError(getName(), soe, "");
80
81 logger.log(Level.WARNING, getName() + ": StackOverflowError", soe);
82 } catch (java.nio.charset.CoderMalfunctionError cme) {
83
84
85 curi.addAnnotation("err=" + cme.getClass().getName());
86 curi.addLocalizedError(getName(), cme, "");
87 logger.log(Level.WARNING, getName() + ": CoderMalfunctionError",
88 cme);
89 }
90 }
91
92 protected boolean isIndependentExtractors() {
93 try {
94 return ((Boolean) getController().getOrder().getAttribute(
95 CrawlOrder.ATTR_INDEPENDENT_EXTRACTORS)).booleanValue();
96 } catch (Exception e) {
97 return false;
98 }
99 }
100
101 /***
102 * @return true if the setting
103 * {@link CrawlOrder#ATTR_INDEPENDENT_EXTRACTORS} is disabled or
104 * {@link CrawlURI#hasBeenLinkExtracted()} is false, and
105 * {@link Processor#isHttpTransactionContentToProcess(CrawlURI)} is
106 * true.
107 */
108 @Override
109 protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
110 return (isIndependentExtractors() || !curi.hasBeenLinkExtracted())
111 && super.isHttpTransactionContentToProcess(curi);
112 }
113
114 protected abstract void extract(CrawlURI curi);
115 }