1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.framework;
25
26 import java.lang.reflect.Constructor;
27 import java.util.logging.Logger;
28
29 import javax.management.AttributeNotFoundException;
30
31 import org.archive.crawler.datamodel.CrawlURI;
32 import org.archive.crawler.deciderules.DecideRule;
33 import org.archive.crawler.deciderules.DecideRuleSequence;
34 import org.archive.crawler.settings.ModuleType;
35 import org.archive.crawler.settings.SimpleType;
36
37 /***
38 * Base class for URI processing classes.
39 *
40 * <p> Each URI is processed by a user defined series of processors. This class
41 * provides the basic infrastructure for these but does not actually do
42 * anything. New processors can be easily created by subclassing this class.
43 *
44 * <p> Classes subclassing this one should not trap InterruptedExceptions.
45 * They should be allowed to propagate to the ToeThread executing the processor.
46 * Also they should immediately exit their main method (<tt>innerProcess()</tt>)
47 * if the <tt>interrupted</tt> flag is set.
48 *
49 * @author Gordon Mohr
50 *
51 * @see org.archive.crawler.framework.ToeThread
52 */
53 public class Processor extends ModuleType {
54
55 private static final long serialVersionUID = 6248563827413710226L;
56
57 /***
58 * Key to use asking settings for decide-rules value.
59 */
60 public static final String ATTR_DECIDE_RULES = "decide-rules";
61 /*** local name for decide-rules */
62 protected String attrDecideRules;
63
64 /***
65 * Key to use asking settings for enabled value.
66 */
67 public final static String ATTR_ENABLED = "enabled";
68
69 private Processor defaultNextProcessor = null;
70
71 private static Logger logger =
72 Logger.getLogger("org.archive.crawler.framework.Processor");
73
74 /***
75 * @param name
76 * @param description
77 */
78 public Processor(String name, String description) {
79 super(name, description);
80 addElementToDefinition(new SimpleType(ATTR_ENABLED,
81 "Is processor enabled", new Boolean(true)));
82 attrDecideRules = getName()+"#"+ATTR_DECIDE_RULES;
83 addElementToDefinition(
84 new DecideRuleSequence(attrDecideRules,
85 "DecideRules which, if their final decision is REJECT, " +
86 "prevent this Processor from running."));
87 }
88
89 /***
90 * Perform processing on the given CrawlURI.
91 *
92 * @param curi
93 * @throws InterruptedException
94 */
95 public final void process(CrawlURI curi) throws InterruptedException {
96
97 curi.setNextProcessor(getDefaultNextProcessor(curi));
98
99
100 try {
101 if (!((Boolean) getAttribute(ATTR_ENABLED, curi)).booleanValue()) {
102 return;
103 }
104 } catch (AttributeNotFoundException e) {
105 logger.severe(e.getMessage());
106 }
107
108 if(rulesAccept(curi)) {
109 innerProcess(curi);
110 } else {
111 innerRejectProcess(curi);
112 }
113 }
114
115 protected void checkForInterrupt() throws InterruptedException {
116 if (Thread.interrupted()) {
117 throw new InterruptedException("interrupted");
118 }
119 }
120
121 /***
122 * @param curi CrawlURI instance.
123 * @throws InterruptedException
124 */
125 protected void innerRejectProcess(CrawlURI curi)
126 throws InterruptedException {
127
128 }
129
130 /***
131 * Classes subclassing this one should override this method to perform
132 * their custom actions on the CrawlURI.
133 *
134 * @param curi The CrawlURI being processed.
135 * @throws InterruptedException
136 */
137 protected void innerProcess(CrawlURI curi)
138 throws InterruptedException {
139
140 }
141
142 /***
143 * Classes subclassing this one should override this method to perform
144 * processor specific actions.
145 * <p>
146 *
147 * This method is garanteed to be called after the crawl is set up, but
148 * before any URI-processing has occured.
149 */
150 protected void initialTasks () {
151
152 }
153
154 /***
155 * Classes subclassing this one should override this method to perform
156 * processor specific actions.
157 *
158 */
159 protected void finalTasks () {
160
161 }
162
163 protected DecideRule getDecideRule(Object o) {
164 try {
165 return (DecideRule)getAttribute(o, attrDecideRules);
166 } catch (AttributeNotFoundException e) {
167 throw new RuntimeException(e);
168 }
169 }
170
171 protected boolean rulesAccept(Object o) {
172 return rulesAccept(getDecideRule(o),o);
173 }
174
175 protected boolean rulesAccept(DecideRule rule, Object o) {
176 return rule.decisionFor(o) != DecideRule.REJECT;
177 }
178 /***
179 * Returns the next processor for the given CrawlURI in the processor chain.
180 * @param curi The CrawlURI that we want to find the next processor for.
181 * @return The next processor for the given CrawlURI in the processor chain.
182 */
183 public Processor getDefaultNextProcessor(CrawlURI curi) {
184 return defaultNextProcessor;
185 }
186
187 /*** Set the default next processor in the chain.
188 *
189 * @param nextProcessor the default next processor in the chain.
190 */
191 public void setDefaultNextProcessor(Processor nextProcessor) {
192 defaultNextProcessor = nextProcessor;
193 }
194
195 /***
196 * Get the controller object.
197 *
198 * @return the controller object.
199 */
200 public CrawlController getController() {
201 return getSettingsHandler().getOrder().getController();
202 }
203
204 public Processor spawn(int serialNum) {
205 Processor newInst = null;
206 try {
207 Constructor co =
208 getClass().getConstructor(new Class[] { String.class });
209 newInst =
210 (Processor) co.newInstance(new Object[] {
211 getName() + serialNum
212 });
213 getParent().setAttribute(newInst);
214 newInst.setTransient(true);
215 } catch (Exception e) {
216
217 e.printStackTrace();
218 }
219 return newInst;
220 }
221
222 /***
223 * Compiles and returns a report (in human readable form) about the status
224 * of the processor. The processor's name (of implementing class) should
225 * always be included.
226 * <p>
227 * Examples of stats declared would include:<br>
228 * * Number of CrawlURIs handled.<br>
229 * * Number of links extracted (for link extractors)<br>
230 * etc.
231 *
232 * @return A human readable report on the processor's state.
233 */
234 public String report(){
235 return "";
236 }
237
238 /***
239 * @param curi CrawlURI to examine.
240 * @return True if content to process -- content length is > 0
241 */
242 protected boolean isContentToProcess(CrawlURI curi) {
243 return curi.getContentLength() > 0;
244 }
245
246 /***
247 * @param curi CrawlURI to examine.
248 * @return True if {@link #isContentToProcess(CrawlURI)} and
249 * the CrawlURI represents a successful http transaction.
250 */
251 protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
252 return isContentToProcess(curi) &&
253 curi.isHttpTransaction() &&
254 curi.isSuccess();
255 }
256
257 /***
258 * @param contentType Found content type.
259 * @param expectedPrefix String to find at start of contenttype: e.g.
260 * <code>text/html</code>.
261 * @return True if passed content-type begins with
262 * expected mimetype.
263 */
264 protected boolean isExpectedMimeType(String contentType,
265 String expectedPrefix) {
266 return contentType != null &&
267 contentType.toLowerCase().startsWith(expectedPrefix);
268 }
269
270 public void kickUpdate() {
271
272 }
273
274 public boolean isEnabled() {
275 return ((Boolean)getUncheckedAttribute(null, ATTR_ENABLED)).booleanValue();
276 }
277 }