1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.crawler.postprocessor;
27
28
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31
32 import org.archive.crawler.datamodel.CandidateURI;
33 import org.archive.crawler.datamodel.CrawlURI;
34 import org.archive.crawler.datamodel.FetchStatusCodes;
35 import org.archive.crawler.framework.Processor;
36
37 /***
38 * 'Schedule' with the Frontier CandidateURIs being carried by the passed
39 * CrawlURI.
40 * Adds either prerequisites or whatever is in CrawlURI outlinks to the
41 * Frontier. Run a Scoper ahead of this processor so only links that
42 * are in-scope get scheduled.
43 * @author stack
44 */
45 public class FrontierScheduler extends Processor
46 implements FetchStatusCodes {
47
48 private static final long serialVersionUID = -5178775477602250542L;
49
50 private static Logger LOGGER =
51 Logger.getLogger(FrontierScheduler.class.getName());
52
53 /***
54 * @param name Name of this filter.
55 */
56 public FrontierScheduler(String name) {
57 super(name, "FrontierScheduler. 'Schedule' with the Frontier " +
58 "any CandidateURIs carried by the passed CrawlURI. " +
59 "Run a Scoper before this " +
60 "processor so links that are not in-scope get bumped from the " +
61 "list of links (And so those in scope get promoted from Link " +
62 "to CandidateURI).");
63 }
64
65 protected void innerProcess(final CrawlURI curi) {
66 if (LOGGER.isLoggable(Level.FINEST)) {
67 LOGGER.finest(getName() + " processing " + curi);
68 }
69
70
71 if (curi.hasPrerequisiteUri() && curi.getFetchStatus() == S_DEFERRED) {
72 handlePrerequisites(curi);
73 return;
74 }
75
76 synchronized(this) {
77 for (CandidateURI cauri: curi.getOutCandidates()) {
78 schedule(cauri);
79 }
80 }
81 }
82
83 protected void handlePrerequisites(CrawlURI curi) {
84 schedule((CandidateURI)curi.getPrerequisiteUri());
85 }
86
87 /***
88 * Schedule the given {@link CandidateURI CandidateURI} with the Frontier.
89 * @param caUri The CandidateURI to be scheduled.
90 */
91 protected void schedule(CandidateURI caUri) {
92 getController().getFrontier().schedule(caUri);
93 }
94 }