1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.postprocessor;
25
26 import java.io.File;
27 import java.lang.reflect.Constructor;
28 import java.util.Iterator;
29 import java.util.logging.FileHandler;
30 import java.util.logging.Formatter;
31 import java.util.logging.Level;
32 import java.util.logging.Logger;
33
34 import javax.management.AttributeNotFoundException;
35
36 import org.apache.commons.httpclient.URIException;
37 import org.archive.crawler.Heritrix;
38 import org.archive.crawler.datamodel.CandidateURI;
39 import org.archive.crawler.datamodel.CoreAttributeConstants;
40 import org.archive.crawler.datamodel.CrawlURI;
41 import org.archive.crawler.datamodel.FetchStatusCodes;
42 import org.archive.crawler.datamodel.UURI;
43 import org.archive.crawler.datamodel.UURIFactory;
44 import org.archive.crawler.extractor.Link;
45 import org.archive.crawler.framework.Filter;
46 import org.archive.crawler.framework.Processor;
47 import org.archive.crawler.settings.MapType;
48 import org.archive.crawler.settings.SimpleType;
49 import org.archive.crawler.settings.Type;
50
51 /***
52 * Determine which extracted links etc get fed back into Frontier.
53 *
54 * Could in the future also control whether current URI is retried.
55 *
56 * @author gojomo
57 */
58 public class Postselector extends Processor
59 implements CoreAttributeConstants, FetchStatusCodes {
60
61 private static Logger logger =
62 Logger.getLogger(Postselector.class.getName());
63
64 private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS =
65 new Boolean(true);
66 private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS =
67 "seed-redirects-new-seed";
68
69 public static final String ATTR_LOG_REJECTS_ENABLED = "override-logger";
70
71 public static final String ATTR_LOG_REJECT_FILTERS =
72 "scope-rejected-uri-log-filters";
73
74 public static final String ATTR_SCHEDULE_EMBEDDED_LINKS =
75 "schedule-embedded-links";
76 private final static Boolean DEFAULT_SCHEDULE_EMBEDDED_LINKS =
77 new Boolean(true);
78
79 /***
80 * Instance of rejected uris log filters.
81 */
82 private MapType rejectLogFilters = null;
83
84 /***
85 * @param name Name of this filter.
86 */
87 public Postselector(String name) {
88 super(name, "Post selector. Determines which extracted links and " +
89 "other related information gets fed back to the Frontier.");
90 Type t;
91 t = addElementToDefinition(
92 new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS,
93 "If enabled, any URL found because a seed redirected to it " +
94 "(seed returned 301 or 302) will be treated as a seed.",
95 DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
96 t.setExpertSetting(true);
97
98 t = addElementToDefinition(new SimpleType(ATTR_LOG_REJECTS_ENABLED,
99 "If enabled, all logging goes to a file named for this class in" +
100 " the job log" +
101 " directory. Set the logging level in heritrix.properites." +
102 " Logging at level INFO will log URIs rejected by scope.",
103 new Boolean(true)));
104 t.setExpertSetting(true);
105 this.rejectLogFilters = (MapType)addElementToDefinition(
106 new MapType(ATTR_LOG_REJECT_FILTERS, "Filters applied after" +
107 " an URI has been rejected. If any filter returns" +
108 " TRUE, the URI is logged if the logging level is INFO.",
109 Filter.class));
110 this.rejectLogFilters.setExpertSetting(true);
111
112 t = addElementToDefinition(new SimpleType(ATTR_SCHEDULE_EMBEDDED_LINKS,
113 "If enabled, embeded links (images etc.) are scheduled for " +
114 "crawling.", DEFAULT_SCHEDULE_EMBEDDED_LINKS));
115 t.setExpertSetting(true);
116 }
117
118 protected void initialTasks() {
119 super.initialTasks();
120
121
122 if (isOverrideEnabled(null)) {
123 int limit = Heritrix.getIntProperty(
124 "java.util.logging.FileHandler.limit",
125 1024 * 1024 * 1024 * 1024);
126 int count = Heritrix.getIntProperty(
127 "java.util.logging.FileHandler.count", 1);
128 try {
129 File logsDir = getController().getLogsDir();
130 String tmp = Heritrix.
131 getProperty("java.util.logging.FileHandler.pattern");
132 File logFile = new File(logsDir,
133 this.getClass().getName() +
134 ((tmp != null && tmp.length() > 0)? tmp: ".log"));
135 FileHandler fh = new FileHandler(logFile.getAbsolutePath(),
136 limit, count, true);
137
138 tmp = Heritrix.
139 getProperty("java.util.logging.FileHandler.formatter");
140 if (tmp != null && tmp.length() > 0) {
141 Constructor co = Class.forName(tmp).
142 getConstructor(new Class [] {});
143 Formatter f = (Formatter)co.
144 newInstance(new Object [] {});
145 fh.setFormatter(f);
146 }
147 logger.addHandler(fh);
148 logger.setUseParentHandlers(false);
149 } catch (Exception e) {
150 logger.severe("Failed customization of logger: " +
151 e.getMessage());
152 }
153 }
154 }
155
156 protected void innerProcess(final CrawlURI curi) {
157 if (logger.isLoggable(Level.FINEST)) {
158 logger.finest(getName() + " processing " + curi);
159 }
160
161
162 if (curi.containsKey(A_PREREQUISITE_URI)) {
163 handlePrerequisites(curi);
164 return;
165 }
166
167 if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
168
169 return;
170 }
171
172 final boolean scheduleEmbeds = ((Boolean)getUncheckedAttribute(curi,
173 ATTR_SCHEDULE_EMBEDDED_LINKS)).booleanValue();
174 final boolean redirectsNewSeeds = ((Boolean)getUncheckedAttribute(curi,
175 ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
176
177 for (final Iterator iter = curi.getOutLinks().iterator();
178 iter.hasNext();) {
179 final Link wref = (Link)iter.next();
180 try {
181 final int directive = getSchedulingFor(wref, scheduleEmbeds);
182 if(directive != CandidateURI.DONT_SCHEDULE) {
183 final CandidateURI caURI = createCandidateURI(curi, wref);
184 caURI.setSchedulingDirective(directive);
185 caURI.setIsSeed(considerAsSeed(curi, wref,
186 redirectsNewSeeds));
187 schedule(caURI);
188 }
189 } catch (URIException e) {
190 getController().logUriError(e,curi.getUURI(),wref.getDestination().toString());
191 }
192 }
193 }
194
195 private boolean considerAsSeed(final CrawlURI curi, final Link wref, final boolean redirectsNewSeeds) {
196
197 if (curi.isSeed()
198 && (curi.getFetchStatus() == 301 || curi.getFetchStatus() == 302)
199 && wref.getHopType() == Link.REFER_HOP) {
200
201
202 if (redirectsNewSeeds) {
203 return true;
204 }
205 }
206 return false;
207 }
208
209 private int getSchedulingFor(final Link wref,
210 final boolean scheduleEmbeds) {
211 final char c = wref.getHopType();
212 switch (c) {
213 case Link.REFER_HOP:
214
215 return CandidateURI.MEDIUM;
216 case Link.EMBED_HOP:
217 if(!scheduleEmbeds) {
218 return CandidateURI.DONT_SCHEDULE;
219 }
220 default:
221
222 return CandidateURI.NORMAL;
223 }
224 }
225
226 protected void handlePrerequisites(CrawlURI curi) {
227 try {
228
229 CandidateURI caUri = createCandidateURI(curi,
230 curi.getPrerequisiteUri());
231 int prereqPriority = curi.getSchedulingDirective() - 1;
232 if (prereqPriority < 0) {
233 prereqPriority = 0;
234 logger.severe("unable to promote prerequisite " + caUri +
235 " above " + curi);
236 }
237 caUri.setSchedulingDirective(curi.getSchedulingDirective() - 1);
238 caUri.setForceFetch(true);
239 if (!schedule(caUri)) {
240
241
242 curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
243 return;
244 }
245
246 } catch (URIException ex) {
247 Object[] array = {curi, curi.getPrerequisiteUri()};
248 getController().uriErrors.log(Level.INFO,ex.getMessage(), array);
249 } catch (NumberFormatException e) {
250
251 Object[] array = {curi, curi.getPrerequisiteUri()};
252 getController().uriErrors.log(Level.INFO,e.getMessage(), array);
253 }
254 }
255
256 /***
257 * Schedule the given {@link CandidateURI CandidateURI} with the Frontier.
258 * @param caUri The CandidateURI to be scheduled.
259 * @return true if CandidateURI was accepted by crawl scope, false
260 * otherwise.
261 */
262 protected boolean schedule(CandidateURI caUri) {
263 if(getController().getScope().accepts(caUri)) {
264 if (logger.isLoggable(Level.FINER)) {
265 logger.finer("Accepted: " + caUri);
266 }
267 getController().getFrontier().schedule(caUri);
268 return true;
269 }
270
271
272
273 if (logger.isLoggable(Level.INFO)) {
274 CrawlURI curi = (caUri instanceof CrawlURI)?
275 (CrawlURI)caUri: new CrawlURI(caUri.getUURI());
276 if (filtersAccept(this.rejectLogFilters, curi)) {
277 logger.info("Rejected " + curi.getUURI().toString());
278 }
279 }
280 return false;
281 }
282
283
284 public boolean isOverrideEnabled(Object context) {
285 boolean result = true;
286 try {
287 Boolean b = (Boolean)getAttribute(context,
288 ATTR_LOG_REJECTS_ENABLED);
289 if (b != null) {
290 result = b.booleanValue();
291 }
292 } catch (AttributeNotFoundException e) {
293 logger.warning("Failed get of 'enabled' attribute.");
294 }
295
296 return result;
297 }
298
299 protected CandidateURI createCandidateURI(CrawlURI curi, Link link)
300 throws URIException {
301 UURI uuri;
302 if (link.getDestination() instanceof UURI) {
303 uuri = (UURI) link.getDestination();
304 } else {
305 uuri = UURIFactory.getInstance(curi.getBaseURI(), link
306 .getDestination().toString());
307 }
308 CandidateURI caURI = new CandidateURI(uuri, curi.getPathFromSeed()
309 + link.getHopType(), curi.getUURI(), link.getContext());
310 return caURI;
311 }
312 }