1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.crawler.postprocessor;
27
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Iterator;
31 import java.util.logging.Level;
32 import java.util.logging.Logger;
33
34 import javax.management.AttributeNotFoundException;
35
36 import org.apache.commons.httpclient.URIException;
37 import org.archive.crawler.datamodel.CandidateURI;
38 import org.archive.crawler.datamodel.CrawlURI;
39 import org.archive.crawler.datamodel.FetchStatusCodes;
40 import org.archive.crawler.deciderules.DecideRule;
41 import org.archive.crawler.deciderules.DecideRuleSequence;
42 import org.archive.crawler.extractor.Link;
43 import org.archive.crawler.framework.Scoper;
44 import org.archive.crawler.settings.SimpleType;
45 import org.archive.crawler.settings.Type;
46
47 /***
48 * Determine which extracted links are within scope.
49 * TODO: To test scope, requires that Link be converted to
50 * a CandidateURI. Make it so don't have to make a CandidateURI to test
51 * if Link is in scope.
52 * <p>Since this scoper has to create CandidateURIs, no sense
53 * discarding them since later in the processing chain CandidateURIs rather
54 * than Links are whats needed scheduling extracted links w/ the
55 * Frontier (Frontier#schedule expects CandidateURI, not Link). This class
56 * replaces Links w/ the CandidateURI that wraps the Link in the CrawlURI.
57 *
58 * @author gojomo
59 * @author stack
60 */
61 public class LinksScoper extends Scoper
62 implements FetchStatusCodes {
63
64 private static final long serialVersionUID = -4074442117992496793L;
65
66 private static Logger LOGGER =
67 Logger.getLogger(LinksScoper.class.getName());
68
69 private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS =
70 "seed-redirects-new-seed";
71
72 private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS =
73 new Boolean(true);
74
75 public static final String ATTR_REJECTLOG_DECIDE_RULES =
76 "scope-rejected-url-rules";
77
78 public static final String ATTR_PREFERENCE_DEPTH_HOPS =
79 "preference-depth-hops";
80
81 private final static Integer DEFAULT_PREFERENCE_DEPTH_HOPS =
82 new Integer(-1);
83
84 /***
85 * @param name Name of this filter.
86 */
87 public LinksScoper(String name) {
88 super(name, "LinksScoper. Rules on which extracted links " +
89 "are within configured scope.");
90
91 Type t;
92 t = addElementToDefinition(
93 new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS,
94 "If enabled, any URL found because a seed redirected to it " +
95 "(original seed returned 301 or 302), will also be treated " +
96 "as a seed.", DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
97 t.setExpertSetting(true);
98
99 t = addElementToDefinition(new SimpleType(ATTR_PREFERENCE_DEPTH_HOPS,
100 "Number of hops (of any sort) from a seed up to which a URI has higher " +
101 "priority scheduling than any remaining seed. For example, if set to 1 items one " +
102 "hop (link, embed, redirect, etc.) away from a seed will be scheduled " +
103 "with HIGH priority. If set to -1, no " +
104 "preferencing will occur, and a breadth-first search with seeds " +
105 "processed before discovered links will proceed. If set to zero, a " +
106 "purely depth-first search will proceed, with all discovered links processed " +
107 "before remaining seeds. Seed redirects are treated as one hop from a seed.",
108 DEFAULT_PREFERENCE_DEPTH_HOPS));
109 t.setExpertSetting(true);
110
111 addElementToDefinition(
112 new DecideRuleSequence(ATTR_REJECTLOG_DECIDE_RULES,
113 "DecideRules which, if their final decision on a link is " +
114 "not REJECT, cause the otherwise scope-rejected links to " +
115 "be logged"));
116
117 }
118
119 protected void innerProcess(final CrawlURI curi) {
120 if (LOGGER.isLoggable(Level.FINEST)) {
121 LOGGER.finest(getName() + " processing " + curi);
122 }
123
124
125 if (curi.hasPrerequisiteUri()) {
126 handlePrerequisite(curi);
127 return;
128 }
129
130
131 if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
132 curi.clearOutlinks();
133 return;
134 }
135
136 if (curi.outlinksSize() <= 0) {
137
138 return;
139 }
140
141 final boolean redirectsNewSeeds = ((Boolean)getUncheckedAttribute(curi,
142 ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
143 int preferenceDepthHops = ((Integer)getUncheckedAttribute(curi,
144 ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
145 Collection<CandidateURI> inScopeLinks = new ArrayList<CandidateURI>();
146 for (final Iterator i = curi.getOutObjects().iterator(); i.hasNext();) {
147 Object o = i.next();
148 if(o instanceof Link){
149 final Link wref = (Link)o;
150 try {
151 final int directive = getSchedulingFor(curi, wref,
152 preferenceDepthHops);
153 final CandidateURI caURI =
154 curi.createCandidateURI(curi.getBaseURI(), wref,
155 directive,
156 considerAsSeed(curi, wref, redirectsNewSeeds));
157 if (isInScope(caURI)) {
158 inScopeLinks.add(caURI);
159 }
160 } catch (URIException e) {
161 getController().logUriError(e, curi.getUURI(),
162 wref.getDestination().toString());
163 }
164 } else if(o instanceof CandidateURI){
165 CandidateURI caURI = (CandidateURI)o;
166 if(isInScope(caURI)){
167 inScopeLinks.add(caURI);
168 }
169 } else {
170 LOGGER.severe("Unexpected type: " + o);
171 }
172 }
173
174
175 curi.replaceOutlinks(inScopeLinks);
176 }
177
178 /***
179 * The CrawlURI has a prerequisite; apply scoping and update
180 * Link to CandidateURI in manner analogous to outlink handling.
181 * @param curi CrawlURI with prereq to consider
182 */
183 protected void handlePrerequisite(CrawlURI curi) {
184 try {
185
186 CandidateURI caUri =
187 curi.createCandidateURI(curi.getBaseURI(),
188 (Link) curi.getPrerequisiteUri());
189 int prereqPriority = curi.getSchedulingDirective() - 1;
190 if (prereqPriority < 0) {
191 prereqPriority = 0;
192 LOGGER.severe("Unable to promote prerequisite " + caUri +
193 " above " + curi);
194 }
195 caUri.setSchedulingDirective(prereqPriority);
196 caUri.setForceFetch(true);
197 if(isInScope(caUri)) {
198
199 curi.setPrerequisiteUri(caUri);
200 } else {
201
202
203 curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
204 }
205 } catch (URIException ex) {
206 Object[] array = {curi, curi.getPrerequisiteUri()};
207 getController().uriErrors.log(Level.INFO,ex.getMessage(), array);
208 } catch (NumberFormatException e) {
209
210 Object[] array = {curi, curi.getPrerequisiteUri()};
211 getController().uriErrors.log(Level.INFO,e.getMessage(), array);
212 }
213 }
214
215 protected void outOfScope(CandidateURI caUri) {
216 super.outOfScope(caUri);
217 if (!LOGGER.isLoggable(Level.INFO)) {
218 return;
219 }
220
221 CrawlURI curi = (caUri instanceof CrawlURI)?
222 (CrawlURI)caUri:
223 new CrawlURI(caUri.getUURI());
224 if (rulesAccept(getRejectLogRules(curi), curi)) {
225 LOGGER.info(curi.getUURI().toString());
226 }
227 }
228
229 protected DecideRule getRejectLogRules(Object o) {
230 try {
231 return (DecideRule)getAttribute(o, ATTR_REJECTLOG_DECIDE_RULES);
232 } catch (AttributeNotFoundException e) {
233 throw new RuntimeException(e);
234 }
235 }
236
237 private boolean considerAsSeed(final CrawlURI curi, final Link wref,
238 final boolean redirectsNewSeeds) {
239 return redirectsNewSeeds && curi.isSeed()
240 && wref.getHopType() == Link.REFER_HOP;
241 }
242
243 /***
244 * Determine scheduling for the <code>curi</code>.
245 * As with the LinksScoper in general, this only handles extracted links,
246 * seeds do not pass through here, but are given MEDIUM priority.
247 * Imports into the frontier similarly do not pass through here,
248 * but are given NORMAL priority.
249 */
250 protected int getSchedulingFor(final CrawlURI curi, final Link wref,
251 final int preferenceDepthHops) {
252 final char c = wref.getHopType();
253 if (LOGGER.isLoggable(Level.FINEST)) {
254 LOGGER.finest(curi + " with path=" + curi.getPathFromSeed() +
255 " isSeed=" + curi.isSeed() + " with fetchStatus=" +
256 curi.getFetchStatus() + " -> " + wref.getDestination() +
257 " type " + c + " with context=" + wref.getContext());
258 }
259
260 switch (c) {
261 case Link.REFER_HOP:
262
263
264 return (preferenceDepthHops >= 0 ? CandidateURI.HIGH :
265 CandidateURI.MEDIUM);
266 default:
267 if (preferenceDepthHops == 0)
268 return CandidateURI.HIGH;
269
270
271
272
273 if (preferenceDepthHops > 0 &&
274 curi.getPathFromSeed().length() + 1 <= preferenceDepthHops)
275 return CandidateURI.HIGH;
276
277 return CandidateURI.NORMAL;
278 }
279 }
280 }