View Javadoc

1   /* LinksScoper
2    * 
3    * $Id: LinksScoper.java 6777 2010-02-22 23:41:57Z gojomo $
4    *
5    * Created on Oct 2, 2003
6    * 
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   *
25   */
26  package org.archive.crawler.postprocessor;
27  
28  import java.util.ArrayList;
29  import java.util.Collection;
30  import java.util.Iterator;
31  import java.util.logging.Level;
32  import java.util.logging.Logger;
33  
34  import javax.management.AttributeNotFoundException;
35  
36  import org.apache.commons.httpclient.URIException;
37  import org.archive.crawler.datamodel.CandidateURI;
38  import org.archive.crawler.datamodel.CrawlURI;
39  import org.archive.crawler.datamodel.FetchStatusCodes;
40  import org.archive.crawler.deciderules.DecideRule;
41  import org.archive.crawler.deciderules.DecideRuleSequence;
42  import org.archive.crawler.extractor.Link;
43  import org.archive.crawler.framework.Scoper;
44  import org.archive.crawler.settings.SimpleType;
45  import org.archive.crawler.settings.Type;
46  
47  /***
48   * Determine which extracted links are within scope.
49   * TODO: To test scope, requires that Link be converted to
50   * a CandidateURI.  Make it so don't have to make a CandidateURI to test
51   * if Link is in scope.
52   * <p>Since this scoper has to create CandidateURIs, no sense
53   * discarding them since later in the processing chain CandidateURIs rather
54   * than Links are whats needed scheduling extracted links w/ the
55   * Frontier (Frontier#schedule expects CandidateURI, not Link).  This class
56   * replaces Links w/ the CandidateURI that wraps the Link in the CrawlURI.
57   *
58   * @author gojomo
59   * @author stack
60   */
61  public class LinksScoper extends Scoper
62  implements FetchStatusCodes {
63  
64      private static final long serialVersionUID = -4074442117992496793L;
65  
66      private static Logger LOGGER =
67          Logger.getLogger(LinksScoper.class.getName());
68  
69      private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS =
70          "seed-redirects-new-seed";
71      
72      private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS =
73          new Boolean(true);
74      
75      public static final String ATTR_REJECTLOG_DECIDE_RULES =
76          "scope-rejected-url-rules";
77      
78      public static final String ATTR_PREFERENCE_DEPTH_HOPS =
79          "preference-depth-hops";
80  
81      private final static Integer DEFAULT_PREFERENCE_DEPTH_HOPS =
82          new Integer(-1);
83      
84      /***
85       * @param name Name of this filter.
86       */
87      public LinksScoper(String name) {
88          super(name, "LinksScoper. Rules on which extracted links " +
89              "are within configured scope.");
90          
91          Type t;
92          t = addElementToDefinition(
93              new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS,
94              "If enabled, any URL found because a seed redirected to it " +
95              "(original seed returned 301 or 302), will also be treated " +
96              "as a seed.", DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
97          t.setExpertSetting(true);
98  
99          t = addElementToDefinition(new SimpleType(ATTR_PREFERENCE_DEPTH_HOPS,
100             "Number of hops (of any sort) from a seed up to which a URI has higher " +
101         "priority scheduling than any remaining seed. For example, if set to 1 items one " + 
102         "hop (link, embed, redirect, etc.) away from a seed will be scheduled " + 
103         "with HIGH priority. If set to -1, no " + 
104         "preferencing will occur, and a breadth-first search with seeds " + 
105         "processed before discovered links will proceed. If set to zero, a " + 
106         "purely depth-first search will proceed, with all discovered links processed " + 
107         "before remaining seeds.  Seed redirects are treated as one hop from a seed.",
108         DEFAULT_PREFERENCE_DEPTH_HOPS));
109         t.setExpertSetting(true);
110         
111         addElementToDefinition(
112             new DecideRuleSequence(ATTR_REJECTLOG_DECIDE_RULES,
113                 "DecideRules which, if their final decision on a link is " +
114                 "not REJECT, cause the otherwise scope-rejected links to " +
115                 "be logged"));
116 
117     }
118 
119     protected void innerProcess(final CrawlURI curi) {
120         if (LOGGER.isLoggable(Level.FINEST)) {
121             LOGGER.finest(getName() + " processing " + curi);
122         }
123         
124         // If prerequisites, nothing to be done in here.
125         if (curi.hasPrerequisiteUri()) {
126             handlePrerequisite(curi);
127             return;
128         }
129         
130         // Don't extract links of error pages.
131         if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
132             curi.clearOutlinks();
133             return;
134         }
135         
136         if (curi.outlinksSize() <= 0) {
137             // No outlinks to process.
138             return;
139         }
140 
141         final boolean redirectsNewSeeds = ((Boolean)getUncheckedAttribute(curi,
142             ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
143         int preferenceDepthHops = ((Integer)getUncheckedAttribute(curi,
144             ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
145         Collection<CandidateURI> inScopeLinks = new ArrayList<CandidateURI>();
146         for (final Iterator i = curi.getOutObjects().iterator(); i.hasNext();) {
147             Object o = i.next();
148             if(o instanceof Link){
149                 final Link wref = (Link)o;
150                 try {
151                     final int directive = getSchedulingFor(curi, wref, 
152                         preferenceDepthHops);
153                     final CandidateURI caURI =
154                         curi.createCandidateURI(curi.getBaseURI(), wref, 
155                             directive, 
156                             considerAsSeed(curi, wref, redirectsNewSeeds));
157                     if (isInScope(caURI)) {
158                         inScopeLinks.add(caURI);
159                     }
160                 } catch (URIException e) {
161                     getController().logUriError(e, curi.getUURI(), 
162                         wref.getDestination().toString());
163                 }
164             } else if(o instanceof CandidateURI){
165                 CandidateURI caURI = (CandidateURI)o;
166                 if(isInScope(caURI)){
167                     inScopeLinks.add(caURI);
168                 }
169             } else {
170                 LOGGER.severe("Unexpected type: " + o);
171             }
172         }
173         // Replace current links collection w/ inscopeLinks.  May be
174         // an empty collection.
175         curi.replaceOutlinks(inScopeLinks);
176     }
177     
178     /***
179      * The CrawlURI has a prerequisite; apply scoping and update
180      * Link to CandidateURI in manner analogous to outlink handling. 
181      * @param curi CrawlURI with prereq to consider
182      */
183     protected void handlePrerequisite(CrawlURI curi) {
184         try {
185             // Create prerequisite CandidateURI
186             CandidateURI caUri =
187                 curi.createCandidateURI(curi.getBaseURI(),
188                     (Link) curi.getPrerequisiteUri());
189             int prereqPriority = curi.getSchedulingDirective() - 1;
190             if (prereqPriority < 0) {
191                 prereqPriority = 0;
192                 LOGGER.severe("Unable to promote prerequisite " + caUri +
193                     " above " + curi);
194             }
195             caUri.setSchedulingDirective(prereqPriority);
196             caUri.setForceFetch(true);
197             if(isInScope(caUri)) {
198                 // replace link with CandidateURI
199                 curi.setPrerequisiteUri(caUri);
200             } else {
201                 // prerequisite is out-of-scope; mark CrawlURI as error,
202                 // preventinting normal S_DEFERRED handling
203                 curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
204             }
205        } catch (URIException ex) {
206             Object[] array = {curi, curi.getPrerequisiteUri()};
207             getController().uriErrors.log(Level.INFO,ex.getMessage(), array);
208         } catch (NumberFormatException e) {
209             // UURI.createUURI will occasionally throw this error.
210             Object[] array = {curi, curi.getPrerequisiteUri()};
211             getController().uriErrors.log(Level.INFO,e.getMessage(), array);
212         }
213     }
214 
215     protected void outOfScope(CandidateURI caUri) {
216         super.outOfScope(caUri);
217         if (!LOGGER.isLoggable(Level.INFO)) {
218             return;
219         }
220         // TODO: Fix filters so work on CandidateURI.
221         CrawlURI curi = (caUri instanceof CrawlURI)?
222             (CrawlURI)caUri:
223             new CrawlURI(caUri.getUURI());
224         if (rulesAccept(getRejectLogRules(curi), curi)) {
225             LOGGER.info(curi.getUURI().toString());
226         }
227     }
228     
229     protected DecideRule getRejectLogRules(Object o) {
230         try {
231             return (DecideRule)getAttribute(o, ATTR_REJECTLOG_DECIDE_RULES);
232         } catch (AttributeNotFoundException e) {
233             throw new RuntimeException(e);
234         }
235     }
236     
237     private boolean considerAsSeed(final CrawlURI curi, final Link wref,
238             final boolean redirectsNewSeeds) {
239         return redirectsNewSeeds && curi.isSeed()
240                 && wref.getHopType() == Link.REFER_HOP;
241     }
242     
243     /***
244      * Determine scheduling for the  <code>curi</code>.
245      * As with the LinksScoper in general, this only handles extracted links,
246      * seeds do not pass through here, but are given MEDIUM priority.  
247      * Imports into the frontier similarly do not pass through here, 
248      * but are given NORMAL priority.
249      */
250     protected int getSchedulingFor(final CrawlURI curi, final Link wref,
251             final int preferenceDepthHops) {
252         final char c = wref.getHopType();
253         if (LOGGER.isLoggable(Level.FINEST)) {
254             LOGGER.finest(curi + " with path=" + curi.getPathFromSeed() +
255                 " isSeed=" + curi.isSeed() + " with fetchStatus=" +
256                 curi.getFetchStatus() + " -> " + wref.getDestination() +
257                 " type " + c + " with context=" + wref.getContext());
258         }
259 
260         switch (c) {
261             case Link.REFER_HOP:
262                 // Treat redirects somewhat urgently
263                 // This also ensures seed redirects remain seed priority
264                 return (preferenceDepthHops >= 0 ? CandidateURI.HIGH :
265                     CandidateURI.MEDIUM);
266             default:
267                 if (preferenceDepthHops == 0)
268                     return CandidateURI.HIGH;
269                     // this implies seed redirects are treated as path
270                     // length 1, which I belive is standard.
271                     // curi.getPathFromSeed() can never be null here, because
272                     // we're processing a link extracted from curi
273                 if (preferenceDepthHops > 0 && 
274                     curi.getPathFromSeed().length() + 1 <= preferenceDepthHops)
275                     return CandidateURI.HIGH;
276                 // Everything else normal (at least for now)
277                 return CandidateURI.NORMAL;
278         }
279     }
280 }