Postselector xref

View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimplePostselector.java
20   * Created on Oct 2, 2003
21   *
22   * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/postprocessor/Postselector.java,v 1.23 2005/05/05 21:30:43 stack-sf Exp $
23   */
24  package org.archive.crawler.postprocessor;
25  
26  import java.io.File;
27  import java.lang.reflect.Constructor;
28  import java.util.Iterator;
29  import java.util.logging.FileHandler;
30  import java.util.logging.Formatter;
31  import java.util.logging.Level;
32  import java.util.logging.Logger;
33  
34  import javax.management.AttributeNotFoundException;
35  
36  import org.apache.commons.httpclient.URIException;
37  import org.archive.crawler.Heritrix;
38  import org.archive.crawler.datamodel.CandidateURI;
39  import org.archive.crawler.datamodel.CoreAttributeConstants;
40  import org.archive.crawler.datamodel.CrawlURI;
41  import org.archive.crawler.datamodel.FetchStatusCodes;
42  import org.archive.crawler.datamodel.UURI;
43  import org.archive.crawler.datamodel.UURIFactory;
44  import org.archive.crawler.extractor.Link;
45  import org.archive.crawler.framework.Filter;
46  import org.archive.crawler.framework.Processor;
47  import org.archive.crawler.settings.MapType;
48  import org.archive.crawler.settings.SimpleType;
49  import org.archive.crawler.settings.Type;
50  
51  /***
52   * Determine which extracted links etc get fed back into Frontier.
53   *
54   * Could in the future also control whether current URI is retried.
55   *
56   * @author gojomo
57   */
58  public class Postselector extends Processor
59  implements CoreAttributeConstants, FetchStatusCodes {
60  
61      private static Logger logger =
62          Logger.getLogger(Postselector.class.getName());
63  
64      private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS =
65          new Boolean(true);
66      private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS =
67          "seed-redirects-new-seed";
68      
69      public static final String ATTR_LOG_REJECTS_ENABLED = "override-logger";
70      
71      public static final String ATTR_LOG_REJECT_FILTERS =
72          "scope-rejected-uri-log-filters";
73      
74      public static final String ATTR_SCHEDULE_EMBEDDED_LINKS =
75          "schedule-embedded-links";
76      private final static Boolean DEFAULT_SCHEDULE_EMBEDDED_LINKS =
77          new Boolean(true);
78      
79      /***
80       * Instance of rejected uris log filters.
81       */
82      private MapType rejectLogFilters = null;
83      
84      /***
85       * @param name Name of this filter.
86       */
87      public Postselector(String name) {
88          super(name, "Post selector. Determines which extracted links and " +
89                  "other related information gets fed back to the Frontier.");
90          Type t;
91          t = addElementToDefinition(
92                  new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS,
93                  "If enabled, any URL found because a seed redirected to it " +
94                  "(seed returned 301 or 302) will be treated as a seed.",
95                  DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
96          t.setExpertSetting(true);
97          
98          t = addElementToDefinition(new SimpleType(ATTR_LOG_REJECTS_ENABLED,
99              "If enabled, all logging goes to a file named for this class in" +
100             " the job log" +
101             " directory. Set the logging level in heritrix.properites." +
102             " Logging at level INFO will log URIs rejected by scope.",
103             new Boolean(true)));
104         t.setExpertSetting(true);
105         this.rejectLogFilters = (MapType)addElementToDefinition(
106             new MapType(ATTR_LOG_REJECT_FILTERS, "Filters applied after" +
107                 " an URI has been rejected. If any filter returns" +
108                " TRUE, the URI is logged if the logging level is INFO.",
109             Filter.class));
110         this.rejectLogFilters.setExpertSetting(true);
111         
112         t = addElementToDefinition(new SimpleType(ATTR_SCHEDULE_EMBEDDED_LINKS,
113             "If enabled, embeded links (images etc.) are scheduled for " +
114             "crawling.", DEFAULT_SCHEDULE_EMBEDDED_LINKS));
115         t.setExpertSetting(true);
116     }
117    
118     protected void initialTasks() {
119         super.initialTasks();
120         // Set up logger for this instance.  May have special directives
121         // since this class can log scope-rejected URLs.
122         if (isOverrideEnabled(null))    {
123             int limit = Heritrix.getIntProperty(
124                 "java.util.logging.FileHandler.limit",
125                 1024 * 1024 * 1024 * 1024);
126             int count = Heritrix.getIntProperty(
127                 "java.util.logging.FileHandler.count", 1);
128             try {
129                 File logsDir = getController().getLogsDir();
130                 String tmp = Heritrix.
131                     getProperty("java.util.logging.FileHandler.pattern");
132                 File logFile = new File(logsDir,
133                     this.getClass().getName() +
134                         ((tmp != null && tmp.length() > 0)? tmp: ".log"));
135                 FileHandler fh = new FileHandler(logFile.getAbsolutePath(),
136                     limit, count, true);
137                 // Manage the formatter to use.
138                 tmp = Heritrix.
139                     getProperty("java.util.logging.FileHandler.formatter");
140                 if (tmp != null && tmp.length() > 0) {
141                         Constructor co = Class.forName(tmp).
142                             getConstructor(new Class [] {});
143                         Formatter f = (Formatter)co.
144                             newInstance(new Object [] {});
145                         fh.setFormatter(f);
146                 }
147                 logger.addHandler(fh);
148                 logger.setUseParentHandlers(false);
149             } catch (Exception e) {
150                 logger.severe("Failed customization of logger: " +
151                     e.getMessage());
152             }
153         }
154     }
155 
156     protected void innerProcess(final CrawlURI curi) {
157         if (logger.isLoggable(Level.FINEST)) {
158             logger.finest(getName() + " processing " + curi);
159         }
160         
161         // handle any prerequisites
162         if (curi.containsKey(A_PREREQUISITE_URI)) {
163             handlePrerequisites(curi);
164             return;
165         }
166         
167         if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
168             // do not follow links of error pages
169             return;
170         }
171 
172         final boolean scheduleEmbeds = ((Boolean)getUncheckedAttribute(curi,
173             ATTR_SCHEDULE_EMBEDDED_LINKS)).booleanValue();
174         final boolean redirectsNewSeeds = ((Boolean)getUncheckedAttribute(curi,
175             ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
176             
177         for (final Iterator iter = curi.getOutLinks().iterator();
178                 iter.hasNext();) {
179             final Link wref = (Link)iter.next();
180             try {
181                 final int directive = getSchedulingFor(wref, scheduleEmbeds);
182                 if(directive != CandidateURI.DONT_SCHEDULE) {
183                     final CandidateURI caURI = createCandidateURI(curi, wref);
184                     caURI.setSchedulingDirective(directive);
185                     caURI.setIsSeed(considerAsSeed(curi, wref,
186                         redirectsNewSeeds));
187                     schedule(caURI);
188                 }
189             } catch (URIException e) {
190                 getController().logUriError(e,curi.getUURI(),wref.getDestination().toString());
191             }
192         }
193     }
194     
195     private boolean considerAsSeed(final CrawlURI curi, final Link wref, final boolean redirectsNewSeeds) {
196         // Check if this is a seed with a 301 or 302.
197         if (curi.isSeed()
198                 && (curi.getFetchStatus() == 301 || curi.getFetchStatus() == 302)
199                 && wref.getHopType() == Link.REFER_HOP) {
200             
201             // Check if redirects from seeds should be treated as seeds.
202             if (redirectsNewSeeds) {
203                 return true;
204             }
205         }
206         return false;
207     }
208 
209     private int getSchedulingFor(final Link wref,
210             final boolean scheduleEmbeds) {
211         final char c = wref.getHopType();
212         switch (c) {
213             case Link.REFER_HOP:
214                 // treat redirects somewhat urgently
215                 return CandidateURI.MEDIUM;
216             case Link.EMBED_HOP:
217                 if(!scheduleEmbeds) {
218                     return CandidateURI.DONT_SCHEDULE;
219                 }
220             default:
221                 // everything else normal (at least for now)
222                 return CandidateURI.NORMAL;
223         }
224     }
225 
226     protected void handlePrerequisites(CrawlURI curi) {
227         try {
228             // create and schedule prerequisite
229             CandidateURI caUri = createCandidateURI(curi,
230                 curi.getPrerequisiteUri());
231             int prereqPriority = curi.getSchedulingDirective() - 1;
232             if (prereqPriority < 0) {
233                 prereqPriority = 0;
234                 logger.severe("unable to promote prerequisite " + caUri +
235                     " above " + curi);
236             }
237             caUri.setSchedulingDirective(curi.getSchedulingDirective() - 1);
238             caUri.setForceFetch(true);
239             if (!schedule(caUri)) {
240                 // prerequisite cannot be scheduled (perhaps excluded by scope)
241                 // must give up on
242                 curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
243                 return;
244             }
245             // leave PREREQ in place so frontier can properly defer this curi
246        } catch (URIException ex) {
247             Object[] array = {curi, curi.getPrerequisiteUri()};
248             getController().uriErrors.log(Level.INFO,ex.getMessage(), array);
249         } catch (NumberFormatException e) {
250             // UURI.createUURI will occasionally throw this error.
251             Object[] array = {curi, curi.getPrerequisiteUri()};
252             getController().uriErrors.log(Level.INFO,e.getMessage(), array);
253         }
254     }
255 
256     /***
257      * Schedule the given {@link CandidateURI CandidateURI} with the Frontier.
258      * @param caUri The CandidateURI to be scheduled.
259      * @return true if CandidateURI was accepted by crawl scope, false
260      * otherwise.
261      */
262     protected boolean schedule(CandidateURI caUri) {
263         if(getController().getScope().accepts(caUri)) {
264             if (logger.isLoggable(Level.FINER)) {
265                 logger.finer("Accepted: " + caUri);
266             }
267             getController().getFrontier().schedule(caUri);
268             return true;
269         }
270 
271         // Run the curi through another set of filters to see
272         // if we should log it to the scope rejection log.
273         if (logger.isLoggable(Level.INFO)) {
274             CrawlURI curi = (caUri instanceof CrawlURI)?
275                 (CrawlURI)caUri: new CrawlURI(caUri.getUURI());
276             if (filtersAccept(this.rejectLogFilters, curi)) {
277                 logger.info("Rejected " + curi.getUURI().toString());
278             }
279         }
280         return false;
281     }
282     
283     
284     public boolean isOverrideEnabled(Object context) {
285         boolean result = true;
286         try {
287             Boolean b = (Boolean)getAttribute(context,
288                 ATTR_LOG_REJECTS_ENABLED);
289             if (b != null) {
290                 result = b.booleanValue();
291             }
292         } catch (AttributeNotFoundException e) {
293             logger.warning("Failed get of 'enabled' attribute.");
294         }
295 
296         return result;
297     }
298     
299     protected CandidateURI createCandidateURI(CrawlURI curi, Link link)
300             throws URIException {
301         UURI uuri;
302         if (link.getDestination() instanceof UURI) {
303             uuri = (UURI) link.getDestination();
304         } else {
305             uuri = UURIFactory.getInstance(curi.getBaseURI(), link
306                     .getDestination().toString());
307         }
308         CandidateURI caURI = new CandidateURI(uuri, curi.getPathFromSeed()
309                 + link.getHopType(), curi.getUURI(), link.getContext());
310         return caURI;
311     }
312 }