View Javadoc

1   /* FrontierScheduler
2    * 
3    * $Id: FrontierScheduler.java 4671 2006-09-26 23:47:15Z paul_jack $
4    *
5    * Created on June 6, 2005
6    * 
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   *
25   */
26  package org.archive.crawler.postprocessor;
27  
28  
29  import java.util.logging.Level;
30  import java.util.logging.Logger;
31  
32  import org.archive.crawler.datamodel.CandidateURI;
33  import org.archive.crawler.datamodel.CrawlURI;
34  import org.archive.crawler.datamodel.FetchStatusCodes;
35  import org.archive.crawler.framework.Processor;
36  
37  /***
38   * 'Schedule' with the Frontier CandidateURIs being carried by the passed
39   * CrawlURI.
40   * Adds either prerequisites or whatever is in CrawlURI outlinks to the
41   * Frontier.  Run a Scoper ahead of this processor so only links that
42   * are in-scope get scheduled.
43   * @author stack
44   */
45  public class FrontierScheduler extends Processor
46  implements FetchStatusCodes {
47  
48      private static final long serialVersionUID = -5178775477602250542L;
49  
50      private static Logger LOGGER =
51          Logger.getLogger(FrontierScheduler.class.getName());
52      
53      /***
54       * @param name Name of this filter.
55       */
56      public FrontierScheduler(String name) {
57          super(name, "FrontierScheduler. 'Schedule' with the Frontier " +
58              "any CandidateURIs carried by the passed CrawlURI. " +
59              "Run a Scoper before this " +
60              "processor so links that are not in-scope get bumped from the " +
61              "list of links (And so those in scope get promoted from Link " +
62              "to CandidateURI).");
63      }
64  
65      protected void innerProcess(final CrawlURI curi) {
66          if (LOGGER.isLoggable(Level.FINEST)) {
67              LOGGER.finest(getName() + " processing " + curi);
68          }
69          
70          // Handle any prerequisites when S_DEFERRED for prereqs
71          if (curi.hasPrerequisiteUri() && curi.getFetchStatus() == S_DEFERRED) {
72              handlePrerequisites(curi);
73              return;
74          }
75  
76          synchronized(this) {
77              for (CandidateURI cauri: curi.getOutCandidates()) {
78                  schedule(cauri);
79              }
80          }
81      }
82  
83      protected void handlePrerequisites(CrawlURI curi) {
84          schedule((CandidateURI)curi.getPrerequisiteUri());
85      }
86  
87      /***
88       * Schedule the given {@link CandidateURI CandidateURI} with the Frontier.
89       * @param caUri The CandidateURI to be scheduled.
90       */
91      protected void schedule(CandidateURI caUri) {
92          getController().getFrontier().schedule(caUri);
93      }
94  }