CrawlStateUpdater xref

View Javadoc

1   /* CrawlStateUpdater
2    *
3    * Created on Jun 5, 2003
4    *
5    * Copyright (C) 2003 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.postprocessor;
24  
25  
26  import java.util.logging.Logger;
27  
28  import org.apache.commons.httpclient.URIException;
29  import org.archive.crawler.datamodel.CoreAttributeConstants;
30  import org.archive.crawler.datamodel.CrawlServer;
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.datamodel.FetchStatusCodes;
33  import org.archive.crawler.framework.Processor;
34  
35  
36  /***
37   * A step, late in the processing of a CrawlURI, for updating the per-host
38   * information that may have been affected by the fetch. This will initially
39   * be robots and ip address info; it could include other per-host stats that
40   * would affect the crawl (like total pages visited at the site) as well.
41   *
42   * @author gojomo
43   * @version $Date: 2010-04-02 01:03:46 +0000 (Fri, 02 Apr 2010) $, $Revision: 6803 $
44   */
45  public class CrawlStateUpdater extends Processor implements
46          CoreAttributeConstants, FetchStatusCodes {
47  
48      private static final long serialVersionUID = -1072728147960180091L;
49  
50      private static final Logger logger =
51          Logger.getLogger(CrawlStateUpdater.class.getName());
52  
53      public CrawlStateUpdater(String name) {
54          super(name, "Crawl state updater");
55      }
56  
57      protected void innerProcess(CrawlURI curi) {
58          CrawlServer server =
59              getController().getServerCache().getServerFor(curi);
60          
61          String scheme = curi.getUURI().getScheme().toLowerCase();
62          if (scheme.equals("http") || scheme.equals("https") &&
63                  server != null) {
64              // Update connection problems counter
65              if( curi.getFetchStatus() == S_CONNECT_FAILED || curi.getFetchStatus() == S_CONNECT_LOST) {
66                  server.incrementConsecutiveConnectionErrors();
67              } else if (curi.getFetchStatus() > 0){
68                  server.resetConsecutiveConnectionErrors();
69              }
70  
71              // Update robots info
72              try {
73                  if ("/robots.txt".equals(curi.getUURI().getPath())) {
74                      // Update server with robots info
75                      // NOTE, this *can* change the curi's fetchStatus from a connection
76                      // problem to S_DEEMED_NOT_FOUND to prevent further retries
77                      server.updateRobots(curi);
78                  }
79              }
80              catch (URIException e) {
81                  logger.severe("Failed get path on " + curi.getUURI());
82              }
83          }
84      }
85  }