1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.postprocessor;
24
25
26 import java.util.logging.Logger;
27
28 import org.apache.commons.httpclient.URIException;
29 import org.archive.crawler.datamodel.CoreAttributeConstants;
30 import org.archive.crawler.datamodel.CrawlServer;
31 import org.archive.crawler.datamodel.CrawlURI;
32 import org.archive.crawler.datamodel.FetchStatusCodes;
33 import org.archive.crawler.framework.Processor;
34
35
36 /***
37 * A step, late in the processing of a CrawlURI, for updating the per-host
38 * information that may have been affected by the fetch. This will initially
39 * be robots and ip address info; it could include other per-host stats that
40 * would affect the crawl (like total pages visited at the site) as well.
41 *
42 * @author gojomo
43 * @version $Date: 2010-04-02 01:03:46 +0000 (Fri, 02 Apr 2010) $, $Revision: 6803 $
44 */
45 public class CrawlStateUpdater extends Processor implements
46 CoreAttributeConstants, FetchStatusCodes {
47
48 private static final long serialVersionUID = -1072728147960180091L;
49
50 private static final Logger logger =
51 Logger.getLogger(CrawlStateUpdater.class.getName());
52
53 public CrawlStateUpdater(String name) {
54 super(name, "Crawl state updater");
55 }
56
57 protected void innerProcess(CrawlURI curi) {
58 CrawlServer server =
59 getController().getServerCache().getServerFor(curi);
60
61 String scheme = curi.getUURI().getScheme().toLowerCase();
62 if (scheme.equals("http") || scheme.equals("https") &&
63 server != null) {
64
65 if( curi.getFetchStatus() == S_CONNECT_FAILED || curi.getFetchStatus() == S_CONNECT_LOST) {
66 server.incrementConsecutiveConnectionErrors();
67 } else if (curi.getFetchStatus() > 0){
68 server.resetConsecutiveConnectionErrors();
69 }
70
71
72 try {
73 if ("/robots.txt".equals(curi.getUURI().getPath())) {
74
75
76
77 server.updateRobots(curi);
78 }
79 }
80 catch (URIException e) {
81 logger.severe("Failed get path on " + curi.getUURI());
82 }
83 }
84 }
85 }