View Javadoc

1   /* CrawlSubstats
2   *
3   * $Id: CrawlSubstats.java 6534 2009-10-01 02:54:34Z nlevitt $
4   *
5   * Created on Nov 4, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.datamodel;
26  
27  import java.io.Serializable;
28  
29  import org.apache.commons.httpclient.HttpStatus;
30  import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
31  
32  /***
33   * Collector of statistics for a 'subset' of a crawl,
34   * such as a server (host:port), host, or frontier group 
35   * (eg queue). 
36   * 
37   * @author gojomo
38   */
39  public class CrawlSubstats implements Serializable, FetchStatusCodes {
40      private static final long serialVersionUID = 8624425657056569036L;
41  
42      public enum Stage {SCHEDULED, SUCCEEDED, RETRIED, DISREGARDED, FAILED};
43      
44      public interface HasCrawlSubstats {
45          public CrawlSubstats getSubstats();
46      }
47      
48      long totalScheduled;   // anything initially scheduled
49                             // (totalScheduled - (fetchSuccesses + fetchFailures)
50      long fetchSuccesses;   // anything disposed-success 
51                             // (HTTP 2XX response codes, other non-errors)
52      long fetchFailures;    // anything disposed-failure
53      long fetchDisregards;  // anything disposed-disregard
54      long fetchResponses;   // all positive responses (incl. 3XX, 4XX, 5XX)
55      long robotsDenials;    // all robots-precluded failures
56      long successBytes;     // total size of all success responses
57      long totalBytes;       // total size of all responses
58      long fetchNonResponses; // processing attempts resulting in no response
59                             // (both failures and temp deferrals)
60      long novelBytes;
61      long novelUrls;
62      long notModifiedBytes;
63      long notModifiedUrls;
64      long dupByHashBytes;
65      long dupByHashUrls;
66      
67      /***
68       * Examing the CrawlURI and based on its status and internal values,
69       * update tallies. 
70       * 
71       * @param curi
72       */
73      public synchronized void tally(CrawlURI curi, Stage stage) {
74          switch(stage) {
75              case SCHEDULED:
76                  totalScheduled++;
77                  break;
78              case RETRIED:
79                  if(curi.getFetchStatus()<=0) {
80                      fetchNonResponses++;
81                  }
82                  break;
83              case SUCCEEDED:
84                  fetchSuccesses++;
85                  fetchResponses++;
86                  totalBytes += curi.getContentSize();
87                  successBytes += curi.getContentSize();
88                  
89                  if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
90                      notModifiedBytes += curi.getContentSize();
91                      notModifiedUrls++;
92                  } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
93                      dupByHashBytes += curi.getContentSize();
94                      dupByHashUrls++;
95                  } else {
96                      novelBytes += curi.getContentSize();
97                      novelUrls++;
98                  }
99                  
100                 break;
101             case DISREGARDED:
102                 fetchDisregards++;
103                 if(curi.getFetchStatus()==S_ROBOTS_PRECLUDED) {
104                     robotsDenials++;
105                 }
106                 break;
107             case FAILED:
108                 if(curi.getFetchStatus()<=0) {
109                     fetchNonResponses++;
110                 } else {
111                     fetchResponses++;
112                     totalBytes += curi.getContentSize();
113                     if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
114                         notModifiedBytes += curi.getContentSize();
115                         notModifiedUrls++;
116                     } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
117                         dupByHashBytes += curi.getContentSize();
118                         dupByHashUrls++;
119                     } else {
120                         novelBytes += curi.getContentSize();
121                         novelUrls++;
122                     }
123                 }
124                 fetchFailures++;
125                 break;
126         }
127     }
128     
129     public long getFetchSuccesses() {
130         return fetchSuccesses;
131     }
132     public long getFetchResponses() {
133         return fetchResponses;
134     }
135     public long getSuccessBytes() {
136         return successBytes;
137     }
138     public long getTotalBytes() {
139         return totalBytes;
140     }
141     public long getFetchNonResponses() {
142         return fetchNonResponses;
143     }
144     public long getTotalScheduled() {
145         return totalScheduled;
146     }
147     public long getFetchDisregards() {
148         return fetchDisregards;
149     }
150     public long getRobotsDenials() {
151         return robotsDenials;
152     }
153     
154     public long getRemaining() {
155         return totalScheduled - (fetchSuccesses + fetchFailures + fetchDisregards);
156     }
157 
158     public long getRecordedFinishes() {
159         return fetchSuccesses + fetchFailures;
160     }
161 
162     public long getNovelBytes() {
163         return novelBytes;
164     }
165 
166     public long getNovelUrls() {
167         return novelUrls;
168     }
169 
170     public long getNotModifiedBytes() {
171         return notModifiedBytes;
172     }
173 
174     public long getNotModifiedUrls() {
175         return notModifiedUrls;
176     }
177 
178     public long getDupByHashBytes() {
179         return dupByHashBytes;
180     }
181 
182     public long getDupByHashUrls() {
183         return dupByHashUrls;
184     }
185 }