1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.datamodel;
26
27 import java.io.Serializable;
28
29 import org.apache.commons.httpclient.HttpStatus;
30 import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
31
32 /***
33 * Collector of statistics for a 'subset' of a crawl,
34 * such as a server (host:port), host, or frontier group
35 * (eg queue).
36 *
37 * @author gojomo
38 */
39 public class CrawlSubstats implements Serializable, FetchStatusCodes {
40 private static final long serialVersionUID = 8624425657056569036L;
41
42 public enum Stage {SCHEDULED, SUCCEEDED, RETRIED, DISREGARDED, FAILED};
43
44 public interface HasCrawlSubstats {
45 public CrawlSubstats getSubstats();
46 }
47
48 long totalScheduled;
49
50 long fetchSuccesses;
51
52 long fetchFailures;
53 long fetchDisregards;
54 long fetchResponses;
55 long robotsDenials;
56 long successBytes;
57 long totalBytes;
58 long fetchNonResponses;
59
60 long novelBytes;
61 long novelUrls;
62 long notModifiedBytes;
63 long notModifiedUrls;
64 long dupByHashBytes;
65 long dupByHashUrls;
66
67 /***
68 * Examing the CrawlURI and based on its status and internal values,
69 * update tallies.
70 *
71 * @param curi
72 */
73 public synchronized void tally(CrawlURI curi, Stage stage) {
74 switch(stage) {
75 case SCHEDULED:
76 totalScheduled++;
77 break;
78 case RETRIED:
79 if(curi.getFetchStatus()<=0) {
80 fetchNonResponses++;
81 }
82 break;
83 case SUCCEEDED:
84 fetchSuccesses++;
85 fetchResponses++;
86 totalBytes += curi.getContentSize();
87 successBytes += curi.getContentSize();
88
89 if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
90 notModifiedBytes += curi.getContentSize();
91 notModifiedUrls++;
92 } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
93 dupByHashBytes += curi.getContentSize();
94 dupByHashUrls++;
95 } else {
96 novelBytes += curi.getContentSize();
97 novelUrls++;
98 }
99
100 break;
101 case DISREGARDED:
102 fetchDisregards++;
103 if(curi.getFetchStatus()==S_ROBOTS_PRECLUDED) {
104 robotsDenials++;
105 }
106 break;
107 case FAILED:
108 if(curi.getFetchStatus()<=0) {
109 fetchNonResponses++;
110 } else {
111 fetchResponses++;
112 totalBytes += curi.getContentSize();
113 if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
114 notModifiedBytes += curi.getContentSize();
115 notModifiedUrls++;
116 } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
117 dupByHashBytes += curi.getContentSize();
118 dupByHashUrls++;
119 } else {
120 novelBytes += curi.getContentSize();
121 novelUrls++;
122 }
123 }
124 fetchFailures++;
125 break;
126 }
127 }
128
129 public long getFetchSuccesses() {
130 return fetchSuccesses;
131 }
132 public long getFetchResponses() {
133 return fetchResponses;
134 }
135 public long getSuccessBytes() {
136 return successBytes;
137 }
138 public long getTotalBytes() {
139 return totalBytes;
140 }
141 public long getFetchNonResponses() {
142 return fetchNonResponses;
143 }
144 public long getTotalScheduled() {
145 return totalScheduled;
146 }
147 public long getFetchDisregards() {
148 return fetchDisregards;
149 }
150 public long getRobotsDenials() {
151 return robotsDenials;
152 }
153
154 public long getRemaining() {
155 return totalScheduled - (fetchSuccesses + fetchFailures + fetchDisregards);
156 }
157
158 public long getRecordedFinishes() {
159 return fetchSuccesses + fetchFailures;
160 }
161
162 public long getNovelBytes() {
163 return novelBytes;
164 }
165
166 public long getNovelUrls() {
167 return novelUrls;
168 }
169
170 public long getNotModifiedBytes() {
171 return notModifiedBytes;
172 }
173
174 public long getNotModifiedUrls() {
175 return notModifiedUrls;
176 }
177
178 public long getDupByHashBytes() {
179 return dupByHashBytes;
180 }
181
182 public long getDupByHashUrls() {
183 return dupByHashUrls;
184 }
185 }