1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.archive.crawler.framework;
20
21 import java.util.Iterator;
22 import java.util.Map;
23
24 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
25
26 /***
27 * An interface for objects that want to collect statistics on
28 * running crawls. An implementation of this is referenced in the
29 * crawl order and loaded when the crawl begins.
30 *
31 * <p>It will be given a reference to the relevant CrawlController.
32 * The CrawlController will contain any additional configuration
33 * information needed.
34 *
35 * <p>Any class that implements this interface can be specified as a
36 * statistics tracker in a crawl order. The CrawlController will
37 * then create and initialize a copy of it and call it's start()
38 * method.
39 *
40 * <p>This interface also specifies several methods to access data that
41 * the CrawlController or the URIFrontier may be interested in at
42 * run time but do not want to have keep track of for themselves.
43 * {@link org.archive.crawler.framework.AbstractTracker AbstractTracker}
44 * implements these. If there are more then one StatisticsTracking
45 * classes defined in the crawl order only the first one will be
46 * used to access this data.
47 *
48 * <p>It is recommended that it register for
49 * {@link org.archive.crawler.event.CrawlStatusListener CrawlStatus} events and
50 * {@link org.archive.crawler.event.CrawlURIDispositionListener CrawlURIDisposition}
51 * events to be able to properly monitor a crawl. Both are registered with the
52 * CrawlController.
53 *
54 * @author Kristinn Sigurdsson
55 *
56 * @see AbstractTracker
57 * @see org.archive.crawler.event.CrawlStatusListener
58 * @see org.archive.crawler.event.CrawlURIDispositionListener
59 * @see org.archive.crawler.framework.CrawlController
60 */
61 public interface StatisticsTracking extends Runnable {
62 /*** Seed successfully crawled */
63 public static final String SEED_DISPOSITION_SUCCESS =
64 "Seed successfully crawled";
65 /*** Failed to crawl seed */
66 public static final String SEED_DISPOSITION_FAILURE =
67 "Failed to crawl seed";
68 /*** Failed to crawl seed, will retry */
69 public static final String SEED_DISPOSITION_RETRY =
70 "Failed to crawl seed, will retry";
71 /*** Seed was disregarded */
72 public static final String SEED_DISPOSITION_DISREGARD =
73 "Seed was disregarded";
74 /*** Seed has not been processed */
75 public static final String SEED_DISPOSITION_NOT_PROCESSED =
76 "Seed has not been processed";
77
78 /***
79 * Do initialization.
80 *
81 * The CrawlController will call this method before calling the start()
82 * method.
83 *
84 * @param c The {@link CrawlController CrawlController} running the crawl
85 * that this class is to gather statistics on.
86 * @throws FatalConfigurationException
87 */
88 public void initialize(CrawlController c)
89 throws FatalConfigurationException;
90
91 /***
92 * Returns how long the current crawl has been running (excluding any time
93 * spent paused/suspended/stopped) since it began.
94 *
95 * @return The length of time - in msec - that this crawl has been running.
96 */
97 public long crawlDuration();
98
99 /***
100 * Start the tracker's crawl timing.
101 */
102 public void noteStart();
103
104
105 /***
106 * Returns the total number of uncompressed bytes processed. Stored
107 * data may be much smaller due to compression or duplicate-reduction
108 * policies.
109 *
110 * @return The total number of uncompressed bytes written to disk
111 * @deprecated misnomer; use totalBytesCrawled instead
112 */
113 public long totalBytesWritten();
114
115 /***
116 * Returns the total number of uncompressed bytes crawled. Stored
117 * data may be much smaller due to compression or duplicate-reduction
118 * policies.
119 *
120 * @return The total number of uncompressed bytes crawled
121 */
122 public long totalBytesCrawled();
123
124 /***
125 * Total amount of time spent actively crawling so far.<p>
126 * Returns the total amount of time (in milliseconds) that has elapsed from
127 * the start of the crawl and until the current time or if the crawl has
128 * ended until the the end of the crawl <b>minus</b> any
129 * time spent paused.
130 * @return Total amount of time (in msec.) spent crawling so far.
131 */
132 public long getCrawlerTotalElapsedTime();
133
134 /***
135 * Returns an estimate of recent document download rates
136 * based on a queue of recently seen CrawlURIs (as of last snapshot).
137 *
138 * @return The rate per second of documents gathered during the last
139 * snapshot
140 */
141 public double currentProcessedDocsPerSec();
142
143 /***
144 * Returns the number of documents that have been processed
145 * per second over the life of the crawl (as of last snapshot)
146 *
147 * @return The rate per second of documents gathered so far
148 */
149 public double processedDocsPerSec();
150
151 /***
152 * Calculates the rate that data, in kb, has been processed
153 * over the life of the crawl (as of last snapshot.)
154 *
155 * @return The rate per second of KB gathered so far
156 */
157 public long processedKBPerSec();
158
159 /***
160 * Calculates an estimate of the rate, in kb, at which documents
161 * are currently being processed by the crawler. For more
162 * accurate estimates set a larger queue size, or get
163 * and average multiple values (as of last snapshot).
164 *
165 * @return The rate per second of KB gathered during the last snapshot
166 */
167 public int currentProcessedKBPerSec();
168
169 /***
170 * Get the number of active (non-paused) threads.
171 *
172 * @return The number of active (non-paused) threads
173 */
174 public int activeThreadCount();
175
176 /***
177 * Number of <i>successfully</i> processed URIs.
178 *
179 * <p>If crawl not running (paused or stopped) this will return the value
180 * of the last snapshot.
181 *
182 * @return The number of successully fetched URIs
183 *
184 * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
185 */
186 public long successfullyFetchedCount();
187
188 /***
189 * @return Total number of URIs (processed + queued +
190 * currently being processed)
191 */
192 public long totalCount();
193
194 public float congestionRatio();
195 public long deepestUri();
196 public long averageDepth();
197
198 /***
199 * Get a SeedRecord iterator for the job being monitored. If job is no
200 * longer running, stored values will be returned. If job is running,
201 * current seed iterator will be fetched and stored values will be updated.
202 * <p>
203 * Sort order is:<br>
204 * No status code (not processed)<br>
205 * Status codes smaller then 0 (largest to smallest)<br>
206 * Status codes larger then 0 (largest to smallest)<br>
207 * <p>
208 * <b>Note:</b> This iterator will iterate over a list of
209 * <i>SeedRecords</i>.
210 * @return the seed iterator
211 */
212 public Iterator getSeedRecordsSortedByStatusCode();
213
214 /***
215 * @return legend of progress-statistics
216 */
217 public String progressStatisticsLegend();
218
219 /***
220 * @return line of progress-statistics
221 */
222 public String getProgressStatisticsLine();
223
224 /***
225 * @return Map of progress-statistics.
226 */
227 public Map getProgressStatistics();
228 }