View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   */
19  package org.archive.crawler.framework;
20  
21  import java.util.Iterator;
22  import java.util.Map;
23  
24  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
25  
26  /***
27   * An interface for objects that want to collect statistics on
28   * running crawls. An implementation of this is referenced in the
29   * crawl order and loaded when the crawl begins.
30   *
31   * <p>It will be given a reference to the relevant CrawlController.
32   * The CrawlController will contain any additional configuration
33   * information needed.
34   *
35   * <p>Any class that implements this interface can be specified as a
36   * statistics tracker in a crawl order.  The CrawlController will
37   * then create and initialize a copy of it and call it's start()
38   * method.
39   *
40   * <p>This interface also specifies several methods to access data that
41   * the CrawlController or the URIFrontier may be interested in at
42   * run time but do not want to have keep track of for themselves.
43   * {@link org.archive.crawler.framework.AbstractTracker AbstractTracker}
44   * implements these. If there are more then one StatisticsTracking
45   * classes defined in the crawl order only the first one will be
46   * used to access this data.
47   *
48   * <p>It is recommended that it register for
49   * {@link org.archive.crawler.event.CrawlStatusListener CrawlStatus} events and
50   * {@link org.archive.crawler.event.CrawlURIDispositionListener CrawlURIDisposition}
51   * events to be able to properly monitor a crawl. Both are registered with the
52   * CrawlController.
53   *
54   * @author Kristinn Sigurdsson
55   *
56   * @see AbstractTracker
57   * @see org.archive.crawler.event.CrawlStatusListener
58   * @see org.archive.crawler.event.CrawlURIDispositionListener
59   * @see org.archive.crawler.framework.CrawlController
60   */
61  public interface StatisticsTracking extends Runnable {
62      /*** Seed successfully crawled */
63      public static final String SEED_DISPOSITION_SUCCESS =
64          "Seed successfully crawled";
65      /*** Failed to crawl seed */
66      public static final String SEED_DISPOSITION_FAILURE =
67          "Failed to crawl seed";
68      /*** Failed to crawl seed, will retry */
69      public static final String SEED_DISPOSITION_RETRY =
70          "Failed to crawl seed, will retry";
71      /*** Seed was disregarded */
72      public static final String SEED_DISPOSITION_DISREGARD =
73          "Seed was disregarded";
74      /*** Seed has not been processed */
75      public static final String SEED_DISPOSITION_NOT_PROCESSED =
76          "Seed has not been processed";
77      
78      /***
79       * Do initialization.
80       *
81       * The CrawlController will call this method before calling the start()
82       * method.
83       *
84       * @param c The {@link CrawlController CrawlController} running the crawl
85       * that this class is to gather statistics on.
86       * @throws FatalConfigurationException
87       */
88      public void initialize(CrawlController c)
89      throws FatalConfigurationException;
90  
91      /***
92       * Returns how long the current crawl has been running (excluding any time
93       * spent paused/suspended/stopped) since it began.
94       *
95       * @return The length of time - in msec - that this crawl has been running.
96       */
97      public long crawlDuration();
98  
99      /***
100      * Start the tracker's crawl timing. 
101      */
102     public void noteStart();
103     
104 
105     /***
106      * Returns the total number of uncompressed bytes processed. Stored
107      * data may be much smaller due to compression or duplicate-reduction
108      * policies. 
109      * 
110      * @return The total number of uncompressed bytes written to disk
111      * @deprecated misnomer; use totalBytesCrawled instead
112      */
113     public long totalBytesWritten();
114     
115     /***
116      * Returns the total number of uncompressed bytes crawled. Stored
117      * data may be much smaller due to compression or duplicate-reduction
118      * policies. 
119      * 
120      * @return The total number of uncompressed bytes crawled
121      */
122     public long totalBytesCrawled();
123     
124     /***
125      * Total amount of time spent actively crawling so far.<p>
126      * Returns the total amount of time (in milliseconds) that has elapsed from
127      * the start of the crawl and until the current time or if the crawl has
128      * ended until the the end of the crawl <b>minus</b> any
129      * time spent paused.
130      * @return Total amount of time (in msec.) spent crawling so far.
131      */
132     public long getCrawlerTotalElapsedTime();
133     
134     /***
135      * Returns an estimate of recent document download rates
136      * based on a queue of recently seen CrawlURIs (as of last snapshot).
137      *
138      * @return The rate per second of documents gathered during the last
139      * snapshot
140      */
141     public double currentProcessedDocsPerSec();
142     
143     /***
144      * Returns the number of documents that have been processed
145      * per second over the life of the crawl (as of last snapshot)
146      *
147      * @return  The rate per second of documents gathered so far
148      */
149     public double processedDocsPerSec();
150     
151     /***
152      * Calculates the rate that data, in kb, has been processed
153      * over the life of the crawl (as of last snapshot.)
154      *
155      * @return The rate per second of KB gathered so far
156      */
157     public long processedKBPerSec();
158 
159     /***
160      * Calculates an estimate of the rate, in kb, at which documents
161      * are currently being processed by the crawler.  For more
162      * accurate estimates set a larger queue size, or get
163      * and average multiple values (as of last snapshot).
164      *
165      * @return The rate per second of KB gathered during the last snapshot
166      */
167     public int currentProcessedKBPerSec();
168     
169     /***
170      * Get the number of active (non-paused) threads.
171      * 
172      * @return The number of active (non-paused) threads
173      */
174     public int activeThreadCount();
175     
176     /***
177      * Number of <i>successfully</i> processed URIs.
178      *
179      * <p>If crawl not running (paused or stopped) this will return the value
180      * of the last snapshot.
181      *
182      * @return The number of successully fetched URIs
183      *
184      * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
185      */
186     public long successfullyFetchedCount();
187     
188     /***
189      * @return Total number of URIs (processed + queued +
190      * currently being processed)
191      */
192     public long totalCount();
193     
194     public float congestionRatio();
195     public long deepestUri();
196     public long averageDepth();
197     
198     /***
199      * Get a SeedRecord iterator for the job being monitored. If job is no 
200      * longer running, stored values will be returned. If job is running, 
201      * current seed iterator will be fetched and stored values will be updated.
202      * <p>
203      * Sort order is:<br>
204      * No status code (not processed)<br>
205      * Status codes smaller then 0 (largest to smallest)<br>
206      * Status codes larger then 0 (largest to smallest)<br>
207      * <p>
208      * <b>Note:</b> This iterator will iterate over a list of 
209      * <i>SeedRecords</i>.
210      * @return the seed iterator
211      */
212     public Iterator getSeedRecordsSortedByStatusCode();
213 
214     /***
215      * @return legend of progress-statistics
216      */
217     public String progressStatisticsLegend();
218 
219     /***
220      * @return line of progress-statistics
221      */
222     public String getProgressStatisticsLine();
223     
224     /***
225      * @return Map of progress-statistics.
226      */
227     public Map getProgressStatistics();
228 }