View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   */
19  package org.archive.crawler.framework;
20  
21  import java.io.Serializable;
22  import java.util.Date;
23  import java.util.EventObject;
24  import java.util.logging.Level;
25  
26  import javax.management.AttributeNotFoundException;
27  
28  import org.archive.crawler.event.CrawlStatusListener;
29  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
30  import org.archive.crawler.settings.ModuleType;
31  import org.archive.crawler.settings.SimpleType;
32  import org.archive.crawler.settings.Type;
33  import org.archive.util.ArchiveUtils;
34  import org.archive.util.PaddingStringBuffer;
35  import org.xbill.DNS.DClass;
36  import org.xbill.DNS.Lookup;
37  
38  /***
39   * A partial implementation of the StatisticsTracking interface.
40   * <p>
41   * It covers the thread handling. (Launching, pausing etc.)  Included in this is
42   * keeping track of the total time spent (actually) crawling.  Several methods
43   * to access the time started, finished etc. are provided.
44   * <p>
45   * To handle the thread work the class implements the CrawlStatusListener and
46   * uses it's events to pause, resume and stop logging of statistics. The run()
47   * method will call logActivity() at intervals specified in the crawl order.
48   * <p>
49   * Implementation of logActivity (the actual logging) as well as listening for
50   * CrawlURIDisposition events is not addressed.
51   *
52   * @author Kristinn Sigurdsson
53   *
54   * @see org.archive.crawler.framework.StatisticsTracking
55   * @see org.archive.crawler.admin.StatisticsTracker
56   */
57  public abstract class AbstractTracker extends ModuleType
58  implements StatisticsTracking, CrawlStatusListener, Serializable {
59      /*** Default period between logging stat values */
60      public static final Integer DEFAULT_STATISTICS_REPORT_INTERVAL =
61          new Integer(20);
62      /*** Attribute name for logging interval in seconds setting
63       */
64      public static final String ATTR_STATS_INTERVAL = "interval-seconds";
65  
66      /*** A reference to the CrawlContoller of the crawl that we are to track
67       * statistics for.
68       */
69      protected transient CrawlController controller;
70  
71      // Keep track of time.
72      protected long crawlerStartTime;
73      protected long crawlerEndTime = -1; // Until crawl ends, this value is -1.
74      protected long crawlerPauseStarted = 0;
75      protected long crawlerTotalPausedTime = 0;
76  
77      /*** Timestamp of when this logger last wrote something to the log */
78      protected long lastLogPointTime;
79  
80      volatile protected boolean shouldrun = true;
81  
82      /***
83       * @param name
84       * @param description
85       */
86      public AbstractTracker(String name, String description) {
87          super(name, description);
88          Type e = addElementToDefinition(new SimpleType(ATTR_STATS_INTERVAL,
89                  "The interval between writing progress information to log.",
90                  DEFAULT_STATISTICS_REPORT_INTERVAL));
91          e.setOverrideable(false);
92      }
93  
94      /***
95       * Sets up the Logger (including logInterval) and registers with the
96       * CrawlController for CrawlStatus and CrawlURIDisposition events.
97       *
98       * @param c A crawl controller instance.
99       * @throws FatalConfigurationException Not thrown here. For overrides that
100      * go to settings system for configuration.
101      * @see CrawlStatusListener
102      * @see org.archive.crawler.event.CrawlURIDispositionListener
103      */
104     public void initialize(CrawlController c)
105     throws FatalConfigurationException {
106         this.controller = c;
107 
108         // Add listeners
109         this.controller.addCrawlStatusListener(this);
110     }
111     
112     /***
113      * Start thread.  Will call logActivity() at intervals specified by
114      * logInterval
115      *
116      */
117     public void run() {
118         // Don't start logging if we have no logger
119         if (this.controller == null) {
120             return;
121         }
122 
123         shouldrun = true; //If we are starting, this should always be true.
124 
125         // Log the legend
126         this.controller.logProgressStatistics(progressStatisticsLegend());
127         lastLogPointTime = System.currentTimeMillis(); // The first interval begins now.
128 
129         // Keep logging until someone calls stop()
130         while (shouldrun) {
131             // Pause before writing the first entry (so we have real numbers)
132             // and then pause between entries
133             try {
134                 Thread.sleep(getLogWriteInterval() * 1000);
135             } catch (InterruptedException e) {
136                 e.printStackTrace();
137                 controller.runtimeErrors.log(Level.INFO,
138                     "Periodic stat logger interrupted while sleeping.");
139             }
140 
141             // In case stop() was invoked while the thread was sleeping or we
142             // are paused.
143             if (shouldrun && getCrawlPauseStartedTime() == 0) {
144                 progressStatisticsEvent(new EventObject(this));
145             }
146         }
147     }
148 
149     /***
150      * @return legend for progress-statistics lines/log
151      */
152     public String progressStatisticsLegend() {
153         return "           timestamp" +
154             "  discovered   " +
155             "   queued   downloaded       doc/s(avg)  KB/s(avg) " +
156             "  dl-failures   busy-thread   mem-use-KB  heap-size-KB " +
157             "  congestion   max-depth   avg-depth";
158     }
159 
160     /***
161      * Notify tracker that crawl has begun. Must be called
162      * outside tracker's own thread, to ensure it is noted
163      * before other threads start interacting with tracker. 
164      */
165     public void noteStart() {
166         if (this.crawlerStartTime == 0) {
167             // Note the time the crawl starts (only if not already set)
168             this.crawlerStartTime = System.currentTimeMillis();
169         }
170     }
171 
172     /***
173      * A method for logging current crawler state.
174      *
175      * This method will be called by run() at intervals specified in
176      * the crawl order file.  It is also invoked when pausing or
177      * stopping a crawl to capture the state at that point.  Default behavior is
178      * call to {@link CrawlController#logProgressStatistics} so CrawlController
179      * can act on progress statistics event.
180      * <p>
181      * It is recommended that for implementations of this method it be
182      * carefully considered if it should be synchronized in whole or in
183      * part
184      * @param e Progress statistics event.
185      */
186     protected synchronized void progressStatisticsEvent(final EventObject e) {
187         this.controller.progressStatisticsEvent(e);
188         // temporary workaround for 
189         // [ 996161 ] Fix DNSJava issues (memory) -- replace with JNDI-DNS?
190         // http://sourceforge.net/support/tracker.php?aid=996161
191         Lookup.getDefaultCache(DClass.IN).clearCache();
192     }
193 
194     /***
195      * Get the starting time of the crawl (as given by
196      * <code>System.currentTimeMillis()</code> when the crawl started).
197      * @return time fo the crawl's start
198      */
199     public long getCrawlStartTime() {
200         return this.crawlerStartTime;
201     }
202 
203     /***
204      * If crawl has ended it will return the time it ended (given by
205      * <code>System.currentTimeMillis()</code> at that time).
206      * <br>
207      * If crawl is still going on it will return the same as
208      * <code>System.currentTimeMillis()</code> at the time of the call.
209      * @return The time of the crawl ending or the current time if the crawl has
210      *         not ended.
211      */
212     public long getCrawlEndTime() {
213         return (this.crawlerEndTime == -1)?
214             System.currentTimeMillis(): this.crawlerEndTime;
215     }
216 
217     /***
218      * Returns the number of milliseconds that the crawl spent paused or
219      * otherwise in a nonactive state.
220      * @return the number of msec. that the crawl was paused or otherwise
221      *         suspended.
222      */
223     public long getCrawlTotalPauseTime() {
224         return this.crawlerTotalPausedTime;
225     }
226 
227     /***
228      * Get the time when the the crawl was last paused/suspended (as given by
229      * <code>System.currentTimeMillis()</code> at that time). Will be 0 if the
230      * crawl is not currently paused.
231      * @return time of the crawl's last pause/suspend or 0 if the crawl is not
232      *         currently paused.
233      */
234     public long getCrawlPauseStartedTime() {
235         return this.crawlerPauseStarted;
236     }
237 
238     public long getCrawlerTotalElapsedTime() {
239         if (getCrawlStartTime() == 0) {
240             // if no start time set yet, consider elapsed time zero
241             return 0;
242         }
243         
244         return (getCrawlPauseStartedTime() != 0)?
245             // Are currently paused, calculate time up to last pause
246             (getCrawlPauseStartedTime() - getCrawlTotalPauseTime() -
247                 getCrawlStartTime()):
248             // Not paused, calculate total time.
249             (getCrawlEndTime() - getCrawlTotalPauseTime() - getCrawlStartTime());
250     }
251 
252     /***
253      * The number of seconds to wait between writing snapshot data to log file.
254      * @return the number of seconds to wait between writing snapshot data to
255      * log file.
256      */
257     protected int getLogWriteInterval() {
258         int logInterval;
259         try {
260             logInterval =
261                 ((Integer) getAttribute(null, ATTR_STATS_INTERVAL)).intValue();
262         } catch (AttributeNotFoundException e) {
263             logInterval = 10;
264         }
265         return logInterval;
266     }
267 
268     /***
269      * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
270      */
271     public void crawlPausing(String statusMessage) {
272         logNote("CRAWL WAITING - " + statusMessage);
273     }
274 
275     protected void logNote(final String note) {
276         this.controller.logProgressStatistics(new PaddingStringBuffer()
277                      .append(ArchiveUtils.get14DigitDate())
278                      .append(" ")
279                      .append(note)
280                      .toString());
281     }
282 
283     public void crawlPaused(String statusMessage) {
284         crawlerPauseStarted = System.currentTimeMillis();
285         progressStatisticsEvent(new EventObject(this));
286         logNote("CRAWL PAUSED - " + statusMessage);
287     }
288 
289     public void crawlResuming(String statusMessage) {
290         tallyCurrentPause();
291         logNote("CRAWL RESUMED - " + statusMessage);
292         lastLogPointTime = System.currentTimeMillis();
293     }
294 
295     /***
296      * For a current pause (if any), add paused time to total and reset
297      */
298     protected void tallyCurrentPause() {
299         if (this.crawlerPauseStarted > 0) {
300             // Ok, we managed to actually pause before resuming.
301             this.crawlerTotalPausedTime
302                 += (System.currentTimeMillis() - this.crawlerPauseStarted);
303         }
304         this.crawlerPauseStarted = 0;
305     }
306 
307     public void crawlEnding(String sExitMessage) {
308         logNote("CRAWL ENDING - " + sExitMessage);
309     }
310 
311     /***
312      * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
313      */
314     public void crawlEnded(String sExitMessage) {
315         // Note the time when the crawl stops.
316         crawlerEndTime = System.currentTimeMillis();
317         progressStatisticsEvent(new EventObject(this));
318         logNote("CRAWL ENDED - " + sExitMessage);
319         shouldrun = false;
320         dumpReports();
321         finalCleanup();
322     }
323 
324     public void crawlStarted(String message) {
325         tallyCurrentPause();
326         noteStart();
327     }
328     
329     /***
330      * Dump reports, if any, on request or at crawl end. 
331      */
332     protected void dumpReports() {
333         // by default do nothing; subclasses may override
334     }
335 
336     /***
337      * Cleanup resources used, at crawl end. 
338      */
339     protected void finalCleanup() {
340         controller = null; // Facilitate GC.
341     }
342 
343     /***
344      * @see org.archive.crawler.framework.StatisticsTracking#crawlDuration()
345      */
346     public long crawlDuration() {
347         return getCrawlerTotalElapsedTime();
348     }
349 }