1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.archive.crawler.framework;
20
21 import java.io.Serializable;
22 import java.util.Date;
23 import java.util.EventObject;
24 import java.util.logging.Level;
25
26 import javax.management.AttributeNotFoundException;
27
28 import org.archive.crawler.event.CrawlStatusListener;
29 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
30 import org.archive.crawler.settings.ModuleType;
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.crawler.settings.Type;
33 import org.archive.util.ArchiveUtils;
34 import org.archive.util.PaddingStringBuffer;
35 import org.xbill.DNS.DClass;
36 import org.xbill.DNS.Lookup;
37
38 /***
39 * A partial implementation of the StatisticsTracking interface.
40 * <p>
41 * It covers the thread handling. (Launching, pausing etc.) Included in this is
42 * keeping track of the total time spent (actually) crawling. Several methods
43 * to access the time started, finished etc. are provided.
44 * <p>
45 * To handle the thread work the class implements the CrawlStatusListener and
46 * uses it's events to pause, resume and stop logging of statistics. The run()
47 * method will call logActivity() at intervals specified in the crawl order.
48 * <p>
49 * Implementation of logActivity (the actual logging) as well as listening for
50 * CrawlURIDisposition events is not addressed.
51 *
52 * @author Kristinn Sigurdsson
53 *
54 * @see org.archive.crawler.framework.StatisticsTracking
55 * @see org.archive.crawler.admin.StatisticsTracker
56 */
57 public abstract class AbstractTracker extends ModuleType
58 implements StatisticsTracking, CrawlStatusListener, Serializable {
59 /*** Default period between logging stat values */
60 public static final Integer DEFAULT_STATISTICS_REPORT_INTERVAL =
61 new Integer(20);
62 /*** Attribute name for logging interval in seconds setting
63 */
64 public static final String ATTR_STATS_INTERVAL = "interval-seconds";
65
66 /*** A reference to the CrawlContoller of the crawl that we are to track
67 * statistics for.
68 */
69 protected transient CrawlController controller;
70
71
72 protected long crawlerStartTime;
73 protected long crawlerEndTime = -1;
74 protected long crawlerPauseStarted = 0;
75 protected long crawlerTotalPausedTime = 0;
76
77 /*** Timestamp of when this logger last wrote something to the log */
78 protected long lastLogPointTime;
79
80 volatile protected boolean shouldrun = true;
81
82 /***
83 * @param name
84 * @param description
85 */
86 public AbstractTracker(String name, String description) {
87 super(name, description);
88 Type e = addElementToDefinition(new SimpleType(ATTR_STATS_INTERVAL,
89 "The interval between writing progress information to log.",
90 DEFAULT_STATISTICS_REPORT_INTERVAL));
91 e.setOverrideable(false);
92 }
93
94 /***
95 * Sets up the Logger (including logInterval) and registers with the
96 * CrawlController for CrawlStatus and CrawlURIDisposition events.
97 *
98 * @param c A crawl controller instance.
99 * @throws FatalConfigurationException Not thrown here. For overrides that
100 * go to settings system for configuration.
101 * @see CrawlStatusListener
102 * @see org.archive.crawler.event.CrawlURIDispositionListener
103 */
104 public void initialize(CrawlController c)
105 throws FatalConfigurationException {
106 this.controller = c;
107
108
109 this.controller.addCrawlStatusListener(this);
110 }
111
112 /***
113 * Start thread. Will call logActivity() at intervals specified by
114 * logInterval
115 *
116 */
117 public void run() {
118
119 if (this.controller == null) {
120 return;
121 }
122
123 shouldrun = true;
124
125
126 this.controller.logProgressStatistics(progressStatisticsLegend());
127 lastLogPointTime = System.currentTimeMillis();
128
129
130 while (shouldrun) {
131
132
133 try {
134 Thread.sleep(getLogWriteInterval() * 1000);
135 } catch (InterruptedException e) {
136 e.printStackTrace();
137 controller.runtimeErrors.log(Level.INFO,
138 "Periodic stat logger interrupted while sleeping.");
139 }
140
141
142
143 if (shouldrun && getCrawlPauseStartedTime() == 0) {
144 progressStatisticsEvent(new EventObject(this));
145 }
146 }
147 }
148
149 /***
150 * @return legend for progress-statistics lines/log
151 */
152 public String progressStatisticsLegend() {
153 return " timestamp" +
154 " discovered " +
155 " queued downloaded doc/s(avg) KB/s(avg) " +
156 " dl-failures busy-thread mem-use-KB heap-size-KB " +
157 " congestion max-depth avg-depth";
158 }
159
160 /***
161 * Notify tracker that crawl has begun. Must be called
162 * outside tracker's own thread, to ensure it is noted
163 * before other threads start interacting with tracker.
164 */
165 public void noteStart() {
166 if (this.crawlerStartTime == 0) {
167
168 this.crawlerStartTime = System.currentTimeMillis();
169 }
170 }
171
172 /***
173 * A method for logging current crawler state.
174 *
175 * This method will be called by run() at intervals specified in
176 * the crawl order file. It is also invoked when pausing or
177 * stopping a crawl to capture the state at that point. Default behavior is
178 * call to {@link CrawlController#logProgressStatistics} so CrawlController
179 * can act on progress statistics event.
180 * <p>
181 * It is recommended that for implementations of this method it be
182 * carefully considered if it should be synchronized in whole or in
183 * part
184 * @param e Progress statistics event.
185 */
186 protected synchronized void progressStatisticsEvent(final EventObject e) {
187 this.controller.progressStatisticsEvent(e);
188
189
190
191 Lookup.getDefaultCache(DClass.IN).clearCache();
192 }
193
194 /***
195 * Get the starting time of the crawl (as given by
196 * <code>System.currentTimeMillis()</code> when the crawl started).
197 * @return time fo the crawl's start
198 */
199 public long getCrawlStartTime() {
200 return this.crawlerStartTime;
201 }
202
203 /***
204 * If crawl has ended it will return the time it ended (given by
205 * <code>System.currentTimeMillis()</code> at that time).
206 * <br>
207 * If crawl is still going on it will return the same as
208 * <code>System.currentTimeMillis()</code> at the time of the call.
209 * @return The time of the crawl ending or the current time if the crawl has
210 * not ended.
211 */
212 public long getCrawlEndTime() {
213 return (this.crawlerEndTime == -1)?
214 System.currentTimeMillis(): this.crawlerEndTime;
215 }
216
217 /***
218 * Returns the number of milliseconds that the crawl spent paused or
219 * otherwise in a nonactive state.
220 * @return the number of msec. that the crawl was paused or otherwise
221 * suspended.
222 */
223 public long getCrawlTotalPauseTime() {
224 return this.crawlerTotalPausedTime;
225 }
226
227 /***
228 * Get the time when the the crawl was last paused/suspended (as given by
229 * <code>System.currentTimeMillis()</code> at that time). Will be 0 if the
230 * crawl is not currently paused.
231 * @return time of the crawl's last pause/suspend or 0 if the crawl is not
232 * currently paused.
233 */
234 public long getCrawlPauseStartedTime() {
235 return this.crawlerPauseStarted;
236 }
237
238 public long getCrawlerTotalElapsedTime() {
239 if (getCrawlStartTime() == 0) {
240
241 return 0;
242 }
243
244 return (getCrawlPauseStartedTime() != 0)?
245
246 (getCrawlPauseStartedTime() - getCrawlTotalPauseTime() -
247 getCrawlStartTime()):
248
249 (getCrawlEndTime() - getCrawlTotalPauseTime() - getCrawlStartTime());
250 }
251
252 /***
253 * The number of seconds to wait between writing snapshot data to log file.
254 * @return the number of seconds to wait between writing snapshot data to
255 * log file.
256 */
257 protected int getLogWriteInterval() {
258 int logInterval;
259 try {
260 logInterval =
261 ((Integer) getAttribute(null, ATTR_STATS_INTERVAL)).intValue();
262 } catch (AttributeNotFoundException e) {
263 logInterval = 10;
264 }
265 return logInterval;
266 }
267
268 /***
269 * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
270 */
271 public void crawlPausing(String statusMessage) {
272 logNote("CRAWL WAITING - " + statusMessage);
273 }
274
275 protected void logNote(final String note) {
276 this.controller.logProgressStatistics(new PaddingStringBuffer()
277 .append(ArchiveUtils.get14DigitDate())
278 .append(" ")
279 .append(note)
280 .toString());
281 }
282
283 public void crawlPaused(String statusMessage) {
284 crawlerPauseStarted = System.currentTimeMillis();
285 progressStatisticsEvent(new EventObject(this));
286 logNote("CRAWL PAUSED - " + statusMessage);
287 }
288
289 public void crawlResuming(String statusMessage) {
290 tallyCurrentPause();
291 logNote("CRAWL RESUMED - " + statusMessage);
292 lastLogPointTime = System.currentTimeMillis();
293 }
294
295 /***
296 * For a current pause (if any), add paused time to total and reset
297 */
298 protected void tallyCurrentPause() {
299 if (this.crawlerPauseStarted > 0) {
300
301 this.crawlerTotalPausedTime
302 += (System.currentTimeMillis() - this.crawlerPauseStarted);
303 }
304 this.crawlerPauseStarted = 0;
305 }
306
307 public void crawlEnding(String sExitMessage) {
308 logNote("CRAWL ENDING - " + sExitMessage);
309 }
310
311 /***
312 * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
313 */
314 public void crawlEnded(String sExitMessage) {
315
316 crawlerEndTime = System.currentTimeMillis();
317 progressStatisticsEvent(new EventObject(this));
318 logNote("CRAWL ENDED - " + sExitMessage);
319 shouldrun = false;
320 dumpReports();
321 finalCleanup();
322 }
323
324 public void crawlStarted(String message) {
325 tallyCurrentPause();
326 noteStart();
327 }
328
329 /***
330 * Dump reports, if any, on request or at crawl end.
331 */
332 protected void dumpReports() {
333
334 }
335
336 /***
337 * Cleanup resources used, at crawl end.
338 */
339 protected void finalCleanup() {
340 controller = null;
341 }
342
343 /***
344 * @see org.archive.crawler.framework.StatisticsTracking#crawlDuration()
345 */
346 public long crawlDuration() {
347 return getCrawlerTotalElapsedTime();
348 }
349 }