1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.framework;
26
27 import java.io.IOException;
28 import java.util.ArrayList;
29
30 import org.archive.crawler.datamodel.CandidateURI;
31 import org.archive.crawler.datamodel.CrawlSubstats;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.exceptions.EndedException;
34 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
35 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
36 import org.archive.crawler.frontier.FrontierJournal;
37 import org.archive.net.UURI;
38 import org.archive.util.Reporter;
39
40
41 /***
42 * An interface for URI Frontiers.
43 *
44 * <p>A URI Frontier is a pluggable module in Heritrix that maintains the
45 * internal state of the crawl. This includes (but is not limited to):
46 * <ul>
47 * <li>What URIs have been discovered
48 * <li>What URIs are being processed (fetched)
49 * <li>What URIs have been processed
50 * <li>In what order unprocessed URIs will be processed
51 * </ul>
52 *
53 * <p>The Frontier is also responsible for enforcing any politeness restrictions
54 * that may have been applied to the crawl. Such as limiting simultaneous
55 * connection to the same host, server or IP number to 1 (or any other fixed
56 * amount), delays between connections etc.
57 *
58 * <p>A URIFrontier is created by the
59 * {@link org.archive.crawler.framework.CrawlController CrawlController} which
60 * is in turn responsible for providing access to it. Most significant among
61 * those modules interested in the Frontier are the
62 * {@link org.archive.crawler.framework.ToeThread ToeThreads} who perform the
63 * actual work of processing a URI.
64 *
65 * <p>The methods defined in this interface are those required to get URIs for
66 * processing, report the results of processing back (ToeThreads) and to get
67 * access to various statistical data along the way. The statistical data is
68 * of interest to {@link org.archive.crawler.framework.StatisticsTracking
69 * Statistics Tracking} modules. A couple of additional methods are provided
70 * to be able to inspect and manipulate the Frontier at runtime.
71 *
72 * <p>The statistical data exposed by this interface is:
73 * <ul>
74 * <li> {@link #discoveredUriCount() Discovered URIs}
75 * <li> {@link #queuedUriCount() Queued URIs}
76 * <li> {@link #finishedUriCount() Finished URIs}
77 * <li> {@link #succeededFetchCount() Successfully processed URIs}
78 * <li> {@link #failedFetchCount() Failed to process URIs}
79 * <li> {@link #disregardedUriCount() Disregarded URIs}
80 * <li> {@link #totalBytesWritten() Total bytes written}
81 * </ul>
82 *
83 * <p>In addition the frontier may optionally implement an interface that
84 * exposes information about hosts.
85 *
86 * <p>Furthermore any implementation of the URI Frontier should trigger
87 * {@link org.archive.crawler.event.CrawlURIDispositionListener
88 * CrawlURIDispostionEvents} by invoking the proper methods on the
89 * {@link org.archive.crawler.framework.CrawlController CrawlController}.
90 * Doing this allows a custom built
91 * {@link org.archive.crawler.framework.StatisticsTracking
92 * Statistics Tracking} module to gather any other additional data it might be
93 * interested in by examining the completed URIs.
94 *
95 * <p>All URI Frontiers inherit from
96 * {@link org.archive.crawler.settings.ModuleType ModuleType}
97 * and therefore creating settings follows the usual pattern of pluggable modules
98 * in Heritrix.
99 *
100 * @author Gordon Mohr
101 * @author Kristinn Sigurdsson
102 *
103 * @see org.archive.crawler.framework.CrawlController
104 * @see org.archive.crawler.framework.CrawlController#fireCrawledURIDisregardEvent(CrawlURI)
105 * @see org.archive.crawler.framework.CrawlController#fireCrawledURIFailureEvent(CrawlURI)
106 * @see org.archive.crawler.framework.CrawlController#fireCrawledURINeedRetryEvent(CrawlURI)
107 * @see org.archive.crawler.framework.CrawlController#fireCrawledURISuccessfulEvent(CrawlURI)
108 * @see org.archive.crawler.framework.StatisticsTracking
109 * @see org.archive.crawler.framework.ToeThread
110 * @see org.archive.crawler.framework.FrontierHostStatistics
111 * @see org.archive.crawler.settings.ModuleType
112 */
113 public interface Frontier extends Reporter {
114 /***
115 * All URI Frontiers should have the same 'name' attribute. This constant
116 * defines that name. This is a name used to reference the Frontier being
117 * used in a given crawl order and since there can only be one Frontier
118 * per crawl order a fixed, unique name for Frontiers is optimal.
119 *
120 * @see org.archive.crawler.settings.ModuleType#ModuleType(String)
121 */
122 public static final String ATTR_NAME = "frontier";
123
124 /***
125 * Initialize the Frontier.
126 *
127 * <p> This method is invoked by the CrawlController once it has
128 * created the Frontier. The constructor of the Frontier should
129 * only contain code for setting up it's settings framework. This
130 * method should contain all other 'startup' code.
131 *
132 * @param c The CrawlController that created the Frontier.
133 *
134 * @throws FatalConfigurationException If provided settings are illegal or
135 * otherwise unusable.
136 * @throws IOException If there is a problem reading settings or seeds file
137 * from disk.
138 */
139 public void initialize(CrawlController c)
140 throws FatalConfigurationException, IOException;
141
142 /***
143 * Get the next URI that should be processed. If no URI becomes availible
144 * during the time specified null will be returned.
145 *
146 * @return the next URI that should be processed.
147 * @throws InterruptedException
148 * @throws EndedException
149 */
150 CrawlURI next() throws InterruptedException, EndedException;
151
152 /***
153 * Returns true if the frontier contains no more URIs to crawl.
154 *
155 * <p>That is to say that there are no more URIs either currently availible
156 * (ready to be emitted), URIs belonging to deferred hosts or pending URIs
157 * in the Frontier. Thus this method may return false even if there is no
158 * currently availible URI.
159 *
160 * @return true if the frontier contains no more URIs to crawl.
161 */
162 boolean isEmpty();
163
164 /***
165 * Schedules a CandidateURI.
166 *
167 * <p>This method accepts one URI and schedules it immediately. This has
168 * nothing to do with the priority of the URI being scheduled. Only that
169 * it will be placed in it's respective queue at once. For priority
170 * scheduling see {@link CandidateURI#setSchedulingDirective(int)}
171 *
172 * <p>This method should be synchronized in all implementing classes.
173 *
174 * @param caURI The URI to schedule.
175 *
176 * @see CandidateURI#setSchedulingDirective(int)
177 */
178 public void schedule(CandidateURI caURI);
179
180 /***
181 * Report a URI being processed as having finished processing.
182 *
183 * <p>ToeThreads will invoke this method once they have completed work on
184 * their assigned URI.
185 *
186 * <p>This method is synchronized.
187 *
188 * @param cURI The URI that has finished processing.
189 */
190 public void finished(CrawlURI cURI);
191
192 /***
193 * Number of <i>discovered</i> URIs.
194 *
195 * <p>That is any URI that has been confirmed be within 'scope'
196 * (i.e. the Frontier decides that it should be processed). This
197 * includes those that have been processed, are being processed
198 * and have finished processing. Does not include URIs that have
199 * been 'forgotten' (deemed out of scope when trying to fetch,
200 * most likely due to operator changing scope definition).
201 *
202 * <p><b>Note:</b> This only counts discovered URIs. Since the same
203 * URI can (at least in most frontiers) be fetched multiple times, this
204 * number may be somewhat lower then the combined <i>queued</i>,
205 * <i>in process</i> and <i>finished</i> items combined due to duplicate
206 * URIs being queued and processed. This variance is likely to be especially
207 * high in Frontiers implementing 'revist' strategies.
208 *
209 * @return Number of discovered URIs.
210 */
211 public long discoveredUriCount();
212
213 /***
214 * Number of URIs <i>queued</i> up and waiting for processing.
215 *
216 * <p>This includes any URIs that failed but will be retried. Basically this
217 * is any <i>discovered</i> URI that has not either been processed or is
218 * being processed. The same discovered URI can be queued multiple times.
219 *
220 * @return Number of queued URIs.
221 */
222 public long queuedUriCount();
223
224 public long deepestUri();
225 public long averageDepth();
226 public float congestionRatio();
227
228 /***
229 * Number of URIs that have <i>finished</i> processing.
230 *
231 * <p>Includes both those that were processed successfully and failed to be
232 * processed (excluding those that failed but will be retried). Does not
233 * include those URIs that have been 'forgotten' (deemed out of scope when
234 * trying to fetch, most likely due to operator changing scope definition).
235 *
236 * @return Number of finished URIs.
237 */
238 public long finishedUriCount();
239
240 /***
241 * Number of <i>successfully</i> processed URIs.
242 *
243 * <p>Any URI that was processed successfully. This includes URIs that
244 * returned 404s and other error codes that do not originate within the
245 * crawler.
246 *
247 * @return Number of <i>successfully</i> processed URIs.
248 */
249 public long succeededFetchCount();
250
251 /***
252 * Number of URIs that <i>failed</i> to process.
253 *
254 * <p>URIs that could not be processed because of some error or failure in
255 * the processing chain. Can include failure to acquire prerequisites, to
256 * establish a connection with the host and any number of other problems.
257 * Does not count those that will be retried, only those that have
258 * permenantly failed.
259 *
260 * @return Number of URIs that failed to process.
261 */
262 public long failedFetchCount();
263
264 /***
265 * Number of URIs that were scheduled at one point but have been
266 * <i>disregarded</i>.
267 *
268 * <p>Counts any URI that is scheduled only to be disregarded
269 * because it is determined to lie outside the scope of the crawl. Most
270 * commonly this will be due to robots.txt exclusions.
271 *
272 * @return The number of URIs that have been disregarded.
273 */
274 public long disregardedUriCount();
275
276 /***
277 * Total number of bytes contained in all URIs that have been processed.
278 *
279 * @return The total amounts of bytes in all processed URIs.
280 * @deprecated misnomer; consult StatisticsTracker instead
281 */
282 public long totalBytesWritten();
283
284 /***
285 * Recover earlier state by reading a recovery log.
286 *
287 * <p>Some Frontiers are able to write detailed logs that can be loaded
288 * after a system crash to recover the state of the Frontier prior to the
289 * crash. This method is the one used to achive this.
290 *
291 * @param pathToLog The name (with full path) of the recover log.
292 * @param retainFailures If true, failures in log should count as
293 * having been included. (If false, failures will be ignored, meaning
294 * the corresponding URIs will be retried in the recovered crawl.)
295 * @throws IOException If problems occur reading the recover log.
296 */
297 public void importRecoverLog(String pathToLog, boolean retainFailures)
298 throws IOException;
299
300 /***
301 * Get a <code>URIFrontierMarker</code> initialized with the given
302 * regular expression at the 'start' of the Frontier.
303 * @param regexpr The regular expression that URIs within the frontier must
304 * match to be considered within the scope of this marker
305 * @param inCacheOnly If set to true, only those URIs within the frontier
306 * that are stored in cache (usually this means in memory
307 * rather then on disk, but that is an implementation
308 * detail) will be considered. Others will be entierly
309 * ignored, as if they dont exist. This is usefull for quick
310 * peeks at the top of the URI list.
311 * @return A URIFrontierMarker that is set for the 'start' of the frontier's
312 * URI list.
313 */
314 public FrontierMarker getInitialMarker(String regexpr,
315 boolean inCacheOnly);
316
317 /***
318 * Returns a list of all uncrawled URIs starting from a specified marker
319 * until <code>numberOfMatches</code> is reached.
320 *
321 * <p>Any encountered URI that has not been successfully crawled, terminally
322 * failed, disregarded or is currently being processed is included. As
323 * there may be duplicates in the frontier, there may also be duplicates
324 * in the report. Thus this includes both discovered and pending URIs.
325 *
326 * <p>The list is a set of strings containing the URI strings. If verbose is
327 * true the string will include some additional information (path to URI
328 * and parent).
329 *
330 * <p>The <code>URIFrontierMarker</code> will be advanced to the position at
331 * which it's maximum number of matches found is reached. Reusing it for
332 * subsequent calls will thus effectively get the 'next' batch. Making
333 * any changes to the frontier can invalidate the marker.
334 *
335 * <p>While the order returned is consistent, it does <i>not</i> have any
336 * explicit relation to the likely order in which they may be processed.
337 *
338 * <p><b>Warning:</b> It is unsafe to make changes to the frontier while
339 * this method is executing. The crawler should be in a paused state before
340 * invoking it.
341 *
342 * @param marker
343 * A marker specifing from what position in the Frontier the
344 * list should begin.
345 * @param numberOfMatches
346 * how many URIs to add at most to the list before returning it
347 * @param verbose
348 * if set to true the strings returned will contain additional
349 * information about each URI beyond their names.
350 * @return a list of all pending URIs falling within the specification
351 * of the marker
352 * @throws InvalidFrontierMarkerException when the
353 * <code>URIFronterMarker</code> does not match the internal
354 * state of the frontier. Tolerance for this can vary
355 * considerably from one URIFrontier implementation to the next.
356 * @see FrontierMarker
357 * @see #getInitialMarker(String, boolean)
358 */
359 public ArrayList<String> getURIsList(FrontierMarker marker,
360 int numberOfMatches,
361 boolean verbose)
362 throws InvalidFrontierMarkerException;
363
364 /***
365 * Delete any URI that matches the given regular expression from the list
366 * of discovered and pending URIs. This does not prevent them from being
367 * rediscovered.
368 *
369 * <p>Any encountered URI that has not been successfully crawled, terminally
370 * failed, disregarded or is currently being processed is considered to be
371 * a pending URI.
372 *
373 * <p><b>Warning:</b> It is unsafe to make changes to the frontier while
374 * this method is executing. The crawler should be in a paused state before
375 * invoking it.
376 *
377 * @param match A regular expression, any URIs that matches it will be
378 * deleted.
379 * @return The number of URIs deleted
380 */
381 public long deleteURIs(String match);
382
383 /***
384 * Delete any URI that matches the given regular expression from the list
385 * of discovered and pending URIs, if it is in a queue with a name matching
386 * the second regular expression. This does not prevent them from being
387 * rediscovered.
388 *
389 * <p>Any encountered URI that has not been successfully crawled, terminally
390 * failed, disregarded or is currently being processed is considered to be
391 * a pending URI.
392 *
393 * <p><b>Warning:</b> It is unsafe to make changes to the frontier while
394 * this method is executing. The crawler should be in a paused state before
395 * invoking it.
396 *
397 * @param uriMatch A regular expression, any URIs that matches will be
398 * deleted from the affected queues.
399 * @param queueMatch A regular expression, any queues matching will have
400 * their URIs checked. A null value means all queues.
401 * @return The number of URIs deleted
402 */
403 public long deleteURIs(String uriMatch, String queueMatch);
404
405 /***
406 * Notify Frontier that a CrawlURI has been deleted outside of the
407 * normal next()/finished() lifecycle.
408 *
409 * @param curi Deleted CrawlURI.
410 */
411 public void deleted(CrawlURI curi);
412
413 /***
414 * Notify Frontier that it should consider the given UURI as if
415 * already scheduled.
416 *
417 * @param u UURI instance to add to the Already Included set.
418 */
419 public void considerIncluded(UURI u);
420
421 /***
422 * Notify Frontier that it should consider updating configuration
423 * info that may have changed in external files.
424 */
425 public void kickUpdate();
426
427 /***
428 * Notify Frontier that it should not release any URIs, instead
429 * holding all threads, until instructed otherwise.
430 */
431 public void pause();
432
433 /***
434 * Resumes the release of URIs to crawl, allowing worker
435 * ToeThreads to proceed.
436 */
437 public void unpause();
438
439 /***
440 * Notify Frontier that it should end the crawl, giving
441 * any worker ToeThread that askss for a next() an
442 * EndedException.
443 */
444 public void terminate();
445
446 /***
447 * @return Return the instance of {@link FrontierJournal} that
448 * this Frontier is using. May be null if no journaling.
449 */
450 public FrontierJournal getFrontierJournal();
451
452 /***
453 * @param cauri CandidateURI for which we're to calculate and
454 * set class key.
455 * @return Classkey for <code>cauri</code>.
456 */
457 public String getClassKey(CandidateURI cauri);
458
459 /***
460 * Request that the Frontier load (or reload) crawl seeds,
461 * typically by contacting the Scope.
462 */
463 public void loadSeeds();
464
465 /***
466 * Request that Frontier allow crawling to begin. Usually
467 * just unpauses Frontier, if paused.
468 */
469 public void start();
470
471 /***
472 * Get the 'frontier group' (usually queue) for the given
473 * CrawlURI.
474 * @param curi CrawlURI to find matching group
475 * @return FrontierGroup for the CrawlURI
476 */
477 public FrontierGroup getGroup(CrawlURI curi);
478
479 /***
480 * Generic interface representing the internal groupings
481 * of a Frontier's URIs -- usually queues. Currently only
482 * offers the HasCrawlSubstats interface.
483 */
484 public interface FrontierGroup extends CrawlSubstats.HasCrawlSubstats {
485
486 }
487
488 /***
489 * Perform any final tasks *before* notification crawl has
490 * reached 'FINISHED' status. (For example, anything that needs to
491 * dump final data to disk/logs.)
492 */
493 public void finalTasks();
494 }