View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   */
19  package org.archive.crawler.event;
20  
21  import org.archive.crawler.datamodel.CrawlURI;
22  
23  /***
24   * An interface for objects that want to be notified
25   * of a CrawlURI disposition (happens each time a
26   * curi has been through the processors).
27   * Classes implementing this interface can register with
28   * the CrawlController to receive these events.
29   * <p>
30   * This interface is to facilitate the gathering of
31   * statistics on a running crawl.
32   * <p>
33   * <b>WARNING:</b> One of these methods <i>will</i> be
34   * called for <b>each</b> CrawlURI that is processed.
35   * It is therefor imperative that the methods execute
36   * quickly!
37   * <p>
38   * Also note that the object implementing this interface
39   * must under <b>no circumstances</b> maintain a reference
40   * to the CrawlURI beyond the scope of the relevant method
41   * body!
42   *
43   * @author Kristinn Sigurdsson
44   *
45   * @see org.archive.crawler.framework.CrawlController
46   */
47  public interface CrawlURIDispositionListener
48  {
49      /***
50       * Notification of a successfully crawled URI
51       *
52       * @param curi The relevant CrawlURI
53       */
54      public void crawledURISuccessful(CrawlURI curi);
55  
56      /***
57       * Notification of a failed crawl of a URI that
58       * will be retried (failure due to possible transient
59       * problems).
60       *
61       * @param curi The relevant CrawlURI
62       */
63      public void crawledURINeedRetry(CrawlURI curi);
64  
65      /***
66       * Notification of a crawled URI that is to be disregarded.
67       * Usually this means that the robots.txt file for the
68       * relevant site forbids this from being crawled and we are
69       * therefor not going to keep it.  Other reasons may apply.
70       * In all cases this means that it <i>was</i> successfully
71       * downloaded but will not be stored.
72       *
73       * @param curi The relevant CrawlURI
74       */
75      public void crawledURIDisregard(CrawlURI curi);
76  
77      /***
78       * Notification of a failed crawling of a URI. The failure
79       * is of a type that precludes retries (either by it's very
80       * nature or because it has been retried to many times)
81       *
82       * @param curi The relevant CrawlURI
83       */
84      public void crawledURIFailure(CrawlURI curi);
85  
86  }