View Javadoc

1   /* URIFrontierHostStatistics
2    *
3    * $Id: FrontierHostStatistics.java 2509 2004-09-02 02:16:11Z gojomo $
4    *
5    * Created on Mar 30, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.framework;
26  
27  
28  /***
29   * An optional interface the Frontiers can implement to provide information
30   * about specific hosts.
31   *
32   * <p>Some URIFrontier implmentations will want to provide a number of
33   * statistics relating to the progress of particular hosts. This only applies
34   * to those Frontiers whose internal structure  uses hosts to split up the
35   * workload and (for example) implement politeness. Some other Frontiers may
36   * also provide this info based on calculations.
37   *
38   * <ul>
39   *     <li> {@link #activeHosts() Active hosts}
40   *     <li> {@link #inactiveHosts() Inactive hosts}
41   *     <li> {@link #deferredHosts() deferred hosts}
42   *     <li> {@link #inProcessHosts() In process hosts}
43   *     <li> {@link #readyHosts() Ready hosts}
44   *     <li> {@link #hostStatus(String) Host status}
45   * </ul>
46   *
47   * @author Kristinn Sigurdsson
48   *
49   * @see org.archive.crawler.framework.Frontier
50   */
51  public interface FrontierHostStatistics {
52  
53      /***
54       * Host has not been encountered by the Frontier, or has been encountered
55       * but has been inactive so long that it has expired.
56       */
57      public static final int HOST_UNKNOWN = 0;
58      /*** Host has URIs ready to be emited. */
59      public static final int HOST_READY = 1;
60      /*** Host has URIs currently being proessed. */
61      public static final int HOST_INPROCESS = 2;
62      /***
63       * Host has been deferred for some amount of time, will become ready once
64       * once that time has elapsed. This is most likely due to politeness or
65       * waiting between retries. Other conditions may exist.
66       */
67      public static final int HOST_DEFERRED = 3;
68      /***
69       * Host has been encountered and all availible URIs for it have been
70       * processed already. More URIs may become availible later or not.
71       * Inactive hosts may eventually become 'forgotten'.
72       */
73      public static final int HOST_INACTIVE = 4;
74  
75      /***
76       * Total number of hosts that are currently active.
77       *
78       * <p>Active hosts are considered to be those that are ready, deferred or
79       * in process.
80       *
81       * @return Total number of hosts that are currently active.
82       */
83      public int activeHosts();
84  
85      /***
86       * Total number of inactive hosts.
87       *
88       * <p>Inactive hosts are those hosts that have been active but have now been
89       * exhausted and contain no more additional URIs.
90       *
91       * @return Total number of inactive hosts.
92       */
93      public int inactiveHosts();
94  
95      /***
96       * Total number of deferred hosts.
97       *
98       * <p>Deferred hosts are currently active hosts that have been deferred
99       * from processing for the time being (becausee of politeness or waiting
100      * before retrying.
101      *
102      * @return Total number of deferred hosts.
103      */
104     public int deferredHosts();
105 
106     /***
107      * Total number of hosts with URIs in process.
108      *
109      * <p>It is generally assumed that each host can have only 1 URI in
110      * process at the same time. However some frontiers may implement
111      * politeness differently meaning that the same host is both ready and
112      * in process. {@link #activeHosts() activeHosts()} will not count them
113      * twice though.
114      *
115      * @return Total number of hosts with URIs in process.
116      */
117     public int inProcessHosts();
118 
119     /***
120      * Total number of hosts that have a URI ready for processing.
121      *
122      * @return Total number of hosts that have a URI ready for processing.
123      */
124     public int readyHosts();
125 
126     /***
127      * Get the status of a host.
128      *
129      * <p>Hosts can be in one of the following states:
130      * <ul>
131      *     <li> {@link #HOST_READY Ready}
132      *     <li> {@link #HOST_INPROCESS In process}
133      *     <li> {@link #HOST_DEFERRED deferred}
134      *     <li> {@link #HOST_INACTIVE Inactive}
135      *     <li> {@link #HOST_UNKNOWN Unknown}
136      * </ul>
137      *
138      * <p>Some Frontiers may allow a host to have more then one URI in process
139      * at the same time. In those cases it will be reported as
140      * {@link #HOST_READY Ready} as long as it is has more URIs ready for
141      * processing. Only once it has no more possible URIs for processing will
142      * it be reported as {@link #HOST_INPROCESS In process}
143      *
144      * @param host The name of the host to lookup the status for.
145      * @return The status of the specified host.
146      *
147      * @see #HOST_DEFERRED
148      * @see #HOST_INACTIVE
149      * @see #HOST_INPROCESS
150      * @see #HOST_READY
151      * @see #HOST_UNKNOWN
152      */
153     public int hostStatus(String host);
154 
155 }