1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.framework;
26
27
28 /***
29 * An optional interface the Frontiers can implement to provide information
30 * about specific hosts.
31 *
32 * <p>Some URIFrontier implmentations will want to provide a number of
33 * statistics relating to the progress of particular hosts. This only applies
34 * to those Frontiers whose internal structure uses hosts to split up the
35 * workload and (for example) implement politeness. Some other Frontiers may
36 * also provide this info based on calculations.
37 *
38 * <ul>
39 * <li> {@link #activeHosts() Active hosts}
40 * <li> {@link #inactiveHosts() Inactive hosts}
41 * <li> {@link #deferredHosts() deferred hosts}
42 * <li> {@link #inProcessHosts() In process hosts}
43 * <li> {@link #readyHosts() Ready hosts}
44 * <li> {@link #hostStatus(String) Host status}
45 * </ul>
46 *
47 * @author Kristinn Sigurdsson
48 *
49 * @see org.archive.crawler.framework.Frontier
50 */
51 public interface FrontierHostStatistics {
52
53 /***
54 * Host has not been encountered by the Frontier, or has been encountered
55 * but has been inactive so long that it has expired.
56 */
57 public static final int HOST_UNKNOWN = 0;
58 /*** Host has URIs ready to be emited. */
59 public static final int HOST_READY = 1;
60 /*** Host has URIs currently being proessed. */
61 public static final int HOST_INPROCESS = 2;
62 /***
63 * Host has been deferred for some amount of time, will become ready once
64 * once that time has elapsed. This is most likely due to politeness or
65 * waiting between retries. Other conditions may exist.
66 */
67 public static final int HOST_DEFERRED = 3;
68 /***
69 * Host has been encountered and all availible URIs for it have been
70 * processed already. More URIs may become availible later or not.
71 * Inactive hosts may eventually become 'forgotten'.
72 */
73 public static final int HOST_INACTIVE = 4;
74
75 /***
76 * Total number of hosts that are currently active.
77 *
78 * <p>Active hosts are considered to be those that are ready, deferred or
79 * in process.
80 *
81 * @return Total number of hosts that are currently active.
82 */
83 public int activeHosts();
84
85 /***
86 * Total number of inactive hosts.
87 *
88 * <p>Inactive hosts are those hosts that have been active but have now been
89 * exhausted and contain no more additional URIs.
90 *
91 * @return Total number of inactive hosts.
92 */
93 public int inactiveHosts();
94
95 /***
96 * Total number of deferred hosts.
97 *
98 * <p>Deferred hosts are currently active hosts that have been deferred
99 * from processing for the time being (becausee of politeness or waiting
100 * before retrying.
101 *
102 * @return Total number of deferred hosts.
103 */
104 public int deferredHosts();
105
106 /***
107 * Total number of hosts with URIs in process.
108 *
109 * <p>It is generally assumed that each host can have only 1 URI in
110 * process at the same time. However some frontiers may implement
111 * politeness differently meaning that the same host is both ready and
112 * in process. {@link #activeHosts() activeHosts()} will not count them
113 * twice though.
114 *
115 * @return Total number of hosts with URIs in process.
116 */
117 public int inProcessHosts();
118
119 /***
120 * Total number of hosts that have a URI ready for processing.
121 *
122 * @return Total number of hosts that have a URI ready for processing.
123 */
124 public int readyHosts();
125
126 /***
127 * Get the status of a host.
128 *
129 * <p>Hosts can be in one of the following states:
130 * <ul>
131 * <li> {@link #HOST_READY Ready}
132 * <li> {@link #HOST_INPROCESS In process}
133 * <li> {@link #HOST_DEFERRED deferred}
134 * <li> {@link #HOST_INACTIVE Inactive}
135 * <li> {@link #HOST_UNKNOWN Unknown}
136 * </ul>
137 *
138 * <p>Some Frontiers may allow a host to have more then one URI in process
139 * at the same time. In those cases it will be reported as
140 * {@link #HOST_READY Ready} as long as it is has more URIs ready for
141 * processing. Only once it has no more possible URIs for processing will
142 * it be reported as {@link #HOST_INPROCESS In process}
143 *
144 * @param host The name of the host to lookup the status for.
145 * @return The status of the specified host.
146 *
147 * @see #HOST_DEFERRED
148 * @see #HOST_INACTIVE
149 * @see #HOST_INPROCESS
150 * @see #HOST_READY
151 * @see #HOST_UNKNOWN
152 */
153 public int hostStatus(String host);
154
155 }