1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.datamodel;
24
25 import java.util.logging.Level;
26 import java.util.logging.Logger;
27
28 import org.apache.commons.collections.Closure;
29 import org.apache.commons.httpclient.URIException;
30 import org.archive.crawler.framework.CrawlController;
31 import org.archive.crawler.settings.SettingsHandler;
32 import org.archive.util.ObjectIdentityCache;
33 import org.archive.util.ObjectIdentityMemCache;
34 import org.archive.util.Supplier;
35
36 /***
37 * Server and Host cache.
38 * @author stack
39 * @version $Date: 2010-10-12 00:39:07 +0000 (Tue, 12 Oct 2010) $, $Revision: 6967 $
40 */
41 public class ServerCache {
42 private static Logger logger =
43 Logger.getLogger(ServerCache.class.getName());
44
45 protected SettingsHandler settingsHandler = null;
46
47 /***
48 * hostname[:port] -> CrawlServer.
49 * Set in the initialization.
50 */
51 protected ObjectIdentityCache<String,CrawlServer> servers = null;
52
53 /***
54 * hostname -> CrawlHost.
55 * Set in the initialization.
56 */
57 protected ObjectIdentityCache<String,CrawlHost> hosts = null;
58
59 /***
60 * Constructor.
61 * Shutdown access to the default constructor by making it protected.
62 */
63 protected ServerCache() {
64 super();
65 }
66
67 /***
68 * This constructor creates a ServerCache that is all memory-based using
69 * Hashtables. Used for unit testing only
70 * (Use {@link #ServerCache(CrawlController)} when crawling).
71 * @param sh
72 * @throws Exception
73 */
74 public ServerCache(final SettingsHandler sh)
75 throws Exception {
76 this.settingsHandler = sh;
77 this.servers = new ObjectIdentityMemCache<CrawlServer>();
78 this.hosts = new ObjectIdentityMemCache<CrawlHost>();
79 }
80
81 /***
82 * Create a ServerCache that uses the given CrawlController to initialize the
83 * maps of servers and hosts.
84 * @param c
85 * @throws Exception
86 */
87 public ServerCache(final CrawlController c)
88 throws Exception {
89 this.settingsHandler = c.getSettingsHandler();
90 this.servers = c.getBigMap("servers", CrawlServer.class);
91 this.hosts = c.getBigMap("hosts", CrawlHost.class);
92 }
93
94 /***
95 * Get the {@link CrawlServer} associated with <code>name</code>,
96 * creating if necessary.
97 * @param serverKey Server name we're to return server for.
98 * @return CrawlServer instance that matches the passed server name.
99 */
100 public CrawlServer getServerFor(final String serverKey) {
101 CrawlServer cserver = servers.getOrUse(
102 serverKey,
103 new Supplier<CrawlServer>() {
104 public CrawlServer get() {
105 String skey = new String(serverKey);
106 CrawlServer cs = new CrawlServer(skey);
107 cs.setSettingsHandler(settingsHandler);
108 return cs;
109 }});
110 return cserver;
111 }
112
113 /***
114 * Get the {@link CrawlServer} associated with <code>curi</code>.
115 * @param cauri CandidateURI we're to get server from.
116 * @return CrawlServer instance that matches the passed CandidateURI.
117 */
118 public CrawlServer getServerFor(CandidateURI cauri) {
119 CrawlServer cs = null;
120 try {
121 String key = CrawlServer.getServerKey(cauri);
122
123
124 if (key != null) {
125 cs = getServerFor(key);
126 }
127 } catch (URIException e) {
128 logger.log(Level.FINE, "No server key for: "+cauri.toString(), e);
129 } catch (NullPointerException npe) {
130 logger.log(Level.FINE, "No server key for: "+cauri.toString(), npe);
131 }
132 return cs;
133 }
134
135 /***
136 * Get the {@link CrawlHost} associated with <code>name</code>.
137 * @param hostname Host name we're to return Host for.
138 * @return CrawlHost instance that matches the passed Host name.
139 */
140 public CrawlHost getHostFor(final String hostname) {
141 if (hostname == null || hostname.length() == 0) {
142 return null;
143 }
144 CrawlHost host = hosts.getOrUse(
145 hostname,
146 new Supplier<CrawlHost>() {
147 public CrawlHost get() {
148 String hkey = new String(hostname);
149 return new CrawlHost(hkey);
150 }});
151 return host;
152 }
153
154 /***
155 * Get the {@link CrawlHost} associated with <code>curi</code>.
156 * @param cauri CandidateURI we're to return Host for.
157 * @return CandidateURI instance that matches the passed Host name.
158 */
159 public CrawlHost getHostFor(CandidateURI cauri) {
160 CrawlHost h = null;
161 try {
162 String hostKey;
163 if (cauri.getUURI().getScheme().equals("dns")) {
164 hostKey = "dns:";
165 } else {
166 hostKey = cauri.getUURI().getReferencedHost();
167 }
168 h = getHostFor(hostKey);
169 } catch (URIException e) {
170 e.printStackTrace();
171 }
172 return h;
173 }
174
175 /***
176 * @param serverKey Key to use doing lookup.
177 * @return True if a server instance exists.
178 */
179 public boolean containsServer(String serverKey) {
180 return (CrawlServer) servers.get(serverKey) != null;
181 }
182
183 /***
184 * @param hostKey Key to use doing lookup.
185 * @return True if a host instance exists.
186 */
187 public boolean containsHost(String hostKey) {
188 return (CrawlHost) hosts.get(hostKey) != null;
189 }
190
191 /***
192 * Called when shutting down the cache so we can do clean up.
193 */
194 public void cleanup() {
195 if (this.hosts != null) {
196
197
198 this.hosts.close();
199 this.hosts = null;
200 }
201 if (this.servers != null) {
202 this.servers.close();
203 this.servers = null;
204 }
205 }
206
207 public void forAllHostsDo(Closure c) {
208 for(String host : hosts.keySet()) {
209 c.execute(hosts.get(host));
210 }
211 }
212 }