View Javadoc

1   /* ServerCache 
2    * 
3    * Created on Nov 19, 2004
4    *
5    * Copyright (C) 2009 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.datamodel;
24  
25  import java.util.logging.Level;
26  import java.util.logging.Logger;
27  
28  import org.apache.commons.collections.Closure;
29  import org.apache.commons.httpclient.URIException;
30  import org.archive.crawler.framework.CrawlController;
31  import org.archive.crawler.settings.SettingsHandler;
32  import org.archive.util.ObjectIdentityCache;
33  import org.archive.util.ObjectIdentityMemCache;
34  import org.archive.util.Supplier;
35  
36  /***
37   * Server and Host cache.
38   * @author stack
39   * @version $Date: 2010-10-12 00:39:07 +0000 (Tue, 12 Oct 2010) $, $Revision: 6967 $
40   */
41  public class ServerCache {
42      private static Logger logger =
43          Logger.getLogger(ServerCache.class.getName());
44      
45      protected SettingsHandler settingsHandler = null;
46      
47      /***
48       * hostname[:port] -> CrawlServer.
49       * Set in the initialization.
50       */
51      protected ObjectIdentityCache<String,CrawlServer> servers = null;
52      
53      /***
54       * hostname -> CrawlHost.
55       * Set in the initialization.
56       */
57      protected ObjectIdentityCache<String,CrawlHost> hosts = null;
58      
59      /***
60       * Constructor.
61       * Shutdown access to the default constructor by making it protected.
62       */
63      protected ServerCache() {
64          super();
65      }
66      
67      /***
68       * This constructor creates a ServerCache that is all memory-based using
69       * Hashtables.  Used for unit testing only
70       * (Use {@link #ServerCache(CrawlController)} when crawling).
71       * @param sh
72       * @throws Exception
73       */
74      public ServerCache(final SettingsHandler sh)
75      throws Exception {
76          this.settingsHandler = sh;
77          this.servers = new ObjectIdentityMemCache<CrawlServer>();
78          this.hosts = new ObjectIdentityMemCache<CrawlHost>();
79      }
80      
81      /***
82       * Create a ServerCache that uses the given CrawlController to initialize the
83       * maps of servers and hosts.
84       * @param c 
85       * @throws Exception
86       */
87      public ServerCache(final CrawlController c)
88      throws Exception {
89          this.settingsHandler = c.getSettingsHandler();
90          this.servers = c.getBigMap("servers", CrawlServer.class);
91          this.hosts = c.getBigMap("hosts", CrawlHost.class);
92      }
93      
94      /***
95       * Get the {@link CrawlServer} associated with <code>name</code>,
96       * creating if necessary. 
97       * @param serverKey Server name we're to return server for.
98       * @return CrawlServer instance that matches the passed server name.
99       */
100     public CrawlServer getServerFor(final String serverKey) {
101         CrawlServer cserver = servers.getOrUse(
102                 serverKey,
103                 new Supplier<CrawlServer>() {
104                     public CrawlServer get() {
105                         String skey = new String(serverKey); // ensure private minimal key
106                         CrawlServer cs = new CrawlServer(skey);
107                         cs.setSettingsHandler(settingsHandler);
108                         return cs;
109                     }});
110         return cserver;
111     }
112 
113     /***
114      * Get the {@link CrawlServer} associated with <code>curi</code>.
115      * @param cauri CandidateURI we're to get server from.
116      * @return CrawlServer instance that matches the passed CandidateURI.
117      */
118     public CrawlServer getServerFor(CandidateURI cauri) {
119         CrawlServer cs = null;
120         try {
121             String key = CrawlServer.getServerKey(cauri);
122             // TODOSOMEDAY: make this robust against those rare cases
123             // where authority is not a hostname.
124             if (key != null) {
125                 cs = getServerFor(key);
126             }
127         }  catch (URIException e) {
128             logger.log(Level.FINE, "No server key for: "+cauri.toString(), e);
129         } catch (NullPointerException npe) {
130             logger.log(Level.FINE, "No server key for: "+cauri.toString(), npe);
131         }
132         return cs;
133     }
134     
135     /***
136      * Get the {@link CrawlHost} associated with <code>name</code>.
137      * @param hostname Host name we're to return Host for.
138      * @return CrawlHost instance that matches the passed Host name.
139      */
140     public CrawlHost getHostFor(final String hostname) {
141         if (hostname == null || hostname.length() == 0) {
142             return null;
143         }
144         CrawlHost host = hosts.getOrUse(
145                 hostname,
146                 new Supplier<CrawlHost>() {
147                     public CrawlHost get() {
148                         String hkey = new String(hostname); // ensure private minimal key
149                         return new CrawlHost(hkey);
150                     }});
151         return host;
152     }
153     
154     /***
155      * Get the {@link CrawlHost} associated with <code>curi</code>.
156      * @param cauri CandidateURI we're to return Host for.
157      * @return CandidateURI instance that matches the passed Host name.
158      */
159     public CrawlHost getHostFor(CandidateURI cauri) {
160         CrawlHost h = null;
161         try {
162             String hostKey;
163             if (cauri.getUURI().getScheme().equals("dns")) {
164                 hostKey = "dns:";
165             } else {
166                 hostKey = cauri.getUURI().getReferencedHost();
167             }
168             h = getHostFor(hostKey);
169         } catch (URIException e) {
170             e.printStackTrace();
171         }
172         return h;
173     }
174 
175     /***
176      * @param serverKey Key to use doing lookup.
177      * @return True if a server instance exists.
178      */
179     public boolean containsServer(String serverKey) {
180         return (CrawlServer) servers.get(serverKey) != null; 
181     }
182 
183     /***
184      * @param hostKey Key to use doing lookup.
185      * @return True if a host instance exists.
186      */
187     public boolean containsHost(String hostKey) {
188         return (CrawlHost) hosts.get(hostKey) != null; 
189     }
190 
191     /***
192      * Called when shutting down the cache so we can do clean up.
193      */
194     public void cleanup() {
195         if (this.hosts != null) {
196             // If we're using a bdb bigmap, the call to clear will
197             // close down the bdb database.
198             this.hosts.close();
199             this.hosts = null;
200         }
201         if (this.servers != null) { 
202             this.servers.close();
203             this.servers = null;
204         }
205     }
206 
207     public void forAllHostsDo(Closure c) {
208         for(String host : hosts.keySet()) {
209             c.execute(hosts.get(host));
210         }
211     }
212 }