CrawlOrder xref

View Javadoc

1   /*
2    * CrawlOrder
3    *
4    * $Header$
5    *
6    * Created on May 15, 2003
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   *
26   */
27  
28  package org.archive.crawler.datamodel;
29  
30  import java.io.File;
31  import java.io.Serializable;
32  import java.util.logging.Logger;
33  
34  import javax.management.AttributeNotFoundException;
35  
36  import org.archive.crawler.framework.CrawlController;
37  import org.archive.crawler.framework.CrawlScope;
38  import org.archive.crawler.framework.Frontier;
39  import org.archive.crawler.framework.Processor;
40  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41  import org.archive.crawler.settings.MapType;
42  import org.archive.crawler.settings.ModuleType;
43  import org.archive.crawler.settings.SimpleType;
44  import org.archive.crawler.settings.Type;
45  import org.archive.crawler.url.canonicalize.BaseRule;
46  
47  /***
48   * Represents the 'root' of the settings hierarchy. Contains those settings that
49   * do not belong to any specific module, but rather relate to the crawl as a
50   * whole (much of this is used by the CrawlController directly or indirectly).
51   *
52   * @see org.archive.crawler.settings.ModuleType
53   */
54  public class CrawlOrder extends ModuleType implements Serializable {
55  
56      private static final long serialVersionUID = -6715840285961511669L;
57  
58      private static Logger logger =
59          Logger.getLogger("org.archive.crawler.datamodel.CrawlOrder");
60  
61      public static final String ATTR_NAME = "crawl-order";
62      public static final String ATTR_SETTINGS_DIRECTORY = "settings-directory";
63      public static final String ATTR_DISK_PATH = "disk-path";
64      public static final String ATTR_LOGS_PATH = "logs-path";
65      public static final String ATTR_CHECKPOINTS_PATH = "checkpoints-path";
66      public static final String ATTR_STATE_PATH = "state-path";
67      public static final String ATTR_SCRATCH_PATH = "scratch-path";
68      public static final String ATTR_RECOVER_PATH = "recover-path";
69      public static final String ATTR_RECOVER_RETAIN_FAILURES =
70          "recover-retain-failures";
71      public static final String ATTR_RECOVER_SCOPE_INCLUDES =
72          "recover-scope-includes";
73      public static final String ATTR_RECOVER_SCOPE_ENQUEUES =
74          "recover-scope-enqueues";
75      public static final String ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download";
76      public static final String ATTR_MAX_DOCUMENT_DOWNLOAD =
77          "max-document-download";
78      public static final String ATTR_MAX_TIME_SEC = "max-time-sec";
79      public static final String ATTR_MAX_TOE_THREADS = "max-toe-threads";
80      public static final String ATTR_HTTP_HEADERS = "http-headers";
81      public static final String ATTR_USER_AGENT = "user-agent";
82      public static final String ATTR_FROM = "from";
83      public static final String ATTR_PRE_FETCH_PROCESSORS =
84          "pre-fetch-processors";
85      public static final String ATTR_FETCH_PROCESSORS = "fetch-processors";
86      public static final String ATTR_EXTRACT_PROCESSORS = "extract-processors";
87      public static final String ATTR_WRITE_PROCESSORS = "write-processors";
88      public static final String ATTR_POST_PROCESSORS = "post-processors";
89      public static final String ATTR_LOGGERS = "loggers";
90      public static final String ATTR_RULES = "uri-canonicalization-rules";
91      public static final String ATTR_RECORDER_OUT_BUFFER =
92          "recorder-out-buffer-bytes";
93      public static final String ATTR_RECORDER_IN_BUFFER =
94          "recorder-in-buffer-bytes";
95      
96      public static final String ATTR_INDEPENDENT_EXTRACTORS = 
97          "independent-extractors";
98      
99      /*** Percentage of heap to allocate to bdb cache */
100     public static final String ATTR_BDB_CACHE_PERCENT =
101         "bdb-cache-percent";
102     
103     /***
104      * When checkpointing, copy the bdb logs.
105      * Default is true.  If false, then we do not copy logs on checkpoint AND
106      * we tell bdbje never to delete log files; instead it renames
107      * files-to-delete with a '.del' extension.  Assumption is that when this
108      * setting is false, an external process is managing the removing of
109      * bdbje log files and that come time to recover from a checkpoint, the
110      * files that comprise a checkpoint are manually assembled.
111      */
112     public static final String ATTR_CHECKPOINT_COPY_BDBJE_LOGS =
113         "checkpoint-copy-bdbje-logs";
114     public static final Boolean DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS =
115         Boolean.TRUE;
116     
117     /***
118      * Default size of bdb cache.
119      */
120     private final static Integer DEFAULT_BDB_CACHE_PERCENT = new Integer(0);
121 
122     private transient MapType httpHeaders;
123     private transient MapType loggers;
124 
125     private transient CrawlController controller;
126 
127     /***
128      * Regex for acceptable user-agent format.
129      */
130     private static String ACCEPTABLE_USER_AGENT =
131         "//S+.*//(.*//+http(s)?:////S+//.//S+.*//).*";
132 
133     /***
134      * Regex for acceptable from address.
135      */
136     private static String ACCEPTABLE_FROM = "//S+@//S+//.//S+";
137     
138 
139     /*** Construct a CrawlOrder.
140      */
141     public CrawlOrder() {
142         super(ATTR_NAME, "Heritrix crawl order. This forms the root of " +
143                 "the settings framework.");
144         Type e;
145 
146         e = addElementToDefinition(new SimpleType(ATTR_SETTINGS_DIRECTORY,
147                 "Directory where override settings are kept. The settings " +
148                 "for many modules can be overridden based on the domain or " +
149                 "subdomain of the URI being processed. This setting specifies" +
150                 " a file level directory to store those settings. The path" +
151                 " is relative to 'disk-path' unless" +
152                 " an absolute path is provided.", "settings"));
153         e.setOverrideable(false);
154         e.setExpertSetting(true);
155 
156         e = addElementToDefinition(new SimpleType(ATTR_DISK_PATH,
157                 "Directory where logs, arcs and other run time files will " +
158                 "be kept. If this path is a relative path, it will be " +
159                 "relative to the crawl order.", ""));
160         e.setOverrideable(false);
161         e.setExpertSetting(true);
162 
163         e = addElementToDefinition(new SimpleType(ATTR_LOGS_PATH,
164                 "Directory where crawler log files will be kept. If this path " +
165                 "is a relative path, it will be relative to the 'disk-path'.",
166                 "logs"));
167         e.setOverrideable(false);
168         e.setExpertSetting(true);
169 
170         e = addElementToDefinition(new SimpleType(ATTR_CHECKPOINTS_PATH,
171                 "Directory where crawler checkpoint files will be kept. " +
172                 "If this path " +
173                 "is a relative path, it will be relative to the 'disk-path'.",
174                 "checkpoints"));
175         e.setOverrideable(false);
176         e.setExpertSetting(true);
177 
178         e = addElementToDefinition(new SimpleType(ATTR_STATE_PATH,
179                 "Directory where crawler-state files will be kept. If this path " +
180                 "is a relative path, it will be relative to the 'disk-path'.",
181                 "state"));
182         e.setOverrideable(false);
183         e.setExpertSetting(true);
184 
185         e = addElementToDefinition(new SimpleType(ATTR_SCRATCH_PATH,
186                 "Directory where discardable temporary files will be kept. " +
187                 "If this path " +
188                 "is a relative path, it will be relative to the 'disk-path'.",
189                 "scratch"));
190         e.setOverrideable(false);
191         e.setExpertSetting(true);
192 
193         e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_DOWNLOAD,
194                 "Maximum number of bytes to download. Once this number is" +
195                 " exceeded the crawler will stop. " +
196                 "A value of zero means no upper limit.", new Long(0)));
197         e.setOverrideable(false);
198 
199         e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCUMENT_DOWNLOAD,
200                 "Maximum number of documents to download. Once this number" +
201                 " is exceeded the crawler will stop. " +
202                 "A value of zero means no upper limit.", new Long(0)));
203         e.setOverrideable(false);
204 
205         e = addElementToDefinition(new SimpleType(ATTR_MAX_TIME_SEC,
206                 "Maximum amount of time to crawl (in seconds). Once this" +
207                 " much time has elapsed the crawler will stop. A value of" +
208                 " zero means no upper limit.",
209                 new Long(0)));
210         e.setOverrideable(false);
211         
212         e = addElementToDefinition(new SimpleType(ATTR_MAX_TOE_THREADS,
213                 "Maximum number of threads processing URIs at the same time.",
214                 new Integer(100)));
215         e.setOverrideable(false);
216 
217         e = addElementToDefinition(new SimpleType(ATTR_RECORDER_OUT_BUFFER,
218                 "Size in bytes of in-memory buffer to record outbound " +
219                 "traffic. One such buffer is reserved for every ToeThread.",
220                 new Integer(4096)));
221         e.setOverrideable(false);
222         e.setExpertSetting(true);
223         
224         e = addElementToDefinition(new SimpleType(ATTR_RECORDER_IN_BUFFER,
225                 "Size in bytes of in-memory buffer to record inbound " +
226                 "traffic. One such buffer is reserved for every ToeThread.",
227                 new Integer(65536)));
228         e.setOverrideable(false);
229         e.setExpertSetting(true);
230         
231         e = addElementToDefinition(new SimpleType(ATTR_BDB_CACHE_PERCENT,
232                 "Percentage of heap to allocate to BerkeleyDB JE cache. " +
233                 "Default of zero means no preference (accept BDB's default, " +
234                 "usually 60%, or the je.maxMemoryPercent property value).",
235                 DEFAULT_BDB_CACHE_PERCENT));
236         e.setExpertSetting(true);
237         e.setOverrideable(false);
238         
239         e = addElementToDefinition(new SimpleType(ATTR_INDEPENDENT_EXTRACTORS,
240                 "Whether an extractor's decision to run on a url is " +
241                 "independent of other extractors. When set to false, " +
242                 "most extractors will only run if no other " +
243                 "extractor has run on the url.",  
244                 false));
245         e.setExpertSetting(true);
246         e.setOverrideable(false);
247         
248         addElementToDefinition(new CrawlScope());
249 
250         httpHeaders = (MapType) addElementToDefinition(new MapType(
251                 ATTR_HTTP_HEADERS, "HTTP headers. Information that will " +
252                         "be used when constructing the HTTP headers of " +
253                         "the crawler's HTTP requests."));
254 
255         e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_USER_AGENT,
256                 "User agent to act as. Field must contain valid URL " +
257                 "that links to website of person or organization " +
258                 "running the crawl. Replace 'PROJECT_URL_HERE' in " +
259                 "initial template. E.g. If organization " +
260                 "is Library of Congress, a valid user agent would be:" +
261                 "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 " +
262                 "+http://loc.gov)'. " +
263                 "Note, you must preserve the '+' before the 'http'.",
264           "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));
265 
266         e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM,
267                 "Contact information. This field must contain a valid " +
268                 "e-mail address for the person or organization responsible" +
269                 "for this crawl: e.g. 'webmaster@loc.gov'",
270                 "CONTACT_EMAIL_ADDRESS_HERE"));
271 
272         addElementToDefinition(new RobotsHonoringPolicy());
273 
274         e = addElementToDefinition(new ModuleType(
275                 Frontier.ATTR_NAME, "Frontier"));
276         e.setLegalValueType(Frontier.class);
277 
278         e = (MapType) addElementToDefinition(new MapType(ATTR_RULES,
279             "Ordered list of url canonicalization rules. " +
280             "Rules are applied in the order listed from top to bottom.",
281             BaseRule.class));
282         e.setOverrideable(true);
283         e.setExpertSetting(true);
284         
285         e = addElementToDefinition(new MapType(
286                 ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" +
287                         " fetching anything from the network.",
288                         Processor.class));
289         e.setOverrideable(false);
290 
291         e = addElementToDefinition(new MapType(
292                 ATTR_FETCH_PROCESSORS, "Processors that fetch documents."
293                 , Processor.class));
294         e.setOverrideable(false);
295 
296         e = addElementToDefinition(new MapType(
297                 ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" +
298                         " from fetched documents.", Processor.class));
299         e.setOverrideable(false);
300 
301         e = addElementToDefinition(new MapType(
302                 ATTR_WRITE_PROCESSORS, "Processors that write documents" +
303                         " to archives.", Processor.class));
304         e.setOverrideable(false);
305 
306         e = addElementToDefinition(new MapType(
307                 ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" +
308                         " the frontier with new URIs.", Processor.class));
309         e.setOverrideable(false);
310 
311         loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS,
312                 "Statistics tracking modules. Any number of specialized " +
313                 "statistics tracker that monitor a crawl and write logs, " +
314                 "reports and/or provide information to the user interface."));
315 
316         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH,
317                 "Optional. Points at recover log (or recover.gz log) OR " +
318                 "the checkpoint directory to use recovering a crawl.", ""));
319         e.setOverrideable(false);
320         e.setExpertSetting(true);
321         
322         e = addElementToDefinition(new SimpleType(
323             ATTR_CHECKPOINT_COPY_BDBJE_LOGS,
324             "When true, on a checkpoint, we copy off the bdbje log files to " +
325             "the checkpoint directory. To recover a checkpoint, just " +
326             "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " +
327             "directory to recover.  This is default setting. " +
328             "But if crawl is large, " +
329             "copying bdbje log files can take tens of minutes and even " +
330             "upwards of an hour (Copying bdbje log files will consume bulk " +
331             "of time checkpointing). If this setting is false, we do NOT copy " +
332             "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " +
333             "files (instead we have it rename files-to-delete with a '.del'" +
334             "extension). Assumption is that when this setting is false, " +
335             "an external process is managing the removal of bdbje log files " +
336             "and that come time to recover from a checkpoint, the files that " +
337             "comprise a checkpoint are manually assembled. This is an expert " +
338             "setting.",
339             DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));
340         e.setOverrideable(false);
341         e.setExpertSetting(true);
342 
343         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES,
344                 "When recovering via the recover.log, should failures " +
345                 "in the log be retained in the recovered crawl, " +
346                 "preventing the corresponding URIs from being retried. " +
347                 "Default is false, meaning failures are forgotten, and " +
348                 "the corresponding URIs will be retried in the recovered " +
349                 "crawl.", Boolean.FALSE));
350         e.setOverrideable(false);
351         e.setExpertSetting(true);
352         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_SCOPE_INCLUDES,
353                 "When recovering via the recover.log, should URIs " +
354                 "be checked against scope before considered included " +
355                 "during the first phase which primes the already-seen " +
356                 "set. " +
357                 "Default is true, meaning scope changes in a recovered " +
358                 "crawl can slim the already-seen size. ", Boolean.TRUE));
359         e.setOverrideable(false);
360         e.setExpertSetting(true);
361         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_SCOPE_ENQUEUES,
362                 "When recovering via the recover.log, should URIs " +
363                 "be checked against scope before reenqueued during " +
364                 "the second phase which fills the to-be-fetched queues. " +
365                 "Default is true, meaning scope changes in a recovered " +
366                 "crawl can slim the pending queues. ", Boolean.TRUE));
367         e.setOverrideable(false);
368         e.setExpertSetting(true);
369         
370         
371         e = addElementToDefinition(
372            new CredentialStore(CredentialStore.ATTR_NAME));
373         e.setOverrideable(true);
374         e.setExpertSetting(true);
375     }
376 
377     /***
378      * @param curi
379      * @return user-agent header value to use
380      */
381     public String getUserAgent(CrawlURI curi) {
382         return ((String) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT));
383     }
384 
385     /***
386      * @param curi
387      * @return from header value to use
388      */
389     public String getFrom(CrawlURI curi) {
390         String res = null;
391         try {
392             res = (String) httpHeaders.getAttribute(ATTR_FROM, curi);
393         } catch (AttributeNotFoundException e) {
394             logger.severe(e.getMessage());
395         }
396         return res;
397     }
398 
399     /***
400      * Returns the set number of maximum toe threads.
401      * @return Number of maximum toe threads
402      */
403     public int getMaxToes() {
404         Integer res = null;
405         try {
406             res = (Integer) getAttribute(null, ATTR_MAX_TOE_THREADS);
407         } catch (AttributeNotFoundException e) {
408             logger.severe(e.getMessage());
409         }
410         return res.intValue();
411     }
412 
413     /***
414      * This method gets the RobotsHonoringPolicy object from the orders file.
415      *
416      * @return the new RobotsHonoringPolicy
417      */
418     public RobotsHonoringPolicy getRobotsHonoringPolicy() {
419         try {
420             return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME);
421         } catch (AttributeNotFoundException e) {
422             logger.severe(e.getMessage());
423             return null;
424         } 
425     }
426 
427     /*** Get the name of the order file.
428      *
429      * @return the name of the order file.
430      */
431     public String getCrawlOrderName() {
432         return getSettingsHandler().getSettingsObject(null).getName();
433     }
434 
435     /***
436      * @return The crawl controller.
437      */
438     public CrawlController getController() {
439         return controller;
440     }
441 
442     /***
443      * @param controller
444      */
445     public void setController(CrawlController controller) {
446         this.controller = controller;
447     }
448 
449     /***
450      * Returns the Map of the StatisticsTracking modules that are included in the
451      * configuration that the current instance of this class is representing.
452      * @return Map of the StatisticsTracking modules
453      */
454     public MapType getLoggers() {
455         return loggers;
456     }
457 
458     /***
459      * Checks if the User Agent and From field are set 'correctly' in
460      * the specified Crawl Order.
461      *
462      * @throws FatalConfigurationException
463      */
464     public void checkUserAgentAndFrom() throws FatalConfigurationException {
465         // don't start the crawl if they're using the default user-agent
466         String userAgent = this.getUserAgent(null);
467         String from = this.getFrom(null);
468         if (!(userAgent.matches(ACCEPTABLE_USER_AGENT)
469             && from.matches(ACCEPTABLE_FROM))) {
470             throw new FatalConfigurationException("unacceptable 'user-agent' " +
471                     " or 'from' (correct your configuration).");
472         }
473     }
474 
475     /***
476      * @return Checkpoint directory.
477      */
478     public File getCheckpointsDirectory() {
479         try {
480             return getDirectoryRelativeToDiskPath((String) getAttribute(null,
481                     CrawlOrder.ATTR_CHECKPOINTS_PATH));
482         } catch (AttributeNotFoundException e) {
483             // TODO Auto-generated catch block
484             e.printStackTrace();
485             return null;
486         }
487     }
488 
489     private File getDirectoryRelativeToDiskPath(String subpath) {
490         File disk;
491         try {
492             disk = getSettingsHandler().getPathRelativeToWorkingDirectory(
493                     (String) getAttribute(null, CrawlOrder.ATTR_DISK_PATH));
494             return new File(disk, subpath);
495         } catch (AttributeNotFoundException e) {
496             // TODO Auto-generated catch block
497             e.printStackTrace();
498             return null;
499         }
500     }
501     
502     /***
503      * Return fullpath to the directory named by <code>key</code>
504      * in settings.
505      * If directory does not exist, it and all intermediary dirs
506      * will be created.
507      * @param key Key to use going to settings.
508      * @return Full path to directory named by <code>key</code>.
509      * @throws AttributeNotFoundException
510      */
511     public File getSettingsDir(String key)
512     throws AttributeNotFoundException {
513         String path = (String)getAttribute(null, key);
514         File f = new File(path);
515         if (!f.isAbsolute()) {
516             f = getDirectoryRelativeToDiskPath(path);
517         }
518         if (!f.exists()) {
519             f.mkdirs();
520         }
521         return f;
522     }
523     
524     
525 }