1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 package org.archive.crawler.datamodel;
29
30 import java.io.File;
31 import java.io.Serializable;
32 import java.util.logging.Logger;
33
34 import javax.management.AttributeNotFoundException;
35
36 import org.archive.crawler.framework.CrawlController;
37 import org.archive.crawler.framework.CrawlScope;
38 import org.archive.crawler.framework.Frontier;
39 import org.archive.crawler.framework.Processor;
40 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41 import org.archive.crawler.settings.MapType;
42 import org.archive.crawler.settings.ModuleType;
43 import org.archive.crawler.settings.SimpleType;
44 import org.archive.crawler.settings.Type;
45 import org.archive.crawler.url.canonicalize.BaseRule;
46
47 /***
48 * Represents the 'root' of the settings hierarchy. Contains those settings that
49 * do not belong to any specific module, but rather relate to the crawl as a
50 * whole (much of this is used by the CrawlController directly or indirectly).
51 *
52 * @see org.archive.crawler.settings.ModuleType
53 */
54 public class CrawlOrder extends ModuleType implements Serializable {
55
56 private static final long serialVersionUID = -6715840285961511669L;
57
58 private static Logger logger =
59 Logger.getLogger("org.archive.crawler.datamodel.CrawlOrder");
60
61 public static final String ATTR_NAME = "crawl-order";
62 public static final String ATTR_SETTINGS_DIRECTORY = "settings-directory";
63 public static final String ATTR_DISK_PATH = "disk-path";
64 public static final String ATTR_LOGS_PATH = "logs-path";
65 public static final String ATTR_CHECKPOINTS_PATH = "checkpoints-path";
66 public static final String ATTR_STATE_PATH = "state-path";
67 public static final String ATTR_SCRATCH_PATH = "scratch-path";
68 public static final String ATTR_RECOVER_PATH = "recover-path";
69 public static final String ATTR_RECOVER_RETAIN_FAILURES =
70 "recover-retain-failures";
71 public static final String ATTR_RECOVER_SCOPE_INCLUDES =
72 "recover-scope-includes";
73 public static final String ATTR_RECOVER_SCOPE_ENQUEUES =
74 "recover-scope-enqueues";
75 public static final String ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download";
76 public static final String ATTR_MAX_DOCUMENT_DOWNLOAD =
77 "max-document-download";
78 public static final String ATTR_MAX_TIME_SEC = "max-time-sec";
79 public static final String ATTR_MAX_TOE_THREADS = "max-toe-threads";
80 public static final String ATTR_HTTP_HEADERS = "http-headers";
81 public static final String ATTR_USER_AGENT = "user-agent";
82 public static final String ATTR_FROM = "from";
83 public static final String ATTR_PRE_FETCH_PROCESSORS =
84 "pre-fetch-processors";
85 public static final String ATTR_FETCH_PROCESSORS = "fetch-processors";
86 public static final String ATTR_EXTRACT_PROCESSORS = "extract-processors";
87 public static final String ATTR_WRITE_PROCESSORS = "write-processors";
88 public static final String ATTR_POST_PROCESSORS = "post-processors";
89 public static final String ATTR_LOGGERS = "loggers";
90 public static final String ATTR_RULES = "uri-canonicalization-rules";
91 public static final String ATTR_RECORDER_OUT_BUFFER =
92 "recorder-out-buffer-bytes";
93 public static final String ATTR_RECORDER_IN_BUFFER =
94 "recorder-in-buffer-bytes";
95
96 public static final String ATTR_INDEPENDENT_EXTRACTORS =
97 "independent-extractors";
98
99 /*** Percentage of heap to allocate to bdb cache */
100 public static final String ATTR_BDB_CACHE_PERCENT =
101 "bdb-cache-percent";
102
103 /***
104 * When checkpointing, copy the bdb logs.
105 * Default is true. If false, then we do not copy logs on checkpoint AND
106 * we tell bdbje never to delete log files; instead it renames
107 * files-to-delete with a '.del' extension. Assumption is that when this
108 * setting is false, an external process is managing the removing of
109 * bdbje log files and that come time to recover from a checkpoint, the
110 * files that comprise a checkpoint are manually assembled.
111 */
112 public static final String ATTR_CHECKPOINT_COPY_BDBJE_LOGS =
113 "checkpoint-copy-bdbje-logs";
114 public static final Boolean DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS =
115 Boolean.TRUE;
116
117 /***
118 * Default size of bdb cache.
119 */
120 private final static Integer DEFAULT_BDB_CACHE_PERCENT = new Integer(0);
121
122 private transient MapType httpHeaders;
123 private transient MapType loggers;
124
125 private transient CrawlController controller;
126
127 /***
128 * Regex for acceptable user-agent format.
129 */
130 private static String ACCEPTABLE_USER_AGENT =
131 "//S+.*//(.*//+http(s)?:////S+//.//S+.*//).*";
132
133 /***
134 * Regex for acceptable from address.
135 */
136 private static String ACCEPTABLE_FROM = "//S+@//S+//.//S+";
137
138
139 /*** Construct a CrawlOrder.
140 */
141 public CrawlOrder() {
142 super(ATTR_NAME, "Heritrix crawl order. This forms the root of " +
143 "the settings framework.");
144 Type e;
145
146 e = addElementToDefinition(new SimpleType(ATTR_SETTINGS_DIRECTORY,
147 "Directory where override settings are kept. The settings " +
148 "for many modules can be overridden based on the domain or " +
149 "subdomain of the URI being processed. This setting specifies" +
150 " a file level directory to store those settings. The path" +
151 " is relative to 'disk-path' unless" +
152 " an absolute path is provided.", "settings"));
153 e.setOverrideable(false);
154 e.setExpertSetting(true);
155
156 e = addElementToDefinition(new SimpleType(ATTR_DISK_PATH,
157 "Directory where logs, arcs and other run time files will " +
158 "be kept. If this path is a relative path, it will be " +
159 "relative to the crawl order.", ""));
160 e.setOverrideable(false);
161 e.setExpertSetting(true);
162
163 e = addElementToDefinition(new SimpleType(ATTR_LOGS_PATH,
164 "Directory where crawler log files will be kept. If this path " +
165 "is a relative path, it will be relative to the 'disk-path'.",
166 "logs"));
167 e.setOverrideable(false);
168 e.setExpertSetting(true);
169
170 e = addElementToDefinition(new SimpleType(ATTR_CHECKPOINTS_PATH,
171 "Directory where crawler checkpoint files will be kept. " +
172 "If this path " +
173 "is a relative path, it will be relative to the 'disk-path'.",
174 "checkpoints"));
175 e.setOverrideable(false);
176 e.setExpertSetting(true);
177
178 e = addElementToDefinition(new SimpleType(ATTR_STATE_PATH,
179 "Directory where crawler-state files will be kept. If this path " +
180 "is a relative path, it will be relative to the 'disk-path'.",
181 "state"));
182 e.setOverrideable(false);
183 e.setExpertSetting(true);
184
185 e = addElementToDefinition(new SimpleType(ATTR_SCRATCH_PATH,
186 "Directory where discardable temporary files will be kept. " +
187 "If this path " +
188 "is a relative path, it will be relative to the 'disk-path'.",
189 "scratch"));
190 e.setOverrideable(false);
191 e.setExpertSetting(true);
192
193 e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_DOWNLOAD,
194 "Maximum number of bytes to download. Once this number is" +
195 " exceeded the crawler will stop. " +
196 "A value of zero means no upper limit.", new Long(0)));
197 e.setOverrideable(false);
198
199 e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCUMENT_DOWNLOAD,
200 "Maximum number of documents to download. Once this number" +
201 " is exceeded the crawler will stop. " +
202 "A value of zero means no upper limit.", new Long(0)));
203 e.setOverrideable(false);
204
205 e = addElementToDefinition(new SimpleType(ATTR_MAX_TIME_SEC,
206 "Maximum amount of time to crawl (in seconds). Once this" +
207 " much time has elapsed the crawler will stop. A value of" +
208 " zero means no upper limit.",
209 new Long(0)));
210 e.setOverrideable(false);
211
212 e = addElementToDefinition(new SimpleType(ATTR_MAX_TOE_THREADS,
213 "Maximum number of threads processing URIs at the same time.",
214 new Integer(100)));
215 e.setOverrideable(false);
216
217 e = addElementToDefinition(new SimpleType(ATTR_RECORDER_OUT_BUFFER,
218 "Size in bytes of in-memory buffer to record outbound " +
219 "traffic. One such buffer is reserved for every ToeThread.",
220 new Integer(4096)));
221 e.setOverrideable(false);
222 e.setExpertSetting(true);
223
224 e = addElementToDefinition(new SimpleType(ATTR_RECORDER_IN_BUFFER,
225 "Size in bytes of in-memory buffer to record inbound " +
226 "traffic. One such buffer is reserved for every ToeThread.",
227 new Integer(65536)));
228 e.setOverrideable(false);
229 e.setExpertSetting(true);
230
231 e = addElementToDefinition(new SimpleType(ATTR_BDB_CACHE_PERCENT,
232 "Percentage of heap to allocate to BerkeleyDB JE cache. " +
233 "Default of zero means no preference (accept BDB's default, " +
234 "usually 60%, or the je.maxMemoryPercent property value).",
235 DEFAULT_BDB_CACHE_PERCENT));
236 e.setExpertSetting(true);
237 e.setOverrideable(false);
238
239 e = addElementToDefinition(new SimpleType(ATTR_INDEPENDENT_EXTRACTORS,
240 "Whether an extractor's decision to run on a url is " +
241 "independent of other extractors. When set to false, " +
242 "most extractors will only run if no other " +
243 "extractor has run on the url.",
244 false));
245 e.setExpertSetting(true);
246 e.setOverrideable(false);
247
248 addElementToDefinition(new CrawlScope());
249
250 httpHeaders = (MapType) addElementToDefinition(new MapType(
251 ATTR_HTTP_HEADERS, "HTTP headers. Information that will " +
252 "be used when constructing the HTTP headers of " +
253 "the crawler's HTTP requests."));
254
255 e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_USER_AGENT,
256 "User agent to act as. Field must contain valid URL " +
257 "that links to website of person or organization " +
258 "running the crawl. Replace 'PROJECT_URL_HERE' in " +
259 "initial template. E.g. If organization " +
260 "is Library of Congress, a valid user agent would be:" +
261 "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 " +
262 "+http://loc.gov)'. " +
263 "Note, you must preserve the '+' before the 'http'.",
264 "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));
265
266 e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM,
267 "Contact information. This field must contain a valid " +
268 "e-mail address for the person or organization responsible" +
269 "for this crawl: e.g. 'webmaster@loc.gov'",
270 "CONTACT_EMAIL_ADDRESS_HERE"));
271
272 addElementToDefinition(new RobotsHonoringPolicy());
273
274 e = addElementToDefinition(new ModuleType(
275 Frontier.ATTR_NAME, "Frontier"));
276 e.setLegalValueType(Frontier.class);
277
278 e = (MapType) addElementToDefinition(new MapType(ATTR_RULES,
279 "Ordered list of url canonicalization rules. " +
280 "Rules are applied in the order listed from top to bottom.",
281 BaseRule.class));
282 e.setOverrideable(true);
283 e.setExpertSetting(true);
284
285 e = addElementToDefinition(new MapType(
286 ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" +
287 " fetching anything from the network.",
288 Processor.class));
289 e.setOverrideable(false);
290
291 e = addElementToDefinition(new MapType(
292 ATTR_FETCH_PROCESSORS, "Processors that fetch documents."
293 , Processor.class));
294 e.setOverrideable(false);
295
296 e = addElementToDefinition(new MapType(
297 ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" +
298 " from fetched documents.", Processor.class));
299 e.setOverrideable(false);
300
301 e = addElementToDefinition(new MapType(
302 ATTR_WRITE_PROCESSORS, "Processors that write documents" +
303 " to archives.", Processor.class));
304 e.setOverrideable(false);
305
306 e = addElementToDefinition(new MapType(
307 ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" +
308 " the frontier with new URIs.", Processor.class));
309 e.setOverrideable(false);
310
311 loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS,
312 "Statistics tracking modules. Any number of specialized " +
313 "statistics tracker that monitor a crawl and write logs, " +
314 "reports and/or provide information to the user interface."));
315
316 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH,
317 "Optional. Points at recover log (or recover.gz log) OR " +
318 "the checkpoint directory to use recovering a crawl.", ""));
319 e.setOverrideable(false);
320 e.setExpertSetting(true);
321
322 e = addElementToDefinition(new SimpleType(
323 ATTR_CHECKPOINT_COPY_BDBJE_LOGS,
324 "When true, on a checkpoint, we copy off the bdbje log files to " +
325 "the checkpoint directory. To recover a checkpoint, just " +
326 "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " +
327 "directory to recover. This is default setting. " +
328 "But if crawl is large, " +
329 "copying bdbje log files can take tens of minutes and even " +
330 "upwards of an hour (Copying bdbje log files will consume bulk " +
331 "of time checkpointing). If this setting is false, we do NOT copy " +
332 "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " +
333 "files (instead we have it rename files-to-delete with a '.del'" +
334 "extension). Assumption is that when this setting is false, " +
335 "an external process is managing the removal of bdbje log files " +
336 "and that come time to recover from a checkpoint, the files that " +
337 "comprise a checkpoint are manually assembled. This is an expert " +
338 "setting.",
339 DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));
340 e.setOverrideable(false);
341 e.setExpertSetting(true);
342
343 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES,
344 "When recovering via the recover.log, should failures " +
345 "in the log be retained in the recovered crawl, " +
346 "preventing the corresponding URIs from being retried. " +
347 "Default is false, meaning failures are forgotten, and " +
348 "the corresponding URIs will be retried in the recovered " +
349 "crawl.", Boolean.FALSE));
350 e.setOverrideable(false);
351 e.setExpertSetting(true);
352 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_SCOPE_INCLUDES,
353 "When recovering via the recover.log, should URIs " +
354 "be checked against scope before considered included " +
355 "during the first phase which primes the already-seen " +
356 "set. " +
357 "Default is true, meaning scope changes in a recovered " +
358 "crawl can slim the already-seen size. ", Boolean.TRUE));
359 e.setOverrideable(false);
360 e.setExpertSetting(true);
361 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_SCOPE_ENQUEUES,
362 "When recovering via the recover.log, should URIs " +
363 "be checked against scope before reenqueued during " +
364 "the second phase which fills the to-be-fetched queues. " +
365 "Default is true, meaning scope changes in a recovered " +
366 "crawl can slim the pending queues. ", Boolean.TRUE));
367 e.setOverrideable(false);
368 e.setExpertSetting(true);
369
370
371 e = addElementToDefinition(
372 new CredentialStore(CredentialStore.ATTR_NAME));
373 e.setOverrideable(true);
374 e.setExpertSetting(true);
375 }
376
377 /***
378 * @param curi
379 * @return user-agent header value to use
380 */
381 public String getUserAgent(CrawlURI curi) {
382 return ((String) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT));
383 }
384
385 /***
386 * @param curi
387 * @return from header value to use
388 */
389 public String getFrom(CrawlURI curi) {
390 String res = null;
391 try {
392 res = (String) httpHeaders.getAttribute(ATTR_FROM, curi);
393 } catch (AttributeNotFoundException e) {
394 logger.severe(e.getMessage());
395 }
396 return res;
397 }
398
399 /***
400 * Returns the set number of maximum toe threads.
401 * @return Number of maximum toe threads
402 */
403 public int getMaxToes() {
404 Integer res = null;
405 try {
406 res = (Integer) getAttribute(null, ATTR_MAX_TOE_THREADS);
407 } catch (AttributeNotFoundException e) {
408 logger.severe(e.getMessage());
409 }
410 return res.intValue();
411 }
412
413 /***
414 * This method gets the RobotsHonoringPolicy object from the orders file.
415 *
416 * @return the new RobotsHonoringPolicy
417 */
418 public RobotsHonoringPolicy getRobotsHonoringPolicy() {
419 try {
420 return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME);
421 } catch (AttributeNotFoundException e) {
422 logger.severe(e.getMessage());
423 return null;
424 }
425 }
426
427 /*** Get the name of the order file.
428 *
429 * @return the name of the order file.
430 */
431 public String getCrawlOrderName() {
432 return getSettingsHandler().getSettingsObject(null).getName();
433 }
434
435 /***
436 * @return The crawl controller.
437 */
438 public CrawlController getController() {
439 return controller;
440 }
441
442 /***
443 * @param controller
444 */
445 public void setController(CrawlController controller) {
446 this.controller = controller;
447 }
448
449 /***
450 * Returns the Map of the StatisticsTracking modules that are included in the
451 * configuration that the current instance of this class is representing.
452 * @return Map of the StatisticsTracking modules
453 */
454 public MapType getLoggers() {
455 return loggers;
456 }
457
458 /***
459 * Checks if the User Agent and From field are set 'correctly' in
460 * the specified Crawl Order.
461 *
462 * @throws FatalConfigurationException
463 */
464 public void checkUserAgentAndFrom() throws FatalConfigurationException {
465
466 String userAgent = this.getUserAgent(null);
467 String from = this.getFrom(null);
468 if (!(userAgent.matches(ACCEPTABLE_USER_AGENT)
469 && from.matches(ACCEPTABLE_FROM))) {
470 throw new FatalConfigurationException("unacceptable 'user-agent' " +
471 " or 'from' (correct your configuration).");
472 }
473 }
474
475 /***
476 * @return Checkpoint directory.
477 */
478 public File getCheckpointsDirectory() {
479 try {
480 return getDirectoryRelativeToDiskPath((String) getAttribute(null,
481 CrawlOrder.ATTR_CHECKPOINTS_PATH));
482 } catch (AttributeNotFoundException e) {
483
484 e.printStackTrace();
485 return null;
486 }
487 }
488
489 private File getDirectoryRelativeToDiskPath(String subpath) {
490 File disk;
491 try {
492 disk = getSettingsHandler().getPathRelativeToWorkingDirectory(
493 (String) getAttribute(null, CrawlOrder.ATTR_DISK_PATH));
494 return new File(disk, subpath);
495 } catch (AttributeNotFoundException e) {
496
497 e.printStackTrace();
498 return null;
499 }
500 }
501
502 /***
503 * Return fullpath to the directory named by <code>key</code>
504 * in settings.
505 * If directory does not exist, it and all intermediary dirs
506 * will be created.
507 * @param key Key to use going to settings.
508 * @return Full path to directory named by <code>key</code>.
509 * @throws AttributeNotFoundException
510 */
511 public File getSettingsDir(String key)
512 throws AttributeNotFoundException {
513 String path = (String)getAttribute(null, key);
514 File f = new File(path);
515 if (!f.isAbsolute()) {
516 f = getDirectoryRelativeToDiskPath(path);
517 }
518 if (!f.exists()) {
519 f.mkdirs();
520 }
521 return f;
522 }
523
524
525 }