1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.settings;
26
27 import java.io.File;
28 import java.lang.reflect.Constructor;
29 import java.lang.reflect.InvocationTargetException;
30 import java.text.ParseException;
31 import java.util.Collection;
32 import java.util.Collections;
33 import java.util.HashMap;
34 import java.util.HashSet;
35 import java.util.Iterator;
36 import java.util.List;
37 import java.util.Map;
38 import java.util.Set;
39 import java.util.logging.Level;
40
41 import javax.management.AttributeNotFoundException;
42 import javax.management.InvalidAttributeValueException;
43
44 import org.archive.crawler.datamodel.CrawlOrder;
45 import org.archive.crawler.framework.Checkpointer;
46 import org.archive.crawler.framework.ToeThread;
47 import org.archive.crawler.settings.refinements.Refinement;
48 import org.archive.net.UURI;
49 import org.archive.util.ArchiveUtils;
50
51 /*** An instance of this class holds a hierarchy of settings.
52 *
53 * More than one instance in memory is allowed so that a new CrawlJob could
54 * be configured while another job is running.
55 *
56 * This class should be subclassed to adapt to a persistent storage.
57 *
58 * @author John Erik Halse
59 */
60 public abstract class SettingsHandler {
61 /*** Cached CrawlerSettings objects */
62 private SettingsCache settingsCache =
63 new SettingsCache(new CrawlerSettings(this, null));
64
65 /*** Reference to the order module */
66 private CrawlOrder order;
67
68 private Set<ValueErrorHandler> valueErrorHandlers
69 = Collections.synchronizedSet(new HashSet<ValueErrorHandler>());
70 private int errorReportingLevel = Level.ALL.intValue();
71
72 /*** Datatypes supported by the settings framwork */
73 final static String INTEGER = "integer";
74 final static String LONG = "long";
75 final static String FLOAT = "float";
76 final static String DOUBLE = "double";
77 final static String BOOLEAN = "boolean";
78 final static String STRING = "string";
79 final static String TEXT = "text";
80 final static String OBJECT = "object";
81 final static String TIMESTAMP = "timestamp";
82 final static String MAP = "map";
83 final static String INTEGER_LIST = "integerList";
84 final static String LONG_LIST = "longList";
85 final static String FLOAT_LIST = "floatList";
86 final static String DOUBLE_LIST = "doubleList";
87 final static String STRING_LIST = "stringList";
88 private final static String names[][] = new String[][] {
89 { INTEGER, "java.lang.Integer"},
90 { LONG, "java.lang.Long"},
91 { FLOAT, "java.lang.Float"},
92 { DOUBLE, "java.lang.Double"},
93 { BOOLEAN, "java.lang.Boolean"},
94 { STRING, "java.lang.String"},
95 { TEXT, "org.archive.crawler.settings.TextField"},
96 { OBJECT, "org.archive.crawler.settings.ModuleType"},
97 { TIMESTAMP, "java.util.Date"},
98 { MAP, "org.archive.crawler.settings.MapType"},
99 { INTEGER_LIST,
100 "org.archive.crawler.settings.IntegerList"},
101 { LONG_LIST, "org.archive.crawler.settings.LongList"},
102 { FLOAT_LIST, "org.archive.crawler.settings.FloatList"},
103 { DOUBLE_LIST, "org.archive.crawler.settings.DoubleList"},
104 { STRING_LIST, "org.archive.crawler.settings.StringList"}};
105 private final static Map<String,String> name2class
106 = new HashMap<String,String>();
107 private final static Map<String,String> class2name
108 = new HashMap<String,String>();
109 static {
110 for (int i = 0; i < names.length; i++) {
111 name2class.put(names[i][0], names[i][1]);
112 class2name.put(names[i][1], names[i][0]);
113 }
114 }
115
116 /*** Create a new SettingsHandler object.
117 *
118 * @throws InvalidAttributeValueException
119 */
120 public SettingsHandler() throws InvalidAttributeValueException {
121 order = new CrawlOrder();
122 order.setAsOrder(this);
123 }
124
125 /*** Initialize the SettingsHandler.
126 *
127 * This method reads the default settings from the persistent storage.
128 */
129 public void initialize() {
130 readSettingsObject(settingsCache.getGlobalSettings());
131 }
132
133 public void cleanup() {
134 this.settingsCache = null;
135 if (this.order != null) {
136 this.order.setController(null);
137 }
138 this.order = null;
139 }
140
141 /*** Strip off the leftmost part of a domain name.
142 *
143 * @param scope the domain name.
144 * @return scope with everything before the first dot ripped off.
145 */
146 protected String getParentScope(String scope) {
147 int split = scope.indexOf('.');
148 return (split == -1)? null: scope.substring(split + 1);
149 }
150
151 /*** Get a module by name.
152 *
153 * All modules in the order should have unique names. This method makes it
154 * possible to get the modules of the order by its name.
155 *
156 * @param name the modules name.
157 * @return the module the name references.
158 */
159 public ModuleType getModule(String name) {
160 return settingsCache.getGlobalSettings().getModule(name);
161 }
162
163 /*** Get a complex type by its absolute name.
164 *
165 * The absolute name is the complex types name and the path leading to
166 * it.
167 *
168 * @param settings the settings object to query.
169 * @param absoluteName the absolute name of the complex type to get.
170 * @return the complex type referenced by the absolute name or null if
171 * the complex type could not be found in this settings object.
172 * @throws AttributeNotFoundException is thrown if no ComplexType by this
173 * name exist.
174 */
175 public ComplexType getComplexTypeByAbsoluteName(
176 CrawlerSettings settings, String absoluteName)
177 throws AttributeNotFoundException {
178
179 settings = settings == null ? settingsCache.getGlobalSettings() : settings;
180
181 DataContainer data = settings.getData(absoluteName);
182 if (data == null) {
183 CrawlerSettings parentSettings = settings.getParent();
184 if (parentSettings == null) {
185 throw new AttributeNotFoundException(absoluteName);
186 }
187 return getComplexTypeByAbsoluteName(parentSettings, absoluteName);
188 }
189 return data.getComplexType();
190 }
191
192 protected static String getTypeName(String className) {
193 return (String) class2name.get(className);
194 }
195
196 protected static String getClassName(String typeName) {
197 return (String) name2class.get(typeName);
198 }
199
200 /*** Convert a String object to an object of <code>typeName</code>.
201 *
202 * @param stringValue string to convert.
203 * @param typeName type to convert to. typeName should be one of the
204 * supported types represented by constants in this class.
205 * @return the new value object.
206 * @throws ClassCastException is thrown if string could not be converted.
207 */
208 protected static Object StringToType(String stringValue, String typeName) {
209 Object value;
210 if (typeName == SettingsHandler.STRING) {
211 value = stringValue;
212 } else if (typeName == SettingsHandler.TEXT) {
213 value = new TextField(stringValue);
214 } else if (typeName == SettingsHandler.INTEGER) {
215 value = Integer.decode(stringValue);
216 } else if (typeName == SettingsHandler.LONG) {
217 value = Long.decode(stringValue);
218 } else if (typeName == SettingsHandler.BOOLEAN) {
219 value = Boolean.valueOf(stringValue);
220 } else if (typeName == SettingsHandler.DOUBLE) {
221 value = Double.valueOf(stringValue);
222 } else if (typeName == SettingsHandler.FLOAT) {
223 value = Float.valueOf(stringValue);
224 } else if (typeName == SettingsHandler.TIMESTAMP) {
225 try {
226 value = ArchiveUtils.parse14DigitDate(stringValue);
227 } catch (ParseException e) {
228 throw new ClassCastException(
229 "Cannot convert '"
230 + stringValue
231 + "' to type '"
232 + typeName
233 + "'");
234 }
235 } else {
236 throw new ClassCastException(
237 "Cannot convert '"
238 + stringValue
239 + "' to type '"
240 + typeName
241 + "'");
242 }
243 return value;
244 }
245
246 /*** Get CrawlerSettings object in effect for a host or domain.
247 *
248 * If there is no specific settings for the host/domain, it will recursively
249 * go up the hierarchy to find the settings object that should be used for
250 * this host/domain.
251 *
252 * @param host the host or domain to get the settings for.
253 * @return settings object in effect for the host/domain.
254 * @see #getSettingsObject(String)
255 * @see #getOrCreateSettingsObject(String)
256 */
257 public CrawlerSettings getSettings(String host) {
258 return getRefinementsForSettings(getSettingsForHost(host), null);
259 }
260
261 /*** Get CrawlerSettings object in effect for a host or domain.
262 *
263 * If there is no specific settings for the host/domain, it will recursively
264 * go up the hierarchy to find the settings object that should be used for
265 * this host/domain.
266 * <p/>
267 * This method passes around a URI that refinement are checked against.
268 *
269 * @param host the host or domain to get the settings for.
270 * @param uuri UURI for context.
271 * @return settings object in effect for the host/domain.
272 * @see #getSettingsObject(String)
273 * @see #getOrCreateSettingsObject(String)
274 */
275 public CrawlerSettings getSettings(String host, UURI uuri) {
276 return getRefinementsForSettings(getSettingsForHost(host), uuri);
277 }
278
279 protected CrawlerSettings getSettingsForHost(String host) {
280 CrawlerSettings settings = settingsCache.getSettings(host, null);
281
282 if (settings == null) {
283 String tmpHost = host;
284 settings = getSettingsObject(tmpHost);
285 while (settings == null && tmpHost != null) {
286 tmpHost = getParentScope(tmpHost);
287 settings = getSettingsObject(tmpHost);
288 }
289
290 settingsCache.putSettings(host, settings);
291 }
292
293 return settings;
294 }
295
296 private CrawlerSettings getRefinementsForSettings(CrawlerSettings settings,
297 UURI uri) {
298 if (settings.hasRefinements()) {
299 for(Iterator it = settings.refinementsIterator(); it.hasNext();) {
300 Refinement refinement = (Refinement) it.next();
301 if (refinement.isWithinRefinementBounds(uri)) {
302 settings = getSettingsObject(settings.getScope(),
303 refinement.getReference());
304 }
305 }
306 }
307
308 return settings;
309 }
310
311 /*** Get CrawlerSettings object for a host or domain.
312 *
313 * The difference between this method and the
314 * <code>getSettings(String host)</code> is that this method will return
315 * null if there is no settings for particular host or domain.
316 *
317 * @param scope the host or domain to get the settings for.
318 * @return settings object for the host/domain or null if no
319 * settings exist for the host/domain.
320 * @see #getSettings(String)
321 * @see #getOrCreateSettingsObject(String)
322 */
323 public CrawlerSettings getSettingsObject(String scope) {
324 return getSettingsObject(scope, null);
325 }
326
327 /***
328 * Get CrawlerSettings object for a host/domain and a particular refinement.
329 *
330 * @param scope the host or domain to get the settings for.
331 * @param refinement the refinement reference to get.
332 * @return CrawlerSettings object for a host/domain and a particular
333 * refinement or null if no settings exist for the host/domain.
334 */
335 public CrawlerSettings getSettingsObject(String scope, String refinement) {
336 CrawlerSettings settings =
337 settingsCache.getSettingsObject(scope, refinement);
338
339 if (settings == null) {
340
341 settings = new CrawlerSettings(this, scope, refinement);
342
343
344 settings = readSettingsObject(settings);
345 if (settings != null) {
346 settingsCache.putSettings(scope, settings);
347 }
348 }
349 return settings;
350 }
351
352 /*** Get or create CrawlerSettings object for a host or domain.
353 *
354 * This method is similar to {@link #getSettingsObject(String)} except that
355 * if there is no settings for this particular host or domain a new settings
356 * object will be returned.
357 *
358 * @param scope the host or domain to get or create the settings for.
359 * @return settings object for the host/domain.
360 * @see #getSettings(String)
361 * @see #getSettingsObject(String)
362 */
363 public CrawlerSettings getOrCreateSettingsObject(String scope) {
364 return getOrCreateSettingsObject(scope, null);
365 }
366
367 public CrawlerSettings getOrCreateSettingsObject(String scope,
368 String refinement) {
369 CrawlerSettings settings;
370 settings = getSettingsObject(scope, refinement);
371 if (settings == null) {
372 scope = scope.intern();
373
374
375 settings = new CrawlerSettings(this, scope, refinement);
376 settingsCache.refreshHostToSettings();
377 settingsCache.putSettings(scope, settings);
378 }
379 return settings;
380 }
381
382 /*** Write the CrawlerSettings object to persistent storage.
383 *
384 * @param settings the settings object to write.
385 */
386 public abstract void writeSettingsObject(CrawlerSettings settings);
387
388 /*** Read the CrawlerSettings object from persistent storage.
389 *
390 * @param settings the settings object to be updated with data from the
391 * persistent storage.
392 * @return the updated settings object or null if there was no data for this
393 * in the persistent storage.
394 */
395 protected abstract CrawlerSettings readSettingsObject(CrawlerSettings settings);
396
397 /*** Delete a settings object from persistent storage.
398 *
399 * @param settings the settings object to delete.
400 */
401 public void deleteSettingsObject(CrawlerSettings settings) {
402 settingsCache.deleteSettingsObject(settings);
403 }
404
405 /*** Get the CrawlOrder.
406 *
407 * @return the CrawlOrder
408 */
409 public CrawlOrder getOrder() {
410 return order;
411 }
412
413 /*** Instatiate a new ModuleType given its name and className.
414 *
415 * @param name the name for the new ComplexType.
416 * @param className the class name of the new ComplexType.
417 * @return an instance of the class identified by className.
418 *
419 * @throws InvocationTargetException
420 */
421 @SuppressWarnings("unchecked")
422 public static ModuleType instantiateModuleTypeFromClassName(
423 String name, String className)
424 throws InvocationTargetException {
425
426 Class cl;
427 try {
428 cl = Class.forName(className);
429 } catch (ClassNotFoundException e) {
430 throw new InvocationTargetException(e);
431 }
432
433 ModuleType module;
434 try {
435 Constructor co =
436 cl.getConstructor(new Class[] { String.class });
437 module = (ModuleType) co.newInstance(new Object[] { name });
438 } catch (IllegalArgumentException e) {
439 throw new InvocationTargetException(e);
440 } catch (InstantiationException e) {
441 throw new InvocationTargetException(e);
442 } catch (IllegalAccessException e) {
443 throw new InvocationTargetException(e);
444 } catch (SecurityException e) {
445 throw new InvocationTargetException(e);
446 } catch (NoSuchMethodException e) {
447 throw new InvocationTargetException(e);
448 }
449 return module;
450 }
451
452 /***
453 * Transforms a relative path so that it is relative to a location that is
454 * regarded as a working dir for these settings. If an absolute path is given,
455 * it will be returned unchanged.
456 * @param path A relative path to a file (or directory)
457 * @return The same path modified so that it is relative to the file level
458 * location that is considered the working directory for these settings.
459 */
460 public abstract File getPathRelativeToWorkingDirectory(String path);
461
462 /***
463 * Will return a Collection of strings with domains that contain 'per'
464 * domain overrides (or their subdomains contain them).
465 *
466 * The domains considered are
467 * limited to those that are subdomains of the supplied domain. If null or
468 * empty string is supplied the TLDs will be considered.
469 * @param rootDomain The domain to get domain overrides for. Examples:
470 * 'org', 'archive.org', 'crawler.archive.org' etc.
471 * @return An array of domains that contain overrides. If rootDomain does not
472 * exist an empty array will be returned.
473 */
474 public abstract Collection getDomainOverrides(String rootDomain);
475
476 /***
477 * Unregister an instance of {@link ValueErrorHandler}.
478 *
479 * @param errorHandler the <code>CalueErrorHandler</code> to unregister.
480 *
481 * @see ValueErrorHandler
482 * @see #setErrorReportingLevel(Level)
483 * @see #registerValueErrorHandler(ValueErrorHandler)
484 *
485 */
486 public void unregisterValueErrorHandler(ValueErrorHandler errorHandler) {
487 valueErrorHandlers.remove(errorHandler);
488 }
489
490 /***
491 * Register an instance of {@link ValueErrorHandler}.
492 * <p>
493 * If a ValueErrorHandler is registered, only constraints with level
494 * {@link Level#SEVERE}will throw an {@link InvalidAttributeValueException}.
495 * The ValueErrorHandler will recieve a notification for all failed checks
496 * with level equal or greater than the error reporting level.
497 *
498 * @param errorHandler the <code>CalueErrorHandler</code> to register.
499 *
500 * @see ValueErrorHandler
501 * @see #setErrorReportingLevel(Level)
502 * @see #unregisterValueErrorHandler(ValueErrorHandler)
503 */
504 public void registerValueErrorHandler(ValueErrorHandler errorHandler) {
505 if (errorHandler != null) {
506 valueErrorHandlers.add(errorHandler);
507 }
508 }
509
510 /***
511 * Fire events on all registered {@link ValueErrorHandler}.
512 *
513 * @param error the failed constraints return value.
514 * @return true if there was any registered ValueErrorHandlers to notify.
515 */
516 boolean fireValueErrorHandlers(Constraint.FailedCheck error) {
517 if (error.getLevel().intValue() >= errorReportingLevel) {
518 for (Iterator it = valueErrorHandlers.iterator(); it.hasNext();) {
519 ((ValueErrorHandler) it.next()).handleValueError(error);
520 }
521 }
522 return valueErrorHandlers.size() > 0;
523 }
524
525 /***
526 * Set the level for which notification of failed constraints will be fired.
527 *
528 * @param level the error reporting level.
529 */
530 public void setErrorReportingLevel(Level level) {
531 errorReportingLevel = level.intValue();
532 }
533
534 /***
535 * Creates and returns a <tt>List</tt> of all files comprising the current
536 * settings framework.
537 *
538 * <p>The List contains the absolute String path of each file.
539 *
540 * <p>The list should contain any configurable files, including such files
541 * as seed file and any other files use by the various settings modules.
542 *
543 * <p>Implementations of the SettingsHandler that do not use files for
544 * permanent storage should return an empty list.
545 * @return <code>List</code> of framework files.
546 */
547 public abstract List getListOfAllFiles();
548
549 /***
550 * Clear any per-host settings cached in memory; allows editting of
551 * per-host settings files on disk, perhaps in bulk/automated fashion,
552 * to take effect in running crawl.
553 */
554 public void clearPerHostSettingsCache() {
555 settingsCache.clear();
556 }
557
558 static ThreadLocal<SettingsHandler> threadContextSettingsHandler =
559 new ThreadLocal<SettingsHandler>();
560 public static void setThreadContextSettingsHandler(SettingsHandler settingsHandler) {
561 threadContextSettingsHandler.set(settingsHandler);
562 }
563 public static SettingsHandler getThreadContextSettingsHandler() {
564 Thread t = Thread.currentThread();
565 if (t instanceof Checkpointer.CheckpointingThread) {
566 return ((Checkpointer.CheckpointingThread)t)
567 .getController().getSettingsHandler();
568 }
569 if (t instanceof ToeThread) {
570 return ((ToeThread) Thread.currentThread())
571 .getController().getSettingsHandler();
572 }
573 if(threadContextSettingsHandler.get()!=null) {
574 return threadContextSettingsHandler.get();
575 }
576
577
578
579
580
581
582
583
584
585 return null;
586 }
587 }