View Javadoc

1   /* SettingsHandler
2    *
3    * $Id: SettingsHandler.java 6703 2009-11-25 01:28:49Z gojomo $
4    *
5    * Created on Dec 16, 2003
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.settings;
26  
27  import java.io.File;
28  import java.lang.reflect.Constructor;
29  import java.lang.reflect.InvocationTargetException;
30  import java.text.ParseException;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.HashMap;
34  import java.util.HashSet;
35  import java.util.Iterator;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Set;
39  import java.util.logging.Level;
40  
41  import javax.management.AttributeNotFoundException;
42  import javax.management.InvalidAttributeValueException;
43  
44  import org.archive.crawler.datamodel.CrawlOrder;
45  import org.archive.crawler.framework.Checkpointer;
46  import org.archive.crawler.framework.ToeThread;
47  import org.archive.crawler.settings.refinements.Refinement;
48  import org.archive.net.UURI;
49  import org.archive.util.ArchiveUtils;
50  
51  /*** An instance of this class holds a hierarchy of settings.
52   *
53   * More than one instance in memory is allowed so that a new CrawlJob could
54   * be configured while another job is running.
55   *
56   * This class should be subclassed to adapt to a persistent storage.
57   *
58   * @author John Erik Halse
59   */
60  public abstract class SettingsHandler {
61      /*** Cached CrawlerSettings objects */
62      private SettingsCache settingsCache =
63          new SettingsCache(new CrawlerSettings(this, null));
64  
65      /*** Reference to the order module */
66      private CrawlOrder order;
67  
68      private Set<ValueErrorHandler> valueErrorHandlers 
69       = Collections.synchronizedSet(new HashSet<ValueErrorHandler>());
70      private int errorReportingLevel = Level.ALL.intValue();
71  
72      /*** Datatypes supported by the settings framwork */
73      final static String INTEGER = "integer";
74      final static String LONG = "long";
75      final static String FLOAT = "float";
76      final static String DOUBLE = "double";
77      final static String BOOLEAN = "boolean";
78      final static String STRING = "string";
79      final static String TEXT = "text";
80      final static String OBJECT = "object";
81      final static String TIMESTAMP = "timestamp";
82      final static String MAP = "map";
83      final static String INTEGER_LIST = "integerList";
84      final static String LONG_LIST = "longList";
85      final static String FLOAT_LIST = "floatList";
86      final static String DOUBLE_LIST = "doubleList";
87      final static String STRING_LIST = "stringList";
88      private final static String names[][] = new String[][] {
89              { INTEGER, "java.lang.Integer"},
90              { LONG, "java.lang.Long"},
91              { FLOAT, "java.lang.Float"},
92              { DOUBLE, "java.lang.Double"},
93              { BOOLEAN, "java.lang.Boolean"},
94              { STRING, "java.lang.String"},
95              { TEXT, "org.archive.crawler.settings.TextField"},
96              { OBJECT, "org.archive.crawler.settings.ModuleType"},
97              { TIMESTAMP, "java.util.Date"},
98              { MAP, "org.archive.crawler.settings.MapType"},
99              { INTEGER_LIST,
100                     "org.archive.crawler.settings.IntegerList"},
101             { LONG_LIST, "org.archive.crawler.settings.LongList"},
102             { FLOAT_LIST, "org.archive.crawler.settings.FloatList"},
103             { DOUBLE_LIST, "org.archive.crawler.settings.DoubleList"},
104             { STRING_LIST, "org.archive.crawler.settings.StringList"}};
105     private final static Map<String,String> name2class
106      = new HashMap<String,String>();
107     private final static Map<String,String> class2name
108      = new HashMap<String,String>();
109     static {
110         for (int i = 0; i < names.length; i++) {
111             name2class.put(names[i][0], names[i][1]);
112             class2name.put(names[i][1], names[i][0]);
113         }
114     }
115 
116     /*** Create a new SettingsHandler object.
117      *
118      * @throws InvalidAttributeValueException
119      */
120     public SettingsHandler() throws InvalidAttributeValueException {
121         order = new CrawlOrder();
122         order.setAsOrder(this);
123     }
124 
125     /*** Initialize the SettingsHandler.
126      *
127      * This method reads the default settings from the persistent storage.
128      */
129     public void initialize() {
130         readSettingsObject(settingsCache.getGlobalSettings());
131     }
132     
133     public void cleanup() {
134         this.settingsCache = null;
135         if (this.order != null) {
136             this.order.setController(null);
137         }
138         this.order =  null;
139     }
140 
141     /*** Strip off the leftmost part of a domain name.
142      *
143      * @param scope the domain name.
144      * @return scope with everything before the first dot ripped off.
145      */
146     protected String getParentScope(String scope) {
147         int split = scope.indexOf('.');
148         return (split == -1)? null: scope.substring(split + 1);
149     }
150 
151     /*** Get a module by name.
152      *
153      * All modules in the order should have unique names. This method makes it
154      * possible to get the modules of the order by its name.
155      *
156      * @param name the modules name.
157      * @return the module the name references.
158      */
159     public ModuleType getModule(String name) {
160         return settingsCache.getGlobalSettings().getModule(name);
161     }
162 
163     /*** Get a complex type by its absolute name.
164      *
165      * The absolute name is the complex types name and the path leading to
166      * it.
167      *
168      * @param settings the settings object to query.
169      * @param absoluteName the absolute name of the complex type to get.
170      * @return the complex type referenced by the absolute name or null if
171      *         the complex type could not be found in this settings object.
172      * @throws AttributeNotFoundException is thrown if no ComplexType by this
173      *         name exist.
174      */
175     public ComplexType getComplexTypeByAbsoluteName(
176             CrawlerSettings settings, String absoluteName)
177             throws AttributeNotFoundException {
178 
179         settings = settings == null ? settingsCache.getGlobalSettings() : settings;
180 
181         DataContainer data = settings.getData(absoluteName);
182         if (data == null) {
183             CrawlerSettings parentSettings = settings.getParent();
184             if (parentSettings == null) {
185                 throw new AttributeNotFoundException(absoluteName);
186             }
187             return getComplexTypeByAbsoluteName(parentSettings, absoluteName);
188         }
189         return data.getComplexType();
190     }
191 
192     protected static String getTypeName(String className) {
193         return (String) class2name.get(className);
194     }
195 
196     protected static String getClassName(String typeName) {
197         return (String) name2class.get(typeName);
198     }
199 
200     /*** Convert a String object to an object of <code>typeName</code>.
201      *
202      * @param stringValue string to convert.
203      * @param typeName type to convert to. typeName should be one of the
204      *        supported types represented by constants in this class.
205      * @return the new value object.
206      * @throws ClassCastException is thrown if string could not be converted.
207      */
208     protected static Object StringToType(String stringValue, String typeName) {
209         Object value;
210         if (typeName == SettingsHandler.STRING) {
211             value = stringValue;
212         } else if (typeName == SettingsHandler.TEXT) {
213             value = new TextField(stringValue);
214         } else if (typeName == SettingsHandler.INTEGER) {
215             value = Integer.decode(stringValue);
216         } else if (typeName == SettingsHandler.LONG) {
217             value = Long.decode(stringValue);
218         } else if (typeName == SettingsHandler.BOOLEAN) {
219             value = Boolean.valueOf(stringValue);
220         } else if (typeName == SettingsHandler.DOUBLE) {
221             value = Double.valueOf(stringValue);
222         } else if (typeName == SettingsHandler.FLOAT) {
223             value = Float.valueOf(stringValue);
224         } else if (typeName == SettingsHandler.TIMESTAMP) {
225             try {
226                 value = ArchiveUtils.parse14DigitDate(stringValue);
227             } catch (ParseException e) {
228                 throw new ClassCastException(
229                     "Cannot convert '"
230                         + stringValue
231                         + "' to type '"
232                         + typeName
233                         + "'");
234             }
235         } else {
236             throw new ClassCastException(
237                 "Cannot convert '"
238                     + stringValue
239                     + "' to type '"
240                     + typeName
241                     + "'");
242         }
243         return value;
244     }
245 
246     /*** Get CrawlerSettings object in effect for a host or domain.
247      *
248      * If there is no specific settings for the host/domain, it will recursively
249      * go up the hierarchy to find the settings object that should be used for
250      * this host/domain.
251      *
252      * @param host the host or domain to get the settings for.
253      * @return settings object in effect for the host/domain.
254      * @see #getSettingsObject(String)
255      * @see #getOrCreateSettingsObject(String)
256      */
257     public CrawlerSettings getSettings(String host) {
258         return getRefinementsForSettings(getSettingsForHost(host), null);
259     }
260 
261     /*** Get CrawlerSettings object in effect for a host or domain.
262     *
263     * If there is no specific settings for the host/domain, it will recursively
264     * go up the hierarchy to find the settings object that should be used for
265     * this host/domain.
266     * <p/>
267     * This method passes around a URI that refinement are checked against.
268     *
269     * @param host the host or domain to get the settings for.
270     * @param uuri UURI for context.
271     * @return settings object in effect for the host/domain.
272     * @see #getSettingsObject(String)
273     * @see #getOrCreateSettingsObject(String)
274     */
275     public CrawlerSettings getSettings(String host, UURI uuri) {
276         return getRefinementsForSettings(getSettingsForHost(host), uuri);
277     }
278 
279     protected CrawlerSettings getSettingsForHost(String host) {
280         CrawlerSettings settings = settingsCache.getSettings(host, null);
281 
282         if (settings == null) {
283             String tmpHost = host;
284             settings = getSettingsObject(tmpHost);
285             while (settings == null && tmpHost != null) {
286                 tmpHost = getParentScope(tmpHost);
287                 settings = getSettingsObject(tmpHost);
288             }
289 
290             settingsCache.putSettings(host, settings);
291         }
292 
293         return settings;
294     }
295 
296     private CrawlerSettings getRefinementsForSettings(CrawlerSettings settings,
297             UURI uri) {
298         if (settings.hasRefinements()) {
299             for(Iterator it = settings.refinementsIterator(); it.hasNext();) {
300                 Refinement refinement = (Refinement) it.next();
301                 if (refinement.isWithinRefinementBounds(uri)) {
302                     settings = getSettingsObject(settings.getScope(),
303                             refinement.getReference());
304                 }
305             }
306         }
307 
308         return settings;
309     }
310 
311     /*** Get CrawlerSettings object for a host or domain.
312      *
313      * The difference between this method and the
314      * <code>getSettings(String host)</code> is that this method will return
315      * null if there is no settings for particular host or domain.
316      *
317      * @param scope the host or domain to get the settings for.
318      * @return settings object for the host/domain or null if no
319      *         settings exist for the host/domain.
320      * @see #getSettings(String)
321      * @see #getOrCreateSettingsObject(String)
322      */
323     public CrawlerSettings getSettingsObject(String scope) {
324         return getSettingsObject(scope, null);
325     }
326 
327     /***
328      * Get CrawlerSettings object for a host/domain and a particular refinement.
329      *
330      * @param scope the host or domain to get the settings for.
331      * @param refinement the refinement reference to get.
332      * @return CrawlerSettings object for a host/domain and a particular
333      * refinement or null if no settings exist for the host/domain.
334      */
335     public CrawlerSettings getSettingsObject(String scope, String refinement) {
336         CrawlerSettings settings =
337             settingsCache.getSettingsObject(scope, refinement);
338 
339         if (settings == null) {
340             // Reference not found
341             settings = new CrawlerSettings(this, scope, refinement);
342             // Try to read settings from persisten storage. If its not there
343             // it will be set to null.
344             settings = readSettingsObject(settings);
345             if (settings != null) {
346                 settingsCache.putSettings(scope, settings);
347             }
348         }
349         return settings;
350     }
351 
352     /*** Get or create CrawlerSettings object for a host or domain.
353      *
354      * This method is similar to {@link #getSettingsObject(String)} except that
355      * if there is no settings for this particular host or domain a new settings
356      * object will be returned.
357      *
358      * @param scope the host or domain to get or create the settings for.
359      * @return settings object for the host/domain.
360      * @see #getSettings(String)
361      * @see #getSettingsObject(String)
362      */
363     public CrawlerSettings getOrCreateSettingsObject(String scope) {
364         return getOrCreateSettingsObject(scope, null);
365     }
366 
367     public CrawlerSettings getOrCreateSettingsObject(String scope,
368             String refinement) {
369         CrawlerSettings settings;
370         settings = getSettingsObject(scope, refinement);
371         if (settings == null) {
372             scope = scope.intern();
373 
374             // No existing settings object found, create one
375             settings = new CrawlerSettings(this, scope, refinement);
376             settingsCache.refreshHostToSettings();
377             settingsCache.putSettings(scope, settings);
378         }
379         return settings;
380     }
381 
382     /*** Write the CrawlerSettings object to persistent storage.
383      *
384      * @param settings the settings object to write.
385      */
386     public abstract void writeSettingsObject(CrawlerSettings settings);
387 
388     /*** Read the CrawlerSettings object from persistent storage.
389      *
390      * @param settings the settings object to be updated with data from the
391      *                 persistent storage.
392      * @return the updated settings object or null if there was no data for this
393      *         in the persistent storage.
394      */
395     protected abstract CrawlerSettings readSettingsObject(CrawlerSettings settings);
396 
397     /*** Delete a settings object from persistent storage.
398      *
399      * @param settings the settings object to delete.
400      */
401     public void deleteSettingsObject(CrawlerSettings settings) {
402         settingsCache.deleteSettingsObject(settings);
403     }
404 
405     /*** Get the CrawlOrder.
406      *
407      * @return the CrawlOrder
408      */
409     public CrawlOrder getOrder() {
410         return order;
411     }
412 
413     /*** Instatiate a new ModuleType given its name and className.
414      *
415      * @param name the name for the new ComplexType.
416      * @param className the class name of the new ComplexType.
417      * @return an instance of the class identified by className.
418      *
419      * @throws InvocationTargetException
420      */
421     @SuppressWarnings("unchecked")
422     public static ModuleType instantiateModuleTypeFromClassName(
423             String name, String className)
424             throws InvocationTargetException {
425 
426         Class cl;
427         try {
428             cl = Class.forName(className);
429         } catch (ClassNotFoundException e) {
430             throw new InvocationTargetException(e);
431         }
432 
433         ModuleType module;
434         try {
435             Constructor co =
436                 cl.getConstructor(new Class[] { String.class });
437             module = (ModuleType) co.newInstance(new Object[] { name });
438         } catch (IllegalArgumentException e) {
439             throw new InvocationTargetException(e);
440         } catch (InstantiationException e) {
441             throw new InvocationTargetException(e);
442         } catch (IllegalAccessException e) {
443             throw new InvocationTargetException(e);
444         } catch (SecurityException e) {
445             throw new InvocationTargetException(e);
446         } catch (NoSuchMethodException e) {
447             throw new InvocationTargetException(e);
448         }
449         return module;
450     }
451 
452     /***
453      * Transforms a relative path so that it is relative to a location that is
454      * regarded as a working dir for these settings. If an absolute path is given,
455      * it will be returned unchanged.
456      * @param path A relative path to a file (or directory)
457      * @return The same path modified so that it is relative to the file level
458      *         location that is considered the working directory for these settings.
459      */
460     public abstract File getPathRelativeToWorkingDirectory(String path);
461 
462     /***
463      * Will return a Collection of strings with domains that contain 'per'
464      * domain overrides (or their subdomains contain them). 
465      * 
466      * The domains considered are
467      * limited to those that are subdomains of the supplied domain. If null or
468      * empty string is supplied the TLDs will be considered.
469      * @param rootDomain The domain to get domain overrides for. Examples:
470      *                   'org', 'archive.org', 'crawler.archive.org' etc.
471      * @return An array of domains that contain overrides. If rootDomain does not
472      *         exist an empty array will be returned.
473      */
474     public abstract Collection getDomainOverrides(String rootDomain);
475 
476     /***
477      * Unregister an instance of {@link ValueErrorHandler}.
478      *
479      * @param errorHandler the <code>CalueErrorHandler</code> to unregister.
480      *
481      * @see ValueErrorHandler
482      * @see #setErrorReportingLevel(Level)
483      * @see #registerValueErrorHandler(ValueErrorHandler)
484      *
485      */
486     public void unregisterValueErrorHandler(ValueErrorHandler errorHandler) {
487         valueErrorHandlers.remove(errorHandler);
488     }
489 
490     /***
491      * Register an instance of {@link ValueErrorHandler}.
492      * <p>
493      * If a ValueErrorHandler is registered, only constraints with level
494      * {@link Level#SEVERE}will throw an {@link InvalidAttributeValueException}.
495      * The ValueErrorHandler will recieve a notification for all failed checks
496      * with level equal or greater than the error reporting level.
497      *
498      * @param errorHandler the <code>CalueErrorHandler</code> to register.
499      *
500      * @see ValueErrorHandler
501      * @see #setErrorReportingLevel(Level)
502      * @see #unregisterValueErrorHandler(ValueErrorHandler)
503      */
504     public void registerValueErrorHandler(ValueErrorHandler errorHandler) {
505         if (errorHandler != null) {
506             valueErrorHandlers.add(errorHandler);
507         }
508     }
509 
510     /***
511      * Fire events on all registered {@link ValueErrorHandler}.
512      *
513      * @param error the failed constraints return value.
514      * @return true if there was any registered ValueErrorHandlers to notify.
515      */
516     boolean fireValueErrorHandlers(Constraint.FailedCheck error) {
517         if (error.getLevel().intValue() >= errorReportingLevel) {
518             for (Iterator it = valueErrorHandlers.iterator(); it.hasNext();) {
519                 ((ValueErrorHandler) it.next()).handleValueError(error);
520             }
521         }
522         return valueErrorHandlers.size() > 0;
523     }
524 
525     /***
526      * Set the level for which notification of failed constraints will be fired.
527      *
528      * @param level the error reporting level.
529      */
530     public void setErrorReportingLevel(Level level) {
531         errorReportingLevel = level.intValue();
532     }
533 
534     /***
535      * Creates and returns a <tt>List</tt> of all files comprising the current
536      * settings framework.
537      *
538      * <p>The List contains the absolute String path of each file.
539      *
540      * <p>The list should contain any configurable files, including such files
541      * as seed file and any other files use by the various settings modules.
542      *
543      * <p>Implementations of the SettingsHandler that do not use files for
544      * permanent storage should return an empty list.
545      * @return <code>List</code> of framework files.
546      */
547     public abstract List getListOfAllFiles();
548     
549     /***
550      * Clear any per-host settings cached in memory; allows editting of 
551      * per-host settings files on disk, perhaps in bulk/automated fashion,
552      * to take effect in running crawl. 
553      */
554     public void clearPerHostSettingsCache() {
555         settingsCache.clear();
556     }
557 
558     static ThreadLocal<SettingsHandler> threadContextSettingsHandler = 
559         new ThreadLocal<SettingsHandler>();
560     public static void setThreadContextSettingsHandler(SettingsHandler settingsHandler) {
561         threadContextSettingsHandler.set(settingsHandler);
562     }
563     public static SettingsHandler getThreadContextSettingsHandler() {
564         Thread t = Thread.currentThread();
565         if (t instanceof Checkpointer.CheckpointingThread) {
566             return ((Checkpointer.CheckpointingThread)t)
567                 .getController().getSettingsHandler();
568         } 
569         if (t instanceof ToeThread) {
570             return ((ToeThread) Thread.currentThread())
571                 .getController().getSettingsHandler();
572         } 
573         if(threadContextSettingsHandler.get()!=null) {
574             return threadContextSettingsHandler.get();
575         }
576         
577         // in most cases, returning a null means an NPE soon, 
578         // so perhaps this should log/raise differently
579         
580         // however, requesting object *might* just be transiently
581         // instantiated (as in momentary deserialization with some 
582         // Stored** operations), including in finalization thread 
583         // (which will never be linked to a usable settingsHandler).
584         // So, don't raise/log a noisy error for now. 
585         return null;
586     }
587 }