View Javadoc

1   /* CrawlerSettings
2    *
3    * $Id: CrawlerSettings.java 4662 2006-09-25 23:45:21Z paul_jack $
4    *
5    * Created on Dec 16, 2003
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.settings;
26  
27  import java.util.ArrayList;
28  import java.util.Date;
29  import java.util.HashMap;
30  import java.util.Iterator;
31  import java.util.List;
32  import java.util.ListIterator;
33  import java.util.Map;
34  
35  import org.archive.crawler.settings.refinements.Refinement;
36  import org.archive.net.UURI;
37  
38  /***
39   * Class representing a settings file.
40   *
41   * More precisely it represents a collection of settings valid in a particular
42   * scope. The scope is either the global settings, or the settings to be used
43   * for a particular domain or host. For scopes other than global, the instance
44   * will only contain those settings that are different from the global.
45   *
46   * In the default implementation this is a one to one mapping from a file to
47   * an instance of this class, but in other implementations the information in
48   * an instance of this class might be stored in a different way (for example
49   * in a RDBMS).
50   *
51   * @author John Erik Halse
52   */
53  public class CrawlerSettings {
54      /*** Registry of DataContainers for ComplexTypes in this settings object
55       *  indexed on absolute name */
56      private final Map<String,DataContainer> localComplexTypes
57       = new HashMap<String,DataContainer>();
58  
59      /*** Registry of top level ModuleTypes in this settings object indexed on
60       * module name. These are modules that doesn't have parents in this
61       * settings object
62       */
63      private final Map<String,ModuleType> topLevelModules
64       = new HashMap<String,ModuleType>();
65  
66      /*** Registry of all ModuleTypes in this settings object indexed on
67       * module name.
68       */
69      private final Map<String,ComplexType> localModules 
70       = new HashMap<String,ComplexType>();
71  
72      /*** Reference to the settings handler this settings object belongs to */
73      private final SettingsHandler settingsHandler;
74  
75      /*** Scope for this collection of settings (hostname) */
76      private final String scope;
77  
78      /*** List of refinements applied to this settings object */
79      private List<Refinement> refinements;
80  
81      /*** True if this settings object is a refinement */
82      private boolean isRefinement = false;
83  
84      /*** Name of this collection of settings */
85      private String name = "";
86  
87      /*** Description of this collection of settings */
88      private String description = "";
89  
90      /***
91       * Operator of this crawl job.
92       */
93      private String operator = "Admin";
94  
95      /***
96       * Organization running this crawl job.
97       */
98      private String organization = "";
99  
100     /***
101      * Audience/recipient/customer on whose behalf this crawl is being run.
102      */
103     private String audience = "";
104 
105 
106     /*** Time when this collection was last saved to persistent storage */
107     private Date lastSaved = null;
108 
109     /***
110      * Constructs a new CrawlerSettings object.
111      *
112      * Application code should not call the constructor directly, but use the
113      * methods in SettingsHandler instead.
114      *
115      * @param handler The SettingsHandler this object belongs to.
116      * @param scope The scope of this settings object (ie. host or domain).
117      *
118      * @see SettingsHandler#getSettings(String)
119      * @see SettingsHandler#getSettingsObject(String)
120      */
121     public CrawlerSettings(SettingsHandler handler, String scope) {
122         this.settingsHandler = handler;
123         this.scope = scope;
124     }
125 
126     /***
127     * Constructs a new CrawlerSettings object which is a refinement of another
128     * settings object.
129     *
130     * Application code should not call the constructor directly, but use the
131     * methods in SettingsHandler instead.
132     *
133     * @param handler The SettingsHandler this object belongs to.
134     * @param scope The scope of this settings object (ie. host or domain).
135     * @param refinement the name or reference to the refinement.
136     *
137     * @see SettingsHandler#getSettings(String)
138     * @see SettingsHandler#getSettingsObject(String)
139     */
140     public CrawlerSettings(SettingsHandler handler, String scope,
141             String refinement) {
142         this(handler, scope);
143         if (refinement != null && !refinement.equals("")) {
144             this.isRefinement = true;
145             this.name = refinement;
146         }
147     }
148 
149     /*** Get the description of this CrawlerSettings object.
150      *
151      * @return the description of this CrawlerSettings object.
152      */
153     public String getDescription() {
154         return description;
155     }
156 
157     /*** Get the name of this CrawlerSettings object.
158      *
159      * @return the name of this CrawlerSettings object.
160      */
161     public String getName() {
162         return name;
163     }
164 
165     /***
166      * Get the name of operator of this crawl from this CrawlerSettings object.
167      *
168      * @return the name of this CrawlerSettings object.
169      */
170     public String getOperator() {
171         return operator;
172     }
173 
174     /***
175      * Get the name of the organization running this crawl from this
176      * CrawlerSettings object.
177      *
178      * @return the name of the organization running this crawl.
179      */
180     public String getOrganization() {
181         return organization;
182     }
183 
184     /***
185      * Get the audience/customer/recipient of the crawl job product from
186      * this CrawlerSettings object.
187      *
188      * @return the audience/customer/recipient of the crawl job product.
189      */
190     public String getAudience() {
191         return audience;
192     }
193 
194     /*** Get the scope of this CrawlerSettings object.
195      *
196      * @return the scope of this CrawlerSettings object.
197      */
198     public String getScope() {
199         return scope;
200     }
201 
202     /*** Set the description of this CrawlerSettings object.
203      *
204      * @param string the description to be set for this CrawlerSettings object.
205      */
206     public void setDescription(String string) {
207         description = string;
208     }
209 
210     /***
211      * Set the operator of this crawl job.
212      * @param name Operator running this crawl.
213      */
214     public void setOperator(String name) {
215         this.operator = name;
216     }
217 
218     /***
219      * Set the name of the organization who is running this crawl.
220      * @param name Name of organization running this crawl.
221      */
222     public void setOrganization(String name) {
223         this.organization = name;
224     }
225 
226     /***
227      * Set the recipient/customer for the crawl job product.
228      * @param name Recipient of crawl job product.
229      */
230     public void setAudience(String name) {
231         this.audience = name;
232     }
233 
234     /*** Set the name of this CrawlerSettings object.
235      *
236      * @param string the name to be set for this CrawlerSettings object.
237      */
238     public void setName(String string) {
239         name = string;
240     }
241 
242     /***
243      * Get the time when this CrawlerSettings was last saved to persistent
244      * storage.
245      *
246      * @return the time when this CrawlerSettings was last saved to persistent
247      * storage. Null if it has not been saved.
248      */
249     public Date getLastSavedTime() {
250         return lastSaved;
251     }
252 
253     /***
254      * Set the time when this CrawlerSettings was last saved to persistent
255      * storage.
256      *
257      * @param lastSaved the time when this CrawlerSettings was last saved to
258      * persistent storage.
259      */
260     protected void setLastSavedTime(Date lastSaved) {
261         this.lastSaved = lastSaved;
262     }
263 
264     protected void addTopLevelModule(ModuleType module) {
265 //        if (topLevelModules.containsKey(module.getName())) {
266 //            throw new IllegalArgumentException(
267 //                "Duplicate module name: " + module.getName());
268 //        } else {
269             topLevelModules.put(module.getName(), module);
270 //        }
271     }
272 
273     protected DataContainer addComplexType(ComplexType type) {
274         DataContainer data = new DataContainer(this, type);
275         localComplexTypes.put(type.getAbsoluteName(), data);
276         if (type instanceof ModuleType) {
277             localModules.put(type.getName(), type);
278         }
279         return data;
280     }
281 
282     protected DataContainer getData(ComplexType complex) {
283         return getData(complex.getAbsoluteName());
284     }
285 
286     protected DataContainer getData(String absoluteName) {
287         return (DataContainer) localComplexTypes.get(absoluteName);
288     }
289 
290     protected ModuleType getTopLevelModule(String name) {
291         return (ModuleType) topLevelModules.get(name);
292     }
293 
294     public ModuleType getModule(String name) {
295         return (ModuleType) localModules.get(name);
296     }
297 
298     protected Iterator topLevelModules() {
299         return topLevelModules.values().iterator();
300     }
301 
302     /*** Get the parent of this CrawlerSettings object.
303      *
304      * @return the parent of this CrawlerSettings object.
305      */
306     public CrawlerSettings getParent() {
307         return getParent(null);
308     }
309 
310     /***
311      * Get the parent of this CrawlerSettings object.
312      * This method passes around a URI so that refinements could be checked.
313      *
314      * @param uri The uri for which parents of this object shoul be found.
315      * @return the parent of this CrawlerSettings object.
316      */
317     public CrawlerSettings getParent(UURI uri) {
318         return (isRefinement())?
319             settingsHandler.getSettingsForHost(scope):
320             (scope == null || scope.equals(""))?
321                 null: 
322                 settingsHandler.
323                     getSettings(settingsHandler.getParentScope(scope), uri);
324     }
325 
326     /*** Get the SettingHandler this CrawlerSettings object belongs to.
327      *
328      * @return the SettingHandler this CrawlerSettings object belongs to.
329      */
330     public SettingsHandler getSettingsHandler() {
331         return settingsHandler;
332     }
333 
334     /***
335      * Get an <code>ListIterator</code> over the refinements for this
336      * settings object.
337      *
338      * @return Returns an iterator over the refinements.
339      */
340     public ListIterator refinementsIterator() {
341         if (refinements == null) {
342             refinements = new ArrayList<Refinement>();
343         }
344         return refinements.listIterator();
345     }
346 
347     /***
348      * Add a refinement to this settings object.
349      *
350      * @param refinement The refinements to set.
351      */
352     public void addRefinement(Refinement refinement) {
353         if (refinements == null) {
354             refinements = new ArrayList<Refinement>();
355         }
356         this.refinements.remove(refinement);
357         this.refinements.add(refinement);
358     }
359 
360     /***
361      * Remove a refinement from this settings object.
362      *
363      * @param reference the reference (name) to the refinement to be removed.
364      * @return true if something was removed, false if the refinement was not
365      *         found.
366      */
367     public boolean removeRefinement(String reference) {
368         if (hasRefinements()) {
369             for(Iterator it = refinements.iterator(); it.hasNext();) {
370                 if (((Refinement) it.next()).getReference().equals(reference)) {
371                     it.remove();
372                     return true;
373                 }
374             }
375         }
376         return false;
377     }
378 
379     /***
380      * Get a refinement with a given reference.
381      *
382      * @param reference the reference (name) to the refinement to get.
383      * @return the refinement having the specified reference or null if no
384      *         refinement matches it.
385      */
386     public Refinement getRefinement(String reference) {
387         if (hasRefinements()) {
388             for(Iterator it = refinements.iterator(); it.hasNext();) {
389                 Refinement tmp = (Refinement) it.next();
390                 if (tmp.getReference().equals(reference)) {
391                     return tmp;
392                 }
393             }
394         }
395         return null;
396     }
397 
398     /***
399      * Returns true if this settings object has refinements attached to it.
400      *
401      * @return true if this settings object has refinements attached to it.
402      */
403     public boolean hasRefinements() {
404         return refinements != null && !refinements.isEmpty();
405     }
406 
407     /***
408      * Returns true if this settings object is a refinement.
409      *
410      * @return true if this settings object is a refinement.
411      */
412     public boolean isRefinement() {
413         return isRefinement;
414     }
415 
416     /***
417      * Mark this settings object as an refinement.
418      *
419      * @param isRefinement Set this to true if this settings object is a
420      *            refinement.
421      */
422     public void setRefinement(boolean isRefinement) {
423         this.isRefinement = isRefinement;
424     }
425 }