CrawlScope xref

View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlScope.java
20   * Created on Oct 1, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.framework;
25  
26  import java.io.BufferedReader;
27  import java.io.File;
28  import java.io.FileInputStream;
29  import java.io.FileOutputStream;
30  import java.io.IOException;
31  import java.io.InputStreamReader;
32  import java.io.OutputStreamWriter;
33  import java.io.Writer;
34  import java.util.HashSet;
35  import java.util.Iterator;
36  import java.util.List;
37  import java.util.Set;
38  import java.util.logging.Logger;
39  
40  import javax.management.AttributeNotFoundException;
41  import javax.management.MBeanException;
42  import javax.management.ReflectionException;
43  
44  import org.apache.commons.httpclient.URIException;
45  import org.archive.crawler.datamodel.CandidateURI;
46  import org.archive.crawler.scope.SeedFileIterator;
47  import org.archive.crawler.scope.SeedListener;
48  import org.archive.crawler.settings.CrawlerSettings;
49  import org.archive.crawler.settings.SimpleType;
50  import org.archive.crawler.settings.Type;
51  import org.archive.net.UURI;
52  import org.archive.util.ArchiveUtils;
53  import org.archive.util.DevUtils;
54  
55  /***
56   * A CrawlScope instance defines which URIs are "in"
57   * a particular crawl.
58   *
59   * It is essentially a Filter which determines, looking at
60   * the totality of information available about a
61   * CandidateURI/CrawlURI instamce, if that URI should be
62   * scheduled for crawling.
63   *
64   * Dynamic information inherent in the discovery of the
65   * URI -- such as the path by which it was discovered --
66   * may be considered.
67   *
68   * Dynamic information which requires the consultation
69   * of external and potentially volatile information --
70   * such as current robots.txt requests and the history
71   * of attempts to crawl the same URI -- should NOT be
72   * considered. Those potentially high-latency decisions
73   * should be made at another step.
74   *
75   * @author gojomo
76   *
77   */
78  public class CrawlScope extends Filter {
79  
80      private static final long serialVersionUID = -3321533224526211277L;
81  
82      private static final Logger logger =
83          Logger.getLogger(CrawlScope.class.getName());
84      public static final String ATTR_NAME = "scope";
85      public static final String ATTR_SEEDS = "seedsfile";
86      
87      /***
88       * Whether every configu change should trigger a 
89       * rereading of the original seeds spec/file.
90       */
91      public static final String 
92          ATTR_REREAD_SEEDS_ON_CONFIG = "reread-seeds-on-config";
93      public static final Boolean
94          DEFAULT_REREAD_SEEDS_ON_CONFIG = Boolean.TRUE;
95      
96      protected Set<SeedListener> seedListeners = new HashSet<SeedListener>();
97  
98      /*** Constructs a new CrawlScope.
99       *
100      * @param name the name is ignored since it always have to be the value of
101      *        the constant ATT_NAME.
102      */
103     public CrawlScope(String name) {
104         // 'name' is never used.
105         super(ATTR_NAME, "Crawl scope");
106         Type t;
107         t = addElementToDefinition(new SimpleType(ATTR_SEEDS,
108                 "File from which to extract seeds.", "seeds.txt"));
109         t.setOverrideable(false);
110         t.setExpertSetting(true);
111         t = addElementToDefinition(new SimpleType(ATTR_REREAD_SEEDS_ON_CONFIG,
112                 "Whether to reread the seeds specification, whether it has " +
113                 "changed or not, every time any configuration change occurs. " +
114                 "If true, seeds are reread even when (for example) new " +
115                 "domain overrides are set. Rereading the seeds can take a " +
116                 "long time with large seed lists.", 
117                 DEFAULT_REREAD_SEEDS_ON_CONFIG));
118         t.setOverrideable(false);
119         t.setExpertSetting(true);
120 
121     }
122 
123     /*** Default constructor.
124      */
125     public CrawlScope() {
126         this(ATTR_NAME);
127     }
128 
129     /***
130      * Initialize is called just before the crawler starts to run.
131      *
132      * The settings system is up and initialized so can be used.  This
133      * initialize happens after {@link #earlyInitialize(CrawlerSettings)}.
134      *
135      * @param controller Controller object.
136      */
137     public void initialize(CrawlController controller) {
138         // by default do nothing (subclasses override)
139     }
140 
141     public String toString() {
142         return "CrawlScope<" + getName() + ">";
143     }
144 
145     /***
146      * Refresh seeds.
147      *
148      */
149     public void refreshSeeds() {
150         // by default do nothing (subclasses which cache should override)
151     }
152 
153     /***
154      * @return Seed list file or null if problem getting settings file.
155      */
156     public File getSeedfile() {
157         File file = null;
158         try {
159             file = getSettingsHandler().getPathRelativeToWorkingDirectory(
160                 (String)getAttribute(ATTR_SEEDS));
161             if (!file.exists() || !file.canRead()) {
162                 throw new IOException("Seeds file " +
163                     file.getAbsolutePath() + " does not exist or unreadable.");
164             }
165         } catch (IOException e) {
166             DevUtils.warnHandle(e, "problem reading seeds");
167         } catch (AttributeNotFoundException e) {
168             DevUtils.warnHandle(e, "problem reading seeds");
169         } catch (MBeanException e) {
170             DevUtils.warnHandle(e, "problem reading seeds");
171             e.printStackTrace();
172         } catch (ReflectionException e) {
173             DevUtils.warnHandle(e, "problem reading seeds");
174             e.printStackTrace();
175         }
176 
177         return file;
178     }
179 
180     /*** Check if a URI is in the seeds.
181      *
182      * @param o the URI to check.
183      * @return true if URI is a seed.
184      */
185     protected boolean isSeed(Object o) {
186         return o instanceof CandidateURI && ((CandidateURI) o).isSeed();
187     }
188 
189     /***
190      * @param a First UURI of compare.
191      * @param b Second UURI of compare.
192      * @return True if UURIs are of same host.
193      */
194     protected boolean isSameHost(UURI a, UURI b) {
195         boolean isSameHost = false;
196         if (a != null && b != null) {
197             // getHost can come back null.  See
198             // "[ 910120 ] java.net.URI#getHost fails when leading digit"
199             try {
200                 if (a.getReferencedHost() != null && b.getReferencedHost() != null) {
201                     if (a.getReferencedHost().equals(b.getReferencedHost())) {
202                         isSameHost = true;
203                     }
204                 }
205             }
206             catch (URIException e) {
207                 logger.severe("Failed compare of " + a + " " + b + ": " +
208                     e.getMessage());
209             }
210         }
211         return isSameHost;
212     }
213 
214 
215 
216     /* (non-Javadoc)
217      * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
218      */
219     public void listUsedFiles(List<String> list){
220         // Add seed file
221         try {
222             File file = getSettingsHandler().getPathRelativeToWorkingDirectory(
223                     (String)getAttribute(ATTR_SEEDS));
224             list.add(file.getAbsolutePath());
225         } catch (AttributeNotFoundException e) {
226             // TODO Auto-generated catch block
227             e.printStackTrace();
228         } catch (MBeanException e) {
229             // TODO Auto-generated catch block
230             e.printStackTrace();
231         } catch (ReflectionException e) {
232             // TODO Auto-generated catch block
233             e.printStackTrace();
234         }
235     }
236 
237     /***
238      * Take note of a situation (such as settings edit) where
239      * involved reconfiguration (such as reading from external
240      * files) may be necessary.
241      */
242     public void kickUpdate() {
243         // TODO: further improve this so that case with hundreds of
244         // thousands or millions of seeds works better without requiring
245         // this specific settings check 
246         if (((Boolean) getUncheckedAttribute(null, ATTR_REREAD_SEEDS_ON_CONFIG))
247                 .booleanValue()) {
248             refreshSeeds();
249             getSettingsHandler().getOrder().getController().getFrontier().loadSeeds();
250         }
251     }
252 
253     /***
254      * Gets an iterator over all configured seeds. Subclasses
255      * which cache seeds in memory can override with more
256      * efficient implementation. 
257      *
258      * @return Iterator, perhaps over a disk file, of seeds
259      */
260     public Iterator<UURI> seedsIterator() {
261         return seedsIterator(null);
262     }
263     
264     /***
265      * Gets an iterator over all configured seeds. Subclasses
266      * which cache seeds in memory can override with more
267      * efficient implementation. 
268      *
269      * @param ignoredItemWriter optional writer to get ignored seed items report
270      * @return Iterator, perhaps over a disk file, of seeds
271      */
272     public Iterator<UURI> seedsIterator(Writer ignoredItemWriter) {
273         BufferedReader br;
274         try {
275             br = new BufferedReader(
276                 new InputStreamReader(
277                     new FileInputStream(getSeedfile()),
278                     "UTF-8"));
279         } catch (IOException e) {
280             throw new RuntimeException(e);
281         }
282         return new SeedFileIterator(br,ignoredItemWriter);
283     }
284     
285     /***
286      * Convenience method to close SeedFileIterator, if appropriate.
287      * 
288      * @param iter Iterator to check if SeedFileIterator needing closing
289      */
290     protected void checkClose(Iterator iter) {
291         if(iter instanceof SeedFileIterator) {
292             ((SeedFileIterator)iter).close();
293         }
294     }
295     
296     /***
297      * Add a new seed to scope. By default, simply appends
298      * to seeds file, though subclasses may handle differently.
299      *
300      * <p>This method is *not* sufficient to get the new seed 
301      * scheduled in the Frontier for crawling -- it only 
302      * affects the Scope's seed record (and decisions which
303      * flow from seeds). 
304      *
305      * @param curi CandidateUri to add
306      * @return true if successful, false if add failed for any reason
307      */
308     public boolean addSeed(final CandidateURI curi) {
309         File f = getSeedfile();
310         if (f != null) {
311             try {
312                 OutputStreamWriter fw = 
313                     new OutputStreamWriter(new FileOutputStream(f, true),"UTF-8");
314                 // Write to new (last) line the URL.
315                 fw.write("\n");
316                 fw.write("# Heritrix added seed ");
317                 fw.write((curi.getVia() != null) 
318                             ? "redirect from " + curi.getVia() 
319                             : "(JMX)");
320                 fw.write(" " + ArchiveUtils.get17DigitDate() + ".\n");
321                 fw.write(curi.toString());
322                 fw.flush();
323                 fw.close();
324                 Iterator iter = seedListeners.iterator();
325                 while(iter.hasNext()) {
326                     ((SeedListener)iter.next()).addedSeed(curi);
327                 }
328                 return true;
329             } catch (IOException e) {
330                 DevUtils.warnHandle(e, "problem writing new seed");
331             }
332         }
333         return false; 
334     }
335     
336     public void addSeedListener(SeedListener sl) {
337         seedListeners.add(sl);
338     }
339 }