View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SurtPrefixScope.java
20   * Created on Oct 1, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.scope;
25  
26  import java.io.File;
27  import java.io.FileOutputStream;
28  import java.io.FileReader;
29  import java.io.IOException;
30  import java.io.OutputStreamWriter;
31  
32  import org.archive.crawler.datamodel.CandidateURI;
33  import org.archive.crawler.deciderules.DecidingScope;
34  import org.archive.crawler.framework.CrawlController;
35  import org.archive.crawler.settings.SimpleType;
36  import org.archive.crawler.settings.Type;
37  import org.archive.util.SurtPrefixSet;
38  
39  /***
40   * A specialized CrawlScope suitable for the most common crawl needs.
41   * 
42   * Roughly, as with other existing CrawlScope variants, SurtPrefixScope's logic
43   * is that a URI is included if:
44   * <pre>
45   *  ( isSeed(uri) || focusFilter.accepts(uri) ) ||
46   *     transitiveFilter.accepts(uri) ) && ! excludeFilter.accepts(uri)
47   * </pre>
48   * Specifically, SurtPrefixScope uses a SurtFilter to test for focus-inclusion.
49   * 
50   * @author gojomo
51   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingScope}.
52   */
53  public class SurtPrefixScope extends RefinedScope {
54  
55      private static final long serialVersionUID = 2652008287322770123L;
56  
57      public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
58      public static final String ATTR_SEEDS_AS_SURT_PREFIXES = "seeds-as-surt-prefixes";
59      public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
60      
61      private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES = new Boolean(true);
62  
63      /***
64       * Whether the 'via' of CrawlURIs should also be checked
65       * to see if it is prefixed by the set of SURT prefixes
66       */
67      public static final String 
68          ATTR_ALSO_CHECK_VIA = "also-check-via";
69      public static final Boolean
70          DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
71      
72      SurtPrefixSet surtPrefixes = null;
73  
74      public SurtPrefixScope(String name) {
75          super(name);
76          setDescription(
77                  "SurtPrefixScope: A scope for crawls limited to regions of " +
78                  "the web defined by a set of SURT prefixes *Deprecated* " +
79                  "Use DecidingScope instead. (The SURT form of " +
80                  "a URI has its hostname reordered to ease sorting and "
81                  + "grouping by domain hierarchies.)");
82          addElementToDefinition(
83                  new SimpleType(ATTR_SURTS_SOURCE_FILE, 
84                  		"Source file from which to infer SURT prefixes. Any URLs " +
85                          "in file will be converted to the implied SURT prefix, and " +
86                          "literal SURT prefixes may be listed on lines beginning " +
87                          "with a '+' character.", 
88                          ""));
89          addElementToDefinition(
90                  new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES, 
91                          "Should seeds also be interpreted as SURT prefixes.", 
92                          DEFAULT_SEEDS_AS_SURT_PREFIXES));
93          
94          Type t = addElementToDefinition(
95                  new SimpleType(ATTR_SURTS_DUMP_FILE, 
96                          "Dump file to save SURT prefixes actually used.", 
97                          ""));
98          t.setExpertSetting(true);
99          t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
100                 "Whether to also rule URI in-scope if a " +
101                 "URI's 'via' URI (the URI from which it was discovered) " +
102                 "in SURT form begins with any of the established prefixes. " +
103                 "For example, can be used to accept URIs that are 'one hop " +
104                 "off' URIs fitting the SURT prefixes. Default is false.",
105                 DEFAULT_ALSO_CHECK_VIA));
106         t.setOverrideable(false);
107         t.setExpertSetting(true);
108 
109     }
110 
111     
112     /* (non-Javadoc)
113      * @see org.archive.crawler.framework.CrawlScope#initialize(org.archive.crawler.framework.CrawlController)
114      */
115     public void initialize(CrawlController controller) {
116         super.initialize(controller);
117         readPrefixes();
118     }
119     
120     /***
121      * Check if a URI is part of this scope.
122      * 
123      * @param object
124      *            An instance of UURI or of CandidateURI.
125      * @return True if focus filter accepts passed object.
126      */
127     protected synchronized boolean focusAccepts(Object object) {
128         // TODO: eliminate duplication wrt/SurtPrefixedDecideRule.evaluate
129         if (surtPrefixes == null) {
130             readPrefixes();
131         }
132         if ( (object instanceof CandidateURI) && 
133                 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
134                     .booleanValue()) {
135             if(focusAccepts(((CandidateURI)object).getVia())) {
136                 return true;
137             }
138         }
139         String candidateSurt = SurtPrefixSet.getCandidateSurt(object);
140         if(candidateSurt == null) {
141             return false; 
142         }
143         return surtPrefixes.containsPrefixOf(candidateSurt);
144     }
145     
146     private void readPrefixes() {
147         surtPrefixes = new SurtPrefixSet(); 
148         FileReader fr = null;
149         
150         // read SURTs from file, if appropriate 
151         String sourcePath = (String) getUncheckedAttribute(null,
152                 ATTR_SURTS_SOURCE_FILE);
153         if(sourcePath.length()>0) {
154             File source = new File(sourcePath);
155             if (!source.isAbsolute()) {
156                 source = new File(getSettingsHandler().getOrder()
157                         .getController().getDisk(), sourcePath);
158             }
159             try {
160                 fr = new FileReader(source);
161                 try {
162                     surtPrefixes.importFromMixed(fr,true);
163                 } finally {
164                     fr.close();
165                 }
166         
167             } catch (IOException e) {
168                 e.printStackTrace();
169                 throw new RuntimeException(e);
170             } 
171         }
172         
173         // interpret seeds as surts, if appropriate
174         boolean deduceFromSeeds = 
175             ((Boolean) getUncheckedAttribute(null, ATTR_SEEDS_AS_SURT_PREFIXES))
176             .booleanValue();
177         try {
178             fr = new FileReader(getSeedfile());
179             try {
180                 surtPrefixes.importFromMixed(fr,deduceFromSeeds);
181             } finally {
182                 fr.close();
183             }
184         } catch (IOException e) {
185             e.printStackTrace();
186             throw new RuntimeException(e);
187         }  
188 
189         // dump surts to file, if appropriate
190         String dumpPath = (String) getUncheckedAttribute(null,
191                 ATTR_SURTS_DUMP_FILE);
192         if(dumpPath.length()>0) {
193             File dump = new File(dumpPath);
194             if (!dump.isAbsolute()) {
195                 dump = new File(getSettingsHandler().getOrder()
196                         .getController().getDisk(), dumpPath);
197             }
198             try {
199                 OutputStreamWriter fw = new OutputStreamWriter(
200                         new FileOutputStream(dump),"UTF-8");
201                 try {
202                     surtPrefixes.exportTo(fw);
203                 } finally {
204                     fw.close();
205                 }
206             } catch (IOException e) {
207                 e.printStackTrace();
208                 throw new RuntimeException(e);
209             }
210         }
211     }
212 
213     /***
214      * Re-read prefixes after an update. 
215      * 
216      * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
217      */
218     public synchronized void kickUpdate() {
219         super.kickUpdate();
220         // TODO: make conditional on file having actually changed,
221         // perhaps by remembering mod-time
222         readPrefixes();
223     }
224 }