1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.scope;
25
26 import java.io.File;
27 import java.io.FileOutputStream;
28 import java.io.FileReader;
29 import java.io.IOException;
30 import java.io.OutputStreamWriter;
31
32 import org.archive.crawler.datamodel.CandidateURI;
33 import org.archive.crawler.deciderules.DecidingScope;
34 import org.archive.crawler.framework.CrawlController;
35 import org.archive.crawler.settings.SimpleType;
36 import org.archive.crawler.settings.Type;
37 import org.archive.util.SurtPrefixSet;
38
39 /***
40 * A specialized CrawlScope suitable for the most common crawl needs.
41 *
42 * Roughly, as with other existing CrawlScope variants, SurtPrefixScope's logic
43 * is that a URI is included if:
44 * <pre>
45 * ( isSeed(uri) || focusFilter.accepts(uri) ) ||
46 * transitiveFilter.accepts(uri) ) && ! excludeFilter.accepts(uri)
47 * </pre>
48 * Specifically, SurtPrefixScope uses a SurtFilter to test for focus-inclusion.
49 *
50 * @author gojomo
51 * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
52 */
53 public class SurtPrefixScope extends RefinedScope {
54
55 private static final long serialVersionUID = 2652008287322770123L;
56
57 public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
58 public static final String ATTR_SEEDS_AS_SURT_PREFIXES = "seeds-as-surt-prefixes";
59 public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
60
61 private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES = new Boolean(true);
62
63 /***
64 * Whether the 'via' of CrawlURIs should also be checked
65 * to see if it is prefixed by the set of SURT prefixes
66 */
67 public static final String
68 ATTR_ALSO_CHECK_VIA = "also-check-via";
69 public static final Boolean
70 DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
71
72 SurtPrefixSet surtPrefixes = null;
73
74 public SurtPrefixScope(String name) {
75 super(name);
76 setDescription(
77 "SurtPrefixScope: A scope for crawls limited to regions of " +
78 "the web defined by a set of SURT prefixes *Deprecated* " +
79 "Use DecidingScope instead. (The SURT form of " +
80 "a URI has its hostname reordered to ease sorting and "
81 + "grouping by domain hierarchies.)");
82 addElementToDefinition(
83 new SimpleType(ATTR_SURTS_SOURCE_FILE,
84 "Source file from which to infer SURT prefixes. Any URLs " +
85 "in file will be converted to the implied SURT prefix, and " +
86 "literal SURT prefixes may be listed on lines beginning " +
87 "with a '+' character.",
88 ""));
89 addElementToDefinition(
90 new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,
91 "Should seeds also be interpreted as SURT prefixes.",
92 DEFAULT_SEEDS_AS_SURT_PREFIXES));
93
94 Type t = addElementToDefinition(
95 new SimpleType(ATTR_SURTS_DUMP_FILE,
96 "Dump file to save SURT prefixes actually used.",
97 ""));
98 t.setExpertSetting(true);
99 t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
100 "Whether to also rule URI in-scope if a " +
101 "URI's 'via' URI (the URI from which it was discovered) " +
102 "in SURT form begins with any of the established prefixes. " +
103 "For example, can be used to accept URIs that are 'one hop " +
104 "off' URIs fitting the SURT prefixes. Default is false.",
105 DEFAULT_ALSO_CHECK_VIA));
106 t.setOverrideable(false);
107 t.setExpertSetting(true);
108
109 }
110
111
112
113
114
115 public void initialize(CrawlController controller) {
116 super.initialize(controller);
117 readPrefixes();
118 }
119
120 /***
121 * Check if a URI is part of this scope.
122 *
123 * @param object
124 * An instance of UURI or of CandidateURI.
125 * @return True if focus filter accepts passed object.
126 */
127 protected synchronized boolean focusAccepts(Object object) {
128
129 if (surtPrefixes == null) {
130 readPrefixes();
131 }
132 if ( (object instanceof CandidateURI) &&
133 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
134 .booleanValue()) {
135 if(focusAccepts(((CandidateURI)object).getVia())) {
136 return true;
137 }
138 }
139 String candidateSurt = SurtPrefixSet.getCandidateSurt(object);
140 if(candidateSurt == null) {
141 return false;
142 }
143 return surtPrefixes.containsPrefixOf(candidateSurt);
144 }
145
146 private void readPrefixes() {
147 surtPrefixes = new SurtPrefixSet();
148 FileReader fr = null;
149
150
151 String sourcePath = (String) getUncheckedAttribute(null,
152 ATTR_SURTS_SOURCE_FILE);
153 if(sourcePath.length()>0) {
154 File source = new File(sourcePath);
155 if (!source.isAbsolute()) {
156 source = new File(getSettingsHandler().getOrder()
157 .getController().getDisk(), sourcePath);
158 }
159 try {
160 fr = new FileReader(source);
161 try {
162 surtPrefixes.importFromMixed(fr,true);
163 } finally {
164 fr.close();
165 }
166
167 } catch (IOException e) {
168 e.printStackTrace();
169 throw new RuntimeException(e);
170 }
171 }
172
173
174 boolean deduceFromSeeds =
175 ((Boolean) getUncheckedAttribute(null, ATTR_SEEDS_AS_SURT_PREFIXES))
176 .booleanValue();
177 try {
178 fr = new FileReader(getSeedfile());
179 try {
180 surtPrefixes.importFromMixed(fr,deduceFromSeeds);
181 } finally {
182 fr.close();
183 }
184 } catch (IOException e) {
185 e.printStackTrace();
186 throw new RuntimeException(e);
187 }
188
189
190 String dumpPath = (String) getUncheckedAttribute(null,
191 ATTR_SURTS_DUMP_FILE);
192 if(dumpPath.length()>0) {
193 File dump = new File(dumpPath);
194 if (!dump.isAbsolute()) {
195 dump = new File(getSettingsHandler().getOrder()
196 .getController().getDisk(), dumpPath);
197 }
198 try {
199 OutputStreamWriter fw = new OutputStreamWriter(
200 new FileOutputStream(dump),"UTF-8");
201 try {
202 surtPrefixes.exportTo(fw);
203 } finally {
204 fw.close();
205 }
206 } catch (IOException e) {
207 e.printStackTrace();
208 throw new RuntimeException(e);
209 }
210 }
211 }
212
213 /***
214 * Re-read prefixes after an update.
215 *
216 * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
217 */
218 public synchronized void kickUpdate() {
219 super.kickUpdate();
220
221
222 readPrefixes();
223 }
224 }