1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.framework;
25
26 import java.io.BufferedReader;
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileOutputStream;
30 import java.io.IOException;
31 import java.io.InputStreamReader;
32 import java.io.OutputStreamWriter;
33 import java.io.Writer;
34 import java.util.HashSet;
35 import java.util.Iterator;
36 import java.util.List;
37 import java.util.Set;
38 import java.util.logging.Logger;
39
40 import javax.management.AttributeNotFoundException;
41 import javax.management.MBeanException;
42 import javax.management.ReflectionException;
43
44 import org.apache.commons.httpclient.URIException;
45 import org.archive.crawler.datamodel.CandidateURI;
46 import org.archive.crawler.scope.SeedFileIterator;
47 import org.archive.crawler.scope.SeedListener;
48 import org.archive.crawler.settings.CrawlerSettings;
49 import org.archive.crawler.settings.SimpleType;
50 import org.archive.crawler.settings.Type;
51 import org.archive.net.UURI;
52 import org.archive.util.ArchiveUtils;
53 import org.archive.util.DevUtils;
54
55 /***
56 * A CrawlScope instance defines which URIs are "in"
57 * a particular crawl.
58 *
59 * It is essentially a Filter which determines, looking at
60 * the totality of information available about a
61 * CandidateURI/CrawlURI instamce, if that URI should be
62 * scheduled for crawling.
63 *
64 * Dynamic information inherent in the discovery of the
65 * URI -- such as the path by which it was discovered --
66 * may be considered.
67 *
68 * Dynamic information which requires the consultation
69 * of external and potentially volatile information --
70 * such as current robots.txt requests and the history
71 * of attempts to crawl the same URI -- should NOT be
72 * considered. Those potentially high-latency decisions
73 * should be made at another step.
74 *
75 * @author gojomo
76 *
77 */
78 public class CrawlScope extends Filter {
79
80 private static final long serialVersionUID = -3321533224526211277L;
81
82 private static final Logger logger =
83 Logger.getLogger(CrawlScope.class.getName());
84 public static final String ATTR_NAME = "scope";
85 public static final String ATTR_SEEDS = "seedsfile";
86
87 /***
88 * Whether every configu change should trigger a
89 * rereading of the original seeds spec/file.
90 */
91 public static final String
92 ATTR_REREAD_SEEDS_ON_CONFIG = "reread-seeds-on-config";
93 public static final Boolean
94 DEFAULT_REREAD_SEEDS_ON_CONFIG = Boolean.TRUE;
95
96 protected Set<SeedListener> seedListeners = new HashSet<SeedListener>();
97
98 /*** Constructs a new CrawlScope.
99 *
100 * @param name the name is ignored since it always have to be the value of
101 * the constant ATT_NAME.
102 */
103 public CrawlScope(String name) {
104
105 super(ATTR_NAME, "Crawl scope");
106 Type t;
107 t = addElementToDefinition(new SimpleType(ATTR_SEEDS,
108 "File from which to extract seeds.", "seeds.txt"));
109 t.setOverrideable(false);
110 t.setExpertSetting(true);
111 t = addElementToDefinition(new SimpleType(ATTR_REREAD_SEEDS_ON_CONFIG,
112 "Whether to reread the seeds specification, whether it has " +
113 "changed or not, every time any configuration change occurs. " +
114 "If true, seeds are reread even when (for example) new " +
115 "domain overrides are set. Rereading the seeds can take a " +
116 "long time with large seed lists.",
117 DEFAULT_REREAD_SEEDS_ON_CONFIG));
118 t.setOverrideable(false);
119 t.setExpertSetting(true);
120
121 }
122
123 /*** Default constructor.
124 */
125 public CrawlScope() {
126 this(ATTR_NAME);
127 }
128
129 /***
130 * Initialize is called just before the crawler starts to run.
131 *
132 * The settings system is up and initialized so can be used. This
133 * initialize happens after {@link #earlyInitialize(CrawlerSettings)}.
134 *
135 * @param controller Controller object.
136 */
137 public void initialize(CrawlController controller) {
138
139 }
140
141 public String toString() {
142 return "CrawlScope<" + getName() + ">";
143 }
144
145 /***
146 * Refresh seeds.
147 *
148 */
149 public void refreshSeeds() {
150
151 }
152
153 /***
154 * @return Seed list file or null if problem getting settings file.
155 */
156 public File getSeedfile() {
157 File file = null;
158 try {
159 file = getSettingsHandler().getPathRelativeToWorkingDirectory(
160 (String)getAttribute(ATTR_SEEDS));
161 if (!file.exists() || !file.canRead()) {
162 throw new IOException("Seeds file " +
163 file.getAbsolutePath() + " does not exist or unreadable.");
164 }
165 } catch (IOException e) {
166 DevUtils.warnHandle(e, "problem reading seeds");
167 } catch (AttributeNotFoundException e) {
168 DevUtils.warnHandle(e, "problem reading seeds");
169 } catch (MBeanException e) {
170 DevUtils.warnHandle(e, "problem reading seeds");
171 e.printStackTrace();
172 } catch (ReflectionException e) {
173 DevUtils.warnHandle(e, "problem reading seeds");
174 e.printStackTrace();
175 }
176
177 return file;
178 }
179
180 /*** Check if a URI is in the seeds.
181 *
182 * @param o the URI to check.
183 * @return true if URI is a seed.
184 */
185 protected boolean isSeed(Object o) {
186 return o instanceof CandidateURI && ((CandidateURI) o).isSeed();
187 }
188
189 /***
190 * @param a First UURI of compare.
191 * @param b Second UURI of compare.
192 * @return True if UURIs are of same host.
193 */
194 protected boolean isSameHost(UURI a, UURI b) {
195 boolean isSameHost = false;
196 if (a != null && b != null) {
197
198
199 try {
200 if (a.getReferencedHost() != null && b.getReferencedHost() != null) {
201 if (a.getReferencedHost().equals(b.getReferencedHost())) {
202 isSameHost = true;
203 }
204 }
205 }
206 catch (URIException e) {
207 logger.severe("Failed compare of " + a + " " + b + ": " +
208 e.getMessage());
209 }
210 }
211 return isSameHost;
212 }
213
214
215
216
217
218
219 public void listUsedFiles(List<String> list){
220
221 try {
222 File file = getSettingsHandler().getPathRelativeToWorkingDirectory(
223 (String)getAttribute(ATTR_SEEDS));
224 list.add(file.getAbsolutePath());
225 } catch (AttributeNotFoundException e) {
226
227 e.printStackTrace();
228 } catch (MBeanException e) {
229
230 e.printStackTrace();
231 } catch (ReflectionException e) {
232
233 e.printStackTrace();
234 }
235 }
236
237 /***
238 * Take note of a situation (such as settings edit) where
239 * involved reconfiguration (such as reading from external
240 * files) may be necessary.
241 */
242 public void kickUpdate() {
243
244
245
246 if (((Boolean) getUncheckedAttribute(null, ATTR_REREAD_SEEDS_ON_CONFIG))
247 .booleanValue()) {
248 refreshSeeds();
249 getSettingsHandler().getOrder().getController().getFrontier().loadSeeds();
250 }
251 }
252
253 /***
254 * Gets an iterator over all configured seeds. Subclasses
255 * which cache seeds in memory can override with more
256 * efficient implementation.
257 *
258 * @return Iterator, perhaps over a disk file, of seeds
259 */
260 public Iterator<UURI> seedsIterator() {
261 return seedsIterator(null);
262 }
263
264 /***
265 * Gets an iterator over all configured seeds. Subclasses
266 * which cache seeds in memory can override with more
267 * efficient implementation.
268 *
269 * @param ignoredItemWriter optional writer to get ignored seed items report
270 * @return Iterator, perhaps over a disk file, of seeds
271 */
272 public Iterator<UURI> seedsIterator(Writer ignoredItemWriter) {
273 BufferedReader br;
274 try {
275 br = new BufferedReader(
276 new InputStreamReader(
277 new FileInputStream(getSeedfile()),
278 "UTF-8"));
279 } catch (IOException e) {
280 throw new RuntimeException(e);
281 }
282 return new SeedFileIterator(br,ignoredItemWriter);
283 }
284
285 /***
286 * Convenience method to close SeedFileIterator, if appropriate.
287 *
288 * @param iter Iterator to check if SeedFileIterator needing closing
289 */
290 protected void checkClose(Iterator iter) {
291 if(iter instanceof SeedFileIterator) {
292 ((SeedFileIterator)iter).close();
293 }
294 }
295
296 /***
297 * Add a new seed to scope. By default, simply appends
298 * to seeds file, though subclasses may handle differently.
299 *
300 * <p>This method is *not* sufficient to get the new seed
301 * scheduled in the Frontier for crawling -- it only
302 * affects the Scope's seed record (and decisions which
303 * flow from seeds).
304 *
305 * @param curi CandidateUri to add
306 * @return true if successful, false if add failed for any reason
307 */
308 public boolean addSeed(final CandidateURI curi) {
309 File f = getSeedfile();
310 if (f != null) {
311 try {
312 OutputStreamWriter fw =
313 new OutputStreamWriter(new FileOutputStream(f, true),"UTF-8");
314
315 fw.write("\n");
316 fw.write("# Heritrix added seed ");
317 fw.write((curi.getVia() != null)
318 ? "redirect from " + curi.getVia()
319 : "(JMX)");
320 fw.write(" " + ArchiveUtils.get17DigitDate() + ".\n");
321 fw.write(curi.toString());
322 fw.flush();
323 fw.close();
324 Iterator iter = seedListeners.iterator();
325 while(iter.hasNext()) {
326 ((SeedListener)iter.next()).addedSeed(curi);
327 }
328 return true;
329 } catch (IOException e) {
330 DevUtils.warnHandle(e, "problem writing new seed");
331 }
332 }
333 return false;
334 }
335
336 public void addSeedListener(SeedListener sl) {
337 seedListeners.add(sl);
338 }
339 }