1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.scope;
26
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.io.Writer;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.net.UURI;
35 import org.archive.net.UURIFactory;
36 import org.archive.util.iterator.LineReadingIterator;
37 import org.archive.util.iterator.RegexpLineIterator;
38 import org.archive.util.iterator.TransformingIteratorWrapper;
39
40
41 /***
42 * Iterator wrapper for seeds file on disk.
43 *
44 * @author gojomo
45 */
46 public class SeedFileIterator extends TransformingIteratorWrapper<String,UURI> {
47 private static Logger logger =
48 Logger.getLogger(SeedFileIterator.class.getName());
49
50 BufferedReader input;
51 Writer ignored;
52
53 /***
54 * Construct a SeedFileIterator over the input available
55 * from the supplied BufferedReader.
56 * @param br BufferedReader from which to get seeds
57 */
58 public SeedFileIterator(BufferedReader br) {
59 this(br,null);
60 }
61
62 /***
63 * Construct a SeedFileIterator over the input available
64 * from the supplied BufferedReader, reporting any nonblank
65 * noncomment entries which don't generate a valid seed to
66 * the supplied BufferedWriter.
67 *
68 * @param inputReader BufferedReader from which to get seeds
69 * @param ignoredWriter BufferedWriter to report any ignored input
70 */
71 public SeedFileIterator(BufferedReader inputReader, Writer ignoredWriter) {
72 super();
73 inner = new RegexpLineIterator(
74 new LineReadingIterator(inputReader),
75 RegexpLineIterator.COMMENT_LINE,
76 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
77 RegexpLineIterator.ENTRY);
78 input = inputReader;
79 ignored = ignoredWriter;
80 }
81
82 protected UURI transform(String uri) {
83 if(! uri.matches("[a-zA-Z][//w+//-]+:.*")) {
84
85
86 uri = "http://"+uri;
87 }
88 try {
89
90 return UURIFactory.getInstance(uri);
91 } catch (URIException e) {
92 logger.log(Level.INFO, "line in seed file ignored: "
93 + e.getMessage(), e);
94 if(ignored!=null) {
95 try {
96 ignored.write(uri+"\n");
97 } catch (IOException e1) {
98
99 e1.printStackTrace();
100 }
101 }
102 return null;
103 }
104 }
105
106
107 /***
108 * Clean-up when hasNext() has returned null: close open files.
109 *
110 * @see org.archive.util.iterator.TransformingIteratorWrapper#noteExhausted()
111 */
112 protected void noteExhausted() {
113 super.noteExhausted();
114 close();
115 }
116
117 public void close() {
118 try {
119 if(input!=null) {
120 input.close();
121 }
122 if(ignored!=null) {
123 ignored.close();
124 }
125 } catch (IOException e) {
126
127 e.printStackTrace();
128 }
129 }
130 }