View Javadoc

1   /* SeedFileIterator
2   *
3   * $Id: SeedFileIterator.java 4651 2006-09-25 18:31:13Z paul_jack $
4   *
5   * Created on Mar 28, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.scope;
26  
27  import java.io.BufferedReader;
28  import java.io.IOException;
29  import java.io.Writer;
30  import java.util.logging.Level;
31  import java.util.logging.Logger;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.archive.net.UURI;
35  import org.archive.net.UURIFactory;
36  import org.archive.util.iterator.LineReadingIterator;
37  import org.archive.util.iterator.RegexpLineIterator;
38  import org.archive.util.iterator.TransformingIteratorWrapper;
39  
40  
41  /***
42   * Iterator wrapper for seeds file on disk. 
43   * 
44   * @author gojomo
45   */
46  public class SeedFileIterator extends TransformingIteratorWrapper<String,UURI> {
47      private static Logger logger =
48          Logger.getLogger(SeedFileIterator.class.getName());
49      
50      BufferedReader input;
51      Writer ignored;
52      
53      /***
54       * Construct a SeedFileIterator over the input available
55       * from the supplied BufferedReader.
56       * @param br BufferedReader from which to get seeds
57       */
58      public SeedFileIterator(BufferedReader br) {
59          this(br,null);
60      }
61  
62      /***
63       * Construct a SeedFileIterator over the input available
64       * from the supplied BufferedReader, reporting any nonblank
65       * noncomment entries which don't generate a valid seed to
66       * the supplied BufferedWriter.
67       * 
68       * @param inputReader BufferedReader from which to get seeds
69       * @param ignoredWriter BufferedWriter to report any ignored input 
70       */
71      public SeedFileIterator(BufferedReader inputReader, Writer ignoredWriter) {
72          super();
73          inner = new RegexpLineIterator(
74                      new LineReadingIterator(inputReader),
75                      RegexpLineIterator.COMMENT_LINE,
76                      RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
77                      RegexpLineIterator.ENTRY);
78          input = inputReader;
79          ignored = ignoredWriter;
80      }
81      
82      protected UURI transform(String uri) {
83          if(! uri.matches("[a-zA-Z][//w+//-]+:.*")) { // Rfc2396 s3.1 scheme, 
84                                                       // minus '.'
85              // Does not begin with scheme, so try http://
86              uri = "http://"+uri;
87          }
88          try {
89              // TODO: ignore lines beginning with non-word char
90              return UURIFactory.getInstance(uri);
91          } catch (URIException e) {
92              logger.log(Level.INFO, "line in seed file ignored: "
93                      + e.getMessage(), e);
94              if(ignored!=null) {
95                  try {
96                      ignored.write(uri+"\n");
97                  } catch (IOException e1) {
98                      // TODO Auto-generated catch block
99                      e1.printStackTrace();
100                 }
101             }
102             return null;
103         }
104     }
105     
106     
107     /***
108      * Clean-up when hasNext() has returned null: close open files. 
109      *
110      * @see org.archive.util.iterator.TransformingIteratorWrapper#noteExhausted()
111      */
112     protected void noteExhausted() {
113         super.noteExhausted();
114         close();
115     }
116     
117     public void close() {
118         try {
119             if(input!=null) {
120                 input.close();
121             }
122             if(ignored!=null) {
123                 ignored.close();
124             }
125         } catch (IOException e) {
126             // TODO Auto-generated catch block
127             e.printStackTrace();
128         }
129     }
130 }