View Javadoc

1   /* SurtPrefixFilter
2   *
3   * $Id: SurtPrefixFilter.java 4652 2006-09-25 18:41:10Z paul_jack $
4   *
5   * Created on Jul 22, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.filter;
26  
27  import java.io.File;
28  import java.io.FileReader;
29  import java.io.IOException;
30  
31  import org.archive.crawler.deciderules.DecideRule;
32  import org.archive.crawler.deciderules.DecidingFilter;
33  import org.archive.crawler.framework.Filter;
34  import org.archive.crawler.settings.SimpleType;
35  import org.archive.util.SURT;
36  import org.archive.util.SurtPrefixSet;
37  /***
38   * A filter which tests a URI against a set of SURT 
39   * prefixes, and if the URI's prefix is in the set,
40   * returns the chosen true/false accepts value. 
41   * 
42   * @author gojomo
43   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingFilter} and
44   * equivalent {@link DecideRule}.
45   */
46  public class SurtPrefixFilter extends Filter {
47  
48      private static final long serialVersionUID = -6933592892325852022L;
49  
50      public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
51      public static final String ATTR_MATCH_RETURN_VALUE = "if-match-return";
52  
53      SurtPrefixSet surtPrefixes = null;
54      
55      /***
56       * @param name
57       */
58      public SurtPrefixFilter(String name) {
59          super(name, "SURT prefix filter *Deprecated* Use" +
60          		"DecidingFilter and equivalent DecideRule instead.");
61          addElementToDefinition(
62              new SimpleType(ATTR_MATCH_RETURN_VALUE, "What to return when " +
63                      "a prefix matches.\n", new Boolean(true)));
64          addElementToDefinition(
65                  new SimpleType(ATTR_SURTS_SOURCE_FILE, 
66                  		"Source file from which to infer SURT prefixes. Any URLs " +
67                          "in file will be converted to the implied SURT prefix, and " +
68                          "literal SURT prefixes may be listed on lines beginning " +
69                          "with a '+' character.", 
70                          ""));
71      }
72      
73      /* (non-Javadoc)
74       * @see org.archive.crawler.framework.Filter#accepts(java.lang.Object)
75       */
76      protected synchronized boolean innerAccepts(Object o) {
77          if (surtPrefixes == null) {
78              readPrefixes();
79          }
80          String s = SURT.fromURI(o.toString());
81          // also want to treat https as http
82          if(s.startsWith("https:")) {
83              s = "http:"+s.substring(6);
84          }
85          // TODO: consider other cases of scheme-indifference?
86          return surtPrefixes.containsPrefixOf(s);
87      }
88  
89      private void readPrefixes() {
90          surtPrefixes = new SurtPrefixSet(); 
91          String sourcePath = (String) getUncheckedAttribute(null,
92                  ATTR_SURTS_SOURCE_FILE);
93          File source = new File(sourcePath);
94          if (!source.isAbsolute()) {
95              source = new File(getSettingsHandler().getOrder()
96                      .getController().getDisk(), sourcePath);
97          }
98          FileReader fr = null;
99          try {
100             fr = new FileReader(source);
101             try {
102                 surtPrefixes.importFromMixed(fr,true);
103             } finally {
104                 fr.close();
105             }
106         } catch (IOException e) {
107             e.printStackTrace();
108             throw new RuntimeException(e);
109         } 
110     }
111     
112     /***
113      * Re-read prefixes after a settings update.
114      * 
115      */
116     public synchronized void kickUpdate() {
117         super.kickUpdate();
118         // TODO: make conditional on file having actually changed,
119         // perhaps by remembering mod-time
120         readPrefixes();
121     }
122 }