PathologicalPathFilter xref

View Javadoc

1   /* PathologicalFilter
2    *
3    * $Id: PathologicalPathFilter.java 4652 2006-09-25 18:41:10Z paul_jack $
4    *
5    * Created on Feb 20, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.filter;
26  
27  import java.util.logging.Logger;
28  
29  import javax.management.AttributeNotFoundException;
30  
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.deciderules.DecideRule;
33  import org.archive.crawler.deciderules.DecidingFilter;
34  import org.archive.crawler.settings.SimpleType;
35  import org.archive.crawler.settings.Type;
36  
37  /*** 
38   * Checks if a URI contains a repeated pattern.
39   *
40   * This filter is checking if a pattern is repeated a specific number of times.
41   * The use is to avoid crawler traps where the server adds the same pattern to
42   * the requested URI like: <code>http://host/img/img/img/img....</code>. This
43   * filter returns TRUE if the path is pathological.  FALSE otherwise.
44   *
45   * @author John Erik Halse
46   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingFilter} and
47   * equivalent {@link DecideRule}.
48   */
49  public class PathologicalPathFilter extends URIRegExpFilter {
50  
51      private static final long serialVersionUID = 2797805167250054353L;
52  
53      private static final Logger logger =
54          Logger.getLogger(PathologicalPathFilter.class.getName());
55  
56      public static final String ATTR_REPETITIONS = "repetitions";
57  
58      public static final Integer DEFAULT_REPETITIONS = new Integer(3);
59      
60      private final String REGEX_PREFIX = ".*?/(.*?/)//1{";
61      private final String REGEX_SUFFIX = ",}.*";
62  
63      /*** Constructs a new PathologicalPathFilter.
64       *
65       * @param name the name of the filter.
66       */
67      public PathologicalPathFilter(String name) {
68          super(name);
69          setDescription("Pathological path filter *Deprecated* Use" +
70          		"DecidingFilter and equivalent DecideRule instead. " +
71          		"The Pathologicalpath filter" +
72                  " is used to avoid crawler traps by adding a constraint on" +
73                  " how many times a pattern in the URI could be repeated." +
74                  " Returns false if the path is NOT pathological (There" +
75                  " are no subpath reptitions or reptitions are less than" +
76                  " the '" + ATTR_REPETITIONS + "' limit).");
77  
78          Type type = getElementFromDefinition(ATTR_MATCH_RETURN_VALUE);
79          type.setTransient(true);
80  
81          type = getElementFromDefinition(ATTR_REGEXP);
82          type.setTransient(true);
83  
84          addElementToDefinition(new SimpleType(ATTR_REPETITIONS,
85                  "Number of times the pattern should be allowed to occur. \n" +
86                  "This filter returns true if number of repetitions of a" +
87                  " pattern exceeds this value",
88                  DEFAULT_REPETITIONS));
89      }
90  
91      /*** 
92       * Construct the regexp string to be matched aginst the URI.
93       * @param o an object to extract a URI from.
94       * @return the regexp pattern.
95       */
96      protected String getRegexp(Object o) {
97          int rep = 0;
98          try {
99              rep = ((Integer)getAttribute(o, ATTR_REPETITIONS)).intValue();
100         } catch (AttributeNotFoundException e) {
101             logger.severe(e.getMessage());
102         }
103         return rep == 0? null: REGEX_PREFIX + (rep - 1) + REGEX_SUFFIX;
104     }
105     
106     protected boolean getFilterOffPosition(CrawlURI curi) {
107         return false;
108     }
109 }