1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.filter;
26
27 import java.util.logging.Logger;
28
29 import javax.management.AttributeNotFoundException;
30
31 import org.archive.crawler.datamodel.CrawlURI;
32 import org.archive.crawler.deciderules.DecideRule;
33 import org.archive.crawler.deciderules.DecidingFilter;
34 import org.archive.crawler.settings.SimpleType;
35 import org.archive.crawler.settings.Type;
36
37 /***
38 * Checks if a URI contains a repeated pattern.
39 *
40 * This filter is checking if a pattern is repeated a specific number of times.
41 * The use is to avoid crawler traps where the server adds the same pattern to
42 * the requested URI like: <code>http://host/img/img/img/img....</code>. This
43 * filter returns TRUE if the path is pathological. FALSE otherwise.
44 *
45 * @author John Erik Halse
46 * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
47 * equivalent {@link DecideRule}.
48 */
49 public class PathologicalPathFilter extends URIRegExpFilter {
50
51 private static final long serialVersionUID = 2797805167250054353L;
52
53 private static final Logger logger =
54 Logger.getLogger(PathologicalPathFilter.class.getName());
55
56 public static final String ATTR_REPETITIONS = "repetitions";
57
58 public static final Integer DEFAULT_REPETITIONS = new Integer(3);
59
60 private final String REGEX_PREFIX = ".*?/(.*?/)//1{";
61 private final String REGEX_SUFFIX = ",}.*";
62
63 /*** Constructs a new PathologicalPathFilter.
64 *
65 * @param name the name of the filter.
66 */
67 public PathologicalPathFilter(String name) {
68 super(name);
69 setDescription("Pathological path filter *Deprecated* Use" +
70 "DecidingFilter and equivalent DecideRule instead. " +
71 "The Pathologicalpath filter" +
72 " is used to avoid crawler traps by adding a constraint on" +
73 " how many times a pattern in the URI could be repeated." +
74 " Returns false if the path is NOT pathological (There" +
75 " are no subpath reptitions or reptitions are less than" +
76 " the '" + ATTR_REPETITIONS + "' limit).");
77
78 Type type = getElementFromDefinition(ATTR_MATCH_RETURN_VALUE);
79 type.setTransient(true);
80
81 type = getElementFromDefinition(ATTR_REGEXP);
82 type.setTransient(true);
83
84 addElementToDefinition(new SimpleType(ATTR_REPETITIONS,
85 "Number of times the pattern should be allowed to occur. \n" +
86 "This filter returns true if number of repetitions of a" +
87 " pattern exceeds this value",
88 DEFAULT_REPETITIONS));
89 }
90
91 /***
92 * Construct the regexp string to be matched aginst the URI.
93 * @param o an object to extract a URI from.
94 * @return the regexp pattern.
95 */
96 protected String getRegexp(Object o) {
97 int rep = 0;
98 try {
99 rep = ((Integer)getAttribute(o, ATTR_REPETITIONS)).intValue();
100 } catch (AttributeNotFoundException e) {
101 logger.severe(e.getMessage());
102 }
103 return rep == 0? null: REGEX_PREFIX + (rep - 1) + REGEX_SUFFIX;
104 }
105
106 protected boolean getFilterOffPosition(CrawlURI curi) {
107 return false;
108 }
109 }