1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.filter;
26
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31
32 import javax.management.AttributeNotFoundException;
33
34 import org.archive.crawler.deciderules.DecideRule;
35 import org.archive.crawler.deciderules.DecidingFilter;
36 import org.archive.crawler.framework.Filter;
37 import org.archive.crawler.settings.SimpleType;
38 import org.archive.crawler.settings.StringList;
39 import org.archive.util.TextUtils;
40
41
42 /***
43 * Compares passed object -- a CrawlURI, UURI, or String --
44 * against regular expressions, accepting matches.
45 * <p>
46 * Can be configured to logically OR or AND the regular expressions.
47 *
48 * @author Kristinn Sigurdsson
49 *
50 * @see org.archive.crawler.filter.URIRegExpFilter
51 * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
52 * equivalent {@link DecideRule}.
53 */
54 public class URIListRegExpFilter
55 extends Filter {
56
57 private static final long serialVersionUID = -2587977969340783677L;
58
59 private static final Logger logger =
60 Logger.getLogger(URIListRegExpFilter.class.getName());
61 public static final String ATTR_REGEXP_LIST = "regexp-list";
62 public static final String ATTR_LIST_LOGIC= "list-logic";
63 public static final String ATTR_MATCH_RETURN_VALUE = "if-match-return";
64
65 public static final String DEFAULT_LIST_LOGIC = "OR";
66 public static final String[] LEGAL_LIST_LOGIC = {"OR","AND"};
67 public static final Boolean DEFAULT_MATCH_RETURN_VALUE = new Boolean(true);
68
69 /***
70 * @param name Filter name.
71 */
72 public URIListRegExpFilter(String name) {
73 super(name, "A filter that uses a list of regular expressions " +
74 "*Deprecated* Use DecidingFilter and equivalent DecideRule " +
75 "instead. Can be " +
76 "optionally either OR or AND based in its evaluation.");
77 addElementToDefinition(
78 new SimpleType(ATTR_MATCH_RETURN_VALUE, "What to return when" +
79 " regular expression matches. \n",
80 DEFAULT_MATCH_RETURN_VALUE));
81 addElementToDefinition(
82 new SimpleType(ATTR_LIST_LOGIC, "Should the list of regular " +
83 "expressions be considered as logically AND or OR when " +
84 "matching.",
85 DEFAULT_LIST_LOGIC, LEGAL_LIST_LOGIC));
86 addElementToDefinition(new StringList(ATTR_REGEXP_LIST,"The list of " +
87 "regular expressions to evalute against the URI."));
88 }
89
90 protected boolean innerAccepts(Object o) {
91 List regexps = getRegexp(o);
92 if(regexps.size()==0){
93 return false;
94 }
95 String str = o.toString();
96 Iterator it = regexps.iterator();
97
98 boolean listLogicOR = isListLogicOR(o);
99
100
101
102 boolean result = listLogicOR == false;
103
104 while(it.hasNext()){
105 String regexp = (String)it.next();
106 boolean matches = TextUtils.matches(regexp, str);
107
108 if (logger.isLoggable(Level.FINER)) {
109 logger.finer("Tested '" + str + "' match with regex '" +
110 regexp + " and result was " + matches);
111 }
112
113 if(matches){
114 if(listLogicOR){
115
116 result = true;
117 break;
118 }
119 } else {
120 if(listLogicOR == false){
121
122 result = false;
123 break;
124 }
125 }
126 }
127
128 result = getMatchReturnValue(o) ? result : !result;
129
130 if (logger.isLoggable(Level.FINE) && result){
131 logger.fine("Matched: " + str);
132 }
133
134 return result;
135 }
136
137 /***
138 * Get the regular expressions list to match the URI against.
139 *
140 * @param o the object for which the regular expression should be
141 * matched against.
142 * @return the regular expression to match against.
143 */
144 protected List getRegexp(Object o) {
145 try {
146 return (StringList) getAttribute(o, ATTR_REGEXP_LIST);
147 } catch (AttributeNotFoundException e) {
148 logger.severe(e.getMessage());
149
150
151 return null;
152 }
153 }
154
155 protected boolean getMatchReturnValue(Object o){
156 try {
157 return ((Boolean) getAttribute(o, ATTR_MATCH_RETURN_VALUE)).booleanValue();
158 } catch (AttributeNotFoundException e) {
159 logger.severe(e.getMessage());
160 return DEFAULT_MATCH_RETURN_VALUE.booleanValue();
161 }
162 }
163
164 protected boolean isListLogicOR(Object o){
165 String logic = DEFAULT_LIST_LOGIC;
166 try {
167 logic = (String) getAttribute(o, ATTR_LIST_LOGIC);
168 } catch (AttributeNotFoundException e) {
169 logger.severe(e.getMessage());
170 }
171 return logic.equals("OR") ? true : false;
172 }
173
174 }