View Javadoc

1   /* URIListRegExpFilter
2    * 
3    * $Id: URIListRegExpFilter.java 4652 2006-09-25 18:41:10Z paul_jack $
4    * 
5    * Created on 30.5.2005
6    *
7    * Copyright (C) 2004 Kristinn Sigurdsson.
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.filter;
26  
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.logging.Level;
30  import java.util.logging.Logger;
31  
32  import javax.management.AttributeNotFoundException;
33  
34  import org.archive.crawler.deciderules.DecideRule;
35  import org.archive.crawler.deciderules.DecidingFilter;
36  import org.archive.crawler.framework.Filter;
37  import org.archive.crawler.settings.SimpleType;
38  import org.archive.crawler.settings.StringList;
39  import org.archive.util.TextUtils;
40  
41  
42  /***
43  * Compares passed object -- a CrawlURI, UURI, or String --
44  * against regular expressions, accepting matches.
45  * <p>
46  * Can be configured to logically OR or AND the regular expressions.
47  *
48  * @author Kristinn Sigurdsson
49  * 
50  * @see org.archive.crawler.filter.URIRegExpFilter
51  * @deprecated As of release 1.10.0.  Replaced by {@link DecidingFilter} and
52  * equivalent {@link DecideRule}.
53  */
54  public class URIListRegExpFilter
55  extends Filter {
56  
57      private static final long serialVersionUID = -2587977969340783677L;
58      
59      private static final Logger logger =
60         Logger.getLogger(URIListRegExpFilter.class.getName());
61     public static final String ATTR_REGEXP_LIST = "regexp-list";
62     public static final String ATTR_LIST_LOGIC= "list-logic";
63     public static final String ATTR_MATCH_RETURN_VALUE = "if-match-return";
64     
65     public static final String DEFAULT_LIST_LOGIC = "OR";
66     public static final String[] LEGAL_LIST_LOGIC = {"OR","AND"};
67     public static final Boolean DEFAULT_MATCH_RETURN_VALUE = new Boolean(true);
68  
69     /***
70      * @param name Filter name.
71      */
72     public URIListRegExpFilter(String name) {
73         super(name, "A filter that uses a list of regular expressions " +
74         		"*Deprecated* Use DecidingFilter and equivalent DecideRule " +
75         		"instead. Can be " +
76               "optionally either OR or AND based in its evaluation.");
77         addElementToDefinition(
78                 new SimpleType(ATTR_MATCH_RETURN_VALUE, "What to return when" +
79                     " regular expression matches. \n", 
80                     DEFAULT_MATCH_RETURN_VALUE));
81         addElementToDefinition(
82                 new SimpleType(ATTR_LIST_LOGIC, "Should the list of regular " +
83                     "expressions be considered as logically AND or OR when " +
84                     "matching.", 
85                     DEFAULT_LIST_LOGIC, LEGAL_LIST_LOGIC));
86         addElementToDefinition(new StringList(ATTR_REGEXP_LIST,"The list of " +
87              "regular expressions to evalute against the URI."));
88     }
89  
90     protected boolean innerAccepts(Object o) {
91         List regexps = getRegexp(o);
92         if(regexps.size()==0){
93             return false;
94         }
95         String str = o.toString();
96         Iterator it = regexps.iterator();
97         
98         boolean listLogicOR = isListLogicOR(o);
99         // Result is initialized so that if OR based the default assumption is
100        // false (find no matches) but if AND based the default assumption is
101        // true (finds no non-matches)
102        boolean result = listLogicOR == false;
103        
104        while(it.hasNext()){
105            String regexp = (String)it.next();
106            boolean matches = TextUtils.matches(regexp, str);
107 
108            if (logger.isLoggable(Level.FINER)) {
109                logger.finer("Tested '" + str + "' match with regex '" +
110                    regexp + " and result was " + matches);
111            }
112            
113            if(matches){
114                if(listLogicOR){
115                    // OR based and we just got a match, done!
116                    result = true;
117                    break;
118                }
119            } else {
120                if(listLogicOR == false){
121                    // AND based and we just found a non-match, done!
122                    result = false;
123                    break;
124                }
125            }
126        }
127        
128        result = getMatchReturnValue(o) ? result : !result;
129        
130        if (logger.isLoggable(Level.FINE) && result){
131            logger.fine("Matched: " + str);
132        }
133        
134        return result;
135    }
136 
137    /*** 
138     * Get the regular expressions list to match the URI against.
139     *
140     * @param o the object for which the regular expression should be
141     *          matched against.
142     * @return the regular expression to match against.
143     */
144    protected List getRegexp(Object o) {
145        try {
146            return (StringList) getAttribute(o, ATTR_REGEXP_LIST);
147        } catch (AttributeNotFoundException e) {
148            logger.severe(e.getMessage());
149            // Basically the filter is inactive if this occurs
150            // (The caller should be returning false when regexp is null).
151            return null;  
152        }
153    }
154    
155    protected boolean getMatchReturnValue(Object o){
156        try {
157            return ((Boolean) getAttribute(o, ATTR_MATCH_RETURN_VALUE)).booleanValue();
158        } catch (AttributeNotFoundException e) {
159            logger.severe(e.getMessage());
160            return DEFAULT_MATCH_RETURN_VALUE.booleanValue();  
161        }
162    }
163 
164    protected boolean isListLogicOR(Object o){
165        String logic = DEFAULT_LIST_LOGIC;
166        try {
167            logic = (String) getAttribute(o, ATTR_LIST_LOGIC);
168        } catch (AttributeNotFoundException e) {
169            logger.severe(e.getMessage());
170        }
171        return logic.equals("OR") ? true : false;
172    }
173 
174 }