View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * RegExpFilter.java
20   * Created on Apr 16, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.filter;
25  
26  import java.util.logging.Level;
27  import java.util.logging.Logger;
28  
29  import javax.management.AttributeNotFoundException;
30  
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.deciderules.DecideRule;
33  import org.archive.crawler.deciderules.DecidingFilter;
34  import org.archive.crawler.framework.Filter;
35  import org.archive.crawler.settings.SimpleType;
36  import org.archive.util.TextUtils;
37  
38  
39  /***
40   * Compares passed object -- a CrawlURI, UURI, or String --
41   * against a regular expression, accepting matches.
42   *
43   * @author Gordon Mohr
44   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingFilter} and
45   * equivalent {@link DecideRule}.
46   */
47  public class URIRegExpFilter
48  extends Filter {
49  
50      private static final long serialVersionUID = 1878356276332865537L;
51  
52      private static final Logger logger =
53          Logger.getLogger(URIRegExpFilter.class.getName());
54      public static final String ATTR_REGEXP = "regexp";
55      public static final String ATTR_MATCH_RETURN_VALUE = "if-match-return";
56  
57      /***
58       * @param name Filter name.
59       */
60      public URIRegExpFilter(String name) {
61          this(name, "URI regexp filter *Deprecated* Use DecidingFilter and " +
62          	"equivalent DecideRule instead. ", "");
63          addElementToDefinition(
64              new SimpleType(ATTR_MATCH_RETURN_VALUE, "What to return when" +
65                  " regular expression matches. \n", new Boolean(true)));
66          addElementToDefinition(
67              new SimpleType(ATTR_REGEXP, "Java regular expression.", ""));
68      }
69  
70      public URIRegExpFilter(String name, String regexp) {
71          this(name, "URI regexp filter.", regexp);
72      }
73  
74      protected URIRegExpFilter(String name, String description, String regexp) {
75          super(name, description);
76          addElementToDefinition(new SimpleType(ATTR_MATCH_RETURN_VALUE,
77              "What to return when" + " regular expression matches. \n",
78              new Boolean(true)));
79          addElementToDefinition(new SimpleType(ATTR_REGEXP,
80              "Java regular expression.", regexp)); 
81      }
82  
83      protected boolean innerAccepts(Object o) {
84          String regexp = getRegexp(o);
85          String str = o.toString();
86          boolean result = (regexp == null)?
87              false: TextUtils.matches(regexp, str);
88          if (logger.isLoggable(Level.FINE)) {
89              logger.fine("Tested '" + str + "' match with regex '" +
90                  getRegexp(o) + " and result was " + result);
91          }
92          return result;
93      }
94  
95      /*** 
96       * Get the regular expression string to match the URI against.
97       *
98       * @param o the object for which the regular expression should be
99       *          matched against.
100      * @return the regular expression to match against.
101      */
102     protected String getRegexp(Object o) {
103         try {
104             return (String) getAttribute(o, ATTR_REGEXP);
105         } catch (AttributeNotFoundException e) {
106             logger.severe(e.getMessage());
107             // Basically the filter is inactive if this occurs
108             // (The caller should be returning false when regexp is null).
109             return null;  
110         }
111     }
112 
113     protected boolean returnTrueIfMatches(CrawlURI curi) {
114         try {
115             return ((Boolean)getAttribute(ATTR_MATCH_RETURN_VALUE, curi)).
116                 booleanValue();
117         } catch (AttributeNotFoundException e) {
118             logger.severe(e.getMessage());
119             return true;
120         }
121     }
122 }