View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Filter.java
20   * Created on Apr 16, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.framework;
25  
26  import java.util.logging.Level;
27  import java.util.logging.Logger;
28  
29  import javax.management.AttributeNotFoundException;
30  
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.settings.ComplexType;
33  import org.archive.crawler.settings.MapType;
34  import org.archive.crawler.settings.ModuleType;
35  import org.archive.crawler.settings.SimpleType;
36  
37  /***
38   * Base class for filter classes.
39   * <p>
40   * Several classes allow 'filters' to be applied to them. Filters are classes
41   * that, based on an arbitrary object passed to them, return a boolean stating
42   * if if passes the filter. Thus applying filters can affect the behavior of
43   * those classes. This class provides the basic framework for filters. All
44   * detailed implementation of filters inherit from it and it is considered to
45   * be a 'null' filter (always returns true).
46   *
47   * @author Gordon Mohr
48   *
49   * @see org.archive.crawler.framework.Processor
50   */
51  public class Filter extends ModuleType {
52  
53      private static final long serialVersionUID = -356718306794776802L;
54  
55      private static Logger logger =
56          Logger.getLogger("org.archive.crawler.framework.Filter");
57  
58      public static final String ATTR_ENABLED = "enabled";
59  
60      /***
61       * Creates a new 'null' filter.
62       * @param name the name of the filter.
63       * @param description an description of the filter suitable for showing in
64       * the user interface.
65       */
66      public Filter(String name, String description) {
67          super(name, description);
68          addElementToDefinition(
69              new SimpleType(ATTR_ENABLED,
70                  "Filter is enabled.", new Boolean(true)));
71      }
72  
73      /***
74       * Creates a new 'null' filter.
75       * @param name the name of the filter.
76       */
77      public Filter(String name) {
78          this(name, "Null filter - accepts everything.");
79      }
80  
81      public boolean accepts(Object o) {
82          CrawlURI curi = (o instanceof CrawlURI) ? (CrawlURI) o : null;
83  
84          // Skip the evaluation if the filter is disabled
85          try {
86              if (!((Boolean)getAttribute(ATTR_ENABLED, curi)).booleanValue()) {
87                  return getFilterOffPosition(curi);
88              }
89          } catch (AttributeNotFoundException e) {
90              logger.severe(e.getMessage());
91          }
92  
93          boolean accept = returnTrueIfMatches(curi) == innerAccepts(o);
94          if (accept && logger.isLoggable(Level.FINEST)) {
95              // Log if filter returns true
96              ComplexType p = this.getParent();
97              if (p instanceof MapType) {
98                  p = p.getParent();
99              }
100             String msg = this.toString() + " belonging to " + p.toString()
101                 + " accepted " + o.toString();
102             logger.finest(msg);
103         }
104         return accept;
105     }
106     
107     /***
108      * If the filter is disabled, the value returned by this method is
109      * what filters return as their disabled setting.
110      * Default is that we return 'true', continue processing, but some
111      * filters -- the exclude filters for example -- will want to return
112      * false if disabled so processing can continue.
113      * @param curi CrawlURI to use as context. Passed curi can be null.
114      * @return This filters 'off' position.
115      */
116     protected boolean getFilterOffPosition(CrawlURI curi) {
117         return true;
118     }
119 
120     /***
121      * Checks to see if filter functionality should be inverted for this
122      * curi.<p>
123      *
124      * All filters will by default return true if curi is accepted by the
125      * filter. If this method returns false, then the filter will return true
126      * if doesn't match.<p>
127      *
128      * Classes extending this class should override this method with
129      * appropriate code.
130      *
131      * @param curi Current CrawlURI
132      * @return true for default behaviour, false otherwise.
133      */
134     protected boolean returnTrueIfMatches(CrawlURI curi){
135         return true;
136     }
137 
138     /***
139      * Classes subclassing this one should override this method to perfrom
140      * their custom determination of whether or not the object given to it.
141      *
142      * @param o The object
143      * @return True if it passes the filter.
144      */
145     protected boolean innerAccepts(Object o) {
146         return true;
147     }
148 
149     public String toString() {
150         return "Filter<" + getName() + ">";
151     }
152 
153     public void kickUpdate() {
154         // by default, do nothing
155     }
156 }