View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * HopsFilter.java
20   * Created on Oct 3, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.filter;
25  
26  import java.util.logging.Logger;
27  
28  import javax.management.AttributeNotFoundException;
29  
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.crawler.datamodel.CandidateURI;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.deciderules.DecideRule;
34  import org.archive.crawler.deciderules.DecidingFilter;
35  import org.archive.crawler.framework.Filter;
36  import org.archive.crawler.settings.SimpleType;
37  import org.archive.net.UURI;
38  
39  /***
40   * Accepts all urls passed in with a path depth
41   * less or equal than the max-path-depth
42   * value.
43   *
44   * @author Igor Ranitovic
45   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingFilter} and
46   * equivalent {@link DecideRule}.
47   */
48  public class PathDepthFilter extends Filter {
49  
50      private static final long serialVersionUID = 1626115117327154205L;
51  
52      private static final Logger logger =
53          Logger.getLogger(PathDepthFilter.class.getName());
54      public static final String ATTR_MATCH_RETURN_VALUE =
55          "path-less-or-equal-return";
56      public static final String ATTR_MAX_PATH_DEPTH = "max-path-depth";
57      Integer maxPathDepth = new Integer(Integer.MAX_VALUE);
58      final static char slash = '/';
59  
60      /***
61       * @param name
62       */
63      public PathDepthFilter(String name) {
64          super(name, "Path depth less or equal filter  *Deprecated* Use" +
65          		"DecidingFilter and equivalent DecideRule instead.");
66          addElementToDefinition(new SimpleType(ATTR_MAX_PATH_DEPTH, "Max path" +
67                  " depth for which this filter will match", maxPathDepth));
68          addElementToDefinition(new SimpleType(ATTR_MATCH_RETURN_VALUE,
69                  "What to return when path depth is less or equal to max path" +
70                  " depth. \n", new Boolean(true)));
71      }
72  
73      protected boolean innerAccepts(Object o) {
74          String path = null;
75          if (o == null) {
76              return false;
77          }
78          
79          if (o instanceof CandidateURI) {
80              try {
81                  if (((CandidateURI)o).getUURI() != null) {
82                      path = ((CandidateURI)o).getUURI().getPath();
83                  }
84              }
85              catch (URIException e) {
86                  logger.severe("Failed getpath for " +
87                      ((CandidateURI)o).getUURI());
88              }
89          } else if (o instanceof UURI) {
90              try {
91                  path = ((UURI)o).getPath();
92              }
93              catch (URIException e) {
94                  logger.severe("Failed getpath for " + o);
95              }
96          }
97  
98          if (path == null) {
99              return true;
100         }
101 
102         int count = 0;
103         for (int i = path.indexOf(slash); i != -1;
104         		i = path.indexOf(slash, i + 1)) {
105             count++;
106         }
107         
108         if (o instanceof CrawlURI) {
109             try {
110                 this.maxPathDepth = (Integer) getAttribute(
111                         ATTR_MAX_PATH_DEPTH, (CrawlURI) o);
112             } catch (AttributeNotFoundException e) {
113                 logger.severe(e.getMessage());
114             }
115         }
116         
117         return (this.maxPathDepth != null) ?
118             count <= this.maxPathDepth.intValue():
119             false;
120     }
121 
122     protected boolean returnTrueIfMatches(CrawlURI curi) {
123        try {
124            return ((Boolean) getAttribute(ATTR_MATCH_RETURN_VALUE, curi)).
125                booleanValue();
126        } catch (AttributeNotFoundException e) {
127            logger.severe(e.getMessage());
128            return true;
129        }
130     }
131     
132     protected boolean getFilterOffPosition(CrawlURI curi) {
133         return returnTrueIfMatches(curi);
134     }
135 }