View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimplePreselector.java
20   * Created on Sep 22, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.prefetch;
25  
26  import javax.management.AttributeNotFoundException;
27  
28  import org.archive.crawler.datamodel.CrawlURI;
29  import org.archive.crawler.datamodel.FetchStatusCodes;
30  import org.archive.crawler.framework.Scoper;
31  import org.archive.crawler.settings.SimpleType;
32  import org.archive.crawler.settings.Type;
33  import org.archive.util.TextUtils;
34  
35  /***
36   * If set to recheck the crawl's scope, gives a yes/no on whether
37   * a CrawlURI should be processed at all. If not, its status
38   * will be marked OUT_OF_SCOPE and the URI will skip directly
39   * to the first "postprocessor".
40   *
41   *
42   * @author gojomo
43   *
44   */
45  public class Preselector extends Scoper
46  implements FetchStatusCodes {
47  
48      private static final long serialVersionUID = 3738560264369561017L;
49  
50      /*** whether to reapply crawl scope at this step */
51      public static final String ATTR_RECHECK_SCOPE = "recheck-scope";
52      /*** indicator allowing all URIs (of a given host, typically) to
53       * be blocked at this step*/
54      public static final String ATTR_BLOCK_ALL = "block-all";
55      /*** indicator allowing all matching URIs to be blocked at this step */
56      public static final String ATTR_BLOCK_BY_REGEXP = "block-by-regexp";
57      /*** indicator allowing all matching URIs */
58      public static final String ATTR_ALLOW_BY_REGEXP = "allow-by-regexp";
59  
60      /***
61       * Constructor.
62       * @param name Name of this processor.
63       */
64      public Preselector(String name) {
65          super(name, "Preselector. Does one last bit of checking to make " +
66              "sure that the current URI should be fetched.");
67          Type e;
68          e = addElementToDefinition(new SimpleType(ATTR_RECHECK_SCOPE,
69                  "Recheck if uri is in scope. This is meaningful if the scope" +
70                  " is altered during a crawl. URIs are checked against the" +
71                  " scope when they are added to queues. Setting this value to" +
72                  " true forces the URI to be checked against the scope when it" +
73                  " is comming out of the queue, possibly after the scope is" +
74                  " altered.", new Boolean(false)));
75          e.setExpertSetting(true);
76  
77          e = addElementToDefinition(new SimpleType(ATTR_BLOCK_ALL,
78                  "Block all URIs from being processed. This is most likely to" +
79                  " be used in overrides to easily reject certain hosts from" +
80                  " being processed.", new Boolean(false)));
81          e.setExpertSetting(true);
82  
83          e = addElementToDefinition(new SimpleType(ATTR_BLOCK_BY_REGEXP,
84                  "Block all URIs matching the regular expression from being" +
85                  " processed.", ""));
86          e.setExpertSetting(true);
87  
88          e = addElementToDefinition(new SimpleType(ATTR_ALLOW_BY_REGEXP,
89                  "Allow only URIs matching the regular expression to be" +
90                  " processed.", ""));
91          e.setExpertSetting(true);
92      }
93  
94      protected void innerProcess(CrawlURI curi) {
95          // Check if uris should be blocked
96          try {
97              if (((Boolean) getAttribute(ATTR_BLOCK_ALL, curi)).booleanValue()) {
98                  curi.setFetchStatus(S_BLOCKED_BY_USER);
99                  curi.skipToProcessorChain(getController().
100                     getPostprocessorChain());
101             }
102         } catch (AttributeNotFoundException e) {
103             // Act as attribute was false, that is: do nothing.
104         }
105 
106         // Check if allowed by regular expression
107         try {
108             String regexp = (String) getAttribute(ATTR_ALLOW_BY_REGEXP, curi);
109             if (regexp != null && !regexp.equals("")) {
110                 if (!TextUtils.matches(regexp, curi.toString())) {
111                     curi.setFetchStatus(S_BLOCKED_BY_USER);
112                     curi.skipToProcessorChain(getController().
113                         getPostprocessorChain());
114                 }
115             }
116         } catch (AttributeNotFoundException e) {
117             // Act as regexp was null, that is: do nothing.
118         }
119 
120 
121         // Check if blocked by regular expression
122         try {
123             String regexp = (String) getAttribute(ATTR_BLOCK_BY_REGEXP, curi);
124             if (regexp != null && !regexp.equals("")) {
125                 if (TextUtils.matches(regexp, curi.toString())) {
126                     curi.setFetchStatus(S_BLOCKED_BY_USER);
127                     curi.skipToProcessorChain(getController().
128                         getPostprocessorChain());
129                 }
130             }
131         } catch (AttributeNotFoundException e) {
132             // Act as regexp was null, that is: do nothing.
133         }
134 
135         // Possibly recheck scope
136         try {
137             if (((Boolean) getAttribute(ATTR_RECHECK_SCOPE, curi)).
138                     booleanValue()) {
139                 if (!isInScope(curi)) {
140                     // Scope rejected
141                     curi.setFetchStatus(S_OUT_OF_SCOPE);
142                     curi.skipToProcessorChain(getController().
143                         getPostprocessorChain());
144                 }
145             }
146         } catch (AttributeNotFoundException e) {
147             // Act as attribute was false, that is: do nothing.
148         }
149     }
150 }