View Javadoc

1   /* Scoper
2    * 
3    * Created on Jun 6, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.framework;
24  
25  import java.util.logging.Handler;
26  import java.util.logging.Level;
27  import java.util.logging.Logger;
28  
29  import javax.management.AttributeNotFoundException;
30  
31  import org.archive.crawler.datamodel.CandidateURI;
32  import org.archive.crawler.settings.SimpleType;
33  import org.archive.crawler.settings.Type;
34  import org.archive.crawler.util.LogUtils;
35  
36  /***
37   * Base class for Scopers.
38   * Scopers test CandidateURIs against a scope.
39   * Scopers allow logging of rejected CandidateURIs.
40   * @author stack
41   * @version $Date: 2010-05-11 22:15:04 +0000 (Tue, 11 May 2010) $, $Revision: 6867 $
42   */
43  public abstract class Scoper extends Processor {
44      private static Logger LOGGER =
45          Logger.getLogger(Scoper.class.getName());
46      
47      /***
48       * Protected so avaiilable to subclasses.
49       */
50      protected static final String ATTR_OVERRIDE_LOGGER_ENABLED =
51          "override-logger";
52  
53      /***
54       * Constructor.
55       * @param name
56       * @param description
57       */
58      public Scoper(String name, String description) {
59          super(name, description);
60          Type t = addElementToDefinition(
61              new SimpleType(ATTR_OVERRIDE_LOGGER_ENABLED,
62              "If enabled, override default logger for this class (Default " +
63              "logger writes the console).  Override " +
64              "logger will instead send all logging to a file named for this " +
65              "class in the job log directory. Set the logging level and " +
66              "other " +
67              "characteristics of the override logger such as rotation size, " +
68              "suffix pattern, etc. in heritrix.properties. This attribute " +
69              "is only checked once, on startup of a job.",
70              new Boolean(false)));
71          t.setExpertSetting(true);
72      }
73      
74      protected void initialTasks() {
75          super.initialTasks();
76          if (!isOverrideLogger(null)) {
77              return;
78          }
79          // Set up logger for this instance.  May have special directives
80          // since this class can log scope-rejected URLs.
81          LogUtils.createFileLogger(getController().getLogsDir(),
82              this.getClass().getName(),
83              Logger.getLogger(this.getClass().getName()));
84      }
85      
86      @Override
87      protected void finalTasks() {
88          super.finalTasks();
89          if (isOverrideLogger(null)) {
90              Logger logger = Logger.getLogger(this.getClass().getName());
91              logger.setUseParentHandlers(true);
92              for (Handler handler: logger.getHandlers()) {
93                  // XXX is there any chance this logger will have another extra
94                  // handler that we shouldn't remove?
95                  logger.removeHandler(handler);
96              }
97          }
98      }
99  
100     /***
101      * @param context Context to use looking up attribute.
102      * @return True if we are to override default logger (default logs
103      * to console) with a logger that writes all loggings to a file
104      * named for this class.
105      */
106     protected boolean isOverrideLogger(Object context) {
107         boolean result = true;
108         try {
109             Boolean b = (Boolean)getAttribute(context,
110                 ATTR_OVERRIDE_LOGGER_ENABLED);
111             if (b != null) {
112                 result = b.booleanValue();
113             }
114         } catch (AttributeNotFoundException e) {
115             LOGGER.warning("Failed get of 'enabled' attribute.");
116         }
117 
118         return result;
119     }
120     
121     /***
122      * Schedule the given {@link CandidateURI CandidateURI} with the Frontier.
123      * @param caUri The CandidateURI to be scheduled.
124      * @return true if CandidateURI was accepted by crawl scope, false
125      * otherwise.
126      */
127     protected boolean isInScope(CandidateURI caUri) {
128         boolean result = false;
129         if (getController().getScope().accepts(caUri)) {
130             result = true;
131             if (LOGGER.isLoggable(Level.FINER)) {
132                 LOGGER.finer("Accepted: " + caUri);
133             }
134         } else {
135             outOfScope(caUri);
136         }
137         return result;
138     }
139     
140     /***
141      * Called when a CandidateUri is ruled out of scope.
142      * Override if you don't want logs as coming from this class.
143      * @param caUri CandidateURI that is out of scope.
144      */
145     protected void outOfScope(CandidateURI caUri) {
146         if (!LOGGER.isLoggable(Level.INFO)) {
147             return;
148         }
149         LOGGER.info(caUri.getUURI().toString());
150     }
151 }