View Javadoc

1   /* $Id: NotExceedsDocumentLengthTresholdDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
2    * 
3    * Created on 28.8.2006
4    *
5    * Copyright (C) 2006 Olaf Freyer
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.deciderules;
24  
25  import java.util.logging.Level;
26  import java.util.logging.Logger;
27  
28  import org.apache.commons.httpclient.HttpMethod;
29  import org.archive.crawler.datamodel.CoreAttributeConstants;
30  import org.archive.crawler.datamodel.CrawlURI;
31  import org.archive.crawler.settings.SimpleType;
32  
33  public class NotExceedsDocumentLengthTresholdDecideRule
34  extends PredicatedDecideRule implements CoreAttributeConstants {
35  	
36  
37      private static final long serialVersionUID = -8774160016195991876L;
38  
39      private static final Logger logger = Logger.
40      	getLogger(NotExceedsDocumentLengthTresholdDecideRule.class.getName());
41      public static final String ATTR_CONTENT_LENGTH_TRESHOLD =
42      	"content-length-treshold";
43      static final Integer DEFAULT_CONTENT_LENGTH_TRESHOLD = -1;
44      public static final String ATTR_USE_AS_MIDFETCH = "use-as-midfetch-filter";
45      static final Boolean DEFAULT_USE_AS_MIDFETCH = new Boolean(true);
46      
47      
48      // Header predictor state constants
49      public static final int HEADER_PREDICTS_MISSING = -1;
50  	
51      public NotExceedsDocumentLengthTresholdDecideRule(String name){
52      	super(name);
53      	setDescription("NotExceedsDocumentLengthTresholdDecideRule. " +
54      			"REJECTs URIs "+
55                  "with content length exceeding a given treshold. "+
56                  "Either examines HTTP header content length or " +
57                  "actual downloaded content length and returns false " +
58                  "for documents exceeding a given length treshold.");
59      	
60          addElementToDefinition(new SimpleType(ATTR_USE_AS_MIDFETCH,
61                  "Shall this rule be used as a midfetch rule? If true, " +
62                  "this rule will determine content length based on HTTP " +
63                  "header information, otherwise the size of the already " +
64                  "downloaded content will be used.",
65                  DEFAULT_USE_AS_MIDFETCH));
66  
67          addElementToDefinition(new SimpleType(ATTR_CONTENT_LENGTH_TRESHOLD,
68          	"Max " +
69  	        "content-length this filter will allow to pass through. If -1, " +
70  	        "then no limit.",
71  	        DEFAULT_CONTENT_LENGTH_TRESHOLD));
72      }
73      
74      protected boolean evaluate(Object object) {
75          try {
76              CrawlURI curi = (CrawlURI)object;
77              
78              int contentlength = HEADER_PREDICTS_MISSING;
79  
80              //filter used as midfetch filter
81          	if (getIsMidfetchRule(object)){
82          		
83                  	if(curi.containsKey(A_HTTP_TRANSACTION) == false){
84                  		// Missing header info, let pass
85                  		if (logger.isLoggable(Level.INFO)) {
86                  			logger.info("Error: Missing HttpMethod object in " +
87                  				"CrawlURI. " + curi.toString());
88                  		}
89                  		return false;
90                  	}
91          		
92                      // Initially assume header info is missing
93                      HttpMethod method =
94                      	(HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
95  
96                      // get content-length 
97                      String newContentlength = null;
98                      if (method.getResponseHeader("content-length") != null) {
99                          newContentlength = method.
100                         	getResponseHeader("content-length").getValue();
101                     }
102                 
103                     if (newContentlength != null &&
104                     		newContentlength.length() > 0) {
105             	        try {
106             	        	contentlength = Integer.parseInt(newContentlength);
107             	        } catch (NumberFormatException nfe) {
108             	        	// Ignore.
109             	        }
110                     }
111                 
112                     // If no document length was reported or format was wrong, 
113                     // let pass
114                     if (contentlength == HEADER_PREDICTS_MISSING) {
115                         return false;
116                     }
117         	} else {
118         	    contentlength = (int)curi.getContentSize();
119         	}
120 
121             return makeDecision(contentlength, object);
122                 
123         } catch (ClassCastException e) {
124             // if not CrawlURI, always disregard
125             return false; 
126         }
127     }
128     
129     /***
130      * @param contentLength content length to check against treshold
131      * @param obj Context object.
132      * @return contentLength not exceeding treshold?
133      */
134     protected Boolean makeDecision(int contentLength, Object obj) {
135     	return contentLength < getContentLengthTreshold(obj);
136     }
137     
138     /***
139      * @param obj Context object.
140      * @return content length threshold
141      */
142     protected int getContentLengthTreshold(Object obj) {
143         int len = ((Integer)getUncheckedAttribute(obj,
144         	ATTR_CONTENT_LENGTH_TRESHOLD)).intValue();
145         return len == -1? Integer.MAX_VALUE: len;
146     }
147 
148     /***
149      * @param obj Context object.
150      * @return to be used as midfetch rule?
151      */
152     private Boolean getIsMidfetchRule(Object obj) {
153         return ((Boolean)getUncheckedAttribute(obj,ATTR_USE_AS_MIDFETCH)).
154         	booleanValue();
155     }
156 }