1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.deciderules;
24
25 import java.util.logging.Level;
26 import java.util.logging.Logger;
27
28 import org.apache.commons.httpclient.HttpMethod;
29 import org.archive.crawler.datamodel.CoreAttributeConstants;
30 import org.archive.crawler.datamodel.CrawlURI;
31 import org.archive.crawler.settings.SimpleType;
32
33 public class NotExceedsDocumentLengthTresholdDecideRule
34 extends PredicatedDecideRule implements CoreAttributeConstants {
35
36
37 private static final long serialVersionUID = -8774160016195991876L;
38
39 private static final Logger logger = Logger.
40 getLogger(NotExceedsDocumentLengthTresholdDecideRule.class.getName());
41 public static final String ATTR_CONTENT_LENGTH_TRESHOLD =
42 "content-length-treshold";
43 static final Integer DEFAULT_CONTENT_LENGTH_TRESHOLD = -1;
44 public static final String ATTR_USE_AS_MIDFETCH = "use-as-midfetch-filter";
45 static final Boolean DEFAULT_USE_AS_MIDFETCH = new Boolean(true);
46
47
48
49 public static final int HEADER_PREDICTS_MISSING = -1;
50
51 public NotExceedsDocumentLengthTresholdDecideRule(String name){
52 super(name);
53 setDescription("NotExceedsDocumentLengthTresholdDecideRule. " +
54 "REJECTs URIs "+
55 "with content length exceeding a given treshold. "+
56 "Either examines HTTP header content length or " +
57 "actual downloaded content length and returns false " +
58 "for documents exceeding a given length treshold.");
59
60 addElementToDefinition(new SimpleType(ATTR_USE_AS_MIDFETCH,
61 "Shall this rule be used as a midfetch rule? If true, " +
62 "this rule will determine content length based on HTTP " +
63 "header information, otherwise the size of the already " +
64 "downloaded content will be used.",
65 DEFAULT_USE_AS_MIDFETCH));
66
67 addElementToDefinition(new SimpleType(ATTR_CONTENT_LENGTH_TRESHOLD,
68 "Max " +
69 "content-length this filter will allow to pass through. If -1, " +
70 "then no limit.",
71 DEFAULT_CONTENT_LENGTH_TRESHOLD));
72 }
73
74 protected boolean evaluate(Object object) {
75 try {
76 CrawlURI curi = (CrawlURI)object;
77
78 int contentlength = HEADER_PREDICTS_MISSING;
79
80
81 if (getIsMidfetchRule(object)){
82
83 if(curi.containsKey(A_HTTP_TRANSACTION) == false){
84
85 if (logger.isLoggable(Level.INFO)) {
86 logger.info("Error: Missing HttpMethod object in " +
87 "CrawlURI. " + curi.toString());
88 }
89 return false;
90 }
91
92
93 HttpMethod method =
94 (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
95
96
97 String newContentlength = null;
98 if (method.getResponseHeader("content-length") != null) {
99 newContentlength = method.
100 getResponseHeader("content-length").getValue();
101 }
102
103 if (newContentlength != null &&
104 newContentlength.length() > 0) {
105 try {
106 contentlength = Integer.parseInt(newContentlength);
107 } catch (NumberFormatException nfe) {
108
109 }
110 }
111
112
113
114 if (contentlength == HEADER_PREDICTS_MISSING) {
115 return false;
116 }
117 } else {
118 contentlength = (int)curi.getContentSize();
119 }
120
121 return makeDecision(contentlength, object);
122
123 } catch (ClassCastException e) {
124
125 return false;
126 }
127 }
128
129 /***
130 * @param contentLength content length to check against treshold
131 * @param obj Context object.
132 * @return contentLength not exceeding treshold?
133 */
134 protected Boolean makeDecision(int contentLength, Object obj) {
135 return contentLength < getContentLengthTreshold(obj);
136 }
137
138 /***
139 * @param obj Context object.
140 * @return content length threshold
141 */
142 protected int getContentLengthTreshold(Object obj) {
143 int len = ((Integer)getUncheckedAttribute(obj,
144 ATTR_CONTENT_LENGTH_TRESHOLD)).intValue();
145 return len == -1? Integer.MAX_VALUE: len;
146 }
147
148 /***
149 * @param obj Context object.
150 * @return to be used as midfetch rule?
151 */
152 private Boolean getIsMidfetchRule(Object obj) {
153 return ((Boolean)getUncheckedAttribute(obj,ATTR_USE_AS_MIDFETCH)).
154 booleanValue();
155 }
156 }