1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.filter;
25
26 import javax.management.AttributeNotFoundException;
27
28 import org.archive.crawler.datamodel.CandidateURI;
29 import org.archive.crawler.deciderules.DecideRule;
30 import org.archive.crawler.deciderules.DecidingFilter;
31 import org.archive.crawler.extractor.Link;
32 import org.archive.crawler.framework.CrawlScope;
33 import org.archive.crawler.framework.Filter;
34 import org.archive.crawler.scope.ClassicScope;
35 import org.archive.crawler.settings.SimpleType;
36
37 /***
38 * Filter which accepts CandidateURI/CrawlURI instances which contain more
39 * than zero but fewer than max-trans-hops entries at the end of their
40 * discovery path.
41 *
42 * @author Gordon Mohr
43 * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
44 * equivalent {@link DecideRule}.
45 */
46 public class TransclusionFilter extends Filter {
47
48 private static final long serialVersionUID = 4251767672778714051L;
49
50 private static final String ATTR_MAX_SPECULATIVE_HOPS =
51 "max-speculative-hops";
52 private static final String ATTR_MAX_REFERRAL_HOPS = "max-referral-hops";
53 private static final String ATTR_MAX_EMBED_HOPS = "max-embed-hops";
54 private static final int DEFAULT_MAX_TRANS_HOPS = 4;
55
56 /***
57 * Default speculative hops.
58 *
59 * No more than 1
60 */
61 private static final int DEFAULT_MAX_SPECULATIVE_HOPS = 1;
62
63 /***
64 * Default maximum referral hops.
65 *
66 * No limit beside the overall trans limit
67 */
68 private static final int DEFAULT_MAX_REFERRAL_HOPS = -1;
69
70 /***
71 * Default embedded link hops.
72 *
73 * No limit beside the overall trans limit
74 */
75 private static final int DEFAULT_MAX_EMBED_HOPS = -1;
76
77 int maxTransHops = DEFAULT_MAX_TRANS_HOPS;
78 int maxSpeculativeHops = DEFAULT_MAX_SPECULATIVE_HOPS;
79 int maxReferralHops = DEFAULT_MAX_REFERRAL_HOPS;
80 int maxEmbedHops = DEFAULT_MAX_EMBED_HOPS;
81
82
83
84
85 /***
86 * @param name
87 */
88 public TransclusionFilter(String name) {
89 super(name, "Transclusion filter *Deprecated* Use" +
90 "DecidingFilter and equivalent DecideRule instead.");
91
92 addElementToDefinition(
93 new SimpleType(
94 ATTR_MAX_SPECULATIVE_HOPS,
95 "Maximum number of consecutive speculative (i.e. URIs" +
96 " extracted that we are not sure if they are embeds or" +
97 " not) hops to allow.\nA value of -1 means no upper limit.",
98 new Integer(DEFAULT_MAX_SPECULATIVE_HOPS)));
99 addElementToDefinition(
100 new SimpleType(
101 ATTR_MAX_REFERRAL_HOPS,
102 "Maximum number of consecutive referral hops to allow.\n" +
103 "A value of -1 means no upper limit.",
104 new Integer(DEFAULT_MAX_REFERRAL_HOPS)));
105 addElementToDefinition(
106 new SimpleType(
107 ATTR_MAX_EMBED_HOPS,
108 "Maximum number of consecutive embed hops to allow.\n" +
109 "A value of -1 means no upper limit.",
110 new Integer(DEFAULT_MAX_EMBED_HOPS)));
111 }
112
113
114
115
116 protected boolean innerAccepts(Object o) {
117 if(! (o instanceof CandidateURI)) {
118 return false;
119 }
120 String path = ((CandidateURI)o).getPathFromSeed();
121 int transCount = 0;
122 int specCount = 0;
123 int refCount = 0;
124 int embedCount = 0;
125 loop: for(int i=path.length()-1;i>=0;i--) {
126
127 switch (path.charAt(i)) {
128 case Link.NAVLINK_HOP: {
129 break loop;
130 }
131 case Link.PREREQ_HOP: {
132 if(transCount==0) {
133
134 transCount++;
135 break loop;
136 }
137
138 break;
139 }
140 case Link.SPECULATIVE_HOP: {
141 specCount++;
142 break;
143 }
144 case Link.REFER_HOP: {
145 refCount++;
146 break;
147 }
148 case Link.EMBED_HOP: {
149 embedCount++;
150 break;
151 }
152
153
154 }
155 transCount++;
156 }
157
158 readMaxValues(o);
159
160
161 return (transCount > 0)
162
163 && (transCount <= this.maxTransHops)
164
165 && (this.maxSpeculativeHops < 0 || specCount <= this.maxSpeculativeHops)
166
167 && (this.maxReferralHops < 0 || refCount <= this.maxReferralHops)
168
169 && (this.maxEmbedHops < 0 || embedCount <= this.maxEmbedHops);
170 }
171
172 public void readMaxValues(Object o) {
173 try {
174 CrawlScope scope =
175 (CrawlScope) globalSettings().getModule(CrawlScope.ATTR_NAME);
176 this.maxTransHops = ((Integer) scope.getAttribute(o, ClassicScope.ATTR_MAX_TRANS_HOPS)).intValue();
177 this.maxSpeculativeHops = ((Integer) getAttribute(o, ATTR_MAX_SPECULATIVE_HOPS)).intValue();
178 this.maxReferralHops = ((Integer) getAttribute(o, ATTR_MAX_REFERRAL_HOPS)).intValue();
179 this.maxEmbedHops = ((Integer) getAttribute(o, ATTR_MAX_EMBED_HOPS)).intValue();
180 } catch (AttributeNotFoundException e) {
181
182 e.printStackTrace();
183 }
184 }
185
186 }