1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.scope;
25
26 import java.util.Iterator;
27 import java.util.logging.Logger;
28
29 import org.apache.commons.httpclient.URIException;
30 import org.archive.crawler.deciderules.DecidingScope;
31 import org.archive.crawler.filter.FilePatternFilter;
32 import org.archive.crawler.filter.TransclusionFilter;
33 import org.archive.crawler.framework.Filter;
34 import org.archive.net.UURI;
35
36 /***
37 * A core CrawlScope suitable for the most common
38 * crawl needs.
39 *
40 * Roughly, its logic is that a URI is included if:
41 *
42 * (( isSeed(uri) || focusFilter.accepts(uri) )
43 * || transitiveFilter.accepts(uri) )
44 * && ! excludeFilter.accepts(uri)
45 *
46 * The focusFilter may be specified by either:
47 * - adding a 'mode' attribute to the
48 * <code>scope</code> element. mode="broad" is equivalent
49 * to no focus; modes "path", "host", and "domain"
50 * imply a SeedExtensionFilter will be used, with
51 * the <code>scope</code> element providing its configuration
52 * - adding a <code>focus</code> subelement
53 * If unspecified, the focusFilter will default to
54 * an accepts-all filter.
55 *
56 * The transitiveFilter may be specified by supplying
57 * a <code>transitive</code> subelement. If unspecified, a
58 * TransclusionFilter will be used, with the <code>scope</code>
59 * element providing its configuration.
60 *
61 * The excludeFilter may be specified by supplying
62 * a <code>exclude</code> subelement. If unspecified, a
63 * accepts-none filter will be used -- meaning that
64 * no URIs will pass the filter and thus be excluded.
65 *
66 * @author gojomo
67 * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
68 */
69 public class DomainScope extends SeedCachingScope {
70
71 private static final long serialVersionUID = 648062105277258820L;
72
73 private static final Logger logger =
74 Logger.getLogger(DomainScope.class.getName());
75
76 public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter";
77 public static final String ATTR_ADDITIONAL_FOCUS_FILTER =
78 "additionalScopeFocus";
79 public static final String DOT = ".";
80
81 Filter additionalFocusFilter;
82 Filter transitiveFilter;
83
84 public DomainScope(String name) {
85 super(name);
86 setDescription(
87 "DomainScope: A scope for domain crawls *Deprecated* Use " +
88 "DecidingScope instead. Crawls made with this" +
89 " scope will be limited to the domain of its seeds. It will" +
90 " however reach subdomains of the seeds' original domains." +
91 " www[#].host is considered to be the same as host.");
92 this.additionalFocusFilter = (Filter) addElementToDefinition(
93 new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER));
94 this.transitiveFilter = (Filter) addElementToDefinition(
95 new TransclusionFilter(ATTR_TRANSITIVE_FILTER));
96 }
97
98 /***
99 * @param o
100 * @return True if transitive filter accepts passed object.
101 */
102 protected boolean transitiveAccepts(Object o) {
103 return this.transitiveFilter.accepts(o);
104 }
105
106 /***
107 * Check if an URI is part of this scope.
108 *
109 * @param o An instance of UURI or of CandidateURI.
110 * @return True if focus filter accepts passed object.
111 */
112 protected boolean focusAccepts(Object o) {
113 UURI u = UURI.from(o);
114 if (u == null) {
115 return false;
116 }
117
118
119
120
121 String seedDomain = null;
122 String candidateDomain =null;
123
124
125 try {
126 candidateDomain = u.getHostBasename();
127 }
128 catch (URIException e1) {
129 logger.severe(
130 "UURI getHostBasename failed for candidate URI: " + u);
131 }
132 if (candidateDomain == null) {
133
134 return false;
135 }
136
137 Iterator iter = seedsIterator();
138 while(iter.hasNext()) {
139 UURI s = (UURI)iter.next();
140
141 try {
142 seedDomain = s.getHostBasename();
143 }
144 catch (URIException e) {
145 logger.severe("UURI getHostBasename failed for seed: " +
146 s);
147 }
148 if (seedDomain == null) {
149
150
151 continue;
152 }
153
154
155 if (seedDomain.equals(candidateDomain)) {
156 checkClose(iter);
157 return true;
158 }
159
160
161
162 seedDomain = DOT + seedDomain;
163 if (seedDomain.regionMatches(0, candidateDomain,
164 candidateDomain.length() - seedDomain.length(),
165 seedDomain.length())) {
166
167 checkClose(iter);
168 return true;
169 }
170 }
171
172 checkClose(iter);
173 return false;
174 }
175
176 protected boolean additionalFocusAccepts(Object o) {
177 return additionalFocusFilter.accepts(o);
178 }
179 }