1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.util;
26
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29 import java.util.regex.Matcher;
30
31 import org.apache.commons.codec.DecoderException;
32 import org.apache.commons.httpclient.URIException;
33 import org.archive.net.LaxURLCodec;
34 import org.archive.net.UURI;
35
36
37 /***
38 * URI-related utilities.
39 *
40 * Primarily, a place to centralize and better document and test certain URI-related heuristics
41 * that may be useful in many places.
42 *
43 * The choice of when to consider a string likely enough to be a URI that we try crawling it
44 * is, so far, based on rather arbitrary rules-of-thumb. We have not quantitatively tested
45 * how often the strings that pass these tests yield meaningful (not 404, non-soft-404,
46 * non-garbage) replies. We are willing to accept some level of mistaken requests, knowing
47 * that their cost is usually negligible, if that allows us to discover meaningful content
48 * that could be not be discovered via other heuristics.
49 *
50 * Our intuitive understanding so far is that: strings that appear to have ./.. relative-path
51 * prefixes, dot-extensions, or path-slashes are good candidates for trying as URIs, even
52 * though with some Javascript/HTML-VALUE-attributes, this yields a lot of false positives.
53 *
54 * We want to get strings like....
55 *
56 * photo.jpg
57 * /photos
58 * /photos/
59 * ./photos
60 * ../../photos
61 * photos/index.html
62 *
63 * ...but we will thus also sometimes try strings that were other kinds of variables/
64 * parameters, like...
65 *
66 * rectangle.x
67 * 11.2px
68 * text/xml
69 * width:6.33
70 *
71 * Until better rules, exception-blacklists or even site-sensitive dynamic adjustment of
72 * heuristics (eg: this site, guesses are yield 200s, keep guessing; this site, guesses are
73 * all 404s, stop guessing) are developed, crawl operators should monitor their crawls
74 * (and contact email) for cases where speculative crawling are generating many errors, and
75 * use settings like ExtractorHTML's 'extract-javascript' and 'extract-value-attributes' or
76 * disable of ExtractorJS entirely when they want to curtail those errors.
77 *
78 * The 'legacy' tests are those used in H1 at least through 1.14.4. They have
79 * some known problems, but are not yet being dropped until more experience
80 * with the 'new' isLikelyUri() test is collected (in H3). Enable the 'xest'
81 * methods of the UriUtilsTest class for details.
82 *
83 * @contributor gojomo
84 */
85 public class UriUtils {
86 private static final Logger LOGGER = Logger.getLogger(UriUtils.class.getName());
87
88
89
90
91
92
93
94
95 static final String NAIVE_LIKELY_URI_PATTERN = "[^<>//s]*[//./][^<>//s]*(?<!//.)";
96
97
98
99 protected final static String[] NAIVE_URI_EXCEPTIONS = {
100 "text/javascript"
101 };
102
103 public static boolean isLikelyUri(CharSequence candidate) {
104
105 if(!TextUtils.matches(NAIVE_LIKELY_URI_PATTERN, candidate)) {
106 return false;
107 }
108
109 for (String s : NAIVE_URI_EXCEPTIONS) {
110 if (s.contentEquals(candidate))
111 return false;
112 }
113
114 if(TextUtils.matches("//d+//.//d+", candidate)) {
115 return false;
116 }
117 return true;
118 }
119
120
121 /***
122 * Perform additional fixup of likely-URI Strings
123 *
124 * @param string detected candidate String
125 * @return String changed/decoded to increase likelihood it is a
126 * meaningful non-404 URI
127 */
128 public static String speculativeFixup(String candidate, UURI base) {
129 String retVal = candidate;
130
131
132 retVal = TextUtils.replaceAll("&", retVal, "&");
133
134
135 Matcher m = TextUtils.getMatcher("(?i)^https?%3A.*",retVal);
136 if(m.matches()) {
137 try {
138 retVal = LaxURLCodec.DEFAULT.decode(retVal);
139 } catch (DecoderException e) {
140 LOGGER.log(Level.INFO,"unable to decode",e);
141 }
142 }
143 TextUtils.recycleMatcher(m);
144
145
146
147
148
149
150 m = TextUtils.getMatcher(
151 "^[^//./://s%]+//.[^/://s%]+//.([^//./://s%]+)(/.*|)$",
152 retVal);
153 if(m.matches()) {
154 if(ArchiveUtils.isTld(m.group(1))) {
155 String schemePlus = "http://";
156
157 try {
158 if (retVal.startsWith(base.getHost())) {
159 schemePlus = base.getScheme() + "://";
160 }
161 } catch (URIException e) {
162
163 }
164 retVal = schemePlus + retVal;
165 }
166 }
167 TextUtils.recycleMatcher(m);
168
169 return retVal;
170 }
171
172
173
174
175
176
177
178
179 static final String STRING_URI_DETECTOR =
180 "(?://w|[//.]{0,2}/)[//S&&[^<>]]*(?://.|/)[//S&&[^<>]]*(?://w|/)";
181
182
183
184
185
186 protected final static String[] STRING_URI_DETECTOR_EXCEPTIONS = {
187 "text/javascript"
188 };
189
190 public static boolean isLikelyUriJavascriptContextLegacy(CharSequence candidate) {
191 if(!TextUtils.matches(STRING_URI_DETECTOR,candidate)) {
192 return false;
193 }
194 for (String s : STRING_URI_DETECTOR_EXCEPTIONS) {
195 if (s.contentEquals(candidate))
196 return false;
197 }
198
199 return true;
200
201 }
202
203
204
205
206
207
208
209
210
211 static final String LIKELY_URI_PATH =
212 "(//.{0,2}[^//.//n//r//s\"']*(//.[^//.//n//r//s\"']+)+)";
213
214 public static boolean isLikelyUriHtmlContextLegacy(CharSequence candidate) {
215 return TextUtils.matches(LIKELY_URI_PATH, candidate);
216 }
217 }