1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.datamodel;
26
27 import java.io.Serializable;
28
29 import org.apache.commons.httpclient.URI;
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.util.SURT;
32 import org.archive.util.TextUtils;
33
34
35 /***
36 * Usable URI.
37 *
38 * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
39 * and methods. It cannot be instantiated directly. Go via UURIFactory.
40 *
41 * <p>We used to use {@link java.net.URI} for parsing URIs but ran across
42 * quirky behaviors and bugs. {@link java.net.URI} is not subclassable --
43 * its final -- and its unlikely that java.net.URI will change any time soon
44 * (See Gordon's considered petition here:
45 * <a href="http://developer.java.sun.com/developer/bugParade/bugs/4939847.html">java.net.URI
46 * should have loose/tolerant/compatibility option (or allow reuse)</a>).
47 *
48 * <p>This class tries to cache calculated strings such as the extracted host
49 * and this class as a string rather than have the parent class rerun its
50 * calculation everytime.
51 *
52 * @author gojomo
53 * @author stack
54 *
55 * @see org.apache.commons.httpclient.URI
56 */
57 public class UURI extends URI
58 implements CharSequence, Serializable {
59 /***
60 * Consider URIs too long for IE as illegal.
61 */
62 public final static int MAX_URL_LENGTH = 2083;
63
64 public static final String MASSAGEHOST_PATTERN = "^www//d*//.";
65
66 /***
67 * Cache of the host name.
68 *
69 * Super class calculates on every call. Profiling shows us spend 30% of
70 * total elapsed time in URI class.
71 */
72 private transient String cachedHost = null;
73
74 /***
75 * Cache of the host base name.
76 */
77 private transient String cachedHostBasename = null;
78
79 /***
80 * Cache of this uuri escaped as a string.
81 *
82 * Super class calculates on every call. Profiling shows us spend 30% of
83 * total elapsed time in URI class.
84 */
85 private transient String cachedEscapedURI = null;
86
87 /***
88 * Cache of this uuri escaped as a string.
89 *
90 * Super class calculates on every call. Profiling shows us spend 30% of
91 * total elapsed time in URI class.
92 */
93 private transient String cachedString = null;
94
95 /***
96 * Cached authority minus userinfo.
97 */
98 private transient String cachedAuthorityMinusUserinfo = null;
99
100 /***
101 * Cache of this uuri in SURT format
102 */
103 private transient String surtForm = null;
104
105
106
107
108
109 static {
110 hostname.set('_');
111 }
112
113
114 /***
115 * Shutdown access to default constructor.
116 */
117 protected UURI() {
118 super();
119 }
120
121 /***
122 * @param uri String representation of an absolute URI.
123 * @param escaped If escaped.
124 * @param charset Charset to use.
125 * @throws org.apache.commons.httpclient.URIException
126 */
127 protected UURI(String uri, boolean escaped, String charset)
128 throws URIException {
129 super(uri, escaped, charset);
130 normalize();
131 }
132
133 /***
134 * @param relative String representation of URI.
135 * @param base Parent UURI to use derelativizing.
136 * @throws org.apache.commons.httpclient.URIException
137 */
138 protected UURI(UURI base, UURI relative) throws URIException {
139 super(base, relative);
140 normalize();
141 }
142
143 /***
144 * @param uri URI as string that is resolved relative to this UURI.
145 * @return UURI that uses this UURI as base.
146 * @throws URIException
147 */
148 public UURI resolve(String uri)
149 throws URIException {
150 return resolve(uri, UURIFactory.isEscaped(uri),
151 this.getProtocolCharset());
152 }
153
154 /***
155 * @param uri URI as string that is resolved relative to this UURI.
156 * @param e True if escaped.
157 * @return UURI that uses this UURI as base.
158 * @throws URIException
159 */
160 public UURI resolve(String uri, boolean e)
161 throws URIException {
162 return resolve(uri, e, this.getProtocolCharset());
163 }
164
165 /***
166 * @param uri URI as string that is resolved relative to this UURI.
167 * @param e True if uri is escaped.
168 * @param charset Charset to use.
169 * @return UURI that uses this UURI as base.
170 * @throws URIException
171 */
172 public UURI resolve(String uri, boolean e, String charset)
173 throws URIException {
174 return new UURI(this, new UURI(uri, e, charset));
175 }
176
177 /***
178 * Test an object if this UURI is equal to another.
179 *
180 * @param obj an object to compare
181 * @return true if two URI objects are equal
182 */
183 public boolean equals(Object obj) {
184
185
186 if (obj == this) {
187 return true;
188 }
189 if (!(obj instanceof UURI)) {
190 return false;
191 }
192 UURI another = (UURI) obj;
193
194 if (!equals(this._scheme, another._scheme)) {
195 return false;
196 }
197
198 if (!equals(this._opaque, another._opaque)) {
199 return false;
200 }
201
202
203 if (!equals(this._authority, another._authority)) {
204 return false;
205 }
206
207 if (!equals(this._path, another._path)) {
208 return false;
209 }
210
211 if (!equals(this._query, another._query)) {
212 return false;
213 }
214
215 return true;
216 }
217
218 /***
219 * Strips www variants from the host.
220 *
221 * Strips www[0-9]*\. from the host. If calling getHostBaseName becomes a
222 * performance issue we should consider adding the hostBasename member that
223 * is set on initialization.
224 *
225 * @return Host's basename.
226 * @throws URIException
227 */
228 public String getHostBasename() throws URIException {
229 if (this.cachedHostBasename == null) {
230 cacheHostBasename();
231 }
232 return this.cachedHostBasename;
233 }
234
235 protected synchronized void cacheHostBasename() throws URIException {
236 if (this.cachedHostBasename != null) {
237 return;
238 }
239 if (this.getHost() != null) {
240 this.cachedHostBasename = TextUtils.
241 replaceFirst(MASSAGEHOST_PATTERN, this.getHost(),
242 UURIFactory.EMPTY_STRING);
243 }
244 }
245
246 /***
247 * Override to cache result
248 * @return String representation of this URI
249 */
250 public synchronized String toString() {
251 if (this.cachedString == null) {
252 this.cachedString = super.toString();
253 }
254 return this.cachedString;
255 }
256
257 public String getEscapedURI() {
258 if (this.cachedEscapedURI == null) {
259 synchronized (this) {
260 if (this.cachedEscapedURI == null) {
261 this.cachedEscapedURI = super.getEscapedURI();
262 }
263 }
264 }
265 return this.cachedEscapedURI;
266 }
267
268 public synchronized String getHost() throws URIException {
269 if (this.cachedHost == null) {
270
271
272 if (this._host != null) {
273 this.cachedHost = super.getHost();
274 }
275 }
276 return this.cachedHost;
277 }
278
279 /***
280 * Return the referenced host in the UURI, if any, also extracting the
281 * host of a DNS-lookup URI where necessary.
282 *
283 * @return the target or topic host of the URI
284 * @throws URIException
285 */
286 public String getReferencedHost() throws URIException {
287 String referencedHost = this.getHost();
288 if(referencedHost==null && this.getScheme().equals("dns")) {
289
290 String possibleHost = this.getCurrentHierPath();
291 if(possibleHost != null && possibleHost.matches("[-_//w//.:]+")) {
292 referencedHost = possibleHost;
293 }
294 }
295 return referencedHost;
296 }
297
298 /***
299 * @return Return the 'SURT' format of this UURI
300 */
301 public String getSurtForm() {
302 if (surtForm == null) {
303 surtForm = SURT.fromURI(this.toString());
304 }
305 return surtForm;
306 }
307
308 /***
309 * Return the authority minus userinfo (if any).
310 *
311 * If no userinfo present, just returns the authority.
312 *
313 * @return The authority stripped of any userinfo if present.
314 * @throws URIException
315 */
316 public String getAuthorityMinusUserinfo()
317 throws URIException {
318 if (this.cachedAuthorityMinusUserinfo != null) {
319 return this.cachedAuthorityMinusUserinfo;
320 }
321 String tmp = getAuthority();
322 if (tmp != null && tmp.length() > 0) {
323 int index = tmp.indexOf('@');
324 if (index >= 0 && index < tmp.length()) {
325 tmp = tmp.substring(index + 1);
326 }
327 }
328 this.cachedAuthorityMinusUserinfo = tmp;
329 return this.cachedAuthorityMinusUserinfo;
330 }
331
332
333
334
335 public int length() {
336 return getEscapedURI().length();
337 }
338
339
340
341
342 public char charAt(int index) {
343 return getEscapedURI().charAt(index);
344 }
345
346
347
348
349 public CharSequence subSequence(int start, int end) {
350 return getEscapedURI().subSequence(start,end);
351 }
352
353
354
355
356 public int compareTo(Object arg0) {
357 return getEscapedURI().compareTo(arg0);
358 }
359
360 /***
361 * Convenience method for finding the UURI inside an
362 * Object likely to have one.
363 *
364 * @param o Object that has a UURI
365 * @return the UURI found
366 */
367 public static UURI from(Object o) {
368 UURI u = null;
369 if (o instanceof UURI) {
370 u = (UURI)o;
371 } else if (o instanceof CandidateURI) {
372 u = ((CandidateURI) o).getUURI();
373 } else {
374
375
376 if (o != null) {
377 throw new IllegalArgumentException("Passed wrong type: " + o);
378 }
379 }
380 return u;
381 }
382
383 /***
384 * Overridden from superclass to apply fixes to the two
385 * marked lines, preventing the misinterpretation of URI
386 * strings which begin with a ':' as absolute URIs.
387 *
388 * See also HTTPClient bug #35148
389 * http://issues.apache.org/bugzilla/show_bug.cgi?id=35148
390 *
391 * @see org.apache.commons.httpclient.URI#parseUriReference(java.lang.String, boolean)
392 */
393 protected void parseUriReference(String original, boolean escaped)
394 throws URIException {
395
396
397 if (original == null) {
398 throw new URIException("URI-Reference required");
399 }
400
401
402
403
404 String tmp = original.trim();
405
406
407
408
409
410 int length = tmp.length();
411
412
413
414
415 if (length > 0) {
416 char[] firstDelimiter = { tmp.charAt(0) };
417 if (validate(firstDelimiter, delims)) {
418 if (length >= 2) {
419 char[] lastDelimiter = { tmp.charAt(length - 1) };
420 if (validate(lastDelimiter, delims)) {
421 tmp = tmp.substring(1, length - 1);
422 length = length - 2;
423 }
424 }
425 }
426 }
427
428
429
430
431 int from = 0;
432
433
434
435
436 boolean isStartedFromPath = false;
437 int atColon = tmp.indexOf(':');
438 int atSlash = tmp.indexOf('/');
439
440 if (atColon <= 0 || (atSlash >= 0 && atSlash < atColon)) {
441 isStartedFromPath = true;
442 }
443
444
445
446
447
448
449
450 int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
451 if (at == -1) {
452 at = 0;
453 }
454
455
456
457
458
459
460
461
462
463
464 if (at > 0 && at < length && tmp.charAt(at) == ':') {
465 char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
466 if (validate(target, scheme)) {
467 _scheme = target;
468 } else {
469 throw new URIException("incorrect scheme");
470 }
471 from = ++at;
472 }
473
474
475
476
477
478
479
480
481
482
483 _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
484 if (0 <= at && at < length && tmp.charAt(at) == '/') {
485
486 _is_hier_part = true;
487 if (at + 2 < length && tmp.charAt(at + 1) == '/') {
488
489 int next = indexFirstOf(tmp, "/?#", at + 2);
490 if (next == -1) {
491 next = (tmp.substring(at + 2).length() == 0) ? at + 2
492 : tmp.length();
493 }
494 parseAuthority(tmp.substring(at + 2, next), escaped);
495 from = at = next;
496
497 _is_net_path = true;
498 }
499 if (from == at) {
500
501 _is_abs_path = true;
502 }
503 }
504
505
506
507
508
509
510
511
512
513 if (from < length) {
514
515 int next = indexFirstOf(tmp, "?#", from);
516 if (next == -1) {
517 next = tmp.length();
518 }
519 if (!_is_abs_path) {
520 if (!escaped
521 && prevalidate(tmp.substring(from, next), disallowed_rel_path)
522 || escaped
523 && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
524
525 _is_rel_path = true;
526 } else if (!escaped
527 && prevalidate(tmp.substring(from, next), disallowed_opaque_part)
528 || escaped
529 && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
530
531 _is_opaque_part = true;
532 } else {
533
534 _path = null;
535 }
536 }
537 if (escaped) {
538 setRawPath(tmp.substring(from, next).toCharArray());
539 } else {
540 setPath(tmp.substring(from, next));
541 }
542 at = next;
543 }
544
545
546 String charset = getProtocolCharset();
547
548
549
550
551
552
553
554
555
556 if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
557 int next = tmp.indexOf('#', at + 1);
558 if (next == -1) {
559 next = tmp.length();
560 }
561 _query = (escaped) ? tmp.substring(at + 1, next).toCharArray()
562 : encode(tmp.substring(at + 1, next), allowed_query, charset);
563 at = next;
564 }
565
566
567
568
569
570
571
572
573
574 if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
575 if (at + 1 == length) {
576 _fragment = "".toCharArray();
577 } else {
578 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
579 : encode(tmp.substring(at + 1), allowed_fragment, charset);
580 }
581 }
582
583
584 setURI();
585 }
586 }