View Javadoc

1   /* UURI
2    *
3    * $Id: UURI.java,v 1.51 2005/06/01 01:16:07 gojomo Exp $
4    *
5    * Created on Apr 18, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.datamodel;
26  
27  import java.io.Serializable;
28  
29  import org.apache.commons.httpclient.URI;
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.util.SURT;
32  import org.archive.util.TextUtils;
33  
34  
35  /***
36   * Usable URI.
37   * 
38   * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
39   * and methods. It cannot be instantiated directly.  Go via UURIFactory.
40   * 
41   *  <p>We used to use {@link java.net.URI} for parsing URIs but ran across
42   * quirky behaviors and bugs.  {@link java.net.URI} is not subclassable --
43   * its final -- and its unlikely that java.net.URI will change any time soon
44   * (See Gordon's considered petition here:
45   * <a href="http://developer.java.sun.com/developer/bugParade/bugs/4939847.html">java.net.URI
46   * should have loose/tolerant/compatibility option (or allow reuse)</a>).
47   *
48   * <p>This class tries to cache calculated strings such as the extracted host
49   * and this class as a string rather than have the parent class rerun its
50   * calculation everytime.
51   *
52   * @author gojomo
53   * @author stack
54   *
55   * @see org.apache.commons.httpclient.URI
56   */
57  public class UURI extends URI
58  implements CharSequence, Serializable {
59      /***
60       * Consider URIs too long for IE as illegal.
61       */
62      public final static int MAX_URL_LENGTH = 2083;
63      
64      public static final String MASSAGEHOST_PATTERN = "^www//d*//.";
65  
66      /***
67       * Cache of the host name.
68       *
69       * Super class calculates on every call.  Profiling shows us spend 30% of
70       * total elapsed time in URI class.
71       */
72      private transient String cachedHost = null;
73      
74      /***
75       * Cache of the host base name.
76       */
77      private transient String cachedHostBasename = null;
78  
79      /***
80       * Cache of this uuri escaped as a string.
81       *
82       * Super class calculates on every call.  Profiling shows us spend 30% of
83       * total elapsed time in URI class.
84       */
85      private transient String cachedEscapedURI = null;
86  
87      /***
88       * Cache of this uuri escaped as a string.
89       *
90       * Super class calculates on every call.  Profiling shows us spend 30% of
91       * total elapsed time in URI class.
92       */
93      private transient String cachedString = null;
94      
95      /***
96       * Cached authority minus userinfo.
97       */
98      private transient String cachedAuthorityMinusUserinfo = null;
99  
100     /***
101      * Cache of this uuri in SURT format
102      */
103     private transient String surtForm = null;
104     
105     // Technically, underscores are disallowed in the domainlabel
106     // portion of hostname according to rfc2396 but we'll be more
107     // loose and allow them. See: [ 1072035 ] [uuri] Underscore in
108     // host messes up port parsing.
109     static {
110         hostname.set('_');
111     }
112 
113 
114     /***
115      * Shutdown access to default constructor.
116      */
117     protected UURI() {
118         super();
119     }
120     
121     /***
122      * @param uri String representation of an absolute URI.
123      * @param escaped If escaped.
124      * @param charset Charset to use.
125      * @throws org.apache.commons.httpclient.URIException
126      */
127     protected UURI(String uri, boolean escaped, String charset)
128     throws URIException {
129         super(uri, escaped, charset);
130         normalize();
131     }
132     
133     /***
134      * @param relative String representation of URI.
135      * @param base Parent UURI to use derelativizing.
136      * @throws org.apache.commons.httpclient.URIException
137      */
138     protected UURI(UURI base, UURI relative) throws URIException {
139         super(base, relative);
140         normalize();
141     }
142 
143     /***
144      * @param uri URI as string that is resolved relative to this UURI.
145      * @return UURI that uses this UURI as base.
146      * @throws URIException
147      */
148     public UURI resolve(String uri)
149     throws URIException {
150         return resolve(uri, UURIFactory.isEscaped(uri),
151             this.getProtocolCharset());
152     }
153 
154     /***
155      * @param uri URI as string that is resolved relative to this UURI.
156      * @param e True if escaped.
157      * @return UURI that uses this UURI as base.
158      * @throws URIException
159      */
160     public UURI resolve(String uri, boolean e)
161     throws URIException {
162         return resolve(uri, e, this.getProtocolCharset());
163     }
164     
165     /***
166      * @param uri URI as string that is resolved relative to this UURI.
167      * @param e True if uri is escaped.
168      * @param charset Charset to use.
169      * @return UURI that uses this UURI as base.
170      * @throws URIException
171      */
172     public UURI resolve(String uri, boolean e, String charset)
173     throws URIException {
174         return new UURI(this, new UURI(uri, e, charset));
175     }
176 
177     /***
178      * Test an object if this UURI is equal to another.
179      *
180      * @param obj an object to compare
181      * @return true if two URI objects are equal
182      */
183     public boolean equals(Object obj) {
184 
185         // normalize and test each components
186         if (obj == this) {
187             return true;
188         }
189         if (!(obj instanceof UURI)) {
190             return false;
191         }
192         UURI another = (UURI) obj;
193         // scheme
194         if (!equals(this._scheme, another._scheme)) {
195             return false;
196         }
197         // is_opaque_part or is_hier_part?  and opaque
198         if (!equals(this._opaque, another._opaque)) {
199             return false;
200         }
201         // is_hier_part
202         // has_authority
203         if (!equals(this._authority, another._authority)) {
204             return false;
205         }
206         // path
207         if (!equals(this._path, another._path)) {
208             return false;
209         }
210         // has_query
211         if (!equals(this._query, another._query)) {
212             return false;
213         }
214         // UURIs do not have fragments
215         return true;
216     }
217 
218     /***
219      * Strips www variants from the host.
220      *
221      * Strips www[0-9]*\. from the host.  If calling getHostBaseName becomes a
222      * performance issue we should consider adding the hostBasename member that
223      * is set on initialization.
224      *
225      * @return Host's basename.
226      * @throws URIException
227      */
228     public String getHostBasename() throws URIException {
229         if (this.cachedHostBasename == null) {
230             cacheHostBasename();
231         }
232         return this.cachedHostBasename;
233     }
234     
235     protected synchronized void cacheHostBasename() throws URIException {
236         if (this.cachedHostBasename != null) {
237             return;
238         }
239         if (this.getHost() != null) {
240             this.cachedHostBasename = TextUtils.
241                 replaceFirst(MASSAGEHOST_PATTERN, this.getHost(),
242                 UURIFactory.EMPTY_STRING);
243         }
244     }
245 
246     /***
247      * Override to cache result
248      * @return String representation of this URI 
249      */
250     public synchronized String toString() {
251         if (this.cachedString == null) {
252             this.cachedString = super.toString();
253         }
254         return this.cachedString;
255     }
256 
257     public String getEscapedURI() {
258         if (this.cachedEscapedURI == null) {
259             synchronized (this) {
260                 if (this.cachedEscapedURI == null) {
261                     this.cachedEscapedURI = super.getEscapedURI();
262                 }
263             }
264         }
265         return this.cachedEscapedURI;
266     }
267 
268     public synchronized String getHost() throws URIException {
269         if (this.cachedHost == null) {
270             // If this._host is null, 3.0 httpclient throws
271             // illegalargumentexception.  Don't go there.
272             if (this._host != null) {
273             	this.cachedHost = super.getHost();
274             }
275         }
276         return this.cachedHost;
277     }
278     
279     /***
280      * Return the referenced host in the UURI, if any, also extracting the 
281      * host of a DNS-lookup URI where necessary. 
282      * 
283      * @return the target or topic host of the URI
284      * @throws URIException
285      */
286     public String getReferencedHost() throws URIException {
287         String referencedHost = this.getHost();
288         if(referencedHost==null && this.getScheme().equals("dns")) {
289             // extract target domain of DNS lookup
290             String possibleHost = this.getCurrentHierPath();
291             if(possibleHost != null && possibleHost.matches("[-_//w//.:]+")) {
292                 referencedHost = possibleHost;
293             }
294         }
295         return referencedHost;
296     }
297 
298     /***
299      * @return Return the 'SURT' format of this UURI
300      */
301     public String getSurtForm() {
302         if (surtForm == null) {
303             surtForm = SURT.fromURI(this.toString());
304         }
305         return surtForm;
306     }
307     
308     /***
309      * Return the authority minus userinfo (if any).
310      * 
311      * If no userinfo present, just returns the authority.
312      * 
313      * @return The authority stripped of any userinfo if present.
314      * @throws URIException
315      */
316 	public String getAuthorityMinusUserinfo()
317     throws URIException {
318         if (this.cachedAuthorityMinusUserinfo != null) {
319             return this.cachedAuthorityMinusUserinfo;
320         }
321         String tmp = getAuthority();
322         if (tmp != null && tmp.length() > 0) {
323         	int index = tmp.indexOf('@');
324             if (index >= 0 && index < tmp.length()) {
325                 tmp = tmp.substring(index + 1);
326             }
327         }
328         this.cachedAuthorityMinusUserinfo = tmp;
329         return this.cachedAuthorityMinusUserinfo;
330 	}
331 
332     /* (non-Javadoc)
333      * @see java.lang.CharSequence#length()
334      */
335     public int length() {
336         return getEscapedURI().length();
337     }
338 
339     /* (non-Javadoc)
340      * @see java.lang.CharSequence#charAt(int)
341      */
342     public char charAt(int index) {
343         return getEscapedURI().charAt(index);
344     }
345 
346     /* (non-Javadoc)
347      * @see java.lang.CharSequence#subSequence(int, int)
348      */
349     public CharSequence subSequence(int start, int end) {
350         return getEscapedURI().subSequence(start,end);
351     }
352 
353     /* (non-Javadoc)
354      * @see java.lang.Comparable#compareTo(java.lang.Object)
355      */
356     public int compareTo(Object arg0) {
357         return getEscapedURI().compareTo(arg0);
358     }
359     
360     /***
361      * Convenience method for finding the UURI inside an
362      * Object likely to have one.
363      * 
364      * @param o Object that has a UURI
365      * @return the UURI found
366      */
367     public static UURI from(Object o) {
368         UURI u = null;
369         if (o instanceof UURI) {
370             u = (UURI)o;
371         } else if (o instanceof CandidateURI) {
372             u = ((CandidateURI) o).getUURI();
373         } else {
374             // TODO: build UURI from a String?
375             // possibly fail with null rather than exception?
376             if (o != null) {
377                 throw new IllegalArgumentException("Passed wrong type: " + o);
378             }
379         }
380         return u;
381     }
382     
383     /***
384      * Overridden from superclass to apply fixes to the two 
385      * marked lines, preventing the misinterpretation of URI
386      * strings which begin with a ':' as absolute URIs. 
387      *
388      * See also HTTPClient bug #35148
389      *  http://issues.apache.org/bugzilla/show_bug.cgi?id=35148
390      * 
391      * @see org.apache.commons.httpclient.URI#parseUriReference(java.lang.String, boolean)
392      */
393     protected void parseUriReference(String original, boolean escaped)
394         throws URIException {
395 
396         // validate and contruct the URI character sequence
397         if (original == null) {
398             throw new URIException("URI-Reference required");
399         }
400 
401         /* @
402          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
403          */
404         String tmp = original.trim();
405         
406         /*
407          * The length of the string sequence of characters.
408          * It may not be equal to the length of the byte array.
409          */
410         int length = tmp.length();
411 
412         /*
413          * Remove the delimiters like angle brackets around an URI.
414          */
415         if (length > 0) {
416             char[] firstDelimiter = { tmp.charAt(0) };
417             if (validate(firstDelimiter, delims)) {
418                 if (length >= 2) {
419                     char[] lastDelimiter = { tmp.charAt(length - 1) };
420                     if (validate(lastDelimiter, delims)) {
421                         tmp = tmp.substring(1, length - 1);
422                         length = length - 2;
423                     }
424                 }
425             }
426         }
427 
428         /*
429          * The starting index
430          */
431         int from = 0;
432 
433         /*
434          * The test flag whether the URI is started from the path component.
435          */
436         boolean isStartedFromPath = false;
437         int atColon = tmp.indexOf(':');
438         int atSlash = tmp.indexOf('/');
439 // THIS NEXT LINE IS A CHANGE FROM SUPERCLASS
440         if (atColon <= 0 || (atSlash >= 0 && atSlash < atColon)) {
441             isStartedFromPath = true;
442         }
443 
444         /*
445          * <p><blockquote><pre>
446          *     @@@@@@@@
447          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
448          * </pre></blockquote><p>
449          */
450         int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
451         if (at == -1) { 
452             at = 0;
453         }
454 
455         /*
456          * Parse the scheme.
457          * <p><blockquote><pre>
458          *  scheme    =  $2 = http
459          *              @
460          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
461          * </pre></blockquote><p>
462          */
463 // THIS NEXT LINE IS A CHANGE FROM SUPERCLASS
464         if (at > 0 && at < length && tmp.charAt(at) == ':') {
465             char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
466             if (validate(target, scheme)) {
467                 _scheme = target;
468             } else {
469                 throw new URIException("incorrect scheme");
470             }
471             from = ++at;
472         }
473 
474         /*
475          * Parse the authority component.
476          * <p><blockquote><pre>
477          *  authority =  $4 = jakarta.apache.org
478          *                  @@
479          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
480          * </pre></blockquote><p>
481          */
482         // Reset flags
483         _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
484         if (0 <= at && at < length && tmp.charAt(at) == '/') {
485             // Set flag
486             _is_hier_part = true;
487             if (at + 2 < length && tmp.charAt(at + 1) == '/') {
488                 // the temporary index to start the search from
489                 int next = indexFirstOf(tmp, "/?#", at + 2);
490                 if (next == -1) {
491                     next = (tmp.substring(at + 2).length() == 0) ? at + 2 
492                         : tmp.length();
493                 }
494                 parseAuthority(tmp.substring(at + 2, next), escaped);
495                 from = at = next;
496                 // Set flag
497                 _is_net_path = true;
498             }
499             if (from == at) {
500                 // Set flag
501                 _is_abs_path = true;
502             }
503         }
504 
505         /*
506          * Parse the path component.
507          * <p><blockquote><pre>
508          *  path      =  $5 = /ietf/uri/
509          *                                @@@@@@
510          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
511          * </pre></blockquote><p>
512          */
513         if (from < length) {
514             // rel_path = rel_segment [ abs_path ]
515             int next = indexFirstOf(tmp, "?#", from);
516             if (next == -1) {
517                 next = tmp.length();
518             }
519             if (!_is_abs_path) {
520                 if (!escaped 
521                     && prevalidate(tmp.substring(from, next), disallowed_rel_path) 
522                     || escaped 
523                     && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
524                     // Set flag
525                     _is_rel_path = true;
526                 } else if (!escaped 
527                     && prevalidate(tmp.substring(from, next), disallowed_opaque_part) 
528                     || escaped 
529                     && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
530                     // Set flag
531                     _is_opaque_part = true;
532                 } else {
533                     // the path component may be empty
534                     _path = null;
535                 }
536             }
537             if (escaped) {
538                 setRawPath(tmp.substring(from, next).toCharArray());
539             } else {
540                 setPath(tmp.substring(from, next));
541             }
542             at = next;
543         }
544 
545         // set the charset to do escape encoding
546         String charset = getProtocolCharset();
547 
548         /*
549          * Parse the query component.
550          * <p><blockquote><pre>
551          *  query     =  $7 = <undefined>
552          *                                        @@@@@@@@@
553          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
554          * </pre></blockquote><p>
555          */
556         if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
557             int next = tmp.indexOf('#', at + 1);
558             if (next == -1) {
559                 next = tmp.length();
560             }
561             _query = (escaped) ? tmp.substring(at + 1, next).toCharArray() 
562                 : encode(tmp.substring(at + 1, next), allowed_query, charset);
563             at = next;
564         }
565 
566         /*
567          * Parse the fragment component.
568          * <p><blockquote><pre>
569          *  fragment  =  $9 = Related
570          *                                                   @@@@@@@@
571          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
572          * </pre></blockquote><p>
573          */
574         if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
575             if (at + 1 == length) { // empty fragment
576                 _fragment = "".toCharArray();
577             } else {
578                 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() 
579                     : encode(tmp.substring(at + 1), allowed_fragment, charset);
580             }
581         }
582 
583         // set this URI.
584         setURI();
585     }
586 }