View Javadoc

1   /* UURI
2    *
3    * $Id: UURI.java 5407 2007-08-16 17:51:21Z gojomo $
4    *
5    * Created on Apr 18, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.net;
26  
27  import java.io.File;
28  import java.io.Serializable;
29  import java.net.URI;
30  import java.net.URISyntaxException;
31  import java.util.logging.Level;
32  import java.util.logging.Logger;
33  
34  import org.apache.commons.httpclient.URIException;
35  import org.archive.crawler.datamodel.CandidateURI;
36  import org.archive.util.SURT;
37  import org.archive.util.TextUtils;
38  
39  
40  /***
41   * Usable URI.
42   * 
43   * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
44   * and methods. It cannot be instantiated directly.  Go via UURIFactory.
45   * 
46   *  <p>We used to use {@link java.net.URI} for parsing URIs but ran across
47   * quirky behaviors and bugs.  {@link java.net.URI} is not subclassable --
48   * its final -- and its unlikely that java.net.URI will change any time soon
49   * (See Gordon's considered petition here:
50   * <a href="http://developer.java.sun.com/developer/bugParade/bugs/4939847.html">java.net.URI
51   * should have loose/tolerant/compatibility option (or allow reuse)</a>).
52   *
53   * <p>This class tries to cache calculated strings such as the extracted host
54   * and this class as a string rather than have the parent class rerun its
55   * calculation everytime.
56   *
57   * @author gojomo
58   * @author stack
59   *
60   * @see org.apache.commons.httpclient.URI
61   */
62  public class UURI extends LaxURI
63  implements CharSequence, Serializable {
64  
65      private static final long serialVersionUID = -1277570889914647093L;
66  
67      private static Logger LOGGER =
68          Logger.getLogger(UURI.class.getName());
69      
70      /***
71       * Consider URIs too long for IE as illegal.
72       */
73      public final static int MAX_URL_LENGTH = 2083;
74      
75      public static final String MASSAGEHOST_PATTERN = "^www//d*//.";
76  
77      /***
78       * Cache of the host name.
79       *
80       * Super class calculates on every call.  Profiling shows us spend 30% of
81       * total elapsed time in URI class.
82       */
83      private transient String cachedHost = null;
84  
85      /***
86       * Cache of this uuri escaped as a string.
87       *
88       * Super class calculates on every call.  Profiling shows us spend 30% of
89       * total elapsed time in URI class.
90       */
91      private transient String cachedEscapedURI = null;
92  
93      /***
94       * Cache of this uuri escaped as a string.
95       *
96       * Super class calculates on every call.  Profiling shows us spend 30% of
97       * total elapsed time in URI class.
98       */
99      private transient String cachedString = null;
100     
101     /***
102      * Cached authority minus userinfo.
103      */
104     private transient String cachedAuthorityMinusUserinfo = null;
105 
106     /***
107      * Cache of this uuri in SURT format
108      */
109     private transient String surtForm = null;
110     
111     // Technically, underscores are disallowed in the domainlabel
112     // portion of hostname according to rfc2396 but we'll be more
113     // loose and allow them. See: [ 1072035 ] [uuri] Underscore in
114     // host messes up port parsing.
115     static {
116         hostname.set('_');
117     }
118 
119 
120     /***
121      * Shutdown access to default constructor.
122      */
123     protected UURI() {
124         super();
125     }
126     
127     /***
128      * @param uri String representation of an absolute URI.
129      * @param escaped If escaped.
130      * @param charset Charset to use.
131      * @throws org.apache.commons.httpclient.URIException
132      */
133     protected UURI(String uri, boolean escaped, String charset)
134     throws URIException {
135         super(uri, escaped, charset);
136         normalize();
137     }
138     
139     /***
140      * @param relative String representation of URI.
141      * @param base Parent UURI to use derelativizing.
142      * @throws org.apache.commons.httpclient.URIException
143      */
144     protected UURI(UURI base, UURI relative) throws URIException {
145         super(base, relative);
146         normalize();
147     }
148 
149     /***
150      * @param uri String representation of a URI.
151      * @param escaped If escaped.
152      * @throws NullPointerException
153      * @throws URIException
154      */
155     protected UURI(String uri, boolean escaped) throws URIException, NullPointerException {
156         super(uri,escaped);
157         normalize();
158     }
159 
160     /***
161      * @param uri URI as string that is resolved relative to this UURI.
162      * @return UURI that uses this UURI as base.
163      * @throws URIException
164      */
165     public UURI resolve(String uri)
166     throws URIException {
167         return resolve(uri, false, // assume not escaped
168             this.getProtocolCharset());
169     }
170 
171     /***
172      * @param uri URI as string that is resolved relative to this UURI.
173      * @param e True if escaped.
174      * @return UURI that uses this UURI as base.
175      * @throws URIException
176      */
177     public UURI resolve(String uri, boolean e)
178     throws URIException {
179         return resolve(uri, e, this.getProtocolCharset());
180     }
181     
182     /***
183      * @param uri URI as string that is resolved relative to this UURI.
184      * @param e True if uri is escaped.
185      * @param charset Charset to use.
186      * @return UURI that uses this UURI as base.
187      * @throws URIException
188      */
189     public UURI resolve(String uri, boolean e, String charset)
190     throws URIException {
191         return new UURI(this, new UURI(uri, e, charset));
192     }
193 
194     /***
195      * Test an object if this UURI is equal to another.
196      *
197      * @param obj an object to compare
198      * @return true if two URI objects are equal
199      */
200     public boolean equals(Object obj) {
201 
202         // normalize and test each components
203         if (obj == this) {
204             return true;
205         }
206         if (!(obj instanceof UURI)) {
207             return false;
208         }
209         UURI another = (UURI) obj;
210         // scheme
211         if (!equals(this._scheme, another._scheme)) {
212             return false;
213         }
214         // is_opaque_part or is_hier_part?  and opaque
215         if (!equals(this._opaque, another._opaque)) {
216             return false;
217         }
218         // is_hier_part
219         // has_authority
220         if (!equals(this._authority, another._authority)) {
221             return false;
222         }
223         // path
224         if (!equals(this._path, another._path)) {
225             return false;
226         }
227         // has_query
228         if (!equals(this._query, another._query)) {
229             return false;
230         }
231         // UURIs do not have fragments
232         return true;
233     }
234 
235     /***
236      * Strips www variants from the host.
237      *
238      * Strips www[0-9]*\. from the host.  If calling getHostBaseName becomes a
239      * performance issue we should consider adding the hostBasename member that
240      * is set on initialization.
241      *
242      * @return Host's basename.
243      * @throws URIException
244      */
245     public String getHostBasename() throws URIException {
246         // caching eliminated because this is rarely used
247         // (only benefits legacy DomainScope, which should
248         // be retired). Saves 4-byte object pointer in UURI
249         // instances.
250         return (this.getReferencedHost() == null) 
251             ? null 
252             : TextUtils.replaceFirst(MASSAGEHOST_PATTERN, 
253                     this.getReferencedHost(), UURIFactory.EMPTY_STRING);
254     }
255 
256     /***
257      * Override to cache result
258      * 
259      * @return String representation of this URI
260      */
261     public synchronized String toString() {
262         if (this.cachedString == null) {
263             this.cachedString = super.toString();
264             coalesceUriStrings();
265         }
266         return this.cachedString;
267     }
268 
269     public synchronized String getEscapedURI() {
270         if (this.cachedEscapedURI == null) {
271             this.cachedEscapedURI = super.getEscapedURI();
272             coalesceUriStrings();
273         }
274         return this.cachedEscapedURI;
275     }
276 
277     /***
278      * The two String fields cachedString and cachedEscapedURI are 
279      * usually identical; if so, coalesce into a single instance. 
280      */
281     protected void coalesceUriStrings() {
282         if (this.cachedString != null && this.cachedEscapedURI != null
283                 && this.cachedString.length() == this.cachedEscapedURI.length()) {
284             // lengths will only be identical if contents are identical
285             // (deescaping will always shrink length), so coalesce to
286             // use only single cached instance
287             this.cachedString = this.cachedEscapedURI;
288         }
289     }
290     
291     public synchronized String getHost() throws URIException {
292         if (this.cachedHost == null) {
293             // If this._host is null, 3.0 httpclient throws
294             // illegalargumentexception.  Don't go there.
295             if (this._host != null) {
296             	this.cachedHost = super.getHost();
297                 coalesceHostAuthorityStrings();
298             }
299         }
300         return this.cachedHost;
301     }
302     
303     /***
304      * The two String fields cachedHost and cachedAuthorityMinusUserInfo are 
305      * usually identical; if so, coalesce into a single instance. 
306      */
307     protected void coalesceHostAuthorityStrings() {
308         if (this.cachedAuthorityMinusUserinfo != null
309                 && this.cachedHost != null
310                 && this.cachedHost.length() ==
311                     this.cachedAuthorityMinusUserinfo.length()) {
312             // lengths can only be identical if contents
313             // are identical; use only one instance
314             this.cachedAuthorityMinusUserinfo = this.cachedHost;
315         }
316     }
317 
318     /***
319      * Return the referenced host in the UURI, if any, also extracting the 
320      * host of a DNS-lookup URI where necessary. 
321      * 
322      * @return the target or topic host of the URI
323      * @throws URIException
324      */
325     public String getReferencedHost() throws URIException {
326         String referencedHost = this.getHost();
327         if(referencedHost==null && this.getScheme().equals("dns")) {
328             // extract target domain of DNS lookup
329             String possibleHost = this.getCurrentHierPath();
330             if(possibleHost != null && possibleHost.matches("[-_//w//.:]+")) {
331                 referencedHost = possibleHost;
332             }
333         }
334         return referencedHost;
335     }
336 
337     /***
338      * @return Return the 'SURT' format of this UURI
339      */
340     public String getSurtForm() {
341         if (surtForm == null) {
342             surtForm = SURT.fromURI(this.toString());
343         }
344         return surtForm;
345     }
346     
347     /***
348      * Return the authority minus userinfo (if any).
349      * 
350      * If no userinfo present, just returns the authority.
351      * 
352      * @return The authority stripped of any userinfo if present.
353      * @throws URIException
354      */
355 	public String getAuthorityMinusUserinfo()
356     throws URIException {
357         if (this.cachedAuthorityMinusUserinfo == null) {
358             String tmp = getAuthority();
359             if (tmp != null && tmp.length() > 0) {
360             	int index = tmp.indexOf('@');
361                 if (index >= 0 && index < tmp.length()) {
362                     tmp = tmp.substring(index + 1);
363                 }
364             }
365             this.cachedAuthorityMinusUserinfo = tmp;
366             coalesceHostAuthorityStrings();
367         }
368         return this.cachedAuthorityMinusUserinfo;
369 	}
370 
371     /* (non-Javadoc)
372      * @see java.lang.CharSequence#length()
373      */
374     public int length() {
375         return getEscapedURI().length();
376     }
377 
378     /* (non-Javadoc)
379      * @see java.lang.CharSequence#charAt(int)
380      */
381     public char charAt(int index) {
382         return getEscapedURI().charAt(index);
383     }
384 
385     /* (non-Javadoc)
386      * @see java.lang.CharSequence#subSequence(int, int)
387      */
388     public CharSequence subSequence(int start, int end) {
389         return getEscapedURI().subSequence(start,end);
390     }
391 
392     /* (non-Javadoc)
393      * @see java.lang.Comparable#compareTo(java.lang.Object)
394      */
395     public int compareTo(Object arg0) {
396         return getEscapedURI().compareTo(arg0.toString());
397     }
398     
399     /***
400      * Convenience method for finding the UURI inside an
401      * Object likely to have (or be/imply) one.
402      * 
403      * @param o Object that is, has, or implies a UURI
404      * @return the UURI found, or null if none
405      */
406     public static UURI from(Object o) {
407         UURI u = null;
408         if (o instanceof UURI) {
409             u = (UURI)o;
410         } else if (o instanceof CandidateURI) {
411             u = ((CandidateURI) o).getUURI();
412         } else if (o instanceof CharSequence) {
413             String s = o.toString();
414             try {
415                 u = UURIFactory.getInstance(s);
416             } catch (URIException e) {
417                 LOGGER.log(Level.FINE,"bad URI",e);
418             }
419         } 
420         return u;
421     }
422     
423     /***
424      * Test if passed String has likely URI scheme prefix.
425      * @param possibleUrl URL string to examine.
426      * @return True if passed string looks like it could be an URL.
427      */
428     public static boolean hasScheme(String possibleUrl) {
429         boolean result = false;
430         for (int i = 0; i < possibleUrl.length(); i++) {
431             char c = possibleUrl.charAt(i);
432             if (c == ':') {
433                 if (i != 0) {
434                     result = true;
435                 }
436                 break;
437             }
438             if (!scheme.get(c)) {
439                 break;
440             }
441         }
442         return result;
443     }
444     
445     /***
446      * @param pathOrUri A file path or a URI.
447      * @return Path parsed from passed <code>pathOrUri</code>.
448      * @throws URISyntaxException
449      */
450     public static String parseFilename(final String pathOrUri)
451     throws URISyntaxException {
452         String path = pathOrUri;
453         if (UURI.hasScheme(pathOrUri)) {
454             URI url = new URI(pathOrUri);
455             path = url.getPath();
456         }
457         return (new File(path)).getName();
458     }
459 }