View Javadoc

1   /* UURIFactory
2    *
3    * $Id: UURIFactory.java 6539 2009-10-03 01:08:25Z szznax $
4    *
5    * Created on July 16, 2004
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.net;
26  
27  import gnu.inet.encoding.IDNA;
28  import gnu.inet.encoding.IDNAException;
29  import it.unimi.dsi.mg4j.util.MutableString;
30  
31  import java.io.UnsupportedEncodingException;
32  import java.util.Arrays;
33  import java.util.BitSet;
34  import java.util.logging.Level;
35  import java.util.logging.Logger;
36  import java.util.regex.Matcher;
37  import java.util.regex.Pattern;
38  
39  import org.apache.commons.httpclient.URI;
40  import org.apache.commons.httpclient.URIException;
41  import org.archive.util.TextUtils;
42  
43  
44  /***
45   * Factory that returns UURIs.
46   * 
47   * Does escaping and fixup on URIs massaging in accordance with RFC2396
48   * and to match browser practice. For example, it removes any
49   * '..' if first thing in the path as per IE,  converts backslashes to forward
50   * slashes, and discards any 'fragment'/anchor portion of the URI. This
51   * class will also fail URIs if they are longer than IE's allowed maximum
52   * length.
53   * 
54   * <p>TODO: Test logging.
55   * 
56   * @author stack
57   */
58  public class UURIFactory extends URI {
59      
60      private static final long serialVersionUID = -6146295130382209042L;
61  
62      /***
63       * Logging instance.
64       */
65      private static Logger logger =
66          Logger.getLogger(UURIFactory.class.getName());
67      
68      /***
69       * The single instance of this factory.
70       */
71      private static final UURIFactory factory = new UURIFactory();
72      
73      /***
74       * RFC 2396-inspired regex.
75       *
76       * From the RFC Appendix B:
77       * <pre>
78       * URI Generic Syntax                August 1998
79       *
80       * B. Parsing a URI Reference with a Regular Expression
81       *
82       * As described in Section 4.3, the generic URI syntax is not sufficient
83       * to disambiguate the components of some forms of URI.  Since the
84       * "greedy algorithm" described in that section is identical to the
85       * disambiguation method used by POSIX regular expressions, it is
86       * natural and commonplace to use a regular expression for parsing the
87       * potential four components and fragment identifier of a URI reference.
88       *
89       * The following line is the regular expression for breaking-down a URI
90       * reference into its components.
91       *
92       * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
93       * 12            3  4          5       6  7        8 9
94       *
95       * The numbers in the second line above are only to assist readability;
96       * they indicate the reference points for each subexpression (i.e., each
97       * paired parenthesis).  We refer to the value matched for subexpression
98       * <n> as $<n>.  For example, matching the above expression to
99       *
100      * http://www.ics.uci.edu/pub/ietf/uri/#Related
101      *
102      * results in the following subexpression matches:
103      *
104      * $1 = http:
105      * $2 = http
106      * $3 = //www.ics.uci.edu
107      * $4 = www.ics.uci.edu
108      * $5 = /pub/ietf/uri/
109      * $6 = <undefined>
110      * $7 = <undefined>
111      * $8 = #Related
112      * $9 = Related
113      *
114      * where <undefined> indicates that the component is not present, as is
115      * the case for the query component in the above example.  Therefore, we
116      * can determine the value of the four components and fragment as
117      *
118      * scheme    = $2
119      * authority = $4
120      * path      = $5
121      * query     = $7
122      * fragment  = $9
123      * </pre>
124      *
125      * -- 
126      * <p>Below differs from the rfc regex in that... 
127      * (1) it has java escaping of regex characters 
128      * (2) we allow a URI made of a fragment only (Added extra
129      * group so indexing is off by one after scheme).
130      * (3) scheme is limited to legal scheme characters 
131      */
132     final static Pattern RFC2396REGEX = Pattern.compile(
133         "^(([a-zA-Z][a-zA-Z//+//-//.]*):)?((//([^/?#]*))?([^?#]*)(//?([^#]*))?)?(#(.*))?");
134     //    12                             34  5          6       7   8          9 A
135     //                                2 1             54        6          87 3      A9
136     // 1: scheme
137     // 2: scheme:
138     // 3: //authority/path
139     // 4: //authority
140     // 5: authority
141     // 6: path
142     // 7: ?query
143     // 8: query 
144     // 9: #fragment
145     // A: fragment
146 
147     public static final String SLASHDOTDOTSLASH = "^(///.//./)+";
148     public static final String SLASH = "/";
149     public static final String HTTP = "http";
150     public static final String HTTP_PORT = ":80";
151     public static final String HTTPS = "https";
152     public static final String HTTPS_PORT = ":443";
153     public static final String DOT = ".";
154     public static final String EMPTY_STRING = "";
155     public static final String NBSP = "\u00A0";
156     public static final String SPACE = " ";
157     public static final String ESCAPED_SPACE = "%20";
158     public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$";
159     public static final String PIPE = "|";
160     public static final String PIPE_PATTERN = "//|";
161     public static final String ESCAPED_PIPE = "%7C";
162     public static final String CIRCUMFLEX = "^";
163     public static final String CIRCUMFLEX_PATTERN = "//^";
164     public static final String ESCAPED_CIRCUMFLEX = "%5E";
165     public static final String QUOT = "\"";
166     public static final String ESCAPED_QUOT = "%22";
167     public static final String SQUOT = "'";
168     public static final String ESCAPED_SQUOT = "%27";
169     public static final String APOSTROPH = "`";
170     public static final String ESCAPED_APOSTROPH = "%60";
171     public static final String LSQRBRACKET = "[";
172     public static final String LSQRBRACKET_PATTERN = "//[";
173     public static final String ESCAPED_LSQRBRACKET = "%5B";
174     public static final String RSQRBRACKET = "]";
175     public static final String RSQRBRACKET_PATTERN = "//]";
176     public static final String ESCAPED_RSQRBRACKET = "%5D";
177     public static final String LCURBRACKET = "{";
178     public static final String LCURBRACKET_PATTERN = "//{";
179     public static final String ESCAPED_LCURBRACKET = "%7B";
180     public static final String RCURBRACKET = "}";
181     public static final String RCURBRACKET_PATTERN = "//}";
182     public static final String ESCAPED_RCURBRACKET = "%7D";
183     public static final String BACKSLASH = "//";
184     public static final String BACKSLASH_PATTERN = "////";
185     public static final String ESCAPED_BACKSLASH = "%5C";
186     public static final String STRAY_SPACING = "[\n\r\t]+";
187     public static final String IMPROPERESC_REPLACE = "%25$1";
188     public static final String IMPROPERESC =
189         "%((?:[^//p{XDigit}])|(?:.[^//p{XDigit}])|(?://z))";
190     public static final String COMMERCIAL_AT = "@";
191     public static final char PERCENT_SIGN = '%';
192     public static final char COLON = ':';
193     
194     /***
195      * First percent sign in string followed by two hex chars.
196      */
197     public static final String URI_HEX_ENCODING =
198         "^[^%]*%[//p{XDigit}][//p{XDigit}].*";
199     
200     /***
201      * Authority port number regex.
202      */
203     final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$");
204     
205     /***
206      * Characters we'll accept in the domain label part of a URI
207      * authority: ASCII letters-digits-hyphen (LDH) plus underscore,
208      * with single intervening '.' characters.
209      * 
210      * (We accept '_' because DNS servers have tolerated for many
211      * years counter to spec; we also accept dash patterns and ACE
212      * prefixes that will be rejected by IDN-punycoding attempt.)
213      */
214     final static String ACCEPTABLE_ASCII_DOMAIN =
215         "^(?:[a-zA-Z0-9_-]++(?://.)?)++$";
216     
217     /***
218      * Pattern that looks for case of three or more slashes after the 
219      * scheme.  If found, we replace them with two only as mozilla does.
220      */
221     final static Pattern HTTP_SCHEME_SLASHES =
222         Pattern.compile("^(https?://)/+(.*)");
223     
224     /***
225      * Pattern that looks for case of two or more slashes in a path.
226      */
227     final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+");
228     
229     /***
230      * System property key for list of supported schemes.
231      */
232     private static final String SCHEMES_KEY = ".schemes";
233     
234     /***
235      * System property key for list of purposefully-ignored schemes.
236      */
237     private static final String IGNORED_SCHEMES_KEY = ".ignored-schemes";
238 
239     private String[] schemes = null;
240     private String[] ignoredSchemes = null;
241 
242     public static final int IGNORED_SCHEME = 9999999;
243     
244     /***
245      * Protected constructor.
246      */
247     private UURIFactory() {
248         super();
249         String s = System.getProperty(this.getClass().getName() + SCHEMES_KEY);
250         if (s != null && s.length() > 0) {
251             schemes = s.split("[, ]+");
252             Arrays.sort(schemes);
253         }
254         String ignored = System.getProperty(this.getClass().getName() + IGNORED_SCHEMES_KEY);
255         if (ignored != null && ignored.length() > 0) {
256             ignoredSchemes  = ignored.split("[, ]+");
257             Arrays.sort(ignoredSchemes);
258         }
259     }
260     
261     /***
262      * @param uri URI as string.
263      * @return An instance of UURI
264      * @throws URIException
265      */
266     public static UURI getInstance(String uri) throws URIException {
267         return UURIFactory.factory.create(uri);
268     }
269     
270     /***
271      * @param uri URI as string.
272      * @param charset Character encoding of the passed uri string.
273      * @return An instance of UURI
274      * @throws URIException
275      */
276     public static UURI getInstance(String uri, String charset)
277     		throws URIException {
278         return UURIFactory.factory.create(uri, charset);
279     }
280     
281     /***
282      * @param base Base uri to use resolving passed relative uri.
283      * @param relative URI as string.
284      * @return An instance of UURI
285      * @throws URIException
286      */
287     public static UURI getInstance(UURI base, String relative)
288     		throws URIException {
289         return UURIFactory.factory.create(base, relative);
290     }
291     
292     /***
293      * Test of whether passed String has an allowed URI scheme.
294      * First tests if likely scheme suffix.  If so, we then test if its one of
295      * the supported schemes.
296      * @param possibleUrl URL string to examine.
297      * @return True if passed string looks like it could be an URL.
298      */
299     public static boolean hasSupportedScheme(String possibleUrl) {
300         boolean hasScheme = UURI.hasScheme(possibleUrl);
301         if (!hasScheme || UURIFactory.factory.schemes == null) {
302             return hasScheme;
303         }
304         String tmpStr = possibleUrl.substring(0, possibleUrl.indexOf(':'));
305         return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0;
306     }
307 
308     /***
309      * @param uri URI as string.
310      * @return Instance of UURI.
311      * @throws URIException
312      */
313     private UURI create(String uri) throws URIException {
314         return create(uri, UURI.getDefaultProtocolCharset());
315     }
316     
317     /***
318      * @param uri URI as string.
319      * @param charset Original encoding of the string.
320      * @return Instance of UURI.
321      * @throws URIException
322      */
323     private UURI create(String uri, String charset) throws URIException {
324         UURI uuri  = new UURI(fixup(uri, null, charset), true, charset);
325         if (logger.isLoggable(Level.FINE)) {
326             logger.fine("URI " + uri +
327                 " PRODUCT " + uuri.toString() +
328                 " CHARSET " + charset);
329         }
330         return validityCheck(uuri);
331     }
332     
333     /***
334      * @param base UURI to use as a base resolving <code>relative</code>.
335      * @param relative Relative URI.
336      * @return Instance of UURI.
337      * @throws URIException
338      */
339     private UURI create(UURI base, String relative) throws URIException {
340         UURI uuri = new UURI(base, new UURI(fixup(relative, base, base.getProtocolCharset()),
341             true, base.getProtocolCharset()));
342         if (logger.isLoggable(Level.FINE)) {
343             logger.fine(" URI " + relative +
344                 " PRODUCT " + uuri.toString() +
345                 " CHARSET " + base.getProtocolCharset() +
346                 " BASE " + base);
347         }
348         return validityCheck(uuri);
349     }
350 
351     /***
352      * Check the generated UURI.
353      * 
354      * At the least look at length of uuri string.  We were seeing case
355      * where before escaping, string was &lt; MAX_URL_LENGTH but after was
356      * &gt;.  Letting out a too-big message was causing us troubles later
357      * down the processing chain.
358      * @param uuri Created uuri to check.
359      * @return The passed <code>uuri</code> so can easily inline this check.
360      * @throws URIException
361      */
362     protected UURI validityCheck(UURI uuri) throws URIException {
363         if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) {
364            throw new URIException("Created (escaped) uuri > " +
365               UURI.MAX_URL_LENGTH +": "+uuri.toString());
366         }
367         return uuri;
368     }
369     
370     /***
371      * Do heritrix fix-up on passed uri string.
372      *
373      * Does heritrix escaping; usually escaping done to make our behavior align
374      * with IEs.  This method codifies our experience pulling URIs from the
375      * wilds.  Its does all the escaping we want; its output can always be
376      * assumed to be 'escaped' (though perhaps to a laxer standard than the 
377      * vanilla HttpClient URI class or official specs might suggest). 
378      *
379      * @param uri URI as string.
380      * @param base May be null.
381      * @param e True if the uri is already escaped.
382      * @return A fixed up URI string.
383      * @throws URIException
384      */
385     private String fixup(String uri, final URI base, final String charset)
386     throws URIException {
387         if (uri == null) {
388             throw new NullPointerException();
389         } else if (uri.length() == 0 && base == null) {
390             throw new URIException("URI length is zero (and not relative).");
391         }
392         
393         if (uri.length() > UURI.MAX_URL_LENGTH) {
394             // We check length here and again later after all convertions.
395             throw new URIException("URI length > " + UURI.MAX_URL_LENGTH +
396                 ": " + uri);
397         }
398         
399         // Replace nbsp with normal spaces (so that they get stripped if at
400         // ends, or encoded if in middle)
401         if (uri.indexOf(NBSP) >= 0) {
402             uri = TextUtils.replaceAll(NBSP, uri, SPACE);
403         }
404         
405         // Get rid of any trailing spaces or new-lines. 
406         uri = uri.trim();
407         
408         // IE actually converts backslashes to slashes rather than to %5C.
409         // Since URIs that have backslashes usually work only with IE, we will
410         // convert backslashes to slashes as well.
411         // TODO: Maybe we can first convert backslashes by specs and than by IE
412         // so that we fetch both versions.
413         if (uri.indexOf(BACKSLASH) >= 0) {
414             uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH);
415         }
416         
417         // Remove stray TAB/CR/LF
418         uri = TextUtils.replaceAll(STRAY_SPACING, uri, EMPTY_STRING);
419         
420         // Test for the case of more than two slashes after the http(s) scheme.
421         // Replace with two slashes as mozilla does if found.
422         // See [ 788219 ] URI Syntax Errors stop page parsing.
423         Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri);
424         if (matcher.matches()) {
425             uri = matcher.group(1) + matcher.group(2);
426         }
427 
428         // now, minimally escape any whitespace
429         uri = escapeWhitespace(uri);
430         
431         // For further processing, get uri elements.  See the RFC2396REGEX
432         // comment above for explaination of group indices used in the below.
433         matcher = RFC2396REGEX.matcher(uri);
434         if (!matcher.matches()) {
435             throw new URIException("Failed parse of " + uri);
436         }
437         String uriScheme = checkUriElementAndLowerCase(matcher.group(2));
438         String uriSchemeSpecificPart = checkUriElement(matcher.group(3));
439         String uriAuthority = checkUriElement(matcher.group(5));
440         String uriPath = checkUriElement(matcher.group(6));
441         String uriQuery = checkUriElement(matcher.group(8));
442         // UNUSED String uriFragment = checkUriElement(matcher.group(10));
443         
444         // If a scheme, is it a supported scheme?
445         if (uriScheme != null && uriScheme.length() > 0 &&
446                 this.schemes != null) {
447             if (!(Arrays.binarySearch(schemes,uriScheme)>=0)) {
448                 // unsupported; see if silently ignored
449                 if((Arrays.binarySearch(ignoredSchemes,uriScheme)>=0)) {
450                     throw new URIException(
451                             IGNORED_SCHEME, "Ignored scheme: " + uriScheme);
452                 } else {
453                     throw new URIException("Unsupported scheme: " + uriScheme);
454                 }
455             }
456         }
457         
458         // Test if relative URI. If so, need a base to resolve against.
459         if (uriScheme == null || uriScheme.length() <= 0) {
460             if (base == null) {
461                 throw new URIException("Relative URI but no base: " + uri);
462             }
463         } else {
464         	checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme,
465         		uriSchemeSpecificPart);
466         }
467         
468         // fixup authority portion: lowercase/IDN-punycode any domain; 
469         // remove stray trailing spaces
470         uriAuthority = fixupAuthority(uriAuthority, charset);
471 
472         // Do some checks if absolute path.
473         if (uriSchemeSpecificPart != null &&
474                 uriSchemeSpecificPart.startsWith(SLASH)) {
475             if (uriPath != null) {
476                 // Eliminate '..' if its first thing in the path.  IE does this.
477                 uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath,
478                     SLASH);
479             }
480             // Ensure root URLs end with '/': browsers always send "/"
481             // on the request-line, so we should consider "http://host"
482             // to be "http://host/".
483             if (uriPath == null || EMPTY_STRING.equals(uriPath)) {
484                 uriPath = SLASH;
485             }
486         }
487 
488         if (uriAuthority != null) {
489             if (uriScheme != null && uriScheme.length() > 0 &&
490                     uriScheme.equals(HTTP)) {
491                 uriAuthority = checkPort(uriAuthority);
492                 uriAuthority = stripTail(uriAuthority, HTTP_PORT);
493             } else if (uriScheme != null && uriScheme.length() > 0 &&
494                     uriScheme.equals(HTTPS)) {
495                 uriAuthority = checkPort(uriAuthority);
496                 uriAuthority = stripTail(uriAuthority, HTTPS_PORT);
497             }
498             // Strip any prefix dot or tail dots from the authority.
499             uriAuthority = stripTail(uriAuthority, DOT);
500             uriAuthority = stripPrefix(uriAuthority, DOT);
501         } else {
502             // no authority; may be relative. consider stripping scheme
503             // to work-around org.apache.commons.httpclient.URI bug
504             // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 )
505             if (uriScheme != null && base != null
506                     && uriScheme.equals(base.getScheme())) {
507                 // uriScheme redundant and will only confound httpclient.URI
508                 uriScheme = null; 
509             }
510         }
511         
512         // Ensure minimal escaping. Use of 'lax' URI and URLCodec 
513         // means minimal escaping isn't necessarily complete/consistent.
514         // There is a chance such lax encoding will throw exceptions
515         // later at inconvenient times. 
516         //
517         // One reason for these bad escapings -- though not the only --
518         // is that the page is using an encoding other than the ASCII or the
519         // UTF-8 that is our default URI encoding.  In this case the parent
520         // class is burping on the passed URL encoding.  If the page encoding
521         // was passed into this factory, the encoding seems to be parsed
522         // correctly (See the testEscapedEncoding unit test).
523         //
524         // This fixup may cause us to miss content.  There is the charset case
525         // noted above.  TODO: Look out for cases where we fail other than for
526         // the above given reason which will be fixed when we address
527         // '[ 913687 ] Make extractors interrogate for charset'.
528 
529         uriPath = ensureMinimalEscaping(uriPath, charset);
530         uriQuery = ensureMinimalEscaping(uriQuery, charset,
531             LaxURLCodec.QUERY_SAFE);
532 
533         // Preallocate.  The '1's and '2's in below are space for ':',
534         // '//', etc. URI characters.
535         MutableString s = new MutableString(
536             ((uriScheme != null)? uriScheme.length(): 0)
537             + 1 // ';' 
538             + ((uriAuthority != null)? uriAuthority.length(): 0)
539             + 2 // '//'
540             + ((uriPath != null)? uriPath.length(): 0)
541             + 1 // '?'
542             + ((uriQuery != null)? uriQuery.length(): 0));
543         appendNonNull(s, uriScheme, ":", true);
544         appendNonNull(s, uriAuthority, "//", false);
545         appendNonNull(s, uriPath, "", false);
546         appendNonNull(s, uriQuery, "?", false);
547         return s.toString();
548     }
549     
550     /***
551      * If http(s) scheme, check scheme specific part begins '//'.
552      * @throws URIException 
553      * @see http://www.faqs.org/rfcs/rfc1738.html Section 3.1. Common Internet
554      * Scheme Syntax
555      */
556     protected void checkHttpSchemeSpecificPartSlashPrefix(final URI base,
557     		final String scheme, final String schemeSpecificPart)
558     throws URIException {
559     	if (scheme == null || scheme.length() <= 0) {
560     		return;
561     	}
562     	if (!scheme.equals("http") && !scheme.equals("https")) {
563     		return;
564     	}
565     	if ( schemeSpecificPart == null 
566     	        || !schemeSpecificPart.startsWith("//")) {
567     	    // only acceptable if schemes match
568     	    if (base == null || !scheme.equals(base.getScheme())) {
569     	        throw new URIException(
570     	                "relative URI with scheme only allowed for " +
571     	                "scheme matching base");
572     	    } 
573     	    return; 
574     	}
575     	if (schemeSpecificPart.length() <= 2) {
576     		throw new URIException("http scheme specific part is " +
577         		"too short: " + schemeSpecificPart);
578     	}
579     }
580     
581     /***
582      * Fixup 'authority' portion of URI, by removing any stray 
583      * encoded spaces, lowercasing any domain names, and applying
584      * IDN-punycoding to Unicode domains. 
585      * 
586      * @param uriAuthority the authority string to fix
587      * @return fixed version
588      * @throws URIException
589      */
590     private String fixupAuthority(String uriAuthority, String charset) throws URIException {
591         // Lowercase the host part of the uriAuthority; don't destroy any
592         // userinfo capitalizations.  Make sure no illegal characters in
593         // domainlabel substring of the uri authority.
594         if (uriAuthority != null) {
595             // Get rid of any trailing escaped spaces:
596             // http://www.archive.org%20.  Rare but happens.
597             // TODO: reevaluate: do IE or firefox do such mid-URI space-removal?
598             // if not, we shouldn't either. 
599             while(uriAuthority.endsWith(ESCAPED_SPACE)) {
600                 uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3);
601             }
602 
603             // lowercase & IDN-punycode only the domain portion
604             int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
605             int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);
606             if(atIndex<0 && portColonIndex<0) {
607                 // most common case: neither userinfo nor port
608                 return fixupDomainlabel(uriAuthority);
609             } else if (atIndex<0 && portColonIndex>-1) {
610                 // next most common: port but no userinfo
611                 String domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex));
612                 String port = uriAuthority.substring(portColonIndex);
613                 return domain + port;
614             } else if (atIndex>-1 && portColonIndex<0) {
615                 // uncommon: userinfo, no port
616                 String userinfo = ensureMinimalEscaping(uriAuthority.substring(0,atIndex+1),charset);
617                 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1));
618                 return userinfo + domain;
619             } else {
620                 // uncommon: userinfo, port
621                 String userinfo = ensureMinimalEscaping(uriAuthority.substring(0,atIndex+1),charset);
622                 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex));
623                 String port = uriAuthority.substring(portColonIndex);
624                 return userinfo + domain + port;
625             }
626         }
627         return uriAuthority;
628     }
629     
630     /***
631      * Fixup the domain label part of the authority.
632      * 
633      * We're more lax than the spec. in that we allow underscores.
634      * 
635      * @param label Domain label to fix.
636      * @return Return fixed domain label.
637      * @throws URIException
638      */
639     private String fixupDomainlabel(String label)
640     throws URIException {
641         
642         // apply IDN-punycoding, as necessary
643         try {
644             // TODO: optimize: only apply when necessary, or
645             // keep cache of recent encodings
646             label = IDNA.toASCII(label);
647         } catch (IDNAException e) {
648             if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) {
649                 // domain name has ACE prefix, leading/trailing dash, or 
650                 // underscore -- but is still a name we wish to tolerate;
651                 // simply continue
652             } else {
653                 // problematic domain: neither ASCII acceptable characters
654                 // nor IDN-punycodable, so throw exception 
655                 // TODO: change to HeritrixURIException so distinguishable
656                 // from URIExceptions in library code
657                 URIException ue = new URIException(e+" "+label);
658                 ue.initCause(e);
659                 throw ue;
660             }
661         }
662         label = label.toLowerCase();
663         return label;
664     }
665     
666     /***
667      * Ensure that there all characters needing escaping
668      * in the passed-in String are escaped. Stray '%' characters
669      * are *not* escaped, as per browser behavior. 
670      * 
671      * @param u String to escape
672      * @param charset 
673      * @return string with any necessary escaping applied
674      */
675     private String ensureMinimalEscaping(String u, final String charset) {
676         return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE);
677     }
678     
679     /***
680      * Ensure that there all characters needing escaping
681      * in the passed-in String are escaped. Stray '%' characters
682      * are *not* escaped, as per browser behavior. 
683      * 
684      * @param u String to escape
685      * @param charset 
686      * @param bitset 
687      * @return string with any necessary escaping applied
688      */
689     private String ensureMinimalEscaping(String u, final String charset,
690             final BitSet bitset) {
691         if (u == null) {
692             return null;
693         }
694         for (int i = 0; i < u.length(); i++) {
695             char c = u.charAt(i);
696             if (!bitset.get(c)) {
697                 try {
698                     u = LaxURLCodec.DEFAULT.encode(bitset, u, charset);
699                 } catch (UnsupportedEncodingException e) {
700                     e.printStackTrace();
701                 }
702                 break;
703             }
704         }
705         return u;
706     }
707 
708     /***
709      * Escape any whitespace found.
710      * 
711      * The parent class takes care of the bulk of escaping.  But if any
712      * instance of escaping is found in the URI, then we ask for parent
713      * to do NO escaping.  Here we escape any whitespace found irrespective
714      * of whether the uri has already been escaped.  We do this for
715      * case where uri has been judged already-escaped only, its been
716      * incompletly done and whitespace remains.  Spaces, etc., in the URI are
717      * a real pain.  Their presence will break log file and ARC parsing.
718      * @param uri URI string to check.
719      * @return uri with spaces escaped if any found.
720      */
721     protected String escapeWhitespace(String uri) {
722         // Just write a new string anyways.  The perl '\s' is not
723         // as inclusive as the Character.isWhitespace so there are
724         // whitespace characters we could miss.  So, rather than
725         // write some awkward regex, just go through the string
726         // a character at a time.  Only create buffer first time
727         // we find a space.
728         MutableString buffer = null;
729         for (int i = 0; i < uri.length(); i++) {
730             char c = uri.charAt(i);
731             if (Character.isWhitespace(c)) {
732                 if (buffer == null) {
733                     buffer = new MutableString(uri.length() +
734                         2 /*If space, two extra characters (at least)*/);
735                     buffer.append(uri.substring(0, i));
736                 }
737                 buffer.append("%");
738                 String hexStr = Integer.toHexString(c);
739                 if ((hexStr.length() % 2) > 0) {
740                     buffer.append("0");
741                 }
742                 buffer.append(hexStr);
743                 
744             } else {
745                 if (buffer != null) {
746                     buffer.append(c);
747                 }
748             }
749         }
750         return (buffer !=  null)? buffer.toString(): uri;
751     }
752 
753     /***
754      * Check port on passed http authority.  Make sure the size is not larger
755      * than allowed: See the 'port' definition on this
756      * page, http://www.kerio.com/manual/wrp/en/418.htm.
757      * Also, we've seen port numbers of '0080' whose leading zeros confuse
758      * the parent class. Strip the leading zeros.
759      *
760      * @param uriAuthority
761      * @return Null or an amended port number.
762      * @throws URIException
763      */
764     private String checkPort(String uriAuthority)
765     throws URIException {
766         Matcher m = PORTREGEX.matcher(uriAuthority);
767         if (m.matches()) {
768             String no = m.group(2);
769             if (no != null && no.length() > 0) {
770                 // First check if the port has leading zeros
771                 // as in '0080'.  Strip them if it has and
772                 // then reconstitute the uriAuthority.  Be careful
773                 // of cases where port is '0' or '000'.
774                 while (no.charAt(0) == '0' && no.length() > 1) {
775                     no = no.substring(1);
776                 }
777                 uriAuthority = m.group(1) + no;
778                 // Now makesure the number is legit.
779                 int portNo = 0;
780                 try {
781                     portNo = Integer.parseInt(no);
782                 } catch (NumberFormatException nfe) {
783                     // just catch and leave portNo at illegal 0
784                 }
785                 if (portNo <= 0 || portNo > 65535) {
786                     throw new URIException("Port out of bounds: " +
787                         uriAuthority);
788                 }
789             }
790         }
791         return uriAuthority;
792     }
793 
794     /***
795      * @param b Buffer to append to.
796      * @param str String to append if not null.
797      * @param substr Suffix or prefix to use if <code>str</code> is not null.
798      * @param suffix True if <code>substr</code> is a suffix.
799      */
800     private void appendNonNull(MutableString b, String str, String substr,
801             boolean suffix) {
802         if (str != null && str.length() > 0) {
803             if (!suffix) {
804                 b.append(substr);
805             }
806             b.append(str);
807             if (suffix) {
808                 b.append(substr);
809             }
810         }
811     }
812 
813     /***
814      * @param str String to work on.
815      * @param prefix Prefix to strip if present.
816      * @return <code>str</code> w/o <code>prefix</code>.
817      */
818     private String stripPrefix(String str, String prefix) {
819         return str.startsWith(prefix)?
820             str.substring(prefix.length(), str.length()):
821             str;
822     }
823 
824     /***
825      * @param str String to work on.
826      * @param tail Tail to strip if present.
827      * @return <code>str</code> w/o <code>tail</code>.
828      */
829     private static String stripTail(String str, String tail) {
830         return str.endsWith(tail)?
831             str.substring(0, str.length() - tail.length()):
832             str;
833     }
834 
835     /***
836      * @param element to examine.
837      * @return Null if passed null or an empty string otherwise
838      * <code>element</code>.
839      */
840     private String checkUriElement(String element) {
841         return (element == null || element.length() <= 0)? null: element;
842     }
843 
844     /***
845      * @param element to examine and lowercase if non-null.
846      * @return Null if passed null or an empty string otherwise
847      * <code>element</code> lowercased.
848      */
849     private String checkUriElementAndLowerCase(String element) {
850         String tmp = checkUriElement(element);
851         return (tmp != null)? tmp.toLowerCase(): tmp;
852     }
853 }