1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.net;
26
27 import gnu.inet.encoding.IDNA;
28 import gnu.inet.encoding.IDNAException;
29 import it.unimi.dsi.mg4j.util.MutableString;
30
31 import java.io.UnsupportedEncodingException;
32 import java.util.Arrays;
33 import java.util.BitSet;
34 import java.util.logging.Level;
35 import java.util.logging.Logger;
36 import java.util.regex.Matcher;
37 import java.util.regex.Pattern;
38
39 import org.apache.commons.httpclient.URI;
40 import org.apache.commons.httpclient.URIException;
41 import org.archive.util.TextUtils;
42
43
44 /***
45 * Factory that returns UURIs.
46 *
47 * Does escaping and fixup on URIs massaging in accordance with RFC2396
48 * and to match browser practice. For example, it removes any
49 * '..' if first thing in the path as per IE, converts backslashes to forward
50 * slashes, and discards any 'fragment'/anchor portion of the URI. This
51 * class will also fail URIs if they are longer than IE's allowed maximum
52 * length.
53 *
54 * <p>TODO: Test logging.
55 *
56 * @author stack
57 */
58 public class UURIFactory extends URI {
59
60 private static final long serialVersionUID = -6146295130382209042L;
61
62 /***
63 * Logging instance.
64 */
65 private static Logger logger =
66 Logger.getLogger(UURIFactory.class.getName());
67
68 /***
69 * The single instance of this factory.
70 */
71 private static final UURIFactory factory = new UURIFactory();
72
73 /***
74 * RFC 2396-inspired regex.
75 *
76 * From the RFC Appendix B:
77 * <pre>
78 * URI Generic Syntax August 1998
79 *
80 * B. Parsing a URI Reference with a Regular Expression
81 *
82 * As described in Section 4.3, the generic URI syntax is not sufficient
83 * to disambiguate the components of some forms of URI. Since the
84 * "greedy algorithm" described in that section is identical to the
85 * disambiguation method used by POSIX regular expressions, it is
86 * natural and commonplace to use a regular expression for parsing the
87 * potential four components and fragment identifier of a URI reference.
88 *
89 * The following line is the regular expression for breaking-down a URI
90 * reference into its components.
91 *
92 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
93 * 12 3 4 5 6 7 8 9
94 *
95 * The numbers in the second line above are only to assist readability;
96 * they indicate the reference points for each subexpression (i.e., each
97 * paired parenthesis). We refer to the value matched for subexpression
98 * <n> as $<n>. For example, matching the above expression to
99 *
100 * http://www.ics.uci.edu/pub/ietf/uri/#Related
101 *
102 * results in the following subexpression matches:
103 *
104 * $1 = http:
105 * $2 = http
106 * $3 = //www.ics.uci.edu
107 * $4 = www.ics.uci.edu
108 * $5 = /pub/ietf/uri/
109 * $6 = <undefined>
110 * $7 = <undefined>
111 * $8 = #Related
112 * $9 = Related
113 *
114 * where <undefined> indicates that the component is not present, as is
115 * the case for the query component in the above example. Therefore, we
116 * can determine the value of the four components and fragment as
117 *
118 * scheme = $2
119 * authority = $4
120 * path = $5
121 * query = $7
122 * fragment = $9
123 * </pre>
124 *
125 * --
126 * <p>Below differs from the rfc regex in that...
127 * (1) it has java escaping of regex characters
128 * (2) we allow a URI made of a fragment only (Added extra
129 * group so indexing is off by one after scheme).
130 * (3) scheme is limited to legal scheme characters
131 */
132 final static Pattern RFC2396REGEX = Pattern.compile(
133 "^(([a-zA-Z][a-zA-Z//+//-//.]*):)?((//([^/?#]*))?([^?#]*)(//?([^#]*))?)?(#(.*))?");
134
135
136
137
138
139
140
141
142
143
144
145
146
147 public static final String SLASHDOTDOTSLASH = "^(///.//./)+";
148 public static final String SLASH = "/";
149 public static final String HTTP = "http";
150 public static final String HTTP_PORT = ":80";
151 public static final String HTTPS = "https";
152 public static final String HTTPS_PORT = ":443";
153 public static final String DOT = ".";
154 public static final String EMPTY_STRING = "";
155 public static final String NBSP = "\u00A0";
156 public static final String SPACE = " ";
157 public static final String ESCAPED_SPACE = "%20";
158 public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$";
159 public static final String PIPE = "|";
160 public static final String PIPE_PATTERN = "//|";
161 public static final String ESCAPED_PIPE = "%7C";
162 public static final String CIRCUMFLEX = "^";
163 public static final String CIRCUMFLEX_PATTERN = "//^";
164 public static final String ESCAPED_CIRCUMFLEX = "%5E";
165 public static final String QUOT = "\"";
166 public static final String ESCAPED_QUOT = "%22";
167 public static final String SQUOT = "'";
168 public static final String ESCAPED_SQUOT = "%27";
169 public static final String APOSTROPH = "`";
170 public static final String ESCAPED_APOSTROPH = "%60";
171 public static final String LSQRBRACKET = "[";
172 public static final String LSQRBRACKET_PATTERN = "//[";
173 public static final String ESCAPED_LSQRBRACKET = "%5B";
174 public static final String RSQRBRACKET = "]";
175 public static final String RSQRBRACKET_PATTERN = "//]";
176 public static final String ESCAPED_RSQRBRACKET = "%5D";
177 public static final String LCURBRACKET = "{";
178 public static final String LCURBRACKET_PATTERN = "//{";
179 public static final String ESCAPED_LCURBRACKET = "%7B";
180 public static final String RCURBRACKET = "}";
181 public static final String RCURBRACKET_PATTERN = "//}";
182 public static final String ESCAPED_RCURBRACKET = "%7D";
183 public static final String BACKSLASH = "//";
184 public static final String BACKSLASH_PATTERN = "////";
185 public static final String ESCAPED_BACKSLASH = "%5C";
186 public static final String STRAY_SPACING = "[\n\r\t]+";
187 public static final String IMPROPERESC_REPLACE = "%25$1";
188 public static final String IMPROPERESC =
189 "%((?:[^//p{XDigit}])|(?:.[^//p{XDigit}])|(?://z))";
190 public static final String COMMERCIAL_AT = "@";
191 public static final char PERCENT_SIGN = '%';
192 public static final char COLON = ':';
193
194 /***
195 * First percent sign in string followed by two hex chars.
196 */
197 public static final String URI_HEX_ENCODING =
198 "^[^%]*%[//p{XDigit}][//p{XDigit}].*";
199
200 /***
201 * Authority port number regex.
202 */
203 final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$");
204
205 /***
206 * Characters we'll accept in the domain label part of a URI
207 * authority: ASCII letters-digits-hyphen (LDH) plus underscore,
208 * with single intervening '.' characters.
209 *
210 * (We accept '_' because DNS servers have tolerated for many
211 * years counter to spec; we also accept dash patterns and ACE
212 * prefixes that will be rejected by IDN-punycoding attempt.)
213 */
214 final static String ACCEPTABLE_ASCII_DOMAIN =
215 "^(?:[a-zA-Z0-9_-]++(?://.)?)++$";
216
217 /***
218 * Pattern that looks for case of three or more slashes after the
219 * scheme. If found, we replace them with two only as mozilla does.
220 */
221 final static Pattern HTTP_SCHEME_SLASHES =
222 Pattern.compile("^(https?://)/+(.*)");
223
224 /***
225 * Pattern that looks for case of two or more slashes in a path.
226 */
227 final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+");
228
229 /***
230 * System property key for list of supported schemes.
231 */
232 private static final String SCHEMES_KEY = ".schemes";
233
234 /***
235 * System property key for list of purposefully-ignored schemes.
236 */
237 private static final String IGNORED_SCHEMES_KEY = ".ignored-schemes";
238
239 private String[] schemes = null;
240 private String[] ignoredSchemes = null;
241
242 public static final int IGNORED_SCHEME = 9999999;
243
244 /***
245 * Protected constructor.
246 */
247 private UURIFactory() {
248 super();
249 String s = System.getProperty(this.getClass().getName() + SCHEMES_KEY);
250 if (s != null && s.length() > 0) {
251 schemes = s.split("[, ]+");
252 Arrays.sort(schemes);
253 }
254 String ignored = System.getProperty(this.getClass().getName() + IGNORED_SCHEMES_KEY);
255 if (ignored != null && ignored.length() > 0) {
256 ignoredSchemes = ignored.split("[, ]+");
257 Arrays.sort(ignoredSchemes);
258 }
259 }
260
261 /***
262 * @param uri URI as string.
263 * @return An instance of UURI
264 * @throws URIException
265 */
266 public static UURI getInstance(String uri) throws URIException {
267 return UURIFactory.factory.create(uri);
268 }
269
270 /***
271 * @param uri URI as string.
272 * @param charset Character encoding of the passed uri string.
273 * @return An instance of UURI
274 * @throws URIException
275 */
276 public static UURI getInstance(String uri, String charset)
277 throws URIException {
278 return UURIFactory.factory.create(uri, charset);
279 }
280
281 /***
282 * @param base Base uri to use resolving passed relative uri.
283 * @param relative URI as string.
284 * @return An instance of UURI
285 * @throws URIException
286 */
287 public static UURI getInstance(UURI base, String relative)
288 throws URIException {
289 return UURIFactory.factory.create(base, relative);
290 }
291
292 /***
293 * Test of whether passed String has an allowed URI scheme.
294 * First tests if likely scheme suffix. If so, we then test if its one of
295 * the supported schemes.
296 * @param possibleUrl URL string to examine.
297 * @return True if passed string looks like it could be an URL.
298 */
299 public static boolean hasSupportedScheme(String possibleUrl) {
300 boolean hasScheme = UURI.hasScheme(possibleUrl);
301 if (!hasScheme || UURIFactory.factory.schemes == null) {
302 return hasScheme;
303 }
304 String tmpStr = possibleUrl.substring(0, possibleUrl.indexOf(':'));
305 return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0;
306 }
307
308 /***
309 * @param uri URI as string.
310 * @return Instance of UURI.
311 * @throws URIException
312 */
313 private UURI create(String uri) throws URIException {
314 return create(uri, UURI.getDefaultProtocolCharset());
315 }
316
317 /***
318 * @param uri URI as string.
319 * @param charset Original encoding of the string.
320 * @return Instance of UURI.
321 * @throws URIException
322 */
323 private UURI create(String uri, String charset) throws URIException {
324 UURI uuri = new UURI(fixup(uri, null, charset), true, charset);
325 if (logger.isLoggable(Level.FINE)) {
326 logger.fine("URI " + uri +
327 " PRODUCT " + uuri.toString() +
328 " CHARSET " + charset);
329 }
330 return validityCheck(uuri);
331 }
332
333 /***
334 * @param base UURI to use as a base resolving <code>relative</code>.
335 * @param relative Relative URI.
336 * @return Instance of UURI.
337 * @throws URIException
338 */
339 private UURI create(UURI base, String relative) throws URIException {
340 UURI uuri = new UURI(base, new UURI(fixup(relative, base, base.getProtocolCharset()),
341 true, base.getProtocolCharset()));
342 if (logger.isLoggable(Level.FINE)) {
343 logger.fine(" URI " + relative +
344 " PRODUCT " + uuri.toString() +
345 " CHARSET " + base.getProtocolCharset() +
346 " BASE " + base);
347 }
348 return validityCheck(uuri);
349 }
350
351 /***
352 * Check the generated UURI.
353 *
354 * At the least look at length of uuri string. We were seeing case
355 * where before escaping, string was < MAX_URL_LENGTH but after was
356 * >. Letting out a too-big message was causing us troubles later
357 * down the processing chain.
358 * @param uuri Created uuri to check.
359 * @return The passed <code>uuri</code> so can easily inline this check.
360 * @throws URIException
361 */
362 protected UURI validityCheck(UURI uuri) throws URIException {
363 if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) {
364 throw new URIException("Created (escaped) uuri > " +
365 UURI.MAX_URL_LENGTH +": "+uuri.toString());
366 }
367 return uuri;
368 }
369
370 /***
371 * Do heritrix fix-up on passed uri string.
372 *
373 * Does heritrix escaping; usually escaping done to make our behavior align
374 * with IEs. This method codifies our experience pulling URIs from the
375 * wilds. Its does all the escaping we want; its output can always be
376 * assumed to be 'escaped' (though perhaps to a laxer standard than the
377 * vanilla HttpClient URI class or official specs might suggest).
378 *
379 * @param uri URI as string.
380 * @param base May be null.
381 * @param e True if the uri is already escaped.
382 * @return A fixed up URI string.
383 * @throws URIException
384 */
385 private String fixup(String uri, final URI base, final String charset)
386 throws URIException {
387 if (uri == null) {
388 throw new NullPointerException();
389 } else if (uri.length() == 0 && base == null) {
390 throw new URIException("URI length is zero (and not relative).");
391 }
392
393 if (uri.length() > UURI.MAX_URL_LENGTH) {
394
395 throw new URIException("URI length > " + UURI.MAX_URL_LENGTH +
396 ": " + uri);
397 }
398
399
400
401 if (uri.indexOf(NBSP) >= 0) {
402 uri = TextUtils.replaceAll(NBSP, uri, SPACE);
403 }
404
405
406 uri = uri.trim();
407
408
409
410
411
412
413 if (uri.indexOf(BACKSLASH) >= 0) {
414 uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH);
415 }
416
417
418 uri = TextUtils.replaceAll(STRAY_SPACING, uri, EMPTY_STRING);
419
420
421
422
423 Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri);
424 if (matcher.matches()) {
425 uri = matcher.group(1) + matcher.group(2);
426 }
427
428
429 uri = escapeWhitespace(uri);
430
431
432
433 matcher = RFC2396REGEX.matcher(uri);
434 if (!matcher.matches()) {
435 throw new URIException("Failed parse of " + uri);
436 }
437 String uriScheme = checkUriElementAndLowerCase(matcher.group(2));
438 String uriSchemeSpecificPart = checkUriElement(matcher.group(3));
439 String uriAuthority = checkUriElement(matcher.group(5));
440 String uriPath = checkUriElement(matcher.group(6));
441 String uriQuery = checkUriElement(matcher.group(8));
442
443
444
445 if (uriScheme != null && uriScheme.length() > 0 &&
446 this.schemes != null) {
447 if (!(Arrays.binarySearch(schemes,uriScheme)>=0)) {
448
449 if((Arrays.binarySearch(ignoredSchemes,uriScheme)>=0)) {
450 throw new URIException(
451 IGNORED_SCHEME, "Ignored scheme: " + uriScheme);
452 } else {
453 throw new URIException("Unsupported scheme: " + uriScheme);
454 }
455 }
456 }
457
458
459 if (uriScheme == null || uriScheme.length() <= 0) {
460 if (base == null) {
461 throw new URIException("Relative URI but no base: " + uri);
462 }
463 } else {
464 checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme,
465 uriSchemeSpecificPart);
466 }
467
468
469
470 uriAuthority = fixupAuthority(uriAuthority, charset);
471
472
473 if (uriSchemeSpecificPart != null &&
474 uriSchemeSpecificPart.startsWith(SLASH)) {
475 if (uriPath != null) {
476
477 uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath,
478 SLASH);
479 }
480
481
482
483 if (uriPath == null || EMPTY_STRING.equals(uriPath)) {
484 uriPath = SLASH;
485 }
486 }
487
488 if (uriAuthority != null) {
489 if (uriScheme != null && uriScheme.length() > 0 &&
490 uriScheme.equals(HTTP)) {
491 uriAuthority = checkPort(uriAuthority);
492 uriAuthority = stripTail(uriAuthority, HTTP_PORT);
493 } else if (uriScheme != null && uriScheme.length() > 0 &&
494 uriScheme.equals(HTTPS)) {
495 uriAuthority = checkPort(uriAuthority);
496 uriAuthority = stripTail(uriAuthority, HTTPS_PORT);
497 }
498
499 uriAuthority = stripTail(uriAuthority, DOT);
500 uriAuthority = stripPrefix(uriAuthority, DOT);
501 } else {
502
503
504
505 if (uriScheme != null && base != null
506 && uriScheme.equals(base.getScheme())) {
507
508 uriScheme = null;
509 }
510 }
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529 uriPath = ensureMinimalEscaping(uriPath, charset);
530 uriQuery = ensureMinimalEscaping(uriQuery, charset,
531 LaxURLCodec.QUERY_SAFE);
532
533
534
535 MutableString s = new MutableString(
536 ((uriScheme != null)? uriScheme.length(): 0)
537 + 1
538 + ((uriAuthority != null)? uriAuthority.length(): 0)
539 + 2
540 + ((uriPath != null)? uriPath.length(): 0)
541 + 1
542 + ((uriQuery != null)? uriQuery.length(): 0));
543 appendNonNull(s, uriScheme, ":", true);
544 appendNonNull(s, uriAuthority, "//", false);
545 appendNonNull(s, uriPath, "", false);
546 appendNonNull(s, uriQuery, "?", false);
547 return s.toString();
548 }
549
550 /***
551 * If http(s) scheme, check scheme specific part begins '//'.
552 * @throws URIException
553 * @see http://www.faqs.org/rfcs/rfc1738.html Section 3.1. Common Internet
554 * Scheme Syntax
555 */
556 protected void checkHttpSchemeSpecificPartSlashPrefix(final URI base,
557 final String scheme, final String schemeSpecificPart)
558 throws URIException {
559 if (scheme == null || scheme.length() <= 0) {
560 return;
561 }
562 if (!scheme.equals("http") && !scheme.equals("https")) {
563 return;
564 }
565 if ( schemeSpecificPart == null
566 || !schemeSpecificPart.startsWith("//")) {
567
568 if (base == null || !scheme.equals(base.getScheme())) {
569 throw new URIException(
570 "relative URI with scheme only allowed for " +
571 "scheme matching base");
572 }
573 return;
574 }
575 if (schemeSpecificPart.length() <= 2) {
576 throw new URIException("http scheme specific part is " +
577 "too short: " + schemeSpecificPart);
578 }
579 }
580
581 /***
582 * Fixup 'authority' portion of URI, by removing any stray
583 * encoded spaces, lowercasing any domain names, and applying
584 * IDN-punycoding to Unicode domains.
585 *
586 * @param uriAuthority the authority string to fix
587 * @return fixed version
588 * @throws URIException
589 */
590 private String fixupAuthority(String uriAuthority, String charset) throws URIException {
591
592
593
594 if (uriAuthority != null) {
595
596
597
598
599 while(uriAuthority.endsWith(ESCAPED_SPACE)) {
600 uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3);
601 }
602
603
604 int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
605 int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);
606 if(atIndex<0 && portColonIndex<0) {
607
608 return fixupDomainlabel(uriAuthority);
609 } else if (atIndex<0 && portColonIndex>-1) {
610
611 String domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex));
612 String port = uriAuthority.substring(portColonIndex);
613 return domain + port;
614 } else if (atIndex>-1 && portColonIndex<0) {
615
616 String userinfo = ensureMinimalEscaping(uriAuthority.substring(0,atIndex+1),charset);
617 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1));
618 return userinfo + domain;
619 } else {
620
621 String userinfo = ensureMinimalEscaping(uriAuthority.substring(0,atIndex+1),charset);
622 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex));
623 String port = uriAuthority.substring(portColonIndex);
624 return userinfo + domain + port;
625 }
626 }
627 return uriAuthority;
628 }
629
630 /***
631 * Fixup the domain label part of the authority.
632 *
633 * We're more lax than the spec. in that we allow underscores.
634 *
635 * @param label Domain label to fix.
636 * @return Return fixed domain label.
637 * @throws URIException
638 */
639 private String fixupDomainlabel(String label)
640 throws URIException {
641
642
643 try {
644
645
646 label = IDNA.toASCII(label);
647 } catch (IDNAException e) {
648 if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) {
649
650
651
652 } else {
653
654
655
656
657 URIException ue = new URIException(e+" "+label);
658 ue.initCause(e);
659 throw ue;
660 }
661 }
662 label = label.toLowerCase();
663 return label;
664 }
665
666 /***
667 * Ensure that there all characters needing escaping
668 * in the passed-in String are escaped. Stray '%' characters
669 * are *not* escaped, as per browser behavior.
670 *
671 * @param u String to escape
672 * @param charset
673 * @return string with any necessary escaping applied
674 */
675 private String ensureMinimalEscaping(String u, final String charset) {
676 return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE);
677 }
678
679 /***
680 * Ensure that there all characters needing escaping
681 * in the passed-in String are escaped. Stray '%' characters
682 * are *not* escaped, as per browser behavior.
683 *
684 * @param u String to escape
685 * @param charset
686 * @param bitset
687 * @return string with any necessary escaping applied
688 */
689 private String ensureMinimalEscaping(String u, final String charset,
690 final BitSet bitset) {
691 if (u == null) {
692 return null;
693 }
694 for (int i = 0; i < u.length(); i++) {
695 char c = u.charAt(i);
696 if (!bitset.get(c)) {
697 try {
698 u = LaxURLCodec.DEFAULT.encode(bitset, u, charset);
699 } catch (UnsupportedEncodingException e) {
700 e.printStackTrace();
701 }
702 break;
703 }
704 }
705 return u;
706 }
707
708 /***
709 * Escape any whitespace found.
710 *
711 * The parent class takes care of the bulk of escaping. But if any
712 * instance of escaping is found in the URI, then we ask for parent
713 * to do NO escaping. Here we escape any whitespace found irrespective
714 * of whether the uri has already been escaped. We do this for
715 * case where uri has been judged already-escaped only, its been
716 * incompletly done and whitespace remains. Spaces, etc., in the URI are
717 * a real pain. Their presence will break log file and ARC parsing.
718 * @param uri URI string to check.
719 * @return uri with spaces escaped if any found.
720 */
721 protected String escapeWhitespace(String uri) {
722
723
724
725
726
727
728 MutableString buffer = null;
729 for (int i = 0; i < uri.length(); i++) {
730 char c = uri.charAt(i);
731 if (Character.isWhitespace(c)) {
732 if (buffer == null) {
733 buffer = new MutableString(uri.length() +
734 2
735 buffer.append(uri.substring(0, i));
736 }
737 buffer.append("%");
738 String hexStr = Integer.toHexString(c);
739 if ((hexStr.length() % 2) > 0) {
740 buffer.append("0");
741 }
742 buffer.append(hexStr);
743
744 } else {
745 if (buffer != null) {
746 buffer.append(c);
747 }
748 }
749 }
750 return (buffer != null)? buffer.toString(): uri;
751 }
752
753 /***
754 * Check port on passed http authority. Make sure the size is not larger
755 * than allowed: See the 'port' definition on this
756 * page, http://www.kerio.com/manual/wrp/en/418.htm.
757 * Also, we've seen port numbers of '0080' whose leading zeros confuse
758 * the parent class. Strip the leading zeros.
759 *
760 * @param uriAuthority
761 * @return Null or an amended port number.
762 * @throws URIException
763 */
764 private String checkPort(String uriAuthority)
765 throws URIException {
766 Matcher m = PORTREGEX.matcher(uriAuthority);
767 if (m.matches()) {
768 String no = m.group(2);
769 if (no != null && no.length() > 0) {
770
771
772
773
774 while (no.charAt(0) == '0' && no.length() > 1) {
775 no = no.substring(1);
776 }
777 uriAuthority = m.group(1) + no;
778
779 int portNo = 0;
780 try {
781 portNo = Integer.parseInt(no);
782 } catch (NumberFormatException nfe) {
783
784 }
785 if (portNo <= 0 || portNo > 65535) {
786 throw new URIException("Port out of bounds: " +
787 uriAuthority);
788 }
789 }
790 }
791 return uriAuthority;
792 }
793
794 /***
795 * @param b Buffer to append to.
796 * @param str String to append if not null.
797 * @param substr Suffix or prefix to use if <code>str</code> is not null.
798 * @param suffix True if <code>substr</code> is a suffix.
799 */
800 private void appendNonNull(MutableString b, String str, String substr,
801 boolean suffix) {
802 if (str != null && str.length() > 0) {
803 if (!suffix) {
804 b.append(substr);
805 }
806 b.append(str);
807 if (suffix) {
808 b.append(substr);
809 }
810 }
811 }
812
813 /***
814 * @param str String to work on.
815 * @param prefix Prefix to strip if present.
816 * @return <code>str</code> w/o <code>prefix</code>.
817 */
818 private String stripPrefix(String str, String prefix) {
819 return str.startsWith(prefix)?
820 str.substring(prefix.length(), str.length()):
821 str;
822 }
823
824 /***
825 * @param str String to work on.
826 * @param tail Tail to strip if present.
827 * @return <code>str</code> w/o <code>tail</code>.
828 */
829 private static String stripTail(String str, String tail) {
830 return str.endsWith(tail)?
831 str.substring(0, str.length() - tail.length()):
832 str;
833 }
834
835 /***
836 * @param element to examine.
837 * @return Null if passed null or an empty string otherwise
838 * <code>element</code>.
839 */
840 private String checkUriElement(String element) {
841 return (element == null || element.length() <= 0)? null: element;
842 }
843
844 /***
845 * @param element to examine and lowercase if non-null.
846 * @return Null if passed null or an empty string otherwise
847 * <code>element</code> lowercased.
848 */
849 private String checkUriElementAndLowerCase(String element) {
850 String tmp = checkUriElement(element);
851 return (tmp != null)? tmp.toLowerCase(): tmp;
852 }
853 }