View Javadoc

1   /* LaxURI
2   *
3   * $Id: LaxURI.java 5919 2008-07-30 22:46:02Z gojomo $
4   *
5   * Created on Aug 3, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.net;
26  
27  import java.util.Arrays;
28  import java.util.BitSet;
29  
30  import org.apache.commons.httpclient.URI;
31  import org.apache.commons.httpclient.URIException;
32  import org.apache.commons.httpclient.util.EncodingUtil;
33  
34  /***
35   * URI subclass which allows partial/inconsistent encoding, matching
36   * the URIs which will be relayed in requests from popular web
37   * browsers (esp. Mozilla Firefox and MS IE).
38   * 
39   * @author gojomo
40   */
41  public class LaxURI extends URI {
42  
43      private static final long serialVersionUID = 5273922211722239537L;
44      
45      final protected static char[] HTTP_SCHEME = {'h','t','t','p'};
46      final protected static char[] HTTPS_SCHEME = {'h','t','t','p','s'};
47      
48      protected static final BitSet lax_rel_segment = new BitSet(256);
49      // Static initializer for lax_rel_segment
50      static {
51          lax_rel_segment.or(rel_segment);
52          lax_rel_segment.set(':'); // allow ':'
53          // TODO: add additional allowances as need is demonstrated
54      }
55  
56      protected static final BitSet lax_abs_path = new BitSet(256);
57      static {
58          lax_abs_path.or(abs_path);
59          lax_abs_path.set('|'); // tests indicate Firefox (1.0.6) doesn't escape.
60      }
61      
62      protected static final BitSet lax_query = new BitSet(256);
63      static {
64          lax_query.or(query);
65          lax_query.set('{'); // tests indicate FF doesn't escape { in query
66          lax_query.set('}'); // tests indicate FF doesn't escape } in query
67          lax_query.set('|'); // tests indicate FF doesn't escape | in query
68          lax_query.set('['); // tests indicate FF doesn't escape [ in query
69          lax_query.set(']'); // tests indicate FF doesn't escape ] in query
70          lax_query.set('^'); // tests indicate FF doesn't escape ^ in query
71      }
72      
73      // passthrough initializers
74      public LaxURI(String uri, boolean escaped, String charset)
75      throws URIException {
76          super(uri,escaped,charset);
77      }
78      public LaxURI(URI base, URI relative) throws URIException {
79          super(base,relative);
80      }
81      public LaxURI(String uri, boolean escaped) throws URIException {
82          super(uri,escaped);
83      }
84      public LaxURI() {
85          super();
86      }
87  
88      // overridden to use this class's static decode()
89      public String getURI() throws URIException {
90          return (_uri == null) ? null : decode(_uri, getProtocolCharset());
91      }
92      
93      // overridden to use this class's static decode()
94      public String getPath() throws URIException {
95          char[] p = getRawPath();
96          return (p == null) ? null : decode(p, getProtocolCharset());
97      }
98  
99      // overridden to use this class's static decode()
100     public String getPathQuery() throws URIException {
101         char[] rawPathQuery = getRawPathQuery();
102         return (rawPathQuery == null) ? null : decode(rawPathQuery,
103                 getProtocolCharset());
104     }
105     // overridden to use this class's static decode()
106     protected static String decode(char[] component, String charset)
107             throws URIException {
108         if (component == null) {
109             throw new IllegalArgumentException(
110                     "Component array of chars may not be null");
111         }
112         return decode(new String(component), charset);
113     }
114 
115     // overridden to use IA's LaxURLCodec, which never throws DecoderException
116     protected static String decode(String component, String charset)
117             throws URIException {
118         if (component == null) {
119             throw new IllegalArgumentException(
120                     "Component array of chars may not be null");
121         }
122         byte[] rawdata = null;
123         //     try {
124         rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil
125                 .getAsciiBytes(component));
126         //     } catch (DecoderException e) {
127         //         throw new URIException(e.getMessage());
128         //     }
129         return EncodingUtil.getString(rawdata, charset);
130     }
131     
132     // overidden to lax() the acceptable-char BitSet passed in
133     protected boolean validate(char[] component, BitSet generous) {
134         return super.validate(component, lax(generous));
135     }
136 
137     // overidden to lax() the acceptable-char BitSet passed in
138     protected boolean validate(char[] component, int soffset, int eoffset,
139             BitSet generous) {
140         return super.validate(component, soffset, eoffset, lax(generous));
141     }
142     
143     /***
144      * Given a BitSet -- typically one of the URI superclass's
145      * predefined static variables -- possibly replace it with
146      * a more-lax version to better match the character sets
147      * actually left unencoded in web browser requests
148      * 
149      * @param generous original BitSet
150      * @return (possibly more lax) BitSet to use
151      */
152     protected BitSet lax(BitSet generous) {
153         if (generous == rel_segment) {
154             // Swap in more lax allowable set
155             return lax_rel_segment;
156         }
157         if (generous == abs_path) {
158             return lax_abs_path;
159         }
160         if (generous == query) {
161             return lax_query;
162         }
163         // otherwise, leave as is
164         return generous;
165     }
166     
167     /*** 
168      * Coalesce the _host and _authority fields where 
169      * possible.
170      * 
171      * In the web crawl/http domain, most URIs have an 
172      * identical _host and _authority. (There is no port
173      * or user info.) However, the superclass always 
174      * creates two separate char[] instances. 
175      * 
176      * Notably, the lengths of these char[] fields are 
177      * equal if and only if their values are identical.
178      * This method makes use of this fact to reduce the
179      * two instances to one where possible, slimming 
180      * instances.  
181      * 
182      * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)
183      */
184     protected void parseAuthority(String original, boolean escaped)
185             throws URIException {
186         super.parseAuthority(original, escaped);
187         if (_host != null && _authority != null
188                 && _host.length == _authority.length) {
189             _host = _authority;
190         }
191     }
192     
193     
194     /*** 
195      * Coalesce _scheme to existing instances, where appropriate.
196      * 
197      * In the web-crawl domain, most _schemes are 'http' or 'https',
198      * but the superclass always creates a new char[] instance. For
199      * these two cases, we replace the created instance with a 
200      * long-lived instance from a static field, saving 12-14 bytes
201      * per instance. 
202      * 
203      * @see org.apache.commons.httpclient.URI#setURI()
204      */
205     protected void setURI() {
206         if (_scheme != null) {
207             if (_scheme.length == 4 && Arrays.equals(_scheme, HTTP_SCHEME)) {
208                 _scheme = HTTP_SCHEME;
209             } else if (_scheme.length == 5
210                     && Arrays.equals(_scheme, HTTP_SCHEME)) {
211                 _scheme = HTTPS_SCHEME;
212             }
213         }
214         super.setURI();
215     }
216     
217     /***
218      * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR 
219      * http://issues.apache.org/jira/browse/HTTPCLIENT-588
220      * AND
221      * http://webteam.archive.org/jira/browse/HER-1268
222      * 
223      * In order to avoid any possilbity of conflict with non-ASCII characters,
224      * Parse a URI reference as a <code>String</code> with the character
225      * encoding of the local system or the document.
226      * <p>
227      * The following line is the regular expression for breaking-down a URI
228      * reference into its components.
229      * <p><blockquote><pre>
230      *   ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
231      *    12            3  4          5       6  7        8 9
232      * </pre></blockquote><p>
233      * For example, matching the above expression to
234      *   http://jakarta.apache.org/ietf/uri/#Related
235      * results in the following subexpression matches:
236      * <p><blockquote><pre>
237      *               $1 = http:
238      *  scheme    =  $2 = http
239      *               $3 = //jakarta.apache.org
240      *  authority =  $4 = jakarta.apache.org
241      *  path      =  $5 = /ietf/uri/
242      *               $6 = <undefined>
243      *  query     =  $7 = <undefined>
244      *               $8 = #Related
245      *  fragment  =  $9 = Related
246      * </pre></blockquote><p>
247      *
248      * @param original the original character sequence
249      * @param escaped <code>true</code> if <code>original</code> is escaped
250      * @throws URIException If an error occurs.
251      */
252     protected void parseUriReference(String original, boolean escaped)
253         throws URIException {
254 
255         // validate and contruct the URI character sequence
256         if (original == null) {
257             throw new URIException("URI-Reference required");
258         }
259 
260         /* @
261          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
262          */
263         String tmp = original.trim();
264         
265         /*
266          * The length of the string sequence of characters.
267          * It may not be equal to the length of the byte array.
268          */
269         int length = tmp.length();
270 
271         /*
272          * Remove the delimiters like angle brackets around an URI.
273          */
274         if (length > 0) {
275             char[] firstDelimiter = { tmp.charAt(0) };
276             if (validate(firstDelimiter, delims)) {
277                 if (length >= 2) {
278                     char[] lastDelimiter = { tmp.charAt(length - 1) };
279                     if (validate(lastDelimiter, delims)) {
280                         tmp = tmp.substring(1, length - 1);
281                         length = length - 2;
282                     }
283                 }
284             }
285         }
286 
287         /*
288          * The starting index
289          */
290         int from = 0;
291 
292         /*
293          * The test flag whether the URI is started from the path component.
294          */
295         boolean isStartedFromPath = false;
296         int atColon = tmp.indexOf(':');
297         int atSlash = tmp.indexOf('/');
298         if ((atColon <= 0 && !tmp.startsWith("//"))
299             || (atSlash >= 0 && atSlash < atColon)) {
300             isStartedFromPath = true;
301         }
302 
303         /*
304          * <p><blockquote><pre>
305          *     @@@@@@@@
306          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
307          * </pre></blockquote><p>
308          */
309         int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
310         if (at == -1) { 
311             at = 0;
312         }
313 
314         /*
315          * Parse the scheme.
316          * <p><blockquote><pre>
317          *  scheme    =  $2 = http
318          *              @
319          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
320          * </pre></blockquote><p>
321          */
322         if (at > 0 && at < length && tmp.charAt(at) == ':') {
323             char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
324             if (validate(target, scheme)) {
325                 _scheme = target;
326                 from = ++at;
327             } else {
328                 // IA CHANGE:
329                 // do nothing; allow interpretation as URI with 
330                 // later colon in other syntactical component
331             }
332             
333         }
334 
335         /*
336          * Parse the authority component.
337          * <p><blockquote><pre>
338          *  authority =  $4 = jakarta.apache.org
339          *                  @@
340          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
341          * </pre></blockquote><p>
342          */
343         // Reset flags
344         _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
345         if (0 <= at && at < length && tmp.charAt(at) == '/') {
346             // Set flag
347             _is_hier_part = true;
348             if (at + 2 < length && tmp.charAt(at + 1) == '/' 
349                 && !isStartedFromPath) {
350                 // the temporary index to start the search from
351                 int next = indexFirstOf(tmp, "/?#", at + 2);
352                 if (next == -1) {
353                     next = (tmp.substring(at + 2).length() == 0) ? at + 2 
354                         : tmp.length();
355                 }
356                 parseAuthority(tmp.substring(at + 2, next), escaped);
357                 from = at = next;
358                 // Set flag
359                 _is_net_path = true;
360             }
361             if (from == at) {
362                 // Set flag
363                 _is_abs_path = true;
364             }
365         }
366 
367         /*
368          * Parse the path component.
369          * <p><blockquote><pre>
370          *  path      =  $5 = /ietf/uri/
371          *                                @@@@@@
372          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
373          * </pre></blockquote><p>
374          */
375         if (from < length) {
376             // rel_path = rel_segment [ abs_path ]
377             int next = indexFirstOf(tmp, "?#", from);
378             if (next == -1) {
379                 next = tmp.length();
380             }
381             if (!_is_abs_path) {
382                 if (!escaped 
383                     && prevalidate(tmp.substring(from, next), disallowed_rel_path) 
384                     || escaped 
385                     && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
386                     // Set flag
387                     _is_rel_path = true;
388                 } else if (!escaped 
389                     && prevalidate(tmp.substring(from, next), disallowed_opaque_part) 
390                     || escaped 
391                     && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
392                     // Set flag
393                     _is_opaque_part = true;
394                 } else {
395                     // the path component may be empty
396                     _path = null;
397                 }
398             }
399             String s = tmp.substring(from, next);
400             if (escaped) {
401                 setRawPath(s.toCharArray());
402             } else {
403                 setPath(s);
404             }
405             at = next;
406         }
407 
408         // set the charset to do escape encoding
409         String charset = getProtocolCharset();
410 
411         /*
412          * Parse the query component.
413          * <p><blockquote><pre>
414          *  query     =  $7 = <undefined>
415          *                                        @@@@@@@@@
416          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
417          * </pre></blockquote><p>
418          */
419         if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
420             int next = tmp.indexOf('#', at + 1);
421             if (next == -1) {
422                 next = tmp.length();
423             }
424             if (escaped) {
425                 _query = tmp.substring(at + 1, next).toCharArray();
426                 if (!validate(_query, query)) {
427                     throw new URIException("Invalid query");
428                 }
429             } else {
430                 _query = encode(tmp.substring(at + 1, next), allowed_query, charset);
431             }
432             at = next;
433         }
434 
435         /*
436          * Parse the fragment component.
437          * <p><blockquote><pre>
438          *  fragment  =  $9 = Related
439          *                                                   @@@@@@@@
440          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
441          * </pre></blockquote><p>
442          */
443         if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
444             if (at + 1 == length) { // empty fragment
445                 _fragment = "".toCharArray();
446             } else {
447                 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() 
448                     : encode(tmp.substring(at + 1), allowed_fragment, charset);
449             }
450         }
451 
452         // set this URI.
453         setURI();
454     }
455     
456 }