1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.net;
26
27 import java.util.Arrays;
28 import java.util.BitSet;
29
30 import org.apache.commons.httpclient.URI;
31 import org.apache.commons.httpclient.URIException;
32 import org.apache.commons.httpclient.util.EncodingUtil;
33
34 /***
35 * URI subclass which allows partial/inconsistent encoding, matching
36 * the URIs which will be relayed in requests from popular web
37 * browsers (esp. Mozilla Firefox and MS IE).
38 *
39 * @author gojomo
40 */
41 public class LaxURI extends URI {
42
43 private static final long serialVersionUID = 5273922211722239537L;
44
45 final protected static char[] HTTP_SCHEME = {'h','t','t','p'};
46 final protected static char[] HTTPS_SCHEME = {'h','t','t','p','s'};
47
48 protected static final BitSet lax_rel_segment = new BitSet(256);
49
50 static {
51 lax_rel_segment.or(rel_segment);
52 lax_rel_segment.set(':');
53
54 }
55
56 protected static final BitSet lax_abs_path = new BitSet(256);
57 static {
58 lax_abs_path.or(abs_path);
59 lax_abs_path.set('|');
60 }
61
62 protected static final BitSet lax_query = new BitSet(256);
63 static {
64 lax_query.or(query);
65 lax_query.set('{');
66 lax_query.set('}');
67 lax_query.set('|');
68 lax_query.set('[');
69 lax_query.set(']');
70 lax_query.set('^');
71 }
72
73
74 public LaxURI(String uri, boolean escaped, String charset)
75 throws URIException {
76 super(uri,escaped,charset);
77 }
78 public LaxURI(URI base, URI relative) throws URIException {
79 super(base,relative);
80 }
81 public LaxURI(String uri, boolean escaped) throws URIException {
82 super(uri,escaped);
83 }
84 public LaxURI() {
85 super();
86 }
87
88
89 public String getURI() throws URIException {
90 return (_uri == null) ? null : decode(_uri, getProtocolCharset());
91 }
92
93
94 public String getPath() throws URIException {
95 char[] p = getRawPath();
96 return (p == null) ? null : decode(p, getProtocolCharset());
97 }
98
99
100 public String getPathQuery() throws URIException {
101 char[] rawPathQuery = getRawPathQuery();
102 return (rawPathQuery == null) ? null : decode(rawPathQuery,
103 getProtocolCharset());
104 }
105
106 protected static String decode(char[] component, String charset)
107 throws URIException {
108 if (component == null) {
109 throw new IllegalArgumentException(
110 "Component array of chars may not be null");
111 }
112 return decode(new String(component), charset);
113 }
114
115
116 protected static String decode(String component, String charset)
117 throws URIException {
118 if (component == null) {
119 throw new IllegalArgumentException(
120 "Component array of chars may not be null");
121 }
122 byte[] rawdata = null;
123
124 rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil
125 .getAsciiBytes(component));
126
127
128
129 return EncodingUtil.getString(rawdata, charset);
130 }
131
132
133 protected boolean validate(char[] component, BitSet generous) {
134 return super.validate(component, lax(generous));
135 }
136
137
138 protected boolean validate(char[] component, int soffset, int eoffset,
139 BitSet generous) {
140 return super.validate(component, soffset, eoffset, lax(generous));
141 }
142
143 /***
144 * Given a BitSet -- typically one of the URI superclass's
145 * predefined static variables -- possibly replace it with
146 * a more-lax version to better match the character sets
147 * actually left unencoded in web browser requests
148 *
149 * @param generous original BitSet
150 * @return (possibly more lax) BitSet to use
151 */
152 protected BitSet lax(BitSet generous) {
153 if (generous == rel_segment) {
154
155 return lax_rel_segment;
156 }
157 if (generous == abs_path) {
158 return lax_abs_path;
159 }
160 if (generous == query) {
161 return lax_query;
162 }
163
164 return generous;
165 }
166
167 /***
168 * Coalesce the _host and _authority fields where
169 * possible.
170 *
171 * In the web crawl/http domain, most URIs have an
172 * identical _host and _authority. (There is no port
173 * or user info.) However, the superclass always
174 * creates two separate char[] instances.
175 *
176 * Notably, the lengths of these char[] fields are
177 * equal if and only if their values are identical.
178 * This method makes use of this fact to reduce the
179 * two instances to one where possible, slimming
180 * instances.
181 *
182 * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)
183 */
184 protected void parseAuthority(String original, boolean escaped)
185 throws URIException {
186 super.parseAuthority(original, escaped);
187 if (_host != null && _authority != null
188 && _host.length == _authority.length) {
189 _host = _authority;
190 }
191 }
192
193
194 /***
195 * Coalesce _scheme to existing instances, where appropriate.
196 *
197 * In the web-crawl domain, most _schemes are 'http' or 'https',
198 * but the superclass always creates a new char[] instance. For
199 * these two cases, we replace the created instance with a
200 * long-lived instance from a static field, saving 12-14 bytes
201 * per instance.
202 *
203 * @see org.apache.commons.httpclient.URI#setURI()
204 */
205 protected void setURI() {
206 if (_scheme != null) {
207 if (_scheme.length == 4 && Arrays.equals(_scheme, HTTP_SCHEME)) {
208 _scheme = HTTP_SCHEME;
209 } else if (_scheme.length == 5
210 && Arrays.equals(_scheme, HTTP_SCHEME)) {
211 _scheme = HTTPS_SCHEME;
212 }
213 }
214 super.setURI();
215 }
216
217 /***
218 * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR
219 * http://issues.apache.org/jira/browse/HTTPCLIENT-588
220 * AND
221 * http://webteam.archive.org/jira/browse/HER-1268
222 *
223 * In order to avoid any possilbity of conflict with non-ASCII characters,
224 * Parse a URI reference as a <code>String</code> with the character
225 * encoding of the local system or the document.
226 * <p>
227 * The following line is the regular expression for breaking-down a URI
228 * reference into its components.
229 * <p><blockquote><pre>
230 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
231 * 12 3 4 5 6 7 8 9
232 * </pre></blockquote><p>
233 * For example, matching the above expression to
234 * http://jakarta.apache.org/ietf/uri/#Related
235 * results in the following subexpression matches:
236 * <p><blockquote><pre>
237 * $1 = http:
238 * scheme = $2 = http
239 * $3 = //jakarta.apache.org
240 * authority = $4 = jakarta.apache.org
241 * path = $5 = /ietf/uri/
242 * $6 = <undefined>
243 * query = $7 = <undefined>
244 * $8 = #Related
245 * fragment = $9 = Related
246 * </pre></blockquote><p>
247 *
248 * @param original the original character sequence
249 * @param escaped <code>true</code> if <code>original</code> is escaped
250 * @throws URIException If an error occurs.
251 */
252 protected void parseUriReference(String original, boolean escaped)
253 throws URIException {
254
255
256 if (original == null) {
257 throw new URIException("URI-Reference required");
258 }
259
260
261
262
263 String tmp = original.trim();
264
265
266
267
268
269 int length = tmp.length();
270
271
272
273
274 if (length > 0) {
275 char[] firstDelimiter = { tmp.charAt(0) };
276 if (validate(firstDelimiter, delims)) {
277 if (length >= 2) {
278 char[] lastDelimiter = { tmp.charAt(length - 1) };
279 if (validate(lastDelimiter, delims)) {
280 tmp = tmp.substring(1, length - 1);
281 length = length - 2;
282 }
283 }
284 }
285 }
286
287
288
289
290 int from = 0;
291
292
293
294
295 boolean isStartedFromPath = false;
296 int atColon = tmp.indexOf(':');
297 int atSlash = tmp.indexOf('/');
298 if ((atColon <= 0 && !tmp.startsWith("//"))
299 || (atSlash >= 0 && atSlash < atColon)) {
300 isStartedFromPath = true;
301 }
302
303
304
305
306
307
308
309 int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
310 if (at == -1) {
311 at = 0;
312 }
313
314
315
316
317
318
319
320
321
322 if (at > 0 && at < length && tmp.charAt(at) == ':') {
323 char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
324 if (validate(target, scheme)) {
325 _scheme = target;
326 from = ++at;
327 } else {
328
329
330
331 }
332
333 }
334
335
336
337
338
339
340
341
342
343
344 _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
345 if (0 <= at && at < length && tmp.charAt(at) == '/') {
346
347 _is_hier_part = true;
348 if (at + 2 < length && tmp.charAt(at + 1) == '/'
349 && !isStartedFromPath) {
350
351 int next = indexFirstOf(tmp, "/?#", at + 2);
352 if (next == -1) {
353 next = (tmp.substring(at + 2).length() == 0) ? at + 2
354 : tmp.length();
355 }
356 parseAuthority(tmp.substring(at + 2, next), escaped);
357 from = at = next;
358
359 _is_net_path = true;
360 }
361 if (from == at) {
362
363 _is_abs_path = true;
364 }
365 }
366
367
368
369
370
371
372
373
374
375 if (from < length) {
376
377 int next = indexFirstOf(tmp, "?#", from);
378 if (next == -1) {
379 next = tmp.length();
380 }
381 if (!_is_abs_path) {
382 if (!escaped
383 && prevalidate(tmp.substring(from, next), disallowed_rel_path)
384 || escaped
385 && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
386
387 _is_rel_path = true;
388 } else if (!escaped
389 && prevalidate(tmp.substring(from, next), disallowed_opaque_part)
390 || escaped
391 && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
392
393 _is_opaque_part = true;
394 } else {
395
396 _path = null;
397 }
398 }
399 String s = tmp.substring(from, next);
400 if (escaped) {
401 setRawPath(s.toCharArray());
402 } else {
403 setPath(s);
404 }
405 at = next;
406 }
407
408
409 String charset = getProtocolCharset();
410
411
412
413
414
415
416
417
418
419 if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
420 int next = tmp.indexOf('#', at + 1);
421 if (next == -1) {
422 next = tmp.length();
423 }
424 if (escaped) {
425 _query = tmp.substring(at + 1, next).toCharArray();
426 if (!validate(_query, query)) {
427 throw new URIException("Invalid query");
428 }
429 } else {
430 _query = encode(tmp.substring(at + 1, next), allowed_query, charset);
431 }
432 at = next;
433 }
434
435
436
437
438
439
440
441
442
443 if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
444 if (at + 1 == length) {
445 _fragment = "".toCharArray();
446 } else {
447 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
448 : encode(tmp.substring(at + 1), allowed_fragment, charset);
449 }
450 }
451
452
453 setURI();
454 }
455
456 }