1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.net;
26
27 import java.io.File;
28 import java.io.Serializable;
29 import java.net.URI;
30 import java.net.URISyntaxException;
31 import java.util.logging.Level;
32 import java.util.logging.Logger;
33
34 import org.apache.commons.httpclient.URIException;
35 import org.archive.crawler.datamodel.CandidateURI;
36 import org.archive.util.SURT;
37 import org.archive.util.TextUtils;
38
39
40 /***
41 * Usable URI.
42 *
43 * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
44 * and methods. It cannot be instantiated directly. Go via UURIFactory.
45 *
46 * <p>We used to use {@link java.net.URI} for parsing URIs but ran across
47 * quirky behaviors and bugs. {@link java.net.URI} is not subclassable --
48 * its final -- and its unlikely that java.net.URI will change any time soon
49 * (See Gordon's considered petition here:
50 * <a href="http://developer.java.sun.com/developer/bugParade/bugs/4939847.html">java.net.URI
51 * should have loose/tolerant/compatibility option (or allow reuse)</a>).
52 *
53 * <p>This class tries to cache calculated strings such as the extracted host
54 * and this class as a string rather than have the parent class rerun its
55 * calculation everytime.
56 *
57 * @author gojomo
58 * @author stack
59 *
60 * @see org.apache.commons.httpclient.URI
61 */
62 public class UURI extends LaxURI
63 implements CharSequence, Serializable {
64
65 private static final long serialVersionUID = -1277570889914647093L;
66
67 private static Logger LOGGER =
68 Logger.getLogger(UURI.class.getName());
69
70 /***
71 * Consider URIs too long for IE as illegal.
72 */
73 public final static int MAX_URL_LENGTH = 2083;
74
75 public static final String MASSAGEHOST_PATTERN = "^www//d*//.";
76
77 /***
78 * Cache of the host name.
79 *
80 * Super class calculates on every call. Profiling shows us spend 30% of
81 * total elapsed time in URI class.
82 */
83 private transient String cachedHost = null;
84
85 /***
86 * Cache of this uuri escaped as a string.
87 *
88 * Super class calculates on every call. Profiling shows us spend 30% of
89 * total elapsed time in URI class.
90 */
91 private transient String cachedEscapedURI = null;
92
93 /***
94 * Cache of this uuri escaped as a string.
95 *
96 * Super class calculates on every call. Profiling shows us spend 30% of
97 * total elapsed time in URI class.
98 */
99 private transient String cachedString = null;
100
101 /***
102 * Cached authority minus userinfo.
103 */
104 private transient String cachedAuthorityMinusUserinfo = null;
105
106 /***
107 * Cache of this uuri in SURT format
108 */
109 private transient String surtForm = null;
110
111
112
113
114
115 static {
116 hostname.set('_');
117 }
118
119
120 /***
121 * Shutdown access to default constructor.
122 */
123 protected UURI() {
124 super();
125 }
126
127 /***
128 * @param uri String representation of an absolute URI.
129 * @param escaped If escaped.
130 * @param charset Charset to use.
131 * @throws org.apache.commons.httpclient.URIException
132 */
133 protected UURI(String uri, boolean escaped, String charset)
134 throws URIException {
135 super(uri, escaped, charset);
136 normalize();
137 }
138
139 /***
140 * @param relative String representation of URI.
141 * @param base Parent UURI to use derelativizing.
142 * @throws org.apache.commons.httpclient.URIException
143 */
144 protected UURI(UURI base, UURI relative) throws URIException {
145 super(base, relative);
146 normalize();
147 }
148
149 /***
150 * @param uri String representation of a URI.
151 * @param escaped If escaped.
152 * @throws NullPointerException
153 * @throws URIException
154 */
155 protected UURI(String uri, boolean escaped) throws URIException, NullPointerException {
156 super(uri,escaped);
157 normalize();
158 }
159
160 /***
161 * @param uri URI as string that is resolved relative to this UURI.
162 * @return UURI that uses this UURI as base.
163 * @throws URIException
164 */
165 public UURI resolve(String uri)
166 throws URIException {
167 return resolve(uri, false,
168 this.getProtocolCharset());
169 }
170
171 /***
172 * @param uri URI as string that is resolved relative to this UURI.
173 * @param e True if escaped.
174 * @return UURI that uses this UURI as base.
175 * @throws URIException
176 */
177 public UURI resolve(String uri, boolean e)
178 throws URIException {
179 return resolve(uri, e, this.getProtocolCharset());
180 }
181
182 /***
183 * @param uri URI as string that is resolved relative to this UURI.
184 * @param e True if uri is escaped.
185 * @param charset Charset to use.
186 * @return UURI that uses this UURI as base.
187 * @throws URIException
188 */
189 public UURI resolve(String uri, boolean e, String charset)
190 throws URIException {
191 return new UURI(this, new UURI(uri, e, charset));
192 }
193
194 /***
195 * Test an object if this UURI is equal to another.
196 *
197 * @param obj an object to compare
198 * @return true if two URI objects are equal
199 */
200 public boolean equals(Object obj) {
201
202
203 if (obj == this) {
204 return true;
205 }
206 if (!(obj instanceof UURI)) {
207 return false;
208 }
209 UURI another = (UURI) obj;
210
211 if (!equals(this._scheme, another._scheme)) {
212 return false;
213 }
214
215 if (!equals(this._opaque, another._opaque)) {
216 return false;
217 }
218
219
220 if (!equals(this._authority, another._authority)) {
221 return false;
222 }
223
224 if (!equals(this._path, another._path)) {
225 return false;
226 }
227
228 if (!equals(this._query, another._query)) {
229 return false;
230 }
231
232 return true;
233 }
234
235 /***
236 * Strips www variants from the host.
237 *
238 * Strips www[0-9]*\. from the host. If calling getHostBaseName becomes a
239 * performance issue we should consider adding the hostBasename member that
240 * is set on initialization.
241 *
242 * @return Host's basename.
243 * @throws URIException
244 */
245 public String getHostBasename() throws URIException {
246
247
248
249
250 return (this.getReferencedHost() == null)
251 ? null
252 : TextUtils.replaceFirst(MASSAGEHOST_PATTERN,
253 this.getReferencedHost(), UURIFactory.EMPTY_STRING);
254 }
255
256 /***
257 * Override to cache result
258 *
259 * @return String representation of this URI
260 */
261 public synchronized String toString() {
262 if (this.cachedString == null) {
263 this.cachedString = super.toString();
264 coalesceUriStrings();
265 }
266 return this.cachedString;
267 }
268
269 public synchronized String getEscapedURI() {
270 if (this.cachedEscapedURI == null) {
271 this.cachedEscapedURI = super.getEscapedURI();
272 coalesceUriStrings();
273 }
274 return this.cachedEscapedURI;
275 }
276
277 /***
278 * The two String fields cachedString and cachedEscapedURI are
279 * usually identical; if so, coalesce into a single instance.
280 */
281 protected void coalesceUriStrings() {
282 if (this.cachedString != null && this.cachedEscapedURI != null
283 && this.cachedString.length() == this.cachedEscapedURI.length()) {
284
285
286
287 this.cachedString = this.cachedEscapedURI;
288 }
289 }
290
291 public synchronized String getHost() throws URIException {
292 if (this.cachedHost == null) {
293
294
295 if (this._host != null) {
296 this.cachedHost = super.getHost();
297 coalesceHostAuthorityStrings();
298 }
299 }
300 return this.cachedHost;
301 }
302
303 /***
304 * The two String fields cachedHost and cachedAuthorityMinusUserInfo are
305 * usually identical; if so, coalesce into a single instance.
306 */
307 protected void coalesceHostAuthorityStrings() {
308 if (this.cachedAuthorityMinusUserinfo != null
309 && this.cachedHost != null
310 && this.cachedHost.length() ==
311 this.cachedAuthorityMinusUserinfo.length()) {
312
313
314 this.cachedAuthorityMinusUserinfo = this.cachedHost;
315 }
316 }
317
318 /***
319 * Return the referenced host in the UURI, if any, also extracting the
320 * host of a DNS-lookup URI where necessary.
321 *
322 * @return the target or topic host of the URI
323 * @throws URIException
324 */
325 public String getReferencedHost() throws URIException {
326 String referencedHost = this.getHost();
327 if(referencedHost==null && this.getScheme().equals("dns")) {
328
329 String possibleHost = this.getCurrentHierPath();
330 if(possibleHost != null && possibleHost.matches("[-_//w//.:]+")) {
331 referencedHost = possibleHost;
332 }
333 }
334 return referencedHost;
335 }
336
337 /***
338 * @return Return the 'SURT' format of this UURI
339 */
340 public String getSurtForm() {
341 if (surtForm == null) {
342 surtForm = SURT.fromURI(this.toString());
343 }
344 return surtForm;
345 }
346
347 /***
348 * Return the authority minus userinfo (if any).
349 *
350 * If no userinfo present, just returns the authority.
351 *
352 * @return The authority stripped of any userinfo if present.
353 * @throws URIException
354 */
355 public String getAuthorityMinusUserinfo()
356 throws URIException {
357 if (this.cachedAuthorityMinusUserinfo == null) {
358 String tmp = getAuthority();
359 if (tmp != null && tmp.length() > 0) {
360 int index = tmp.indexOf('@');
361 if (index >= 0 && index < tmp.length()) {
362 tmp = tmp.substring(index + 1);
363 }
364 }
365 this.cachedAuthorityMinusUserinfo = tmp;
366 coalesceHostAuthorityStrings();
367 }
368 return this.cachedAuthorityMinusUserinfo;
369 }
370
371
372
373
374 public int length() {
375 return getEscapedURI().length();
376 }
377
378
379
380
381 public char charAt(int index) {
382 return getEscapedURI().charAt(index);
383 }
384
385
386
387
388 public CharSequence subSequence(int start, int end) {
389 return getEscapedURI().subSequence(start,end);
390 }
391
392
393
394
395 public int compareTo(Object arg0) {
396 return getEscapedURI().compareTo(arg0.toString());
397 }
398
399 /***
400 * Convenience method for finding the UURI inside an
401 * Object likely to have (or be/imply) one.
402 *
403 * @param o Object that is, has, or implies a UURI
404 * @return the UURI found, or null if none
405 */
406 public static UURI from(Object o) {
407 UURI u = null;
408 if (o instanceof UURI) {
409 u = (UURI)o;
410 } else if (o instanceof CandidateURI) {
411 u = ((CandidateURI) o).getUURI();
412 } else if (o instanceof CharSequence) {
413 String s = o.toString();
414 try {
415 u = UURIFactory.getInstance(s);
416 } catch (URIException e) {
417 LOGGER.log(Level.FINE,"bad URI",e);
418 }
419 }
420 return u;
421 }
422
423 /***
424 * Test if passed String has likely URI scheme prefix.
425 * @param possibleUrl URL string to examine.
426 * @return True if passed string looks like it could be an URL.
427 */
428 public static boolean hasScheme(String possibleUrl) {
429 boolean result = false;
430 for (int i = 0; i < possibleUrl.length(); i++) {
431 char c = possibleUrl.charAt(i);
432 if (c == ':') {
433 if (i != 0) {
434 result = true;
435 }
436 break;
437 }
438 if (!scheme.get(c)) {
439 break;
440 }
441 }
442 return result;
443 }
444
445 /***
446 * @param pathOrUri A file path or a URI.
447 * @return Path parsed from passed <code>pathOrUri</code>.
448 * @throws URISyntaxException
449 */
450 public static String parseFilename(final String pathOrUri)
451 throws URISyntaxException {
452 String path = pathOrUri;
453 if (UURI.hasScheme(pathOrUri)) {
454 URI url = new URI(pathOrUri);
455 path = url.getPath();
456 }
457 return (new File(path)).getName();
458 }
459 }