View Javadoc

1   /* UriUtils
2    * 
3    * $Id: MimetypeUtils.java 3119 2005-02-17 20:39:21Z stack-sf $
4    * 
5    * Created on April 15, 2010
6    *
7    * Copyright (C) 2010 Internet Archive.
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.util;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  import java.util.regex.Matcher;
30  
31  import org.apache.commons.codec.DecoderException;
32  import org.apache.commons.httpclient.URIException;
33  import org.archive.net.LaxURLCodec;
34  import org.archive.net.UURI;
35  
36  
37  /***
38   * URI-related utilities. 
39   * 
40   * Primarily, a place to centralize and better document and test certain URI-related heuristics
41   * that may be useful in many places. 
42   * 
43   * The choice of when to consider a string likely enough to be a URI that we try crawling it 
44   * is, so far, based on rather arbitrary rules-of-thumb. We have not quantitatively tested 
45   * how often the strings that pass these tests yield meaningful (not 404, non-soft-404, 
46   * non-garbage) replies. We are willing to accept some level of mistaken requests, knowing
47   * that their cost is usually negligible, if that allows us to discover meaningful content
48   * that could be not be discovered via other heuristics. 
49   * 
50   *  Our intuitive understanding so far is that: strings that appear to have ./.. relative-path
51   *  prefixes, dot-extensions,  or path-slashes are good candidates for trying as URIs, even 
52   *  though with some Javascript/HTML-VALUE-attributes, this yields a lot of false positives. 
53   *  
54   *  We want to get strings like....
55   *  
56   *    photo.jpg
57   *    /photos
58   *    /photos/
59   *    ./photos
60   *    ../../photos
61   *    photos/index.html
62   *  
63   *  ...but we will thus also sometimes try strings that were other kinds of variables/
64   *  parameters, like...
65   *  
66   *    rectangle.x
67   *    11.2px
68   *    text/xml
69   *    width:6.33
70   * 
71   *  Until better rules, exception-blacklists or even site-sensitive dynamic adjustment of 
72   *  heuristics (eg: this site, guesses are yield 200s, keep guessing; this site, guesses are
73   *  all 404s, stop guessing) are developed, crawl operators should monitor their crawls 
74   *  (and contact email) for cases where speculative crawling are generating many errors, and
75   *  use settings like ExtractorHTML's 'extract-javascript' and 'extract-value-attributes' or
76   *  disable of ExtractorJS entirely when they want to curtail those errors. 
77   *  
78   *  The 'legacy' tests are those used in H1 at least through 1.14.4. They have
79   *  some known problems, but are not yet being dropped until more experience 
80   *  with the 'new' isLikelyUri() test is collected (in H3). Enable the 'xest'
81   *  methods of the UriUtilsTest class for details. 
82   *  
83   * @contributor gojomo
84   */
85  public class UriUtils {
86      private static final Logger LOGGER = Logger.getLogger(UriUtils.class.getName());
87  
88  //
89  // new combined test
90  //
91      // naive likely-uri test: 
92      //    no whitespace or '<' or '>'; 
93      //    at least one '.' or '/';
94      //    not ending with '.'
95      static final String NAIVE_LIKELY_URI_PATTERN = "[^<>//s]*[//./][^<>//s]*(?<!//.)";
96      
97      // blacklist of strings that NAIVE_LIKELY_URI_PATTERN picks up as URIs,
98      // which are known to be problematic, and NOT to be tried as URIs
99      protected final static String[] NAIVE_URI_EXCEPTIONS = {
100         "text/javascript"
101         };
102     
103     public static boolean isLikelyUri(CharSequence candidate) {
104         // naive test
105         if(!TextUtils.matches(NAIVE_LIKELY_URI_PATTERN, candidate)) {
106             return false; 
107         }
108         // eliminate common false-positives: by blacklist
109         for (String s : NAIVE_URI_EXCEPTIONS) {
110             if (s.contentEquals(candidate)) 
111                 return false;
112         }
113         // ...and simple numbers
114         if(TextUtils.matches("//d+//.//d+", candidate)) {
115             return false; 
116         }
117         return true; 
118     }
119     
120     
121     /***
122      * Perform additional fixup of likely-URI Strings
123      * 
124      * @param string detected candidate String
125      * @return String changed/decoded to increase likelihood it is a 
126      * meaningful non-404 URI
127      */
128     public static String speculativeFixup(String candidate, UURI base) {
129         String retVal = candidate;
130         
131         // unescape ampersands
132         retVal = TextUtils.replaceAll("&amp;", retVal, "&");
133         
134         // uri-decode if begins with encoded 'http(s)?%3A'
135         Matcher m = TextUtils.getMatcher("(?i)^https?%3A.*",retVal); 
136         if(m.matches()) {
137             try {
138                 retVal = LaxURLCodec.DEFAULT.decode(retVal);
139             } catch (DecoderException e) {
140                 LOGGER.log(Level.INFO,"unable to decode",e);
141             }
142         }
143         TextUtils.recycleMatcher(m);
144         
145         // TODO: more URI-decoding if there are %-encoded parts?
146         
147         // detect scheme-less intended-absolute-URI
148         // intent: "opens with what looks like a dotted-domain, and 
149         // last segment is a top-level-domain (eg "com", "org", etc)" 
150         m = TextUtils.getMatcher(
151                 "^[^//./://s%]+//.[^/://s%]+//.([^//./://s%]+)(/.*|)$", 
152                 retVal);
153         if(m.matches()) {
154             if(ArchiveUtils.isTld(m.group(1))) { 
155                 String schemePlus = "http://";       
156                 // if on exact same host preserve scheme (eg https)
157                 try {
158                     if (retVal.startsWith(base.getHost())) {
159                         schemePlus = base.getScheme() + "://";
160                     }
161                 } catch (URIException e) {
162                     // error retrieving source host - ignore it
163                 }
164                 retVal = schemePlus + retVal; 
165             }
166         }
167         TextUtils.recycleMatcher(m);
168         
169         return retVal; 
170     }
171     
172     
173 //
174 // legacy likely-URI test from ExtractorJS
175 //
176     // determines whether a string is likely URI
177     // (no whitespace or '<' '>',  has an internal dot or some slash,
178     // begins and ends with either '/' or a word-char)
179     static final String STRING_URI_DETECTOR =
180         "(?://w|[//.]{0,2}/)[//S&&[^<>]]*(?://.|/)[//S&&[^<>]]*(?://w|/)";
181 
182  
183     // blacklist of strings that STRING_URI_DETECTOR picks up as URIs,
184     // which are known to be problematic, and NOT to be 
185     // added to outLinks
186     protected final static String[] STRING_URI_DETECTOR_EXCEPTIONS = {
187         "text/javascript"
188         };
189     
190     public static boolean isLikelyUriJavascriptContextLegacy(CharSequence candidate) {
191     	if(!TextUtils.matches(STRING_URI_DETECTOR,candidate)) {
192     		return false; 
193     	}
194     	for (String s : STRING_URI_DETECTOR_EXCEPTIONS) {
195             if (s.contentEquals(candidate)) 
196                 return false;
197         }
198     	// matches detector and not an exception: so a likely URI
199     	return true; 
200     	
201     }
202     
203 //
204 // legacy likely-URI test from ExtractorHTML
205 // 
206 	
207     // much like the javascript likely-URI extractor, but
208     // without requiring quotes -- this can indicate whether
209     // an HTML tag attribute that isn't definitionally a
210     // URI might be one anyway, as in form-tag VALUE attributes
211     static final String LIKELY_URI_PATH =
212      "(//.{0,2}[^//.//n//r//s\"']*(//.[^//.//n//r//s\"']+)+)";
213 	
214 	public static boolean isLikelyUriHtmlContextLegacy(CharSequence candidate) {
215 		return TextUtils.matches(LIKELY_URI_PATH, candidate);
216 	}
217 }