View Javadoc

1   /* SURT
2   *
3   * $Id: SURT.java 4919 2007-02-20 23:25:20Z gojomo $
4   *
5   * Created on Jul 16, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.util;
26  
27  import java.io.BufferedInputStream;
28  import java.io.BufferedOutputStream;
29  import java.io.BufferedReader;
30  import java.io.FileInputStream;
31  import java.io.FileOutputStream;
32  import java.io.IOException;
33  import java.io.InputStream;
34  import java.io.InputStreamReader;
35  import java.io.PrintStream;
36  import java.util.regex.Matcher;
37  
38  /***
39   * Sort-friendly URI Reordering Transform.
40   * 
41   * Converts URIs of the form:
42   * 
43   *   scheme://userinfo@domain.tld:port/path?query#fragment
44   * 
45   * ...into...
46   * 
47   *   scheme://(tld,domain,:port@userinfo)/path?query#fragment
48   * 
49   * The '(' ')' characters serve as an unambiguous notice that the so-called 
50   * 'authority' portion of the URI ([userinfo@]host[:port] in http URIs) has 
51   * been transformed; the commas prevent confusion with regular hostnames.
52   * 
53   * This remedies the 'problem' with standard URIs that the host portion of a 
54   * regular URI, with its dotted-domains, is actually in reverse order from 
55   * the natural hierarchy that's usually helpful for grouping and sorting.
56   * 
57   * The value of respecting URI case variance is considered negligible: it
58   * is vanishingly rare for case-variance to be meaningful, while URI case-
59   * variance often arises from people's confusion or sloppiness, and they
60   * only correct it insofar as necessary to avoid blatant problems. Thus 
61   * the usual SURT form is considered to be flattened to all lowercase, and 
62   * not completely reversible. 
63   * 
64   * @author gojomo
65   */
66  public class SURT {
67      static char DOT = '.';
68      static String BEGIN_TRANSFORMED_AUTHORITY = "(";
69      static String TRANSFORMED_HOST_DELIM = ",";
70      static String END_TRANSFORMED_AUTHORITY = ")";
71      
72      // 1: scheme://
73      // 2: userinfo (if present)
74      // 3: @ (if present)
75      // 4: dotted-quad host
76      // 5: other host
77      // 6: :port
78      // 7: path
79      static String URI_SPLITTER = 
80              "^(//w+://)(?:([-//w//.!~//*'//(//)%;:&=+$,]+?)(@))?"+
81      //        1           2                                 3    
82              "(?:((?://d{1,3}//.){3}//d{1,3})|(//S+?))(://d+)?(///S*)?$";
83      //           4                            5       6       7
84      
85      // RFC2396 
86      //       reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
87      //                     "$" | ","
88      //       unreserved  = alphanum | mark
89      //       mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
90      //       userinfo    = *( unreserved | escaped |
91      //                     ";" | ":" | "&" | "=" | "+" | "$" | "," )
92      //       escaped     = "%" hex hex
93  
94  
95      /***
96       * Utility method for creating the SURT form of the URI in the
97       * given String.
98       * 
99       * By default, does not preserve casing. 
100      * 
101      * @param s String URI to be converted to SURT form
102      * @return SURT form 
103      */
104     public static String fromURI(String s) {
105         return fromURI(s,false);
106     }
107     
108     /***
109      * Utility method for creating the SURT form of the URI in the
110      * given String.
111      * 
112      * If it appears a bit convoluted in its approach, note that it was
113      * optimized to minimize object-creation after allocation-sites profiling 
114      * indicated this method was a top source of garbage in long-running crawls.
115      * 
116      * Assumes that the String URI has already been cleaned/fixed (eg
117      * by UURI fixup) in ways that put it in its crawlable form for 
118      * evaluation.
119      * 
120      * @param s String URI to be converted to SURT form
121      * @param preserveCase whether original case should be preserved
122      * @return SURT form 
123      */
124     public static String fromURI(String s, boolean preserveCase) {
125         Matcher m = TextUtils.getMatcher(URI_SPLITTER,s);
126         if(!m.matches()) {
127             // not an authority-based URI scheme; return unchanged
128             TextUtils.recycleMatcher(m);
129             return s;
130         }
131         // preallocate enough space for SURT form, which includes
132         // 3 extra characters ('(', ')', and one more ',' than '.'s
133         // in original)
134         StringBuffer builder = new StringBuffer(s.length()+3);
135         append(builder,s,m.start(1),m.end(1)); // scheme://
136         builder.append(BEGIN_TRANSFORMED_AUTHORITY); // '('
137         
138         if(m.start(4)>-1) {
139             // dotted-quad ip match: don't reverse
140             append(builder,s,m.start(4),m.end(4));
141         } else {
142             // other hostname match: do reverse
143             int hostSegEnd = m.end(5);
144             int hostStart = m.start(5); 
145             for(int i = m.end(5)-1; i>=hostStart; i--) {
146                 if(s.charAt(i-1)!=DOT && i > hostStart) {
147                     continue;
148                 }
149                 append(builder,s,i,hostSegEnd); // rev host segment
150                 builder.append(TRANSFORMED_HOST_DELIM);     // ','
151                 hostSegEnd = i-1;
152             }
153         }
154 
155         append(builder,s,m.start(6),m.end(6)); // :port
156         append(builder,s,m.start(3),m.end(3)); // at
157         append(builder,s,m.start(2),m.end(2)); // userinfo
158         builder.append(END_TRANSFORMED_AUTHORITY); // ')'
159         append(builder,s,m.start(7),m.end(7)); // path
160         if (!preserveCase) {
161             for(int i = 0; i < builder.length(); i++) {
162                 builder.setCharAt(i,Character.toLowerCase(builder.charAt((i))));
163             }
164         }
165         TextUtils.recycleMatcher(m);
166         return builder.toString();
167     }
168     
169     private static void append(StringBuffer b, CharSequence cs, int start, 
170             int end) {
171         if (start < 0) {
172             return;
173         }
174         b.append(cs, start, end);
175     }
176         
177     /***
178      * Allow class to be used as a command-line tool for converting 
179      * URL lists (or naked host or host/path fragments implied
180      * to be HTTP URLs) to SURT form. Lines that cannot be converted
181      * are returned unchanged. 
182      * 
183      *
184      * Read from stdin or first file argument. Writes to stdout or 
185      * second argument filename
186      * 
187      * @param args cmd-line arguments
188      * @throws IOException
189      */
190     public static void main(String[] args) throws IOException {
191         InputStream in = args.length > 0 ? new BufferedInputStream(
192                 new FileInputStream(args[0])) : System.in;
193         PrintStream out = args.length > 1 ? new PrintStream(
194                 new BufferedOutputStream(new FileOutputStream(args[1])))
195                 : System.out;
196         BufferedReader br =
197             new BufferedReader(new InputStreamReader(in));
198         String line;
199         while((line = br.readLine())!=null) {
200             if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
201             line = line.trim();
202             if(line.length()==0) continue;
203             line = ArchiveUtils.addImpliedHttpIfNecessary(line);
204             out.println(SURT.fromURI(line));
205         }
206         br.close();
207         out.close();
208     }
209 }