1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.util;
26
27 import java.io.BufferedInputStream;
28 import java.io.BufferedOutputStream;
29 import java.io.BufferedReader;
30 import java.io.FileInputStream;
31 import java.io.FileOutputStream;
32 import java.io.IOException;
33 import java.io.InputStream;
34 import java.io.InputStreamReader;
35 import java.io.PrintStream;
36 import java.util.regex.Matcher;
37
38 /***
39 * Sort-friendly URI Reordering Transform.
40 *
41 * Converts URIs of the form:
42 *
43 * scheme://userinfo@domain.tld:port/path?query#fragment
44 *
45 * ...into...
46 *
47 * scheme://(tld,domain,:port@userinfo)/path?query#fragment
48 *
49 * The '(' ')' characters serve as an unambiguous notice that the so-called
50 * 'authority' portion of the URI ([userinfo@]host[:port] in http URIs) has
51 * been transformed; the commas prevent confusion with regular hostnames.
52 *
53 * This remedies the 'problem' with standard URIs that the host portion of a
54 * regular URI, with its dotted-domains, is actually in reverse order from
55 * the natural hierarchy that's usually helpful for grouping and sorting.
56 *
57 * The value of respecting URI case variance is considered negligible: it
58 * is vanishingly rare for case-variance to be meaningful, while URI case-
59 * variance often arises from people's confusion or sloppiness, and they
60 * only correct it insofar as necessary to avoid blatant problems. Thus
61 * the usual SURT form is considered to be flattened to all lowercase, and
62 * not completely reversible.
63 *
64 * @author gojomo
65 */
66 public class SURT {
67 static char DOT = '.';
68 static String BEGIN_TRANSFORMED_AUTHORITY = "(";
69 static String TRANSFORMED_HOST_DELIM = ",";
70 static String END_TRANSFORMED_AUTHORITY = ")";
71
72
73
74
75
76
77
78
79 static String URI_SPLITTER =
80 "^(//w+://)(?:([-//w//.!~//*'//(//)%;:&=+$,]+?)(@))?"+
81
82 "(?:((?://d{1,3}//.){3}//d{1,3})|(//S+?))(://d+)?(///S*)?$";
83
84
85
86
87
88
89
90
91
92
93
94
95 /***
96 * Utility method for creating the SURT form of the URI in the
97 * given String.
98 *
99 * By default, does not preserve casing.
100 *
101 * @param s String URI to be converted to SURT form
102 * @return SURT form
103 */
104 public static String fromURI(String s) {
105 return fromURI(s,false);
106 }
107
108 /***
109 * Utility method for creating the SURT form of the URI in the
110 * given String.
111 *
112 * If it appears a bit convoluted in its approach, note that it was
113 * optimized to minimize object-creation after allocation-sites profiling
114 * indicated this method was a top source of garbage in long-running crawls.
115 *
116 * Assumes that the String URI has already been cleaned/fixed (eg
117 * by UURI fixup) in ways that put it in its crawlable form for
118 * evaluation.
119 *
120 * @param s String URI to be converted to SURT form
121 * @param preserveCase whether original case should be preserved
122 * @return SURT form
123 */
124 public static String fromURI(String s, boolean preserveCase) {
125 Matcher m = TextUtils.getMatcher(URI_SPLITTER,s);
126 if(!m.matches()) {
127
128 TextUtils.recycleMatcher(m);
129 return s;
130 }
131
132
133
134 StringBuffer builder = new StringBuffer(s.length()+3);
135 append(builder,s,m.start(1),m.end(1));
136 builder.append(BEGIN_TRANSFORMED_AUTHORITY);
137
138 if(m.start(4)>-1) {
139
140 append(builder,s,m.start(4),m.end(4));
141 } else {
142
143 int hostSegEnd = m.end(5);
144 int hostStart = m.start(5);
145 for(int i = m.end(5)-1; i>=hostStart; i--) {
146 if(s.charAt(i-1)!=DOT && i > hostStart) {
147 continue;
148 }
149 append(builder,s,i,hostSegEnd);
150 builder.append(TRANSFORMED_HOST_DELIM);
151 hostSegEnd = i-1;
152 }
153 }
154
155 append(builder,s,m.start(6),m.end(6));
156 append(builder,s,m.start(3),m.end(3));
157 append(builder,s,m.start(2),m.end(2));
158 builder.append(END_TRANSFORMED_AUTHORITY);
159 append(builder,s,m.start(7),m.end(7));
160 if (!preserveCase) {
161 for(int i = 0; i < builder.length(); i++) {
162 builder.setCharAt(i,Character.toLowerCase(builder.charAt((i))));
163 }
164 }
165 TextUtils.recycleMatcher(m);
166 return builder.toString();
167 }
168
169 private static void append(StringBuffer b, CharSequence cs, int start,
170 int end) {
171 if (start < 0) {
172 return;
173 }
174 b.append(cs, start, end);
175 }
176
177 /***
178 * Allow class to be used as a command-line tool for converting
179 * URL lists (or naked host or host/path fragments implied
180 * to be HTTP URLs) to SURT form. Lines that cannot be converted
181 * are returned unchanged.
182 *
183 *
184 * Read from stdin or first file argument. Writes to stdout or
185 * second argument filename
186 *
187 * @param args cmd-line arguments
188 * @throws IOException
189 */
190 public static void main(String[] args) throws IOException {
191 InputStream in = args.length > 0 ? new BufferedInputStream(
192 new FileInputStream(args[0])) : System.in;
193 PrintStream out = args.length > 1 ? new PrintStream(
194 new BufferedOutputStream(new FileOutputStream(args[1])))
195 : System.out;
196 BufferedReader br =
197 new BufferedReader(new InputStreamReader(in));
198 String line;
199 while((line = br.readLine())!=null) {
200 if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
201 line = line.trim();
202 if(line.length()==0) continue;
203 line = ArchiveUtils.addImpliedHttpIfNecessary(line);
204 out.println(SURT.fromURI(line));
205 }
206 br.close();
207 out.close();
208 }
209 }