1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import it.unimi.dsi.fastutil.io.RepositionableStream;
26
27 import java.io.File;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.net.HttpURLConnection;
31 import java.net.MalformedURLException;
32 import java.net.URL;
33 import java.net.URLConnection;
34
35 import org.archive.io.arc.ARCReaderFactory;
36 import org.archive.io.warc.WARCReaderFactory;
37 import org.archive.net.UURI;
38 import org.archive.net.md5.Md5URLConnection;
39 import org.archive.net.rsync.RsyncURLConnection;
40 import org.archive.util.FileUtils;
41 import org.archive.util.IoUtils;
42
43
44 /***
45 * Factory that returns an Archive file Reader.
46 * Returns Readers for ARCs or WARCs.
47 * @author stack
48 * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $
49 */
50 public class ArchiveReaderFactory implements ArchiveFileConstants {
51 /***
52 * Offset value for when we want to stream all.
53 */
54 private final static int STREAM_ALL = -1;
55
56 private static final ArchiveReaderFactory factory =
57 new ArchiveReaderFactory();
58
59 /***
60 * Shutdown any public access to default constructor.
61 */
62 protected ArchiveReaderFactory() {
63 super();
64 }
65
66 /***
67 * Get an Archive file Reader on passed path or url.
68 * Does primitive heuristic figuring if path or URL.
69 * @param arcFileOrUrl File path or URL pointing at an Archive file.
70 * @return An Archive file Reader.
71 * @throws IOException
72 * @throws MalformedURLException
73 * @throws IOException
74 */
75 public static ArchiveReader get(final String arcFileOrUrl)
76 throws MalformedURLException, IOException {
77 return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl);
78 }
79
80 protected ArchiveReader getArchiveReader(final String arcFileOrUrl)
81 throws MalformedURLException, IOException {
82 return getArchiveReader(arcFileOrUrl, STREAM_ALL);
83 }
84
85 protected ArchiveReader getArchiveReader(final String arcFileOrUrl,
86 final long offset)
87 throws MalformedURLException, IOException {
88 return UURI.hasScheme(arcFileOrUrl)?
89 get(new URL(arcFileOrUrl), offset):
90 get(new File(arcFileOrUrl), offset);
91 }
92
93 /***
94 * @param f An Archive file to read.
95 * @return An ArchiveReader
96 * @throws IOException
97 */
98 public static ArchiveReader get(final File f) throws IOException {
99 return ArchiveReaderFactory.factory.getArchiveReader(f);
100 }
101
102 protected ArchiveReader getArchiveReader(final File f)
103 throws IOException {
104 return getArchiveReader(f, 0);
105 }
106
107 /***
108 * @param f An Archive file to read.
109 * @param offset Have returned Reader set to start reading at this offset.
110 * @return An ArchiveReader
111 * @throws IOException
112 */
113 public static ArchiveReader get(final File f, final long offset)
114 throws IOException {
115 return ArchiveReaderFactory.factory.getArchiveReader(f, offset);
116 }
117
118 protected ArchiveReader getArchiveReader(final File f,
119 final long offset)
120 throws IOException {
121 if (ARCReaderFactory.isARCSuffix(f.getName())) {
122 return ARCReaderFactory.get(f, true, offset);
123 } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
124 return WARCReaderFactory.get(f, offset);
125 }
126 throw new IOException("Unknown file extension (Not ARC nor WARC): "
127 + f.getName());
128 }
129
130 /***
131 * Wrap a Reader around passed Stream.
132 * @param s Identifying String for this Stream used in error messages.
133 * Must be a string that ends with the name of the file we're to put
134 * an ArchiveReader on. This code looks at file endings to figure
135 * whether to return an ARC or WARC reader.
136 * @param is Stream. Stream will be wrapped with implementation of
137 * RepositionableStream unless already supported.
138 * @param atFirstRecord Are we at first Record?
139 * @return ArchiveReader.
140 * @throws IOException
141 */
142 public static ArchiveReader get(final String s, final InputStream is,
143 final boolean atFirstRecord)
144 throws IOException {
145 return ArchiveReaderFactory.factory.getArchiveReader(s, is,
146 atFirstRecord);
147 }
148
149 /***
150 * @param is
151 * @return If passed <code>is</code> is
152 * {@link RepositionableInputStream}, returns <code>is</code>, else we
153 * wrap <code>is</code> with {@link RepositionableStream}.
154 */
155 protected InputStream asRepositionable(final InputStream is) {
156 if (is instanceof RepositionableStream) {
157 return is;
158 }
159
160
161
162 return new RepositionableInputStream(is, 16 * 1024);
163 }
164
165 protected ArchiveReader getArchiveReader(final String id,
166 final InputStream is, final boolean atFirstRecord)
167 throws IOException {
168 final InputStream stream = asRepositionable(is);
169 if (ARCReaderFactory.isARCSuffix(id)) {
170 return ARCReaderFactory.get(id, stream, atFirstRecord);
171 } else if (WARCReaderFactory.isWARCSuffix(id)) {
172 return WARCReaderFactory.get(id, stream, atFirstRecord);
173 }
174 throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
175 }
176
177 /***
178 * Get an Archive Reader aligned at <code>offset</code>.
179 * This version of get will not bring the file local but will try to
180 * stream across the net making an HTTP 1.1 Range request on remote
181 * http server (RFC1435 Section 14.35).
182 * @param u HTTP URL for an Archive file.
183 * @param offset Offset into file at which to start fetching.
184 * @return An ArchiveReader aligned at offset.
185 * @throws IOException
186 */
187 public static ArchiveReader get(final URL u, final long offset)
188 throws IOException {
189 return ArchiveReaderFactory.factory.getArchiveReader(u, offset);
190 }
191
192 protected ArchiveReader getArchiveReader(final URL f, final long offset)
193 throws IOException {
194
195 URLConnection connection = f.openConnection();
196 if (!(connection instanceof HttpURLConnection)) {
197 throw new IOException("This method only handles HTTP connections.");
198 }
199 addUserAgent((HttpURLConnection)connection);
200 if (offset != STREAM_ALL) {
201
202
203
204 connection.addRequestProperty("Range", "bytes=" + offset + "-");
205 }
206
207 return getArchiveReader(f.toString(), connection.getInputStream(),
208 (offset == 0));
209 }
210
211 /***
212 * Get an ARCReader.
213 * Pulls the ARC local into whereever the System Property
214 * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
215 * points at this local copy. A close on this ARCReader instance will
216 * remove the local copy.
217 * @param u An URL that points at an ARC.
218 * @return An ARCReader.
219 * @throws IOException
220 */
221 public static ArchiveReader get(final URL u)
222 throws IOException {
223 return ArchiveReaderFactory.factory.getArchiveReader(u);
224 }
225
226 protected ArchiveReader getArchiveReader(final URL u)
227 throws IOException {
228
229 if (u.getPath() != null) {
230
231 File f = new File(u.getPath());
232 if (f.exists()) {
233 return get(f, 0);
234 }
235 }
236
237 String scheme = u.getProtocol();
238 if (scheme.startsWith("http") || scheme.equals("s3")) {
239
240
241
242 return get(u, STREAM_ALL);
243 }
244
245 return makeARCLocal(u.openConnection());
246 }
247
248 protected ArchiveReader makeARCLocal(final URLConnection connection)
249 throws IOException {
250 File localFile = null;
251 if (connection instanceof HttpURLConnection) {
252
253 String p = connection.getURL().getPath();
254 int index = p.lastIndexOf('/');
255 if (index >= 0) {
256
257 localFile = new File(FileUtils.TMPDIR, p.substring(index + 1));
258 if (localFile.exists()) {
259
260
261
262 localFile.delete();
263 }
264 } else {
265 localFile = File.createTempFile(ArchiveReader.class.getName(),
266 ".tmp", FileUtils.TMPDIR);
267 }
268 addUserAgent((HttpURLConnection)connection);
269 connection.connect();
270 try {
271 IoUtils.readFullyToFile(connection.getInputStream(), localFile,
272 new byte[16 * 1024]);
273 } catch (IOException ioe) {
274 localFile.delete();
275 throw ioe;
276 }
277 } else if (connection instanceof RsyncURLConnection) {
278
279
280 connection.connect();
281 localFile = ((RsyncURLConnection)connection).getFile();
282 } else if (connection instanceof Md5URLConnection) {
283
284
285 connection.connect();
286 localFile = ((Md5URLConnection)connection).getFile();
287 } else {
288 throw new UnsupportedOperationException("No support for " +
289 connection);
290 }
291
292 ArchiveReader reader = null;
293 try {
294 reader = get(localFile, 0);
295 } catch (IOException e) {
296 localFile.delete();
297 throw e;
298 }
299
300
301 return reader.getDeleteFileOnCloseReader(localFile);
302 }
303
304 protected void addUserAgent(final HttpURLConnection connection) {
305 connection.addRequestProperty("User-Agent", this.getClass().getName());
306 }
307
308 /***
309 * @param f File to test.
310 * @return True if <code>f</code> is compressed.
311 * @throws IOException
312 */
313 protected boolean isCompressed(final File f) throws IOException {
314 return f.getName().toLowerCase().
315 endsWith(DOT_COMPRESSED_FILE_EXTENSION);
316 }
317 }