1   /* $Id: ArchiveReaderFactory.java 4977 2007-03-09 23:57:28Z stack-sf $
2    *
3    * Created on August 18th, 2006
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import it.unimi.dsi.fastutil.io.RepositionableStream;
26  
27  import java.io.File;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.net.HttpURLConnection;
31  import java.net.MalformedURLException;
32  import java.net.URL;
33  import java.net.URLConnection;
34  
35  import org.archive.io.arc.ARCReaderFactory;
36  import org.archive.io.warc.WARCReaderFactory;
37  import org.archive.net.UURI;
38  import org.archive.net.md5.Md5URLConnection;
39  import org.archive.net.rsync.RsyncURLConnection;
40  import org.archive.util.FileUtils;
41  import org.archive.util.IoUtils;
42  
43  
44  /***
45   * Factory that returns an Archive file Reader.
46   * Returns Readers for ARCs or WARCs.
47   * @author stack
48   * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $
49   */
50  public class ArchiveReaderFactory implements ArchiveFileConstants {
51  	/***
52  	 * Offset value for when we want to stream all.
53  	 */
54  	private final static int STREAM_ALL = -1;
55  	
56  	private static final ArchiveReaderFactory factory =
57  		new ArchiveReaderFactory();
58  	
59      /***
60       * Shutdown any public access to default constructor.
61       */
62      protected ArchiveReaderFactory() {
63          super();
64      }
65      
66      /***
67       * Get an Archive file Reader on passed path or url.
68       * Does primitive heuristic figuring if path or URL.
69       * @param arcFileOrUrl File path or URL pointing at an Archive file.
70       * @return An Archive file Reader.
71       * @throws IOException 
72       * @throws MalformedURLException 
73       * @throws IOException 
74       */
75      public static ArchiveReader get(final String arcFileOrUrl)
76      throws MalformedURLException, IOException {
77      	return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl);
78      }
79      
80      protected ArchiveReader getArchiveReader(final String arcFileOrUrl)
81      throws MalformedURLException, IOException {
82      	return getArchiveReader(arcFileOrUrl, STREAM_ALL);
83      }
84      
85      protected ArchiveReader getArchiveReader(final String arcFileOrUrl,
86      	final long offset)
87      throws MalformedURLException, IOException {
88      	return UURI.hasScheme(arcFileOrUrl)?
89      		get(new URL(arcFileOrUrl), offset):
90      			get(new File(arcFileOrUrl), offset);
91      }
92      
93      /***
94       * @param f An Archive file to read.
95       * @return An ArchiveReader
96       * @throws IOException 
97       */
98      public static ArchiveReader get(final File f) throws IOException {
99      	return ArchiveReaderFactory.factory.getArchiveReader(f);
100     }
101     
102     protected ArchiveReader getArchiveReader(final File f)
103     throws IOException {
104     	return getArchiveReader(f, 0);
105     }
106     
107     /***
108      * @param f An Archive file to read.
109      * @param offset Have returned Reader set to start reading at this offset.
110      * @return An ArchiveReader
111      * @throws IOException 
112      */
113     public static ArchiveReader get(final File f, final long offset)
114     throws IOException {
115     	return ArchiveReaderFactory.factory.getArchiveReader(f, offset);
116 	}
117     
118     protected ArchiveReader getArchiveReader(final File f,
119     	final long offset)
120     throws IOException {
121     	if (ARCReaderFactory.isARCSuffix(f.getName())) {
122     		return ARCReaderFactory.get(f, true, offset);
123     	} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
124     		return WARCReaderFactory.get(f, offset);
125     	}
126     	throw new IOException("Unknown file extension (Not ARC nor WARC): "
127     		+ f.getName());
128     }
129     
130     /***
131      * Wrap a Reader around passed Stream.
132      * @param s Identifying String for this Stream used in error messages.
133      * Must be a string that ends with the name of the file we're to put
134      * an ArchiveReader on.  This code looks at file endings to figure
135      * whether to return an ARC or WARC reader.
136      * @param is Stream.  Stream will be wrapped with implementation of
137      * RepositionableStream unless already supported.
138      * @param atFirstRecord Are we at first Record?
139      * @return ArchiveReader.
140      * @throws IOException
141      */
142     public static ArchiveReader get(final String s, final InputStream is,
143         final boolean atFirstRecord)
144     throws IOException {
145         return ArchiveReaderFactory.factory.getArchiveReader(s, is,
146         	atFirstRecord);
147     }
148     
149     /***
150      * @param is
151      * @return If passed <code>is</code> is
152      * {@link RepositionableInputStream}, returns <code>is</code>, else we
153      * wrap <code>is</code> with {@link RepositionableStream}.
154      */
155     protected InputStream asRepositionable(final InputStream is) {
156         if (is instanceof RepositionableStream) {
157             return is;
158         }
159         // RepositionableInputStream calls mark on each read so can back up at
160         // least the read amount.  Needed for gzip inflater overinflations
161         // reading into the next gzip member.
162         return new RepositionableInputStream(is, 16 * 1024);
163     }
164     
165     protected ArchiveReader getArchiveReader(final String id, 
166     		final InputStream is, final boolean atFirstRecord)
167     throws IOException {
168     	final InputStream stream = asRepositionable(is);
169         if (ARCReaderFactory.isARCSuffix(id)) {
170             return ARCReaderFactory.get(id, stream, atFirstRecord);
171         } else if (WARCReaderFactory.isWARCSuffix(id)) {
172             return WARCReaderFactory.get(id, stream, atFirstRecord);
173         }
174         throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
175     }
176     
177     /***
178      * Get an Archive Reader aligned at <code>offset</code>.
179      * This version of get will not bring the file local but will try to
180      * stream across the net making an HTTP 1.1 Range request on remote
181      * http server (RFC1435 Section 14.35).
182      * @param u HTTP URL for an Archive file.
183      * @param offset Offset into file at which to start fetching.
184      * @return An ArchiveReader aligned at offset.
185      * @throws IOException
186      */
187     public static ArchiveReader get(final URL u, final long offset)
188     throws IOException {
189     	return ArchiveReaderFactory.factory.getArchiveReader(u, offset);
190     }
191     
192     protected ArchiveReader getArchiveReader(final URL f, final long offset)
193     throws IOException {
194         // Get URL connection.
195         URLConnection connection = f.openConnection();
196         if (!(connection instanceof HttpURLConnection)) {
197             throw new IOException("This method only handles HTTP connections.");
198         }
199         addUserAgent((HttpURLConnection)connection);
200         if (offset != STREAM_ALL) {
201         	// Use a Range request (Assumes HTTP 1.1 on other end). If
202         	// length >= 0, add open-ended range header to the request.  Else,
203         	// because end-byte is inclusive, subtract 1.
204         	connection.addRequestProperty("Range", "bytes=" + offset + "-");
205         }
206         
207         return getArchiveReader(f.toString(), connection.getInputStream(),
208             (offset == 0));
209     }
210     
211     /***
212      * Get an ARCReader.
213      * Pulls the ARC local into whereever the System Property
214      * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
215      * points at this local copy.  A close on this ARCReader instance will
216      * remove the local copy.
217      * @param u An URL that points at an ARC.
218      * @return An ARCReader.
219      * @throws IOException 
220      */
221     public static ArchiveReader get(final URL u)
222     throws IOException {
223     	return ArchiveReaderFactory.factory.getArchiveReader(u);
224     }
225     
226     protected ArchiveReader getArchiveReader(final URL u)
227     throws IOException {
228         // If url represents a local file then return file it points to.
229         if (u.getPath() != null) {
230             // TODO: Add scheme check and host check.
231             File f = new File(u.getPath());
232             if (f.exists()) {
233                 return get(f, 0);
234             }
235         }
236        
237         String scheme = u.getProtocol();
238         if (scheme.startsWith("http") || scheme.equals("s3")) {
239             // Try streaming if http or s3 URLs rather than copying local
240         	// and then reading (Passing an offset will get us an Reader
241         	// that wraps a Stream).
242             return get(u, STREAM_ALL);
243         }
244         
245         return makeARCLocal(u.openConnection());
246     }
247     
248     protected ArchiveReader makeARCLocal(final URLConnection connection)
249     throws IOException {
250         File localFile = null;
251         if (connection instanceof HttpURLConnection) {
252             // If http url connection, bring down the resource local.
253             String p = connection.getURL().getPath();
254             int index = p.lastIndexOf('/');
255             if (index >= 0) {
256                 // Name file for the file we're making local.
257                 localFile = new File(FileUtils.TMPDIR, p.substring(index + 1));
258                 if (localFile.exists()) {
259                     // If file of same name already exists in TMPDIR, then
260                     // clean it up (Assuming only reason a file of same name in
261                     // TMPDIR is because we failed a previous download).
262                     localFile.delete();
263                 }
264             } else {
265                 localFile = File.createTempFile(ArchiveReader.class.getName(),
266                     ".tmp", FileUtils.TMPDIR);
267             }
268             addUserAgent((HttpURLConnection)connection);
269             connection.connect();
270             try {
271                 IoUtils.readFullyToFile(connection.getInputStream(), localFile,
272                     new byte[16 * 1024]);
273             } catch (IOException ioe) {
274                 localFile.delete();
275                 throw ioe;
276             }
277         } else if (connection instanceof RsyncURLConnection) {
278             // Then, connect and this will create a local file.
279             // See implementation of the rsync handler.
280             connection.connect();
281             localFile = ((RsyncURLConnection)connection).getFile();
282         } else if (connection instanceof Md5URLConnection) {
283             // Then, connect and this will create a local file.
284             // See implementation of the md5 handler.
285             connection.connect();
286             localFile = ((Md5URLConnection)connection).getFile();
287         } else {
288             throw new UnsupportedOperationException("No support for " +
289                 connection);
290         }
291         
292         ArchiveReader reader = null;
293         try {
294             reader = get(localFile, 0);
295         } catch (IOException e) {
296             localFile.delete();
297             throw e;
298         }
299         
300         // Return a delegate that does cleanup of downloaded file on close.
301         return reader.getDeleteFileOnCloseReader(localFile);
302     }
303     
304     protected void addUserAgent(final HttpURLConnection connection) {
305         connection.addRequestProperty("User-Agent", this.getClass().getName());
306     }
307     
308     /***
309      * @param f File to test.
310      * @return True if <code>f</code> is compressed.
311      * @throws IOException
312      */
313     protected boolean isCompressed(final File f) throws IOException {
314         return f.getName().toLowerCase().
315         	endsWith(DOT_COMPRESSED_FILE_EXTENSION);
316     }
317 }