View Javadoc

1   /* HTTPRecorder
2    *
3    * $Id: HttpRecorder.java 7149 2011-05-03 17:19:15Z nlevitt $
4    *
5    * Created on Sep 22, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.util;
26  
27  import java.io.BufferedInputStream;
28  import java.io.File;
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.io.InputStreamReader;
32  import java.io.OutputStream;
33  import java.nio.charset.Charset;
34  import java.util.logging.Level;
35  import java.util.logging.Logger;
36  
37  import org.archive.io.RecordingInputStream;
38  import org.archive.io.RecordingOutputStream;
39  import org.archive.io.ReplayCharSequence;
40  import org.archive.io.ReplayInputStream;
41  
42  import com.google.common.base.Charsets;
43  
44  
45  /***
46   * Pairs together a RecordingInputStream and RecordingOutputStream
47   * to capture exactly a single HTTP transaction.
48   *
49   * Initially only supports HTTP/1.0 (one request, one response per stream)
50   *
51   * Call {@link #markContentBegin()} to demarc the transition between HTTP
52   * header and body.
53   *
54   * @author gojomo
55   */
56  public class HttpRecorder {
57      protected static Logger logger =
58          Logger.getLogger("org.archive.util.HttpRecorder");
59  
60      private static final int DEFAULT_OUTPUT_BUFFER_SIZE = 4096;
61      private static final int DEFAULT_INPUT_BUFFER_SIZE = 65536;
62  
63      private RecordingInputStream ris = null;
64      private RecordingOutputStream ros = null;
65  
66      /***
67       * Backing file basename.
68       *
69       * Keep it around so can clean up backing files left on disk.
70       */
71      private String backingFileBasename = null;
72  
73      /***
74       * Backing file output stream suffix.
75       */
76      private static final String RECORDING_OUTPUT_STREAM_SUFFIX = ".ros";
77  
78     /***
79      * Backing file input stream suffix.
80      */
81      private static final String RECORDING_INPUT_STREAM_SUFFIX = ".ris";
82  
83      /***
84       * Response character encoding.
85       */
86      private String characterEncoding = null;
87  
88      /***
89       * Constructor with limited access.
90       * Used internally for case where we're wrapping an already
91       * downloaded stream with a HttpRecorder.
92       */
93      protected HttpRecorder() {
94          super();
95      }
96      
97      /***
98       * Create an HttpRecorder.
99       *
100      * @param tempDir Directory into which we drop backing files for
101      * recorded input and output.
102      * @param backingFilenameBase Backing filename base to which we'll append
103      * suffices <code>ris</code> for recorded input stream and
104      * <code>ros</code> for recorded output stream.
105      * @param outBufferSize Size of output buffer to use.
106      * @param inBufferSize Size of input buffer to use.
107      */
108     public HttpRecorder(File tempDir, String backingFilenameBase, 
109             int outBufferSize, int inBufferSize) {
110         super();
111         tempDir.mkdirs();
112         this.backingFileBasename =
113             (new File(tempDir.getPath(), backingFilenameBase))
114                 .getAbsolutePath();
115         this.ris = new RecordingInputStream(inBufferSize,
116             this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX);
117         this.ros = new RecordingOutputStream(outBufferSize,
118             this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX);
119     }
120 
121     /***
122      * Create an HttpRecorder.
123      * 
124      * @param tempDir
125      *            Directory into which we drop backing files for recorded input
126      *            and output.
127      * @param backingFilenameBase
128      *            Backing filename base to which we'll append suffices
129      *            <code>ris</code> for recorded input stream and
130      *            <code>ros</code> for recorded output stream.
131      */
132     public HttpRecorder(File tempDir, String backingFilenameBase) {
133         this(tempDir, backingFilenameBase, DEFAULT_INPUT_BUFFER_SIZE,
134                 DEFAULT_OUTPUT_BUFFER_SIZE);
135     }
136 
137     /***
138      * Wrap the provided stream with the internal RecordingInputStream
139      *
140      * open() throws an exception if RecordingInputStream is already open.
141      *
142      * @param is InputStream to wrap.
143      *
144      * @return The input stream wrapper which itself is an input stream.
145      * Pass this in place of the passed stream so input can be recorded.
146      *
147      * @throws IOException
148      */
149     public InputStream inputWrap(InputStream is) 
150     throws IOException {
151         logger.fine(Thread.currentThread().getName() + " wrapping input");
152         this.ris.open(is);
153         return this.ris;
154     }
155 
156     /***
157      * Wrap the provided stream with the internal RecordingOutputStream
158      *
159      * open() throws an exception if RecordingOutputStream is already open.
160      * 
161      * @param os The output stream to wrap.
162      *
163      * @return The output stream wrapper which is itself an output stream.
164      * Pass this in place of the passed stream so output can be recorded.
165      *
166      * @throws IOException
167      */
168     public OutputStream outputWrap(OutputStream os) 
169     throws IOException {
170         this.ros.open(os);
171         return this.ros;
172     }
173 
174     /***
175      * Close all streams.
176      */
177     public void close() {
178         logger.fine(Thread.currentThread().getName() + " closing");
179         try {
180             this.ris.close();
181         } catch (IOException e) {
182             // TODO: Can we not let the exception out of here and report it
183             // higher up in the caller?
184             DevUtils.logger.log(Level.SEVERE, "close() ris" +
185                 DevUtils.extraInfo(), e);
186         }
187         try {
188             this.ros.close();
189         } catch (IOException e) {
190             DevUtils.logger.log(Level.SEVERE, "close() ros" +
191                 DevUtils.extraInfo(), e);
192         }
193     }
194 
195     /***
196      * Return the internal RecordingInputStream
197      *
198      * @return A RIS.
199      */
200     public RecordingInputStream getRecordedInput() {
201         return this.ris;
202     }
203 
204     /***
205      * @return The RecordingOutputStream.
206      */
207     public RecordingOutputStream getRecordedOutput() {
208         return this.ros;
209     }
210 
211     /***
212      * Mark current position as the point where the HTTP headers end.
213      */
214     public void markContentBegin() {
215         this.ris.markContentBegin();
216     }
217 
218     public long getResponseContentLength() {
219         return this.ris.getResponseContentLength();
220     }
221 
222     /***
223      * Close both input and output recorders.
224      *
225      * Recorders are the output streams to which we are recording.
226      * {@link #close()} closes the stream that is being recorded and the
227      * recorder. This method explicitly closes the recorder only.
228      */
229     public void closeRecorders() {
230         try {
231             this.ris.closeRecorder();
232             this.ros.closeRecorder();
233         } catch (IOException e) {
234             DevUtils.warnHandle(e, "Convert to runtime exception?");
235         }
236     }
237 
238     /***
239      * Cleanup backing files.
240      *
241      * Call when completely done w/ recorder.  Removes any backing files that
242      * may have been dropped.
243      */
244     public void cleanup() {
245         this.close();
246         this.delete(this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX);
247         this.delete(this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX);
248     }
249 
250     /***
251      * Delete file if exists.
252      *
253      * @param name Filename to delete.
254      */
255     private void delete(String name) {
256         File f = new File(name);
257         if (f.exists()) {
258             f.delete();
259         }
260     }
261 
262     /***
263      * Get the current threads' HttpRecorder.
264      *
265      * @return This threads' HttpRecorder.  Returns null if can't find a
266      * HttpRecorder in current instance.
267      */
268     public static HttpRecorder getHttpRecorder() {
269         HttpRecorder recorder = null;
270         Thread thread = Thread.currentThread();
271         if (thread instanceof HttpRecorderMarker) {
272             recorder = ((HttpRecorderMarker)thread).getHttpRecorder();
273         }
274         return recorder;
275     }
276 
277     /***
278      * @param characterEncoding Character encoding of recording.
279      */
280     public void setCharacterEncoding(String characterEncoding) {
281         this.characterEncoding = characterEncoding;
282     }
283 
284     /***
285      * @return Returns the characterEncoding.
286      */
287     public String getCharacterEncoding() {
288         return this.characterEncoding;
289     }
290 
291     /***
292      * @return A ReplayCharSequence.  Call close on the RCS when done w/ it.
293      * Will return indeterminate results if the underlying recording streams
294      * have not been closed first.
295      * @throws IOException
296      * @throws IOException
297      */
298     public ReplayCharSequence getReplayCharSequence() throws IOException {
299         return getRecordedInput().
300             getReplayCharSequence(this.characterEncoding);
301     }
302     
303     /***
304      * @return A replay input stream.
305      * @throws IOException
306      */
307     public ReplayInputStream getReplayInputStream() throws IOException {
308         return getRecordedInput().getReplayInputStream();
309     }
310 
311     /***
312      * Return a short prefix of the presumed-textual content as a String.
313      * 
314      * @param size max length of String to return 
315      * @return String prefix, or empty String (with logged exception) on any error
316      */
317     public String getContentReplayPrefixString(int size) {
318         Charset charset = Charsets.ISO_8859_1;
319         if (characterEncoding != null) {
320             try {
321                 charset = Charset.forName(characterEncoding);
322             } catch (IllegalArgumentException e) {
323                 // revert to single-byte for unknown encodings
324             }
325         }
326 
327         try {
328             InputStreamReader isr =  new InputStreamReader(getRecordedInput().getContentReplayInputStream(), charset);
329             char[] chars = new char[size];
330             int count = isr.read(chars);
331             isr.close(); 
332             return new String(chars,0,count);
333         } catch (IOException e) {
334             logger.log(Level.SEVERE,"unable to get replay prefix string", e);
335             return ""; 
336         } 
337     }
338     
339     /***
340      * Record the input stream for later playback by an extractor, etc.
341      * This is convenience method used to setup an artificial HttpRecorder
342      * scenario used in unit tests, etc.
343      * @param dir Directory to write backing file to.
344      * @param basename of what we're recording.
345      * @param in Stream to read.
346      * @param encoding Stream encoding.
347      * @throws IOException
348      * @return An {@link org.archive.util.HttpRecorder}.
349      */
350     public static HttpRecorder wrapInputStreamWithHttpRecord(File dir,
351         String basename, InputStream in, String encoding)
352     throws IOException {
353         HttpRecorder rec = new HttpRecorder(dir, basename);
354         if (encoding != null && encoding.length() > 0) {
355             rec.setCharacterEncoding(encoding);
356         }
357         // Do not use FastBufferedInputStream here.  It does not
358         // support mark.
359         InputStream is = rec.inputWrap(new BufferedInputStream(in));
360         rec.markContentBegin();
361         
362         final int BUFFER_SIZE = 1024 * 4;
363         byte [] buffer = new byte[BUFFER_SIZE];
364         while(true) {
365             // Just read it all down.
366             int x = is.read(buffer);
367             if (x == -1) {
368                 break;
369             }
370         }
371         is.close();
372         return rec;
373     }
374 }