View Javadoc

1   /* UriProcessingFormatter.java
2    *
3    * $Id: UriProcessingFormatter.java 6905 2010-06-22 00:56:45Z nlevitt $
4    * 
5    * Created on Jun 10, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.io;
26  
27  import it.unimi.dsi.mg4j.util.MutableString;
28  
29  import java.util.logging.Formatter;
30  import java.util.logging.LogRecord;
31  
32  import org.archive.crawler.datamodel.CoreAttributeConstants;
33  import org.archive.crawler.datamodel.CrawlURI;
34  import org.archive.util.ArchiveUtils;
35  import org.archive.util.MimetypeUtils;
36  
37  /***
38   * Formatter for 'crawl.log'. Expects completed CrawlURI as parameter.
39   *
40   * @author gojomo
41   */
42  public class UriProcessingFormatter
43  extends Formatter implements CoreAttributeConstants {
44      private final static String NA = "-";
45      /***
46       * Guess at line length (URIs are assumed avg. of 128 bytes).
47       * Used to preallocated the buffer we accumulate the log line
48       * in.  Hopefully we get it right most of the time and no need
49       * to enlarge except in the rare case.
50       */
51      private final static int GUESS_AT_LOG_LENGTH =
52          17 + 1 + 3 + 1 + 10 + 128 + + 1 + 10 + 1 + 128 + 1 + 10 + 1 + 3 +
53          14 + 1 + 32 + 4 + 128 + 1 + 64 + 1;
54      
55      /***
56       * Reuseable assembly buffer.
57       */
58      private final MutableString buffer =
59          new MutableString(GUESS_AT_LOG_LENGTH);
60      
61      public String format(LogRecord lr) {
62          CrawlURI curi = (CrawlURI)lr.getParameters()[0];
63          String length = NA;
64          String mime = null;
65          if (curi.isHttpTransaction()) {
66              if(curi.getContentLength() >= 0) {
67                  length = Long.toString(curi.getContentLength());
68              } else if (curi.getContentSize() > 0) {
69                  length = Long.toString(curi.getContentSize());
70              }
71              mime = curi.getContentType();
72          } else {
73              if (curi.getContentSize() > 0) {
74                  length = Long.toString(curi.getContentSize());
75              } 
76              mime = curi.getContentType();
77          }
78          mime = MimetypeUtils.truncate(mime);
79  
80          long time = System.currentTimeMillis();
81          String arcTimeAndDuration;
82          if(curi.containsKey(A_FETCH_COMPLETED_TIME)) {
83              long completedTime = curi.getLong(A_FETCH_COMPLETED_TIME);
84              long beganTime = curi.getLong(A_FETCH_BEGAN_TIME);
85              arcTimeAndDuration = ArchiveUtils.get17DigitDate(beganTime) + "+"
86                      + Long.toString(completedTime - beganTime);
87          } else {
88              arcTimeAndDuration = NA;
89          }
90  
91          String via = curi.flattenVia();
92          
93          String digest = curi.getContentDigestSchemeString();
94  
95          String sourceTag = curi.containsKey(A_SOURCE_TAG) 
96                  ? curi.getString(A_SOURCE_TAG)
97                  : null;
98                  
99          String warc = curi.containsKey(A_WRITTEN_TO_WARC) 
100                 ? curi.getString(A_WRITTEN_TO_WARC) 
101                 : null;
102                 
103         this.buffer.length(0);
104         return this.buffer.append(ArchiveUtils.getLog17Date(time))
105             .append(" ")
106             .append(ArchiveUtils.padTo(curi.getFetchStatus(), 5))
107             .append(" ")
108             .append(ArchiveUtils.padTo(length, 10))
109             .append(" ")
110             .append(curi.getUURI().toString())
111             .append(" ")
112             .append(checkForNull(curi.getPathFromSeed()))
113             .append(" ")
114             .append(checkForNull(via))
115             .append(" ")
116             .append(mime)
117             .append(" ")
118             .append("#")
119             // Pad threads to be 3 digits.  For Igor.
120             .append(ArchiveUtils.padTo(
121                 Integer.toString(curi.getThreadNumber()), 3, '0'))
122             .append(" ")
123             .append(arcTimeAndDuration)
124             .append(" ")
125             .append(checkForNull(digest))
126             .append(" ")
127             .append(checkForNull(sourceTag))
128             .append(" ")
129             .append(checkForNull(curi.getAnnotations()))
130             .append(" ")
131             .append(checkForNull(warc))
132             .append("\n").toString();
133     }
134     
135     /***
136      * @param str String to check.
137      * @return Return passed string or <code>NA</code> if null.
138      */
139     protected String checkForNull(String str) {
140         return (str == null || str.length() <= 0)? NA: str;
141     }
142 }
143 
144