1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.io;
26
27 import it.unimi.dsi.mg4j.util.MutableString;
28
29 import java.util.logging.Formatter;
30 import java.util.logging.LogRecord;
31
32 import org.archive.crawler.datamodel.CoreAttributeConstants;
33 import org.archive.crawler.datamodel.CrawlURI;
34 import org.archive.util.ArchiveUtils;
35 import org.archive.util.MimetypeUtils;
36
37 /***
38 * Formatter for 'crawl.log'. Expects completed CrawlURI as parameter.
39 *
40 * @author gojomo
41 */
42 public class UriProcessingFormatter
43 extends Formatter implements CoreAttributeConstants {
44 private final static String NA = "-";
45 /***
46 * Guess at line length (URIs are assumed avg. of 128 bytes).
47 * Used to preallocated the buffer we accumulate the log line
48 * in. Hopefully we get it right most of the time and no need
49 * to enlarge except in the rare case.
50 */
51 private final static int GUESS_AT_LOG_LENGTH =
52 17 + 1 + 3 + 1 + 10 + 128 + + 1 + 10 + 1 + 128 + 1 + 10 + 1 + 3 +
53 14 + 1 + 32 + 4 + 128 + 1 + 64 + 1;
54
55 /***
56 * Reuseable assembly buffer.
57 */
58 private final MutableString buffer =
59 new MutableString(GUESS_AT_LOG_LENGTH);
60
61 public String format(LogRecord lr) {
62 CrawlURI curi = (CrawlURI)lr.getParameters()[0];
63 String length = NA;
64 String mime = null;
65 if (curi.isHttpTransaction()) {
66 if(curi.getContentLength() >= 0) {
67 length = Long.toString(curi.getContentLength());
68 } else if (curi.getContentSize() > 0) {
69 length = Long.toString(curi.getContentSize());
70 }
71 mime = curi.getContentType();
72 } else {
73 if (curi.getContentSize() > 0) {
74 length = Long.toString(curi.getContentSize());
75 }
76 mime = curi.getContentType();
77 }
78 mime = MimetypeUtils.truncate(mime);
79
80 long time = System.currentTimeMillis();
81 String arcTimeAndDuration;
82 if(curi.containsKey(A_FETCH_COMPLETED_TIME)) {
83 long completedTime = curi.getLong(A_FETCH_COMPLETED_TIME);
84 long beganTime = curi.getLong(A_FETCH_BEGAN_TIME);
85 arcTimeAndDuration = ArchiveUtils.get17DigitDate(beganTime) + "+"
86 + Long.toString(completedTime - beganTime);
87 } else {
88 arcTimeAndDuration = NA;
89 }
90
91 String via = curi.flattenVia();
92
93 String digest = curi.getContentDigestSchemeString();
94
95 String sourceTag = curi.containsKey(A_SOURCE_TAG)
96 ? curi.getString(A_SOURCE_TAG)
97 : null;
98
99 String warc = curi.containsKey(A_WRITTEN_TO_WARC)
100 ? curi.getString(A_WRITTEN_TO_WARC)
101 : null;
102
103 this.buffer.length(0);
104 return this.buffer.append(ArchiveUtils.getLog17Date(time))
105 .append(" ")
106 .append(ArchiveUtils.padTo(curi.getFetchStatus(), 5))
107 .append(" ")
108 .append(ArchiveUtils.padTo(length, 10))
109 .append(" ")
110 .append(curi.getUURI().toString())
111 .append(" ")
112 .append(checkForNull(curi.getPathFromSeed()))
113 .append(" ")
114 .append(checkForNull(via))
115 .append(" ")
116 .append(mime)
117 .append(" ")
118 .append("#")
119
120 .append(ArchiveUtils.padTo(
121 Integer.toString(curi.getThreadNumber()), 3, '0'))
122 .append(" ")
123 .append(arcTimeAndDuration)
124 .append(" ")
125 .append(checkForNull(digest))
126 .append(" ")
127 .append(checkForNull(sourceTag))
128 .append(" ")
129 .append(checkForNull(curi.getAnnotations()))
130 .append(" ")
131 .append(checkForNull(warc))
132 .append("\n").toString();
133 }
134
135 /***
136 * @param str String to check.
137 * @return Return passed string or <code>NA</code> if null.
138 */
139 protected String checkForNull(String str) {
140 return (str == null || str.length() <= 0)? NA: str;
141 }
142 }
143
144