1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.hcc.util;
24
25 import java.io.ByteArrayInputStream;
26 import java.io.ByteArrayOutputStream;
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileNotFoundException;
30 import java.io.FileOutputStream;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.InputStream;
34 import java.io.InputStreamReader;
35 import java.text.SimpleDateFormat;
36 import java.util.Collection;
37 import java.util.Date;
38 import java.util.HashMap;
39 import java.util.List;
40 import java.util.Map;
41 import java.util.Properties;
42 import java.util.jar.JarEntry;
43 import java.util.jar.JarOutputStream;
44 import java.util.logging.Level;
45 import java.util.logging.Logger;
46
47 public class OrderJarFactory {
48 public static String SETTINGS_DIRECTORY_PROPERTY= OrderJarFactory.class.getName() + ".settingsDefaultsDir";
49 private static Logger log =
50 Logger.getLogger(OrderJarFactory.class.getName());
51 public static final String NAME_KEY = "name";
52 public static final String OPERATOR_KEY = "operator";
53
54 public static final String DURATION_KEY = "duration";
55 public static final String TEST_CRAWL_KEY = "isTest";
56 public static final String ONE_HOP_OFF_KEY = "oneHopOff";
57
58 public static final String DOCUMENT_LIMIT_KEY = "documentLimitKey";
59 public static final String USER_AGENT_KEY = "userAgent";
60 public static final String FROM_EMAIL_KEY = "fromEmail";
61 public static final String DISK_PATH_KEY = "diskPath";
62 public static final String DESCRIPTION = "description";
63 public static final String ORGANIZATION = "organization";
64
65 public static final String SEEDS_KEY = "seeds";
66 public static final String HOST_CONSTRAINTS_KEY = "hostConstraints";
67
68
69 public OrderJarFactory() {
70 super();
71
72 }
73
74 public static File createOrderJar(Map parameters) {
75 try {
76
77
78 Map<String, InputStream> map = new HashMap<String, InputStream>();
79 InputStream orderPrototype = OrderJarFactory.class
80 .getResourceAsStream("/order.xml");
81 InputStreamReader reader = new InputStreamReader(orderPrototype);
82 char[] cbuf = new char[1024];
83 int read = -1;
84 StringBuffer b = new StringBuffer();
85 while ((read = reader.read(cbuf)) > -1) {
86 b.append(cbuf, 0, read);
87 }
88
89
90 String order = b.toString();
91 String date = new SimpleDateFormat("yyyyMMddhhmmss")
92 .format(new Date());
93
94 order = order.replace("$name", parameters.get(NAME_KEY).toString());
95 Object operator = parameters.get(OPERATOR_KEY);
96 if(operator == null){
97 operator = "No Operator Specified";
98 }
99 order = order.replace("$operator", operator.toString());
100
101 order = order.replace("$arcPrefix", parameters
102 .get(NAME_KEY)
103 .toString());
104 order = order.replace("$date", date);
105 Object isTest = parameters.get(TEST_CRAWL_KEY);
106
107 boolean isTestFlag = isTest != null && new Boolean(isTest.toString()).booleanValue();
108 order = order.replace("$writeEnabled", String.valueOf(!isTestFlag));
109
110 Object oneHopOff = parameters.get(ONE_HOP_OFF_KEY);
111
112 order = order.replace(
113 "$oneHopOff",
114 String.valueOf(
115 oneHopOff != null && new Boolean(oneHopOff.toString())));
116
117
118 order = order.replace("$date", date);
119
120 int duration = 60*60*24*3;
121 Object durationStr = parameters.get(DURATION_KEY);
122 if(durationStr != null){
123 duration = Integer.parseInt(durationStr.toString())/1000;
124 }
125
126 order = order.replace("$duration", duration +"");
127
128 int documentLimit = 0;
129
130 Object documentLimitStr = parameters.get(DOCUMENT_LIMIT_KEY);
131 if(documentLimitStr != null){
132 documentLimit = Integer.parseInt(documentLimitStr.toString());
133 }
134 order = order.replace("$documentLimit", documentLimit+"");
135
136 order = order.replace("$userAgent", parameters.get(USER_AGENT_KEY).toString());
137 order = order.replace("$fromEmail", parameters.get(FROM_EMAIL_KEY).toString());
138
139 String diskPath = (String)parameters.get(DISK_PATH_KEY);
140 if(diskPath == null){
141 diskPath = "";
142 }
143
144 order = order.replace("$diskPath", diskPath);
145
146 String organization = (String)parameters.get(ORGANIZATION);
147
148 if(organization == null){
149 organization = "";
150 }
151
152 order = order.replace("$organization", organization);
153
154 String description = (String)parameters.get(DESCRIPTION);
155
156 if(description == null){
157 description = "";
158 }
159
160 order = order.replace("$description", description);
161
162 ByteArrayOutputStream orderFileOs = new ByteArrayOutputStream();
163 orderFileOs.write(order.getBytes());
164 map.put("order.xml", new ByteArrayInputStream(orderFileOs
165 .toByteArray()));
166 orderFileOs.close();
167
168
169 ByteArrayOutputStream seedsOs = new ByteArrayOutputStream();
170 int count = 0;
171 Collection<String> seeds = (Collection<String>) parameters
172 .get(SEEDS_KEY);
173 for (String seed : seeds) {
174 if (count++ > 0) {
175 seedsOs.write("\n".getBytes());
176 }
177 seedsOs.write(seed.getBytes());
178
179 }
180
181 seedsOs.flush();
182 map.put(
183 "seeds.txt",
184 new ByteArrayInputStream(seedsOs.toByteArray()));
185 seedsOs.close();
186
187
188 File jarFile = File.createTempFile("order", ".jar");
189 JarOutputStream jos = new JarOutputStream(new FileOutputStream(
190 jarFile));
191 byte[] buf = new byte[1024];
192
193
194
195
196 Properties p =null;
197 try{
198 p = SmartPropertiesResolver.getProperties("hcc.properties");
199
200 }catch(RuntimeException ex){
201 log.info("hcc.properties not found");
202 }
203
204 if(p != null){
205 String defaultSettingsDirectoryRoot = p.getProperty(SETTINGS_DIRECTORY_PROPERTY);
206 addFilesFromSettingsDirectory(map, defaultSettingsDirectoryRoot);
207 }
208
209
210
211 File tempCrawlSettingsDirectoryRoot =
212 new File(System.getProperty("java.io.tmpdir") + File.separator + new Date().getTime());
213
214 tempCrawlSettingsDirectoryRoot.deleteOnExit();
215
216 Map<String,InputStream> files =
217 writeHostConstraints(
218 (List<HostConstraint>)parameters.get(HOST_CONSTRAINTS_KEY),
219 tempCrawlSettingsDirectoryRoot);
220
221
222
223 map.putAll(files);
224
225
226
227
228 for (String filename : map.keySet()) {
229
230 jos.putNextEntry(new JarEntry(filename));
231
232
233 InputStream in = map.get(filename);
234
235 int len;
236 while ((len = in.read(buf)) > 0) {
237 jos.write(buf, 0, len);
238 }
239
240
241 jos.closeEntry();
242 in.close();
243 }
244
245 jos.close();
246
247 if (log.isLoggable(Level.FINE)) {
248 log.fine("created jar file" + jarFile.getAbsolutePath());
249 }
250
251 return jarFile;
252
253 } catch (Exception e) {
254 if (log.isLoggable(Level.SEVERE)) {
255 log.severe(e.getMessage());
256 }
257
258 e.printStackTrace();
259 throw new RuntimeException(e);
260 }
261 }
262
263
264 protected static Map<String, InputStream> writeHostConstraints(List<HostConstraint> hostConstraints, File crawlSettingsDirectoryRoot) throws IOException{
265
266 Map<String, InputStream> files = new HashMap<String, InputStream>();
267 if(hostConstraints != null){
268
269 for(HostConstraint hc : hostConstraints){
270
271 File file = writeSettingsFile(hc, crawlSettingsDirectoryRoot);
272 files.put(hc.getSettingsFilePath(), new FileInputStream(file));
273
274
275 }
276 }
277
278 return files;
279
280 }
281
282 protected static File writeSettingsFile(HostConstraint hc, File crawlSettingsDirectoryRoot) throws IOException{
283
284 File directory = new File(crawlSettingsDirectoryRoot, hc.getSettingsFileDirectory());
285 directory.mkdirs();
286 File file = File.createTempFile("order", "xml", directory);
287 file.deleteOnExit();
288
289 FileWriter w = new FileWriter(file);
290
291 w.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
292 w.append("<crawl-settings xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:noNamespaceSchemaLocation=\"heritrix_settings.xsd\">");
293
294
295
296
297
298
299
300
301
302
303 if(hc.getIgnoreRobots() != null && hc.getIgnoreRobots()){
304 w.append("<object name=\"robots-honoring-policy\"><string name=\"type\">ignore</string></object>");
305 }
306
307 if(hc.getBlock() != null && hc.getBlock()){
308
309 w.append("<object name=\"Preselector\"><boolean name=\"block-all\">true</boolean></object>");
310 }
311
312 if(hc.getRegex() != null){
313 w.append("<object name=\"rejectIfRegexMatch\">" +
314 "<string name=\"regexp\">"+ hc.getRegex() +"</string>" +
315 "</object>");
316 }
317
318 Long docLimit = hc.getDocumentLimit();
319 if(docLimit != null && docLimit > 0){
320 int errorPenalty = docLimit.intValue()/100;
321 if(errorPenalty == 0){
322 errorPenalty = 1;
323 }
324 w.append("<object name=\"frontier\">");
325 w.append("<integer name=\"error-penalty-amount\">"+errorPenalty+"</integer>");
326 w.append("<long name=\"queue-total-budget\">"+docLimit+"</long>");
327 w.append("</object>");
328 }
329
330 w.append("</crawl-settings>");
331
332 w.close();
333
334 return file;
335 }
336
337 protected static void addFilesFromSettingsDirectory(Map<String,InputStream> files, String settingsDirectoryRoot) throws IOException{
338 if(settingsDirectoryRoot != null){
339 File settingsDirectory = new File(settingsDirectoryRoot);
340 if(settingsDirectory.exists()){
341 log.info("Settings directory parameter specified: " + settingsDirectoryRoot);
342 recursivelyAddChildren(files, settingsDirectory, null);
343 }else{
344 log.warning("Settings directory parameter points to a non-existent directory: " + settingsDirectoryRoot);
345 }
346 }else{
347 log.info("Settings directory property is null: no settings directory specified.");
348 }
349 }
350
351 protected static void recursivelyAddChildren(
352 Map<String,InputStream> files,
353 File settingDirectoryRoot,
354 String path) throws IOException{
355 File file;
356 if(path == null){
357 file = settingDirectoryRoot;
358 }else{
359 file = new File(path);
360 }
361 if(file.exists()){
362 if(file.isFile()){
363 FileInputStream fis = new FileInputStream(file);
364 String filePath = settingDirectoryRoot.getCanonicalPath();
365
366 String key = file.getCanonicalPath().replace(filePath + File.separator, "");
367 log.fine("adding file to settings map: " + file.getCanonicalPath() + " keyed as " + key);
368 files.put(key, fis);
369 }else{
370 String[] children = file.list();
371 if(children != null){
372 for(String child : children){
373 recursivelyAddChildren(files, settingDirectoryRoot, file.getCanonicalPath() + File.separator + child);
374 }
375 }
376 }
377 }
378 }
379 }