View Javadoc

1   /* $Id: OrderJarFactory.java 5907 2008-07-28 21:21:44Z dbernstein $
2    *
3    * Created on Dec 12, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    *  
7    * This file is part of the Heritrix Cluster Controller (crawler.archive.org).
8    *  
9    * HCC is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.hcc.util;
24  
25  import java.io.ByteArrayInputStream;
26  import java.io.ByteArrayOutputStream;
27  import java.io.File;
28  import java.io.FileInputStream;
29  import java.io.FileNotFoundException;
30  import java.io.FileOutputStream;
31  import java.io.FileWriter;
32  import java.io.IOException;
33  import java.io.InputStream;
34  import java.io.InputStreamReader;
35  import java.text.SimpleDateFormat;
36  import java.util.Collection;
37  import java.util.Date;
38  import java.util.HashMap;
39  import java.util.List;
40  import java.util.Map;
41  import java.util.Properties;
42  import java.util.jar.JarEntry;
43  import java.util.jar.JarOutputStream;
44  import java.util.logging.Level;
45  import java.util.logging.Logger;
46  
47  public class OrderJarFactory {
48  	public static String SETTINGS_DIRECTORY_PROPERTY= OrderJarFactory.class.getName() + ".settingsDefaultsDir";
49      private static Logger log =
50          Logger.getLogger(OrderJarFactory.class.getName());
51      public static final String NAME_KEY = "name";
52      public static final String OPERATOR_KEY = "operator";
53      
54      public static final String DURATION_KEY = "duration";
55      public static final String TEST_CRAWL_KEY = "isTest";
56      public static final String ONE_HOP_OFF_KEY = "oneHopOff";
57  
58      public static final String DOCUMENT_LIMIT_KEY = "documentLimitKey";
59      public static final String USER_AGENT_KEY = "userAgent";
60      public static final String FROM_EMAIL_KEY = "fromEmail";
61      public static final String DISK_PATH_KEY = "diskPath";
62      public static final String DESCRIPTION = "description";
63      public static final String ORGANIZATION = "organization";
64      
65      public static final String SEEDS_KEY = "seeds";
66      public static final String HOST_CONSTRAINTS_KEY = "hostConstraints";
67      
68  
69      public OrderJarFactory() {
70          super();
71          // TODO Auto-generated constructor stub
72      }
73  
74      public static File createOrderJar(Map parameters) {
75          try {
76          	
77          	//read in order xml prototype to string buffer
78              Map<String, InputStream> map = new HashMap<String, InputStream>();
79              InputStream orderPrototype = OrderJarFactory.class
80                      .getResourceAsStream("/order.xml");
81              InputStreamReader reader = new InputStreamReader(orderPrototype);
82              char[] cbuf = new char[1024];
83              int read = -1;
84              StringBuffer b = new StringBuffer();
85              while ((read = reader.read(cbuf)) > -1) {
86                  b.append(cbuf, 0, read);
87              }
88  
89              //replace values
90              String order = b.toString();
91              String date = new SimpleDateFormat("yyyyMMddhhmmss")
92                      .format(new Date());
93  
94              order = order.replace("$name", parameters.get(NAME_KEY).toString());
95              Object operator = parameters.get(OPERATOR_KEY);
96              if(operator == null){
97              	operator = "No Operator Specified";
98              }
99              order = order.replace("$operator", operator.toString());
100 
101             order = order.replace("$arcPrefix", parameters
102                     .get(NAME_KEY)
103                     .toString());
104             order = order.replace("$date", date);
105             Object isTest = parameters.get(TEST_CRAWL_KEY);
106     
107             boolean isTestFlag = isTest != null && new Boolean(isTest.toString()).booleanValue();
108             order = order.replace("$writeEnabled", String.valueOf(!isTestFlag));
109 	        
110             Object oneHopOff = parameters.get(ONE_HOP_OFF_KEY);
111             
112             order = order.replace(
113             			"$oneHopOff", 
114             			String.valueOf(
115             					oneHopOff != null && new Boolean(oneHopOff.toString())));
116 	       
117             
118             order = order.replace("$date", date);
119 
120             int duration = 60*60*24*3;
121             Object durationStr = parameters.get(DURATION_KEY);
122             if(durationStr != null){
123                 duration = Integer.parseInt(durationStr.toString())/1000;
124             }
125             
126             order = order.replace("$duration", duration +"");
127             
128             int documentLimit = 0;
129             
130             Object documentLimitStr = parameters.get(DOCUMENT_LIMIT_KEY);
131             if(documentLimitStr != null){
132                 documentLimit = Integer.parseInt(documentLimitStr.toString());
133             }
134             order = order.replace("$documentLimit", documentLimit+"");
135 
136             order = order.replace("$userAgent", parameters.get(USER_AGENT_KEY).toString());
137             order = order.replace("$fromEmail", parameters.get(FROM_EMAIL_KEY).toString());
138 
139             String diskPath = (String)parameters.get(DISK_PATH_KEY);
140             if(diskPath == null){
141             	diskPath = "";
142             }
143             
144             order = order.replace("$diskPath", diskPath);
145 
146             String organization = (String)parameters.get(ORGANIZATION);
147             
148             if(organization == null){
149             	organization = "";
150             }
151             
152             order = order.replace("$organization", organization);
153 
154             String description = (String)parameters.get(DESCRIPTION);
155             
156             if(description == null){
157             	description = "";
158             }
159             
160             order = order.replace("$description", description);
161             
162             ByteArrayOutputStream orderFileOs = new ByteArrayOutputStream();
163             orderFileOs.write(order.getBytes());
164             map.put("order.xml", new ByteArrayInputStream(orderFileOs
165                     .toByteArray()));
166             orderFileOs.close();
167 
168             // write seeds
169             ByteArrayOutputStream seedsOs = new ByteArrayOutputStream();
170             int count = 0;
171             Collection<String> seeds = (Collection<String>) parameters
172                     .get(SEEDS_KEY);
173             for (String seed : seeds) {
174                 if (count++ > 0) {
175                     seedsOs.write("\n".getBytes());
176                 }
177                 seedsOs.write(seed.getBytes());
178 
179             }
180 
181             seedsOs.flush();
182             map.put(
183                     "seeds.txt",
184                     new ByteArrayInputStream(seedsOs.toByteArray()));
185             seedsOs.close();
186 
187             // write jar file.
188             File jarFile = File.createTempFile("order", ".jar");
189             JarOutputStream jos = new JarOutputStream(new FileOutputStream(
190                     jarFile));
191             byte[] buf = new byte[1024];
192             
193 
194             //if a settings directory defaults has been specified,
195             //add the contents of the settings directory defaults
196             Properties p =null;
197             try{
198             	p = SmartPropertiesResolver.getProperties("hcc.properties");
199 
200             }catch(RuntimeException ex){
201             	log.info("hcc.properties not found");
202             }
203             
204             if(p != null){
205                 String defaultSettingsDirectoryRoot = p.getProperty(SETTINGS_DIRECTORY_PROPERTY);
206                 addFilesFromSettingsDirectory(map, defaultSettingsDirectoryRoot);
207             }            
208             
209             
210             //create a unique temp work directory for crawlSettings
211             File tempCrawlSettingsDirectoryRoot = 
212             	new File(System.getProperty("java.io.tmpdir") + File.separator + new Date().getTime());
213             
214             tempCrawlSettingsDirectoryRoot.deleteOnExit();
215             //create hostConstraints hierarchy
216             Map<String,InputStream> files = 
217             	writeHostConstraints(
218             			(List<HostConstraint>)parameters.get(HOST_CONSTRAINTS_KEY),
219             			tempCrawlSettingsDirectoryRoot);
220             
221             //add files to map - any name clashes with the defaults will be overwritten by
222             //the user specified constrants.
223             map.putAll(files);
224 
225 
226             
227             // for each map entry
228             for (String filename : map.keySet()) {
229                 // Add ZIP entry to output stream.
230                 jos.putNextEntry(new JarEntry(filename));
231 
232                 // Transfer bytes from the file to the jar file
233                 InputStream in = map.get(filename);
234 
235                 int len;
236                 while ((len = in.read(buf)) > 0) {
237                     jos.write(buf, 0, len);
238                 }
239 
240                 // Complete the entry
241                 jos.closeEntry();
242                 in.close();
243             }
244 
245             jos.close();
246             
247             if (log.isLoggable(Level.FINE)) {
248                 log.fine("created jar file" + jarFile.getAbsolutePath());
249             }
250 
251             return jarFile;
252 
253         } catch (Exception e) {
254             if (log.isLoggable(Level.SEVERE)) {
255                 log.severe(e.getMessage());
256             }
257 
258             e.printStackTrace();
259             throw new RuntimeException(e);
260         }
261     }
262     
263     
264     protected static Map<String, InputStream> writeHostConstraints(List<HostConstraint> hostConstraints, File crawlSettingsDirectoryRoot) throws IOException{
265     		
266     	Map<String, InputStream> files = new HashMap<String, InputStream>();
267     	if(hostConstraints != null){
268     		//for each constraint
269     		for(HostConstraint hc : hostConstraints){
270     			//put filename path into list.
271     			File file = writeSettingsFile(hc, crawlSettingsDirectoryRoot);
272     			files.put(hc.getSettingsFilePath(), new FileInputStream(file));
273     			
274     			
275     		}
276     	}
277     	
278     	return files;
279     
280     }
281     
282     protected static File writeSettingsFile(HostConstraint hc, File crawlSettingsDirectoryRoot) throws IOException{
283 		//create directory hierarchy if doesn't exist
284 		File directory = new File(crawlSettingsDirectoryRoot, hc.getSettingsFileDirectory());
285 		directory.mkdirs();
286 		File file = File.createTempFile("order", "xml", directory);
287 		file.deleteOnExit();
288 		//write order file
289 		FileWriter w = new FileWriter(file);
290 		
291 		w.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
292 		w.append("<crawl-settings xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:noNamespaceSchemaLocation=\"heritrix_settings.xsd\">");
293 		/*
294 		w.append("<meta>");
295 		w.append("<name></name>");
296 		w.append("<description></description>");
297 		w.append("<operator>Admin</operator>");
298 		w.append("<audience></audience>");
299 		w.append("<organization></organization>");
300 		w.append("<date></date>");
301 		w.append("</meta>");
302 		*/
303 		if(hc.getIgnoreRobots() != null && hc.getIgnoreRobots()){
304 			w.append("<object name=\"robots-honoring-policy\"><string name=\"type\">ignore</string></object>");
305 		}
306 		
307 		if(hc.getBlock()  != null && hc.getBlock()){
308 			/*w.append("<object name=\"scope\"><boolean name=\"enabled\">false</boolean></object>");*/
309             w.append("<object name=\"Preselector\"><boolean name=\"block-all\">true</boolean></object>");
310 		}
311 		
312 		if(hc.getRegex() != null){
313 			w.append("<object name=\"rejectIfRegexMatch\">" +
314                     "<string name=\"regexp\">"+ hc.getRegex() +"</string>" +
315                      "</object>");
316 		}
317 		
318 		Long docLimit = hc.getDocumentLimit();
319 		if(docLimit != null && docLimit > 0){	
320 			int errorPenalty = docLimit.intValue()/100;
321 			if(errorPenalty == 0){
322 				errorPenalty = 1;
323 			}
324 		   w.append("<object name=\"frontier\">");
325 		   w.append("<integer name=\"error-penalty-amount\">"+errorPenalty+"</integer>"); 
326 		   w.append("<long name=\"queue-total-budget\">"+docLimit+"</long>"); 
327 		   w.append("</object>");
328 		}  
329 			  
330 		w.append("</crawl-settings>");
331 			
332 		w.close();
333 
334 		return file;
335     }
336     
337     protected static void addFilesFromSettingsDirectory(Map<String,InputStream> files, String settingsDirectoryRoot) throws IOException{
338         if(settingsDirectoryRoot != null){
339             File settingsDirectory = new File(settingsDirectoryRoot);
340             if(settingsDirectory.exists()){
341             	log.info("Settings directory parameter specified: " + settingsDirectoryRoot);
342                 recursivelyAddChildren(files, settingsDirectory, null);
343             }else{
344             	log.warning("Settings directory parameter points to a non-existent directory: " + settingsDirectoryRoot);
345             }
346         }else{
347         	log.info("Settings directory property is null: no settings directory specified.");
348         }
349     }
350     
351     protected static void recursivelyAddChildren(
352     						Map<String,InputStream> files, 
353     						File settingDirectoryRoot, 
354     						String path) throws IOException{
355 		File file;
356 		if(path == null){
357 			file = settingDirectoryRoot;
358 		}else{
359 			file = new File(path);
360 		}
361 		if(file.exists()){
362 			if(file.isFile()){
363 				FileInputStream fis = new FileInputStream(file);
364 				String filePath = settingDirectoryRoot.getCanonicalPath();
365 				
366 				String key = file.getCanonicalPath().replace(filePath + File.separator, "");
367 				log.fine("adding file to settings map: " + file.getCanonicalPath() + " keyed as " + key);
368 				files.put(key, fis);
369 			}else{
370 				String[] children = file.list();
371 				if(children != null){
372 					for(String child : children){
373 						recursivelyAddChildren(files, settingDirectoryRoot, file.getCanonicalPath() + File.separator + child);
374 					}
375 				}
376 			}
377 		}
378 	}
379 }