View Javadoc

1   /* BenchmarkUriUniqFilters
2    *
3    * $Id: BenchmarkUriUniqFilters.java 4647 2006-09-22 18:39:39Z paul_jack $
4    *
5    * Created on Jun 22, 2005.
6    *
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.util;
26  import java.io.BufferedReader;
27  import java.io.BufferedWriter;
28  import java.io.File;
29  import java.io.FileReader;
30  import java.io.FileWriter;
31  import java.io.IOException;
32  
33  import org.archive.crawler.datamodel.CandidateURI;
34  import org.archive.crawler.datamodel.UriUniqFilter;
35  import org.archive.util.fingerprint.MemLongFPSet;
36  
37  
38  /***
39   * BenchmarkUriUniqFilters
40   * 
41   * @author gojomo
42   */
43  public class BenchmarkUriUniqFilters implements UriUniqFilter.HasUriReceiver {
44  //    private Logger LOGGER =
45  //        Logger.getLogger(BenchmarkUriUniqFilters.class.getName());
46      
47      private BufferedWriter out; // optional to dump uniq items
48      String current; // current line/URI being checked
49      
50      /***
51       * Test the UriUniqFilter implementation (MemUriUniqFilter,
52       * BloomUriUniqFilter, or BdbUriUniqFilter) named in first
53       * argument against the file of one-per-line URIs named
54       * in the second argument. 
55       * 
56       * @param args from cmd-line
57       * @throws IOException
58       */
59      public static void main(String[] args) throws IOException {
60          (new BenchmarkUriUniqFilters()).instanceMain(args);
61      }
62      
63      public void instanceMain(String[] args) throws IOException {
64          String testClass = args[0];
65          String inputFilename = args[1];
66          long start = System.currentTimeMillis();
67          UriUniqFilter uniq = createUriUniqFilter(testClass);
68          long created = System.currentTimeMillis();
69          BufferedReader br = new BufferedReader(new FileReader(inputFilename));
70          if(args.length>2) {
71              String outputFilename = args[2];
72              out = new BufferedWriter(new FileWriter(outputFilename));
73          }
74          int added = 0;
75          while((current=br.readLine())!=null) {
76              added++;
77              uniq.add(current,null);
78          }
79          uniq.close();
80          long finished = System.currentTimeMillis();
81          if(out!=null) {
82              out.close();
83          }
84          System.out.println(added+" adds");
85          System.out.println(uniq.count()+" retained");
86          System.out.println((created-start)+"ms to setup UUF");
87          System.out.println((finished-created)+"ms to perform all adds");
88      }
89      
90      private UriUniqFilter createUriUniqFilter(String testClass) throws IOException {
91          UriUniqFilter uniq = null;
92          if(BdbUriUniqFilter.class.getName().endsWith(testClass)) {;
93              // BDB setup
94              File tmpDir = File.createTempFile("uuf","benchmark");
95              tmpDir.delete();
96              tmpDir.mkdir();
97              uniq = new BdbUriUniqFilter(tmpDir, 50);
98          } else if(BloomUriUniqFilter.class.getName().endsWith(testClass)) {
99              // bloom setup
100             uniq = new BloomUriUniqFilter();
101         } else if(MemUriUniqFilter.class.getName().endsWith(testClass)) {
102             // mem hashset
103             uniq = new MemUriUniqFilter();
104         } else if (FPUriUniqFilter.class.getName().endsWith(testClass)) {
105             // mem fp set (open-addressing) setup
106             uniq = new FPUriUniqFilter(new MemLongFPSet(21,0.75f));
107         }
108         uniq.setDestination(this);
109         return uniq;
110     }
111 
112     /* (non-Javadoc)
113      * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CandidateURI)
114      */
115     public void receive(CandidateURI item) {
116         if(out!=null) {
117             try {
118                 // we assume all tested filters are immediate passthrough so
119                 // we can use 'current'; a buffering filter would change this
120                 // assumption
121                 out.write(current);
122                 out.write("\n");
123             } catch (IOException e) {
124                 // TODO Auto-generated catch block
125                 e.printStackTrace();
126             }
127         }
128     }
129 }