View Javadoc

1   /* UriUniqFilterImpl
2   *
3   * $Id: SetBasedUriUniqFilter.java 4036 2005-12-16 03:10:54Z gojomo $
4   *
5   * Created on Sep 29, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.util;
26  
27  import java.io.BufferedOutputStream;
28  import java.io.File;
29  import java.io.FileNotFoundException;
30  import java.io.FileOutputStream;
31  import java.io.PrintWriter;
32  import java.util.logging.Level;
33  import java.util.logging.Logger;
34  
35  import org.archive.crawler.datamodel.CandidateURI;
36  import org.archive.crawler.datamodel.UriUniqFilter;
37  
38  /***
39   * UriUniqFilter based on an underlying UriSet (essentially a Set).
40   * 
41   * @author gojomo
42   */
43  public abstract class SetBasedUriUniqFilter implements UriUniqFilter {
44      private static Logger LOGGER =
45          Logger.getLogger(SetBasedUriUniqFilter.class.getName());
46  
47      protected HasUriReceiver receiver;
48      protected PrintWriter profileLog;
49      protected long duplicateCount = 0;
50      protected long duplicatesAtLastSample = 0;
51      
52      public SetBasedUriUniqFilter() {
53          super();
54          String profileLogFile = 
55              System.getProperty(SetBasedUriUniqFilter.class.getName()
56                  + ".profileLogFile");
57          if (profileLogFile != null) {
58              setProfileLog(new File(profileLogFile));
59          }
60      }
61      
62      protected abstract boolean setAdd(CharSequence key);
63  
64      protected abstract boolean setRemove(CharSequence key);
65  
66      protected abstract long setCount();
67      
68      public long count() {
69          return setCount();
70      }
71  
72      public long pending() {
73          // no items pile up in this implementation
74          return 0;
75      }
76  
77      public void setDestination(HasUriReceiver receiver) {
78          this.receiver = receiver;
79      }
80  
81      protected void profileLog(String key) {
82          if (profileLog != null) {
83              profileLog.println(key);
84          }
85      }
86      
87      public void add(String key, CandidateURI value) {
88          profileLog(key);
89          if (setAdd(key)) {
90              this.receiver.receive(value);
91              if (setCount() % 50000 == 0) {
92                  LOGGER.log(Level.FINE, "count: " + setCount() + " totalDups: "
93                          + duplicateCount + " recentDups: "
94                          + (duplicateCount - duplicatesAtLastSample));
95                  duplicatesAtLastSample = duplicateCount;
96              }
97          } else {
98              duplicateCount++;
99          }
100     }
101 
102     public void addNow(String key, CandidateURI value) {
103         add(key, value);
104     }
105     
106     public void addForce(String key, CandidateURI value) {
107         profileLog(key);
108         setAdd(key);
109         this.receiver.receive(value);
110     }
111 
112     public void note(String key) {
113         profileLog(key);
114         setAdd(key);
115     }
116 
117     public void forget(String key, CandidateURI value) {
118         setRemove(key);
119     }
120 
121     public long requestFlush() {
122         // unnecessary; all actions with set-based uniqfilter are immediate
123         return 0;
124     }
125 
126     public void close() {
127         if (profileLog != null) {
128             profileLog.close();
129         }
130     }
131 
132     public void setProfileLog(File logfile) {
133         try {
134             profileLog = new PrintWriter(new BufferedOutputStream(
135                     new FileOutputStream(logfile)));
136         } catch (FileNotFoundException e) {
137             throw new RuntimeException(e);
138         }
139     }
140 }