1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.util;
26
27 import java.io.BufferedOutputStream;
28 import java.io.File;
29 import java.io.FileNotFoundException;
30 import java.io.FileOutputStream;
31 import java.io.PrintWriter;
32 import java.util.logging.Level;
33 import java.util.logging.Logger;
34
35 import org.archive.crawler.datamodel.CandidateURI;
36 import org.archive.crawler.datamodel.UriUniqFilter;
37
38 /***
39 * UriUniqFilter based on an underlying UriSet (essentially a Set).
40 *
41 * @author gojomo
42 */
43 public abstract class SetBasedUriUniqFilter implements UriUniqFilter {
44 private static Logger LOGGER =
45 Logger.getLogger(SetBasedUriUniqFilter.class.getName());
46
47 protected HasUriReceiver receiver;
48 protected PrintWriter profileLog;
49 protected long duplicateCount = 0;
50 protected long duplicatesAtLastSample = 0;
51
52 public SetBasedUriUniqFilter() {
53 super();
54 String profileLogFile =
55 System.getProperty(SetBasedUriUniqFilter.class.getName()
56 + ".profileLogFile");
57 if (profileLogFile != null) {
58 setProfileLog(new File(profileLogFile));
59 }
60 }
61
62 protected abstract boolean setAdd(CharSequence key);
63
64 protected abstract boolean setRemove(CharSequence key);
65
66 protected abstract long setCount();
67
68 public long count() {
69 return setCount();
70 }
71
72 public long pending() {
73
74 return 0;
75 }
76
77 public void setDestination(HasUriReceiver receiver) {
78 this.receiver = receiver;
79 }
80
81 protected void profileLog(String key) {
82 if (profileLog != null) {
83 profileLog.println(key);
84 }
85 }
86
87 public void add(String key, CandidateURI value) {
88 profileLog(key);
89 if (setAdd(key)) {
90 this.receiver.receive(value);
91 if (setCount() % 50000 == 0) {
92 LOGGER.log(Level.FINE, "count: " + setCount() + " totalDups: "
93 + duplicateCount + " recentDups: "
94 + (duplicateCount - duplicatesAtLastSample));
95 duplicatesAtLastSample = duplicateCount;
96 }
97 } else {
98 duplicateCount++;
99 }
100 }
101
102 public void addNow(String key, CandidateURI value) {
103 add(key, value);
104 }
105
106 public void addForce(String key, CandidateURI value) {
107 profileLog(key);
108 setAdd(key);
109 this.receiver.receive(value);
110 }
111
112 public void note(String key) {
113 profileLog(key);
114 setAdd(key);
115 }
116
117 public void forget(String key, CandidateURI value) {
118 setRemove(key);
119 }
120
121 public long requestFlush() {
122
123 return 0;
124 }
125
126 public void close() {
127 if (profileLog != null) {
128 profileLog.close();
129 }
130 }
131
132 public void setProfileLog(File logfile) {
133 try {
134 profileLog = new PrintWriter(new BufferedOutputStream(
135 new FileOutputStream(logfile)));
136 } catch (FileNotFoundException e) {
137 throw new RuntimeException(e);
138 }
139 }
140 }