1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.archive.util;
21
22 import java.util.Random;
23
24 import junit.framework.TestCase;
25
26
27 /***
28 * BloomFilter tests.
29 *
30 * @contributor gojomo
31 * @version $Date: 2009-11-19 14:39:53 -0800 (Thu, 19 Nov 2009) $, $Revision: 6674 $
32 */
33 public abstract class BloomFilterTestBase extends TestCase {
34
35 abstract BloomFilter createBloom(long n, int d, Random random);
36
37 protected void trialWithParameters(long targetSize, int hashCount, long addCount, long containsCount) {
38 BloomFilter bloom = createBloom(targetSize,hashCount,new Random(1996L));
39
40 int addFalsePositives = checkAdds(bloom,addCount);
41 checkDistribution(bloom);
42
43 long maxTolerableDuringAdds = addCount / (1<<hashCount);
44 assertTrue(
45 "excessive false positives ("+addFalsePositives+">"+maxTolerableDuringAdds+") during adds",
46 addFalsePositives<10);
47
48 if(containsCount==0) {
49 return;
50 }
51 int containsFalsePositives = checkContains(bloom,containsCount);
52
53
54 long minTolerableDuringContains = (addCount < targetSize) ? 0 : containsCount / ((1<<hashCount) * 4);
55
56 long maxTolerableDuringContains = containsCount * 4 / (1<<hashCount);
57 assertTrue(
58 "excessive false positives ("+containsFalsePositives+">"+maxTolerableDuringContains+") during contains",
59 containsFalsePositives<=maxTolerableDuringContains);
60 assertTrue(
61 "missing false positives ("+containsFalsePositives+"<"+minTolerableDuringContains+") during contains",
62 containsFalsePositives>=minTolerableDuringContains);
63 }
64
65 /***
66 * Test very-large (almost 800MB, spanning more than Integer.MAX_VALUE bit
67 * indexes) bloom at saturation for expected behavior and level of
68 * false-positives.
69 *
70 * Renamed to non-'test' name so not automatically run, because can
71 * take 15+ minutes to complete.
72 */
73 public void xestOversized() {
74 trialWithParameters(200000000,22,200000000,32000000);
75 }
76
77 /***
78 * Test large (495MB), default-sized bloom at saturation for
79 * expected behavior and level of false-positives.
80 *
81 * Renamed to non-'test' name so not automatically run, because can
82 * take 15+ minutes to complete.
83 */
84 public void xestDefaultFull() {
85 trialWithParameters(125000000,22,125000000,34000000);
86 }
87
88 public void xestDefaultAbbreviated() {
89 trialWithParameters(125000000,22,17000000,0);
90 }
91
92 public void testSmall() {
93 trialWithParameters(10000000, 20, 10000000, 10000000);
94 }
95
96 /***
97 * Check that the given filter behaves properly as a large number of
98 * constructed unique strings are added: responding positively to
99 * contains, and negatively to redundant adds. Assuming that the filter
100 * was empty before it was called, any add()s that report the string was
101 * already present are false-positives; report the total of same so the
102 * caller can evaluate if that level was suspiciously out of the expected
103 * error rate.
104 *
105 * @param bloom BloomFilter to check
106 * @param count int number of unique strings to check
107 * @return
108 */
109 protected int checkAdds(BloomFilter bloom, long count) {
110 int falsePositives = 0;
111 for(int i = 0; i < count; i++) {
112 String str = "add"+Integer.toString(i);
113 if(!bloom.add(str)) {
114 falsePositives++;
115 }
116 assertTrue(bloom.contains(str));
117 assertFalse(str+" not present on re-add",bloom.add(str));
118 }
119 return falsePositives;
120 }
121
122 /***
123 * Check if the given filter contains any of the given constructed
124 * strings. Since the previously-added strings (of checkAdds) were
125 * different from these, *any* positive contains results are
126 * false-positives. Return the total count so that the calling method
127 * can determine if the false-positive rate is outside the expected
128 * range.
129 *
130 * @param bloom BloomFilter to check
131 * @param count int number of unique strings to check
132 * @return
133 */
134 protected int checkContains(BloomFilter bloom, long count) {
135 int falsePositives = 0;
136 for(int i = 0; i < count; i++) {
137 String str = "contains"+Integer.toString(i);
138 if(bloom.contains(str)) {
139 falsePositives++;
140 }
141 }
142 return falsePositives;
143 }
144
145 /***
146 * Check that the given bloom filter, assumed to have already had a
147 * significant number of items added, has bits set in the lower and upper
148 * 10% of its bit field.
149 *
150 * (This would have caught previous int/long bugs in the filter hashing
151 * or conversion of bit indexes into array indexes and bit masks.)
152 *
153 * @param bloom BloomFilter to check
154 */
155 public void checkDistribution(BloomFilter bloom) {
156 long bitLength = bloom.getSizeBytes() * 8L;
157 for(long i = 0; i<bitLength; i++) {
158
159 if(bloom.getBit(i)) {
160 assertTrue("set bits not as expected in early positions",(i/(double)bitLength)<0.1d);
161 break;
162 }
163 }
164 for(long i = bitLength-1; i>=0; i--) {
165
166 if(bloom.getBit(i)) {
167 assertTrue("set bits not as expected in late positions",(i/(double)bitLength)>0.1d);
168 break;
169 }
170 }
171 }
172 }