View Javadoc

1   /*
2    *  This file is part of the Heritrix web crawler (crawler.archive.org).
3    *
4    *  Licensed to the Internet Archive (IA) by one or more individual 
5    *  contributors. 
6    *
7    *  The IA licenses this file to You under the Apache License, Version 2.0
8    *  (the "License"); you may not use this file except in compliance with
9    *  the License.  You may obtain a copy of the License at
10   *
11   *      http://www.apache.org/licenses/LICENSE-2.0
12   *
13   *  Unless required by applicable law or agreed to in writing, software
14   *  distributed under the License is distributed on an "AS IS" BASIS,
15   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   *  See the License for the specific language governing permissions and
17   *  limitations under the License.
18   */
19  
20  package org.archive.util;
21  
22  import java.util.Random;
23  
24  import junit.framework.TestCase;
25  
26  
27  /***
28   * BloomFilter tests. 
29   * 
30   * @contributor gojomo
31   * @version $Date: 2009-11-19 14:39:53 -0800 (Thu, 19 Nov 2009) $, $Revision: 6674 $
32   */
33  public abstract class BloomFilterTestBase extends TestCase {
34      
35      abstract BloomFilter createBloom(long n, int d, Random random);
36  
37      protected void trialWithParameters(long targetSize, int hashCount, long addCount, long containsCount) {
38          BloomFilter bloom = createBloom(targetSize,hashCount,new Random(1996L));
39          
40          int addFalsePositives = checkAdds(bloom,addCount);
41          checkDistribution(bloom); 
42          // this is a *very* rough and *very* lenient upper bound for adds <= targetSize
43          long maxTolerableDuringAdds = addCount / (1<<hashCount);
44          assertTrue(
45                  "excessive false positives ("+addFalsePositives+">"+maxTolerableDuringAdds+") during adds",
46                  addFalsePositives<10);
47          
48          if(containsCount==0) {
49              return; 
50          }
51          int containsFalsePositives = checkContains(bloom,containsCount); 
52          // expect at least 0 if bloom wasn't saturated in add phase
53          // if was saturated, expect at least 1/4th of the theoretical 1-in-every-(2<<hashCount) 
54          long minTolerableDuringContains = (addCount < targetSize) ? 0 : containsCount / ((1<<hashCount) * 4);
55          // expect no more than 4 times the theoretical-at-saturation
56          long maxTolerableDuringContains = containsCount * 4 / (1<<hashCount);
57          assertTrue(
58                  "excessive false positives ("+containsFalsePositives+">"+maxTolerableDuringContains+") during contains",
59                  containsFalsePositives<=maxTolerableDuringContains); // no more than double expected 1-in-4mil
60          assertTrue(
61                  "missing false positives ("+containsFalsePositives+"<"+minTolerableDuringContains+") during contains",
62                  containsFalsePositives>=minTolerableDuringContains);  // should be at least a couple
63      }
64                  
65      /***
66       * Test very-large (almost 800MB, spanning more than Integer.MAX_VALUE bit 
67       * indexes) bloom at saturation for expected behavior and level of 
68       * false-positives. 
69       * 
70       * Renamed to non-'test' name so not automatically run, because can 
71       * take 15+ minutes to complete.
72       */
73      public void xestOversized() {
74          trialWithParameters(200000000,22,200000000,32000000);
75      }
76      
77      /***
78       * Test large (495MB), default-sized bloom at saturation for 
79       * expected behavior and level of false-positives. 
80       * 
81       * Renamed to non-'test' name so not automatically run, because can 
82       * take 15+ minutes to complete.
83       */
84      public void xestDefaultFull() {
85          trialWithParameters(125000000,22,125000000,34000000);
86      }
87    
88      public void xestDefaultAbbreviated() {
89          trialWithParameters(125000000,22,17000000,0);
90      }
91      
92      public void testSmall() {
93          trialWithParameters(10000000, 20, 10000000, 10000000);
94      }
95      
96      /***
97       * Check that the given filter behaves properly as a large number of
98       * constructed unique strings are added: responding positively to 
99       * contains, and negatively to redundant adds. Assuming that the filter
100      * was empty before it was called, any add()s that report the string was
101      * already present are false-positives; report the total of same so the
102      * caller can evaluate if that level was suspiciously out of the expected
103      * error rate. 
104      * 
105      * @param bloom BloomFilter to check
106      * @param count int number of unique strings to check
107      * @return
108      */
109     protected int checkAdds(BloomFilter bloom, long count) {
110         int falsePositives = 0; 
111         for(int i = 0; i < count; i++) {
112             String str = "add"+Integer.toString(i);
113             if(!bloom.add(str)) {
114                 falsePositives++;
115             }
116             assertTrue(bloom.contains(str));
117             assertFalse(str+" not present on re-add",bloom.add(str));
118         }
119         return falsePositives;
120     }
121     
122     /***
123      * Check if the given filter contains any of the given constructed
124      * strings. Since the previously-added strings (of checkAdds) were
125      * different from these, *any* positive contains results are 
126      * false-positives. Return the total count so that the calling method
127      * can determine if the false-positive rate is outside the expected 
128      * range. 
129      * 
130      * @param bloom BloomFilter to check
131      * @param count int number of unique strings to check
132      * @return
133      */
134     protected int checkContains(BloomFilter bloom, long count) {
135         int falsePositives = 0; 
136         for(int i = 0; i < count; i++) {
137             String str = "contains"+Integer.toString(i);
138             if(bloom.contains(str)) {
139                 falsePositives++;
140             }
141         }
142         return falsePositives;
143     }
144     
145     /***
146      * Check that the given bloom filter, assumed to have already had a 
147      * significant number of items added, has bits set in the lower and upper
148      * 10% of its bit field. 
149      * 
150      * (This would have caught previous int/long bugs in the filter hashing 
151      * or conversion of bit indexes into array indexes and bit masks.)
152      * 
153      * @param bloom BloomFilter to check
154      */
155     public void checkDistribution(BloomFilter bloom) {
156         long bitLength = bloom.getSizeBytes() * 8L;
157         for(long i = 0; i<bitLength; i++) {
158             // verify that first set bit is in first 20% of bitfield
159             if(bloom.getBit(i)) {
160                 assertTrue("set bits not as expected in early positions",(i/(double)bitLength)<0.1d); 
161                 break; 
162             }
163         }
164         for(long i = bitLength-1; i>=0; i--) {
165             // verify that first set bit is in first 20% of bitfield
166             if(bloom.getBit(i)) {
167                 assertTrue("set bits not as expected in late positions",(i/(double)bitLength)>0.1d); 
168                 break; 
169             }
170         }
171     }
172 }