1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.util;
28
29 import java.io.Serializable;
30 import java.security.SecureRandom;
31 import java.util.Random;
32
33 /*** A Bloom filter.
34 *
35 * ADAPTED/IMPROVED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
36 *
37 * <p>KEY CHANGES:
38 *
39 * <ul>
40 * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between
41 * similar strings (common in the domain of URIs)</li>
42 *
43 * <li>Removed dependence on cern.colt MersenneTwister (replaced with
44 * SecureRandom) and QuickBitVector (replaced with local methods).</li>
45 *
46 * <li>Adapted to allow long bit indices</li>
47 *
48 * <li>Stores bitfield in an array of up to 2^22 arrays of 2^26 longs. Thus,
49 * bitfield may grow to 2^48 longs in size -- 2PiB, 2*54 bitfield indexes.
50 * (I expect this will outstrip available RAM for the next few years.)</li>
51 * </ul>
52 *
53 * <hr>
54 *
55 * <P>Instances of this class represent a set of character sequences (with
56 * false positives) using a Bloom filter. Because of the way Bloom filters work,
57 * you cannot remove elements.
58 *
59 * <P>Bloom filters have an expected error rate, depending on the number
60 * of hash functions used, on the filter size and on the number of elements in
61 * the filter. This implementation uses a variable optimal number of hash
62 * functions, depending on the expected number of elements. More precisely, a
63 * Bloom filter for <var>n</var> character sequences with <var>d</var> hash
64 * functions will use ln 2 <var>d</var><var>n</var> ≈
65 * 1.44 <var>d</var><var>n</var> bits; false positives will happen with
66 * probability 2<sup>-<var>d</var></sup>.
67 *
68 * <P>Hash functions are generated at creation time using universal hashing.
69 * Each hash function uses {@link #NUMBER_OF_WEIGHTS} random integers, which
70 * are cyclically multiplied by the character codes in a character sequence.
71 * The resulting integers are XOR-ed together.
72 *
73 * <P>This class exports access methods that are very similar to those of
74 * {@link java.util.Set}, but it does not implement that interface, as too
75 * many non-optional methods would be unimplementable (e.g., iterators).
76 *
77 * @author Sebastiano Vigna
78 * @contributor Gordon Mohr
79 */
80 public class BloomFilter64bit implements Serializable, BloomFilter {
81 private static final long serialVersionUID = 2L;
82
83 /*** The number of weights used to create hash functions. */
84 final static int NUMBER_OF_WEIGHTS = 2083;
85 /*** The number of bits in this filter. */
86 final protected long m;
87 /*** if bitfield is an exact power of 2 in length, it is this power */
88 protected int power = -1;
89 /*** The expected number of inserts; determines calculated size */
90 final protected long expectedInserts;
91 /*** The number of hash functions used by this filter. */
92 final protected int d;
93 /*** The underlying bit vector */
94 final protected long[][] bits;
95 /*** The random integers used to generate the hash functions. */
96 final protected long[][] weight;
97
98 /*** The number of elements currently in the filter. It may be
99 * smaller than the actual number of additions of distinct character
100 * sequences because of false positives.
101 */
102 int size;
103
104 /*** The natural logarithm of 2, used in the computation of the number of bits. */
105 final static double NATURAL_LOG_OF_2 = Math.log( 2 );
106
107 /*** power-of-two to use as maximum size of bitfield subarrays */
108 protected final static int SUBARRAY_POWER_OF_TWO = 26;
109 /*** number of longs in one subarray */
110 protected final static int SUBARRAY_LENGTH_IN_LONGS = 1 << SUBARRAY_POWER_OF_TWO;
111 /*** mask for lowest SUBARRAY_POWER_OF_TWO bits */
112 protected final static int SUBARRAY_MASK = SUBARRAY_LENGTH_IN_LONGS - 1;
113
114 final static boolean DEBUG = false;
115
116 /*** Creates a new Bloom filter with given number of hash functions and
117 * expected number of elements.
118 *
119 * @param n the expected number of elements.
120 * @param d the number of hash functions; if the filter add not more
121 * than <code>n</code> elements, false positives will happen with
122 * probability 2<sup>-<var>d</var></sup>.
123 */
124 public BloomFilter64bit( final long n, final int d) {
125 this(n,d, new SecureRandom(), false);
126 }
127
128 public BloomFilter64bit( final long n, final int d, boolean roundUp) {
129 this(n,d, new SecureRandom(), roundUp);
130 }
131
132 /*** Creates a new Bloom filter with given number of hash functions and
133 * expected number of elements.
134 *
135 * @param n the expected number of elements.
136 * @param d the number of hash functions; if the filter add not more
137 * than <code>n</code> elements, false positives will happen with
138 * probability 2<sup>-<var>d</var></sup>.
139 * @param Random weightsGenerator may provide a seeded Random for reproducible
140 * internal universal hash function weighting
141 * @param roundUp if true, round bit size up to next-nearest-power-of-2
142 */
143 public BloomFilter64bit( final long n, final int d, Random weightsGenerator, boolean roundUp ) {
144 this.expectedInserts = n;
145 this.d = d;
146 long lenInLongs = (long)Math.ceil( ( (long)n * (long)d / NATURAL_LOG_OF_2 ) / 64L );
147 if ( lenInLongs > (1L<<48) ) {
148 throw new IllegalArgumentException(
149 "This filter would require " + lenInLongs + " longs, " +
150 "greater than this classes maximum of 2^48 longs (2PiB)." );
151 }
152 long lenInBits = lenInLongs * 64L;
153
154 if(roundUp) {
155 int pow = 0;
156 while((1L<<pow) < lenInBits) {
157 pow++;
158 }
159 this.power = pow;
160 this.m = 1L<<pow;
161 lenInLongs = m/64L;
162 } else {
163 this.m = lenInBits;
164 }
165
166
167 int arrayOfArraysLength = (int)((lenInLongs+SUBARRAY_LENGTH_IN_LONGS-1)/SUBARRAY_LENGTH_IN_LONGS);
168 bits = new long[ (int)(arrayOfArraysLength) ][];
169
170 long lenInLongsRemaining = lenInLongs;
171 for(int i = 0; i < bits.length; i++) {
172 bits[i] = new long[(int)Math.min(lenInLongsRemaining,SUBARRAY_LENGTH_IN_LONGS)];
173 lenInLongsRemaining -= bits[i].length;
174 }
175
176 if ( DEBUG ) System.err.println( "Number of bits: " + m );
177
178 weight = new long[ d ][];
179 for( int i = 0; i < d; i++ ) {
180 weight[ i ] = new long[ NUMBER_OF_WEIGHTS ];
181 for( int j = 0; j < NUMBER_OF_WEIGHTS; j++ )
182 weight[ i ][ j ] = weightsGenerator.nextLong();
183 }
184 }
185
186 /*** The number of character sequences in the filter.
187 *
188 * @return the number of character sequences in the filter (but
189 * see {@link #contains(CharSequence)}).
190 */
191
192 public int size() {
193 return size;
194 }
195
196 /*** Hashes the given sequence with the given hash function.
197 *
198 * @param s a character sequence.
199 * @param l the length of <code>s</code>.
200 * @param k a hash function index (smaller than {@link #d}).
201 * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
202 */
203 protected long hash( final CharSequence s, final int l, final int k ) {
204 final long[] w = weight[ k ];
205 long h = 0;
206 int i = l;
207 while( i-- != 0 ) h ^= s.charAt( i ) * w[ i % NUMBER_OF_WEIGHTS ];
208 long retVal;
209 if(power>0) {
210 retVal = h >>> (64-power);
211 } else {
212
213 retVal = ( h & 0x7FFFFFFFFFFFFFFFL ) % m;
214 }
215 return retVal;
216 }
217
218 public long[] bitIndexesFor(CharSequence s) {
219 long[] ret = new long[d];
220 for(int i = 0; i < d; i++) {
221 ret[i] = hash(s,s.length(),i);
222 }
223 return ret;
224 }
225
226 /*** Checks whether the given character sequence is in this filter.
227 *
228 * <P>Note that this method may return true on a character sequence that is has
229 * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
230 * where <var>d</var> is the number of hash functions specified at creation time, if
231 * the number of the elements in the filter is less than <var>n</var>, the number
232 * of expected elements specified at creation time.
233 *
234 * @param s a character sequence.
235 * @return true if the sequence is in the filter (or if a sequence with the
236 * same hash sequence is in the filter).
237 */
238
239 public boolean contains( final CharSequence s ) {
240 int i = d, l = s.length();
241 while( i-- != 0 ) if ( ! getBit( hash( s, l, i ) ) ) return false;
242 return true;
243 }
244
245 /*** Adds a character sequence to the filter.
246 *
247 * @param s a character sequence.
248 * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
249 */
250
251 public boolean add( final CharSequence s ) {
252 boolean result = false;
253 int i = d, l = s.length();
254 long h;
255 while( i-- != 0 ) {
256 h = hash( s, l, i );
257 if ( ! setGetBit( h ) ) {
258 result = true;
259 }
260 }
261 if ( result ) size++;
262 return result;
263 }
264
265 protected final static long ADDRESS_BITS_PER_UNIT = 6;
266 protected final static long BIT_INDEX_MASK = (1<<6)-1;
267
268 /***
269 * Returns from the local bitvector the value of the bit with
270 * the specified index. The value is <tt>true</tt> if the bit
271 * with the index <tt>bitIndex</tt> is currently set; otherwise,
272 * returns <tt>false</tt>.
273 *
274 * (adapted from cern.colt.bitvector.QuickBitVector)
275 *
276 * @param bitIndex the bit index.
277 * @return the value of the bit with the specified index.
278 */
279 public boolean getBit(long bitIndex) {
280 long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT;
281 int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO);
282 int subarrayIndex = (int) (longIndex & SUBARRAY_MASK);
283 return ((bits[arrayIndex][subarrayIndex] & (1L << (bitIndex & BIT_INDEX_MASK))) != 0);
284 }
285
286 /***
287 * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
288 *
289 * (adapted from cern.colt.bitvector.QuickBitVector)
290 *
291 * @param bitIndex the index of the bit to be set.
292 */
293 protected void setBit( long bitIndex) {
294 long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT;
295 int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO);
296 int subarrayIndex = (int) (longIndex & SUBARRAY_MASK);
297 bits[arrayIndex][subarrayIndex] |= (1L << (bitIndex & BIT_INDEX_MASK));
298 }
299
300 /***
301 * Sets the bit with index <tt>bitIndex</tt> in local bitvector --
302 * returning the old value.
303 *
304 * (adapted from cern.colt.bitvector.QuickBitVector)
305 *
306 * @param bitIndex the index of the bit to be set.
307 */
308 protected boolean setGetBit( long bitIndex) {
309 long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT;
310 int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO);
311 int subarrayIndex = (int) (longIndex & SUBARRAY_MASK);
312 long mask = 1L << (bitIndex & BIT_INDEX_MASK);
313 boolean ret = (bits[arrayIndex][subarrayIndex] & mask)!=0;
314 bits[arrayIndex][subarrayIndex] |= mask;
315 return ret;
316 }
317
318
319
320
321 public long getSizeBytes() {
322
323 return 8*(((bits.length-1)*bits[0].length)+bits[bits.length-1].length);
324 }
325
326 @Override
327 public long getExpectedInserts() {
328 return expectedInserts;
329 }
330
331 @Override
332 public long getHashCount() {
333 return d;
334 }
335 }