View Javadoc

1   /* BloomFilter
2   *
3   * $Id: BloomFilter64bit.java 6891 2010-06-15 22:22:25Z gojomo $
4   *
5   * Created on Jun 21, 2005
6   *
7   * Copyright (C) 2005 Internet Archive; a slight adaptation of
8   * LGPL work (C) Sebastiano Vigna
9   *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25  */
26  
27  package org.archive.util;
28  
29  import java.io.Serializable;
30  import java.security.SecureRandom;
31  import java.util.Random;
32  
33  /*** A Bloom filter.
34   *
35   * ADAPTED/IMPROVED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
36   * 
37   * <p>KEY CHANGES:
38   *
39   * <ul>
40   * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between 
41   * similar strings (common in the domain of URIs)</li>
42   * 
43   * <li>Removed dependence on cern.colt MersenneTwister (replaced with
44   * SecureRandom) and QuickBitVector (replaced with local methods).</li>
45   * 
46   * <li>Adapted to allow long bit indices</li>
47   * 
48   * <li>Stores bitfield in an array of up to 2^22 arrays of 2^26 longs. Thus, 
49   * bitfield may grow to 2^48 longs in size -- 2PiB, 2*54 bitfield indexes.
50   * (I expect this will outstrip available RAM for the next few years.)</li>
51   * </ul>
52   * 
53   * <hr>
54   * 
55   * <P>Instances of this class represent a set of character sequences (with 
56   * false positives) using a Bloom filter. Because of the way Bloom filters work,
57   * you cannot remove elements.
58   *
59   * <P>Bloom filters have an expected error rate, depending on the number
60   * of hash functions used, on the filter size and on the number of elements in 
61   * the filter. This implementation uses a variable optimal number of hash 
62   * functions, depending on the expected number of elements. More precisely, a 
63   * Bloom filter for <var>n</var> character sequences with <var>d</var> hash 
64   * functions will use ln 2 <var>d</var><var>n</var> &#8776; 
65   * 1.44 <var>d</var><var>n</var> bits; false positives will happen with 
66   * probability 2<sup>-<var>d</var></sup>.
67   *
68   * <P>Hash functions are generated at creation time using universal hashing. 
69   * Each hash function uses {@link #NUMBER_OF_WEIGHTS} random integers, which 
70   * are cyclically multiplied by the character codes in a character sequence. 
71   * The resulting integers are XOR-ed together.
72   *
73   * <P>This class exports access methods that are very similar to those of 
74   * {@link java.util.Set}, but it does not implement that interface, as too 
75   * many non-optional methods would be unimplementable (e.g., iterators).
76   *
77   * @author Sebastiano Vigna
78   * @contributor Gordon Mohr
79   */
80  public class BloomFilter64bit implements Serializable, BloomFilter {
81      private static final long serialVersionUID = 2L;
82  
83      /*** The number of weights used to create hash functions. */
84      final static int NUMBER_OF_WEIGHTS = 2083; // CHANGED FROM 16
85      /*** The number of bits in this filter. */
86      final protected long m;
87      /*** if bitfield is an exact power of 2 in length, it is this power */ 
88      protected int power = -1; 
89      /*** The expected number of inserts; determines calculated size */ 
90      final protected long expectedInserts; 
91      /*** The number of hash functions used by this filter. */
92      final protected int d;
93      /*** The underlying bit vector */
94      final protected long[][] bits;
95      /*** The random integers used to generate the hash functions. */
96      final protected long[][] weight;
97  
98      /*** The number of elements currently in the filter. It may be
99       * smaller than the actual number of additions of distinct character
100      * sequences because of false positives.
101      */
102     int size;
103 
104     /*** The natural logarithm of 2, used in the computation of the number of bits. */
105     final static double NATURAL_LOG_OF_2 = Math.log( 2 );
106 
107     /*** power-of-two to use as maximum size of bitfield subarrays */
108     protected final static int SUBARRAY_POWER_OF_TWO = 26; // 512MiB of longs
109     /*** number of longs in one subarray */
110     protected final static int SUBARRAY_LENGTH_IN_LONGS = 1 << SUBARRAY_POWER_OF_TWO; 
111     /*** mask for lowest SUBARRAY_POWER_OF_TWO bits */
112     protected final static int SUBARRAY_MASK = SUBARRAY_LENGTH_IN_LONGS - 1; //0x0FFFFFFF
113 
114     final static boolean DEBUG = false;
115 
116     /*** Creates a new Bloom filter with given number of hash functions and 
117      * expected number of elements.
118      *
119      * @param n the expected number of elements.
120      * @param d the number of hash functions; if the filter add not more 
121      * than <code>n</code> elements, false positives will happen with 
122      * probability 2<sup>-<var>d</var></sup>.
123      */
124     public BloomFilter64bit( final long n, final int d) {
125         this(n,d, new SecureRandom(), false);
126     }
127     
128     public BloomFilter64bit( final long n, final int d, boolean roundUp) {
129         this(n,d, new SecureRandom(), roundUp);
130     }
131     
132     /*** Creates a new Bloom filter with given number of hash functions and 
133      * expected number of elements.
134      *
135      * @param n the expected number of elements.
136      * @param d the number of hash functions; if the filter add not more 
137      * than <code>n</code> elements, false positives will happen with 
138      * probability 2<sup>-<var>d</var></sup>.
139      * @param Random weightsGenerator may provide a seeded Random for reproducible
140      * internal universal hash function weighting
141      * @param roundUp if true, round bit size up to next-nearest-power-of-2
142      */
143     public BloomFilter64bit( final long n, final int d, Random weightsGenerator, boolean roundUp ) {
144         this.expectedInserts = n; 
145         this.d = d;
146         long lenInLongs = (long)Math.ceil( ( (long)n * (long)d / NATURAL_LOG_OF_2 ) / 64L );
147         if ( lenInLongs > (1L<<48) ) {
148             throw new IllegalArgumentException(
149                     "This filter would require " + lenInLongs + " longs, " +
150                     "greater than this classes maximum of 2^48 longs (2PiB)." );
151         }
152         long lenInBits = lenInLongs * 64L;
153         
154         if(roundUp) {
155             int pow = 0;
156             while((1L<<pow) < lenInBits) {
157                 pow++;
158             }
159             this.power = pow;
160             this.m = 1L<<pow;
161             lenInLongs = m/64L;
162         } else {
163             this.m = lenInBits;
164         }
165 
166         
167         int arrayOfArraysLength = (int)((lenInLongs+SUBARRAY_LENGTH_IN_LONGS-1)/SUBARRAY_LENGTH_IN_LONGS);
168         bits = new long[ (int)(arrayOfArraysLength) ][];
169         // ensure last subarray is no longer than necessary
170         long lenInLongsRemaining = lenInLongs; 
171         for(int i = 0; i < bits.length; i++) {
172             bits[i] = new long[(int)Math.min(lenInLongsRemaining,SUBARRAY_LENGTH_IN_LONGS)];
173             lenInLongsRemaining -= bits[i].length;
174         }
175 
176         if ( DEBUG ) System.err.println( "Number of bits: " + m );
177 
178         weight = new long[ d ][];
179         for( int i = 0; i < d; i++ ) {
180             weight[ i ] = new long[ NUMBER_OF_WEIGHTS ];
181             for( int j = 0; j < NUMBER_OF_WEIGHTS; j++ )
182                  weight[ i ][ j ] = weightsGenerator.nextLong();
183         }
184     }
185 
186     /*** The number of character sequences in the filter.
187      *
188      * @return the number of character sequences in the filter (but 
189      * see {@link #contains(CharSequence)}).
190      */
191 
192     public int size() {
193         return size;
194     }
195 
196     /*** Hashes the given sequence with the given hash function.
197      *
198      * @param s a character sequence.
199      * @param l the length of <code>s</code>.
200      * @param k a hash function index (smaller than {@link #d}).
201      * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
202      */
203     protected long hash( final CharSequence s, final int l, final int k ) {
204         final long[] w = weight[ k ];
205         long h = 0;
206         int i = l;
207         while( i-- != 0 ) h ^= s.charAt( i ) * w[ i % NUMBER_OF_WEIGHTS ];
208         long retVal; 
209         if(power>0) {
210             retVal =  h >>> (64-power); 
211         } else { 
212             //                ####----####----
213             retVal =  ( h & 0x7FFFFFFFFFFFFFFFL ) % m;
214         }
215         return retVal; 
216     }
217     
218     public long[] bitIndexesFor(CharSequence s) {
219         long[] ret = new long[d];
220         for(int i = 0; i < d; i++) {
221              ret[i] = hash(s,s.length(),i); 
222         }
223         return ret;
224     }
225     
226     /*** Checks whether the given character sequence is in this filter.
227      *
228      * <P>Note that this method may return true on a character sequence that is has
229      * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
230      * where <var>d</var> is the number of hash functions specified at creation time, if
231      * the number of the elements in the filter is less than <var>n</var>, the number
232      * of expected elements specified at creation time.
233      *
234      * @param s a character sequence.
235      * @return true if the sequence is in the filter (or if a sequence with the
236      * same hash sequence is in the filter).
237      */
238 
239     public boolean contains( final CharSequence s ) {
240         int i = d, l = s.length();
241         while( i-- != 0 ) if ( ! getBit( hash( s, l, i ) ) ) return false;
242         return true;
243     }
244 
245     /*** Adds a character sequence to the filter.
246      *
247      * @param s a character sequence.
248      * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
249      */
250 
251     public boolean add( final CharSequence s ) {
252         boolean result = false;
253         int i = d, l = s.length();
254         long h;
255         while( i-- != 0 ) {
256             h = hash( s, l, i );
257             if ( ! setGetBit( h ) ) {
258                 result = true;
259             }
260         }
261         if ( result ) size++;
262         return result;
263     }
264     
265     protected final static long ADDRESS_BITS_PER_UNIT = 6; // 64=2^6
266     protected final static long BIT_INDEX_MASK = (1<<6)-1; // = 63 = 2^BITS_PER_UNIT - 1;
267 
268     /***
269      * Returns from the local bitvector the value of the bit with 
270      * the specified index. The value is <tt>true</tt> if the bit 
271      * with the index <tt>bitIndex</tt> is currently set; otherwise, 
272      * returns <tt>false</tt>.
273      *
274      * (adapted from cern.colt.bitvector.QuickBitVector)
275      * 
276      * @param     bitIndex   the bit index.
277      * @return    the value of the bit with the specified index.
278      */
279     public boolean getBit(long bitIndex) {
280         long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT;
281         int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO); 
282         int subarrayIndex = (int) (longIndex & SUBARRAY_MASK); 
283         return ((bits[arrayIndex][subarrayIndex] & (1L << (bitIndex & BIT_INDEX_MASK))) != 0);
284     }
285 
286     /***
287      * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
288      *
289      * (adapted from cern.colt.bitvector.QuickBitVector)
290      * 
291      * @param     bitIndex   the index of the bit to be set.
292      */
293     protected void setBit( long bitIndex) {
294         long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT;
295         int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO); 
296         int subarrayIndex = (int) (longIndex & SUBARRAY_MASK); 
297         bits[arrayIndex][subarrayIndex] |= (1L << (bitIndex & BIT_INDEX_MASK));
298     }
299     
300     /***
301      * Sets the bit with index <tt>bitIndex</tt> in local bitvector -- 
302      * returning the old value. 
303      *
304      * (adapted from cern.colt.bitvector.QuickBitVector)
305      * 
306      * @param     bitIndex   the index of the bit to be set.
307      */
308     protected boolean setGetBit( long bitIndex) {
309         long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT;
310         int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO); 
311         int subarrayIndex = (int) (longIndex & SUBARRAY_MASK); 
312         long mask = 1L << (bitIndex & BIT_INDEX_MASK);
313         boolean ret = (bits[arrayIndex][subarrayIndex] & mask)!=0;
314         bits[arrayIndex][subarrayIndex] |= mask;
315         return ret; 
316     }
317     
318 	/* (non-Javadoc)
319 	 * @see org.archive.util.BloomFilter#getSizeBytes()
320 	 */
321 	public long getSizeBytes() {
322 	    // account for ragged-sized last array
323 	    return 8*(((bits.length-1)*bits[0].length)+bits[bits.length-1].length);
324 	}
325 
326     @Override
327     public long getExpectedInserts() {
328         return expectedInserts;
329     }
330 
331     @Override
332     public long getHashCount() {
333         return d;
334     }
335 }