View Javadoc

1   /* BloomFilter
2   *
3   * Copyright (C) 2010 Internet Archive; an adaptation of
4   * LGPL work (C) Sebastiano Vigna
5   *
6   * This file is part of the Heritrix web crawler (crawler.archive.org).
7   *
8   * This class is free software; you can redistribute it and/or modify
9   * it under the terms of the GNU Lesser Public License as published by
10  * the Free Software Foundation; either version 2.1 of the License, or
11  * any later version.
12  *
13  * Heritrix is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser Public License
19  * along with Heritrix; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21  */
22  
23  package org.archive.util;
24  
25  /***
26   * Common interface for different Bloom filter implementations
27   * 
28   * @author Gordon Mohr
29   */
30  public interface BloomFilter {
31  	/*** The number of character sequences in the filter (considered to be the 
32  	 * number of add()s that returned 'true')
33  	 *
34  	 * @return the number of character sequences in the filter (but see {@link #contains(CharSequence)}).
35  	 */
36  	public abstract int size();
37  
38  	/*** Checks whether the given character sequence is in this filter.
39  	 *
40  	 * <P>Note that this method may return true on a character sequence that is has
41  	 * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
42  	 * where <var>d</var> is the number of hash functions specified at creation time, if
43  	 * the number of the elements in the filter is less than <var>n</var>, the number
44  	 * of expected elements specified at creation time.
45  	 *
46  	 * @param s a character sequence.
47  	 * @return true if the sequence is in the filter (or if a sequence with the
48  	 * same hash sequence is in the filter).
49  	 */
50  	public abstract boolean contains(final CharSequence s);
51  
52  	/*** Adds a character sequence to the filter.
53  	 *
54  	 * @param s a character sequence.
55  	 * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
56  	 */
57  	public abstract boolean add(final CharSequence s);
58  
59  	/***
60       * The amount of memory in bytes consumed by the bloom 
61       * bitfield.
62       *
63  	 * @return memory used by bloom bitfield, in bytes
64  	 */
65  	public abstract long getSizeBytes();
66  	
67  	/***
68  	 * Report the number of expected inserts used at instantiation time to 
69  	 * calculate the bitfield size. 
70  	 * 
71  	 * @return long number of inserts expected at instantiation
72  	 */
73      public abstract long getExpectedInserts();
74      
75      /***
76       * Report the number of internal independent hash function (and thus the
77       * number of bits set/checked for each item presented). 
78       * 
79       * @return long count of hash functions
80       */
81      public abstract long getHashCount(); 
82      
83      // public for white-box unit testing
84      public boolean getBit(long bitIndex);
85  }