View Javadoc

1   /* AbstractLongFPSet
2    *
3    * $Id: AbstractLongFPSet.java 3437 2005-05-06 02:49:04Z stack-sf $
4    *
5    * Created on Oct 20, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.util;
26  
27  import java.io.Serializable;
28  import java.util.logging.Logger;
29  
30  import org.archive.util.fingerprint.LongFPSet;
31  
32  /***
33   * Shell of functionality for a Set of primitive long fingerprints, held
34   * in an array of possibly-empty slots.
35   * 
36   * The implementation of that holding array is delegated to subclasses.
37   *
38   * <p>Capacity is always a power of 2.
39   *
40   * <p>Fingerprints are already assumed to be well-distributed, so the
41   * hashed position for a value is just its high-order bits.
42   *
43   * @author gojomo
44   * @version $Date: 2005-05-06 02:49:04 +0000 (Fri, 06 May 2005) $, $Revision: 3437 $
45   */
46  public abstract class AbstractLongFPSet implements LongFPSet, Serializable {
47      private static Logger logger =
48          Logger.getLogger("org.archive.util.AbstractLongFPSet");
49  
50      /***
51       * A constant used to indicate that a slot in the set storage is empty.
52       * A zero or positive value means slot is filled
53       */
54      protected static byte EMPTY = -1;
55  
56      /*** the capacity of this set, specified as the exponent of a power of 2 */
57      protected int capacityPowerOfTwo;
58  
59      /*** The load factor, as a fraction.  This gives the amount of free space
60       * to keep in the Set. */
61      protected float loadFactor;
62  
63      /*** The current number of elements in the set */
64      protected long count;
65  
66      /***
67       * To support serialization
68       * TODO: verify needed?
69       */
70      public AbstractLongFPSet() {
71          super();
72      }
73      
74      /***
75       * Create a new AbstractLongFPSet with a given capacity and load Factor
76       *
77       * @param capacityPowerOfTwo The capacity as the exponent of a power of 2.
78       *  e.g if the capacity is <code>4</code> this means <code>2^^4</code>
79       * entries
80       * @param loadFactor The load factor as a fraction.  This gives the amount
81       * of free space to keep in the Set.
82       */
83      public AbstractLongFPSet(final int capacityPowerOfTwo, float loadFactor) {
84          this.capacityPowerOfTwo = capacityPowerOfTwo;
85          this.loadFactor = loadFactor;
86          this.count = 0;
87      }
88  
89      /***
90       * Does this set contain the given value?
91       *
92       * @see org.archive.util.fingerprint.LongFPSet#contains(long)
93       */
94      public boolean contains(long val) {
95          long i = indexFor(val);
96          if (slotHasData(i)) {
97              noteAccess(i);
98              return true;
99          }
100         return false;
101     }
102 
103     /***
104      * Check the state of a slot in the storage.
105      *
106      * @param i the index of the slot to check
107      * @return -1 if slot is filled; nonegative if full.
108      */
109     protected abstract int getSlotState(long i);
110 
111     /***
112      * Note access (hook for subclass cache-replacement strategies)
113      *
114      * @param index The index of the slot to check.
115      */
116     private void noteAccess(long index) {
117         // by default do nothing
118         // cache subclasses may use to update access counts, etc.
119     }
120 
121     /***
122      * Return the number of entries in this set.
123      *
124      * @see org.archive.util.fingerprint.LongFPSet#count()
125      */
126     public long count() {
127         return count;
128     }
129 
130     /***
131      * Add the given value to this set
132      *
133      * @see org.archive.util.fingerprint.LongFPSet#add(long)
134      */
135     public boolean add(long val) {
136         logger.finest("Adding " + val);
137         long i = indexFor(val);
138         if (slotHasData(i)) {
139             // positive index indicates already in set
140             return false;
141         }
142         // we have a possible slot now, which is encoded as a negative number
143 
144         // check for space, and grow if needed
145         if ((count + 1) > (loadFactor * (1 << capacityPowerOfTwo))) {
146             makeSpace();
147             // find new i
148             i = indexFor(val);
149             assert i < 0 : "slot should be empty";
150         }
151 
152         i = asDataSlot(i); // convert to positive index
153         setAt(i, val);
154         count++;
155         noteAccess(i);
156         return true;
157     }
158 
159     /***
160      * Make additional space to keep the load under the target
161      * loadFactor level.
162      * 
163      * Subclasses may grow or discard entries to satisfy.
164      */
165     protected abstract void makeSpace();
166 
167     /***
168      * Set the stored value at the given slot.
169      *
170      * @param i the slot index
171      * @param l the value to set
172      */
173     protected abstract void setAt(long i, long l);
174 
175     /*** 
176      * Get the stored value at the given slot.
177      *
178      * @param i the slot index
179      * @return The stored value at the given slot.
180      */
181     protected abstract long getAt(long i);
182 
183     /*** 
184      * Given a value, check the store for its existence. If it exists, it
185      * will return the index where the value resides.  Otherwise it return
186      * an encoded index, which is a possible storage location for the value.
187      *
188      * <p>Note, if we have a loading factor less than 1.0, there should always
189      * be an empty location where we can store the value
190      *
191      * @param val the fingerprint value to check for
192      * @return The (positive) index where the value already resides,
193      * or an empty index where it could be inserted (encoded as a
194      * negative number).
195      */
196     private long indexFor(long val) {
197         long candidateIndex = startIndexFor(val);
198         while (true) {
199             if (getSlotState(candidateIndex) < 0) {
200                 // slot empty; return negative number encoding index
201                 return asEmptySlot(candidateIndex);
202             }
203             if (getAt(candidateIndex) == val) {
204                 // already present; return positive index
205                 return candidateIndex;
206             }
207             candidateIndex++;
208             if (candidateIndex == 1 << capacityPowerOfTwo) {
209                 candidateIndex = 0; // wraparound
210             }
211         }
212     }
213 
214     /***
215      * Return the recommended storage index for the given value.
216      * Assumes values are already well-distributed; merely uses
217      * high-order bits.
218      *
219      * @param val
220      * @return The recommended storage index for the given value.
221      */
222     private long startIndexFor(long val) {
223         return (val >>> (64 - capacityPowerOfTwo));
224     }
225 
226     public boolean remove(long l) {
227         long i = indexFor(l);
228         if (!slotHasData(i)) {
229             // not present, not changed
230             return false;
231         }
232         removeAt(i);
233         return true;
234     }
235 
236     /***
237      * Remove the value at the given index, relocating its
238      * successors as necessary.
239      *
240      *  @param index
241      */
242     protected void removeAt(long index) {
243         count--;
244         clearAt(index);
245         long probeIndex = index + 1;
246         while (true) {
247             if (probeIndex == 1 << capacityPowerOfTwo) {
248                 probeIndex = 0; //wraparound
249             }
250             if (getSlotState(probeIndex) < 0) {
251                 // vacant
252                 break;
253             }
254             long val = getAt(probeIndex);
255             long newIndex = indexFor(val);
256             if (newIndex != probeIndex) {
257                 // value must shift down
258                 newIndex = asDataSlot(newIndex); // positivize
259                 relocate(val, probeIndex, newIndex);
260             }
261             probeIndex++;
262         }
263     }
264 
265     protected abstract void clearAt(long index);
266 
267     protected abstract void relocate(long value, long fromIndex, long toIndex);
268 
269     /***
270      * Low-cost, non-definitive (except when true) contains
271      * test. Default answer of false is acceptable.
272      *
273      * @see org.archive.util.fingerprint.LongFPSet#quickContains(long)
274      */
275     public boolean quickContains(long fp) {
276         return false;
277     }
278 
279     /***
280      * given a slot index, which could or could not be empty, return it as
281      * a slot index indicating an non-empty slot
282      *
283      * @param index the slot index to convert
284      * @return the index, converted to represent an slot with data
285      */
286     private long asDataSlot(final long index) {
287         if (slotHasData(index)) { // slot already has data
288             return index;
289         }
290         return - (index + 1);
291     }
292 
293     /*** 
294      * Given a slot index, which could or could not be empty, return it as
295      * a slot index indicating an empty slot
296      * @param index the slot index to convert
297      * @return the index, converted to represent an empty slot
298      */
299     private long asEmptySlot(final long index) {
300         if (!slotHasData(index)) { // already empty slot
301             return index;
302         }
303         return -index - 1;
304     }
305 
306     /*** 
307      * Does this index represent a slot with data?
308      *
309      * @param index the index to check
310      * @return <code>true</code> if the slot has data
311      */
312     private boolean slotHasData(final long index) {
313         return index >= 0;
314     }
315 }