001: /* BloomFilter
002: *
003: * $Id: BloomFilter32bp2Split.java 4644 2006-09-20 22:40:21Z paul_jack $
004: *
005: * Created on Jun 21, 2005
006: *
007: * Copyright (C) 2005 Internet Archive; a slight adaptation of
008: * LGPL work (C) Sebastiano Vigna
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.util;
028:
029: import java.io.Serializable;
030: import java.security.SecureRandom;
031:
032: /** A Bloom filter.
033: *
034: * SLIGHTLY ADAPTED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
035: *
036: * <p>KEY CHANGES:
037: *
038: * <ul>
039: * <li>Adapted to use 32bit ops as much as possible... may be slightly
040: * faster on 32bit hardware/OS</li>
041: * <li>Changed to use bitfield that is a power-of-two in size, allowing
042: * hash() to use bitshifting rather than modulus... may be slightly
043: * faster</li>
044: * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between
045: * similar strings</li>
046: * <li>Removed dependence on cern.colt MersenneTwister (replaced with
047: * SecureRandom) and QuickBitVector (replaced with local methods).</li>
048: * </ul>
049: *
050: * <hr>
051: *
052: * <P>Instances of this class represent a set of character sequences (with false positives)
053: * using a Bloom filter. Because of the way Bloom filters work,
054: * you cannot remove elements.
055: *
056: * <P>Bloom filters have an expected error rate, depending on the number
057: * of hash functions used, on the filter size and on the number of elements in the filter. This implementation
058: * uses a variable optimal number of hash functions, depending on the expected
059: * number of elements. More precisely, a Bloom
060: * filter for <var>n</var> character sequences with <var>d</var> hash functions will use
061: * ln 2 <var>d</var><var>n</var> ≈ 1.44 <var>d</var><var>n</var> bits;
062: * false positives will happen with probability 2<sup>-<var>d</var></sup>.
063: *
064: * <P>Hash functions are generated at creation time using universal hashing. Each hash function
065: * uses {@link #NUMBER_OF_WEIGHTS} random integers, which are cyclically multiplied by
066: * the character codes in a character sequence. The resulting integers are XOR-ed together.
067: *
068: * <P>This class exports access methods that are very similar to those of {@link java.util.Set},
069: * but it does not implement that interface, as too many non-optional methods
070: * would be unimplementable (e.g., iterators).
071: *
072: * @author Sebastiano Vigna
073: */
074: public class BloomFilter32bp2Split implements Serializable, BloomFilter {
075:
076: private static final long serialVersionUID = -1504889954381695129L;
077:
078: /** The number of weights used to create hash functions. */
079: final public static int NUMBER_OF_WEIGHTS = 2083; // CHANGED FROM 16
080: /** The number of bits in this filter. */
081: final public long m;
082: /** the power-of-two that m is */
083: final public long power; // 1<<power == m
084: /** The number of hash functions used by this filter. */
085: final public int d;
086: /** The underlying bit vectorS. */
087: final private int[][] bits;
088: /** Bitshift to get first index */
089: final private int aShift;
090: /** Mask to get second index */
091: final private int bMask;
092: /** The random integers used to generate the hash functions. */
093: final private int[][] weight;
094:
095: /** The number of elements currently in the filter. It may be
096: * smaller than the actual number of additions of distinct character
097: * sequences because of false positives.
098: */
099: private int size;
100:
101: /** The natural logarithm of 2, used in the computation of the number of bits. */
102: private final static double NATURAL_LOG_OF_2 = Math.log(2);
103:
104: private final static boolean DEBUG = false;
105:
106: /** Creates a new Bloom filter with given number of hash functions and expected number of elements.
107: *
108: * @param n the expected number of elements.
109: * @param d the number of hash functions; if the filter add not more than <code>n</code> elements,
110: * false positives will happen with probability 2<sup>-<var>d</var></sup>.
111: */
112: public BloomFilter32bp2Split(final int n, final int d) {
113: this .d = d;
114: long minBits = (long) ((long) n * (long) d / NATURAL_LOG_OF_2);
115: long pow = 0;
116: while ((1L << pow) < minBits) {
117: pow++;
118: }
119: this .power = pow;
120: this .m = 1L << pow;
121: int len = (int) (m / 32);
122: if (m > 1L << 32) {
123: throw new IllegalArgumentException(
124: "This filter would require " + m + " bits");
125: }
126:
127: aShift = (int) (pow - ADDRESS_BITS_PER_UNIT - 8);
128: bMask = (1 << aShift) - 1;
129: bits = new int[256][1 << aShift];
130:
131: System.out.println("power " + power + " bits " + m + " len "
132: + len);
133: System.out.println("aShift " + aShift + " bMask " + bMask);
134:
135: if (DEBUG)
136: System.err.println("Number of bits: " + m);
137:
138: // seeded for reproduceable behavior in repeated runs; BUT:
139: // SecureRandom's default implementation (as of 1.5)
140: // seems to mix in its own seeding.
141: final SecureRandom random = new SecureRandom(new byte[] { 19,
142: 96 });
143: weight = new int[d][];
144: for (int i = 0; i < d; i++) {
145: weight[i] = new int[NUMBER_OF_WEIGHTS];
146: for (int j = 0; j < NUMBER_OF_WEIGHTS; j++)
147: weight[i][j] = random.nextInt();
148: }
149: }
150:
151: /** The number of character sequences in the filter.
152: *
153: * @return the number of character sequences in the filter (but see {@link #contains(CharSequence)}).
154: */
155:
156: public int size() {
157: return size;
158: }
159:
160: /** Hashes the given sequence with the given hash function.
161: *
162: * @param s a character sequence.
163: * @param l the length of <code>s</code>.
164: * @param k a hash function index (smaller than {@link #d}).
165: * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
166: */
167: private int hash(final CharSequence s, final int l, final int k) {
168: final int[] w = weight[k];
169: int h = 0, i = l;
170: while (i-- != 0)
171: h ^= s.charAt(i) * w[i % NUMBER_OF_WEIGHTS];
172: return h >>> (32 - power);
173: }
174:
175: /** Checks whether the given character sequence is in this filter.
176: *
177: * <P>Note that this method may return true on a character sequence that is has
178: * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
179: * where <var>d</var> is the number of hash functions specified at creation time, if
180: * the number of the elements in the filter is less than <var>n</var>, the number
181: * of expected elements specified at creation time.
182: *
183: * @param s a character sequence.
184: * @return true if the sequence is in the filter (or if a sequence with the
185: * same hash sequence is in the filter).
186: */
187:
188: public boolean contains(final CharSequence s) {
189: int i = d, l = s.length();
190: while (i-- != 0)
191: if (!getBit(hash(s, l, i)))
192: return false;
193: return true;
194: }
195:
196: /** Adds a character sequence to the filter.
197: *
198: * @param s a character sequence.
199: * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
200: */
201:
202: public boolean add(final CharSequence s) {
203: boolean result = false;
204: int i = d, l = s.length();
205: int h;
206: while (i-- != 0) {
207: h = hash(s, l, i);
208: if (!setGetBit(h))
209: result = true;
210: }
211: if (result)
212: size++;
213: return result;
214: }
215:
216: protected final static int ADDRESS_BITS_PER_UNIT = 5; // 32=2^5
217: protected final static int BIT_INDEX_MASK = 31; // = BITS_PER_UNIT - 1;
218:
219: /**
220: * Returns from the local bitvector the value of the bit with
221: * the specified index. The value is <tt>true</tt> if the bit
222: * with the index <tt>bitIndex</tt> is currently set; otherwise,
223: * returns <tt>false</tt>.
224: *
225: * (adapted from cern.colt.bitvector.QuickBitVector)
226: *
227: * @param bitIndex the bit index.
228: * @return the value of the bit with the specified index.
229: */
230: protected boolean getBit(int bitIndex) {
231: int intIndex = (int) (bitIndex >>> ADDRESS_BITS_PER_UNIT);
232: return ((bits[intIndex >>> aShift][intIndex & bMask] & (1 << (bitIndex & BIT_INDEX_MASK))) != 0);
233: }
234:
235: /**
236: * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
237: *
238: * (adapted from cern.colt.bitvector.QuickBitVector)
239: *
240: * @param bitIndex the index of the bit to be set.
241: */
242: protected void setBit(int bitIndex) {
243: int intIndex = (int) (bitIndex >>> ADDRESS_BITS_PER_UNIT);
244: bits[intIndex >>> aShift][intIndex & bMask] |= 1 << (bitIndex & BIT_INDEX_MASK);
245: }
246:
247: /**
248: * Sets the bit with index <tt>bitIndex</tt> in local bitvector --
249: * returning the old value.
250: *
251: * (adapted from cern.colt.bitvector.QuickBitVector)
252: *
253: * @param bitIndex the index of the bit to be set.
254: */
255: protected boolean setGetBit(int bitIndex) {
256: int intIndex = (int) (bitIndex >>> ADDRESS_BITS_PER_UNIT);
257: int a = intIndex >>> aShift;
258: int b = intIndex & bMask;
259: int mask = 1 << (bitIndex & BIT_INDEX_MASK);
260: boolean ret = ((bits[a][b] & (mask)) != 0);
261: bits[a][b] |= mask;
262: return ret;
263: }
264:
265: /* (non-Javadoc)
266: * @see org.archive.util.BloomFilter#getSizeBytes()
267: */
268: public long getSizeBytes() {
269: return bits.length * bits[0].length * 4;
270: }
271: }
|