001: /* BloomFilter
002: *
003: * $Id: BloomFilter32bp2.java 4644 2006-09-20 22:40:21Z paul_jack $
004: *
005: * Created on Jun 21, 2005
006: *
007: * Copyright (C) 2005 Internet Archive; a slight adaptation of
008: * LGPL work (C) Sebastiano Vigna
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.util;
028:
029: import java.io.Serializable;
030: import java.security.SecureRandom;
031:
032: /** A Bloom filter.
033: *
034: * SLIGHTLY ADAPTED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
035: *
036: * <p>KEY CHANGES:
037: *
038: * <ul>
039: * <li>Adapted to use 32bit ops as much as possible... may be slightly
040: * faster on 32bit hardware/OS</li>
041: * <li>Changed to use bitfield that is a power-of-two in size, allowing
042: * hash() to use bitshifting rather than modulus... may be slightly
043: * faster</li>
044: * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between
045: * similar strings</li>
046: * <li>Removed dependence on cern.colt MersenneTwister (replaced with
047: * SecureRandom) and QuickBitVector (replaced with local methods).</li>
048: * </ul>
049: *
050: * <hr>
051: *
052: * <P>Instances of this class represent a set of character sequences (with false positives)
053: * using a Bloom filter. Because of the way Bloom filters work,
054: * you cannot remove elements.
055: *
056: * <P>Bloom filters have an expected error rate, depending on the number
057: * of hash functions used, on the filter size and on the number of elements in the filter. This implementation
058: * uses a variable optimal number of hash functions, depending on the expected
059: * number of elements. More precisely, a Bloom
060: * filter for <var>n</var> character sequences with <var>d</var> hash functions will use
061: * ln 2 <var>d</var><var>n</var> ≈ 1.44 <var>d</var><var>n</var> bits;
062: * false positives will happen with probability 2<sup>-<var>d</var></sup>.
063: *
064: * <P>Hash functions are generated at creation time using universal hashing. Each hash function
065: * uses {@link #NUMBER_OF_WEIGHTS} random integers, which are cyclically multiplied by
066: * the character codes in a character sequence. The resulting integers are XOR-ed together.
067: *
068: * <P>This class exports access methods that are very similar to those of {@link java.util.Set},
069: * but it does not implement that interface, as too many non-optional methods
070: * would be unimplementable (e.g., iterators).
071: *
072: * @author Sebastiano Vigna
073: */
074: public class BloomFilter32bp2 implements Serializable, BloomFilter {
075:
076: private static final long serialVersionUID = -2292902803681146635L;
077:
078: /** The number of weights used to create hash functions. */
079: final public static int NUMBER_OF_WEIGHTS = 2083; // CHANGED FROM 16
080: /** The number of bits in this filter. */
081: final public long m;
082: /** the power-of-two that m is */
083: final public long power; // 1<<power == m
084: /** The number of hash functions used by this filter. */
085: final public int d;
086: /** The underlying bit vectorS. */
087: final private int[] bits;
088: /** The random integers used to generate the hash functions. */
089: final private int[][] weight;
090:
091: /** The number of elements currently in the filter. It may be
092: * smaller than the actual number of additions of distinct character
093: * sequences because of false positives.
094: */
095: private int size;
096:
097: /** The natural logarithm of 2, used in the computation of the number of bits. */
098: private final static double NATURAL_LOG_OF_2 = Math.log(2);
099:
100: private final static boolean DEBUG = false;
101:
102: /** Creates a new Bloom filter with given number of hash functions and expected number of elements.
103: *
104: * @param n the expected number of elements.
105: * @param d the number of hash functions; if the filter add not more than <code>n</code> elements,
106: * false positives will happen with probability 2<sup>-<var>d</var></sup>.
107: */
108: public BloomFilter32bp2(final int n, final int d) {
109: this .d = d;
110: long minBits = (long) ((long) n * (long) d / NATURAL_LOG_OF_2);
111: long pow = 0;
112: while ((1L << pow) < minBits) {
113: pow++;
114: }
115: this .power = pow;
116: this .m = 1L << pow;
117: int len = (int) (m / 32);
118: if (m > 1L << 32) {
119: throw new IllegalArgumentException(
120: "This filter would require " + m + " bits");
121: }
122: System.out.println("power " + power + " bits " + m + " len "
123: + len);
124:
125: bits = new int[len];
126:
127: if (DEBUG)
128: System.err.println("Number of bits: " + m);
129:
130: // seeded for reproduceable behavior in repeated runs; BUT:
131: // SecureRandom's default implementation (as of 1.5)
132: // seems to mix in its own seeding.
133: final SecureRandom random = new SecureRandom(new byte[] { 19,
134: 96 });
135: weight = new int[d][];
136: for (int i = 0; i < d; i++) {
137: weight[i] = new int[NUMBER_OF_WEIGHTS];
138: for (int j = 0; j < NUMBER_OF_WEIGHTS; j++)
139: weight[i][j] = random.nextInt();
140: }
141: }
142:
143: /** The number of character sequences in the filter.
144: *
145: * @return the number of character sequences in the filter (but see {@link #contains(CharSequence)}).
146: */
147:
148: public int size() {
149: return size;
150: }
151:
152: /** Hashes the given sequence with the given hash function.
153: *
154: * @param s a character sequence.
155: * @param l the length of <code>s</code>.
156: * @param k a hash function index (smaller than {@link #d}).
157: * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
158: */
159: private int hash(final CharSequence s, final int l, final int k) {
160: final int[] w = weight[k];
161: int h = 0, i = l;
162: while (i-- != 0)
163: h ^= s.charAt(i) * w[i % NUMBER_OF_WEIGHTS];
164: return h >>> (32 - power);
165: }
166:
167: /** Checks whether the given character sequence is in this filter.
168: *
169: * <P>Note that this method may return true on a character sequence that is has
170: * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
171: * where <var>d</var> is the number of hash functions specified at creation time, if
172: * the number of the elements in the filter is less than <var>n</var>, the number
173: * of expected elements specified at creation time.
174: *
175: * @param s a character sequence.
176: * @return true if the sequence is in the filter (or if a sequence with the
177: * same hash sequence is in the filter).
178: */
179:
180: public boolean contains(final CharSequence s) {
181: int i = d, l = s.length();
182: while (i-- != 0)
183: if (!getBit(hash(s, l, i)))
184: return false;
185: return true;
186: }
187:
188: /** Adds a character sequence to the filter.
189: *
190: * @param s a character sequence.
191: * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
192: */
193:
194: public boolean add(final CharSequence s) {
195: boolean result = false;
196: int i = d, l = s.length();
197: int h;
198: while (i-- != 0) {
199: h = hash(s, l, i);
200: if (!getBit(h))
201: result = true;
202: setBit(h);
203: }
204: if (result)
205: size++;
206: return result;
207: }
208:
209: protected final static int ADDRESS_BITS_PER_UNIT = 5; // 32=2^5
210: protected final static int BIT_INDEX_MASK = 31; // = BITS_PER_UNIT - 1;
211:
212: /**
213: * Returns from the local bitvector the value of the bit with
214: * the specified index. The value is <tt>true</tt> if the bit
215: * with the index <tt>bitIndex</tt> is currently set; otherwise,
216: * returns <tt>false</tt>.
217: *
218: * (adapted from cern.colt.bitvector.QuickBitVector)
219: *
220: * @param bitIndex the bit index.
221: * @return the value of the bit with the specified index.
222: */
223: protected boolean getBit(int bitIndex) {
224: return ((bits[(int) (bitIndex >>> ADDRESS_BITS_PER_UNIT)] & (1 << (bitIndex & BIT_INDEX_MASK))) != 0);
225: }
226:
227: /**
228: * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
229: *
230: * (adapted from cern.colt.bitvector.QuickBitVector)
231: *
232: * @param bitIndex the index of the bit to be set.
233: */
234: protected void setBit(int bitIndex) {
235: bits[(int) (bitIndex >>> ADDRESS_BITS_PER_UNIT)] |= 1 << (bitIndex & BIT_INDEX_MASK);
236: }
237:
238: /* (non-Javadoc)
239: * @see org.archive.util.BloomFilter#getSizeBytes()
240: */
241: public long getSizeBytes() {
242: return bits.length * 4;
243: }
244: }
|