001: /* BloomFilter32bit
002: *
003: * $Id: BloomFilter32bitSplit.java 4644 2006-09-20 22:40:21Z paul_jack $
004: *
005: * Created on Jun 21, 2005
006: *
007: * Copyright (C) 2005 Internet Archive; a slight adaptation of
008: * LGPL work (C) Sebastiano Vigna
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.util;
028:
029: import java.io.Serializable;
030: import java.security.SecureRandom;
031:
032: /** A Bloom filter.
033: *
034: * SLIGHTLY ADAPTED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
035: *
036: * <p>KEY CHANGES:
037: *
038: * <ul>
039: * <li>Adapted to use 32bit ops as much as possible... may be slightly
040: * faster on 32bit hardware/OS</li>
041: * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between
042: * similar strings</li>
043: * <li>Removed dependence on cern.colt MersenneTwister (replaced with
044: * SecureRandom) and QuickBitVector (replaced with local methods).</li>
045: * </ul>
046: *
047: * <hr>
048: *
049: * <P>Instances of this class represent a set of character sequences (with false positives)
050: * using a Bloom filter. Because of the way Bloom filters work,
051: * you cannot remove elements.
052: *
053: * <P>Bloom filters have an expected error rate, depending on the number
054: * of hash functions used, on the filter size and on the number of elements in the filter. This implementation
055: * uses a variable optimal number of hash functions, depending on the expected
056: * number of elements. More precisely, a Bloom
057: * filter for <var>n</var> character sequences with <var>d</var> hash functions will use
058: * ln 2 <var>d</var><var>n</var> ≈ 1.44 <var>d</var><var>n</var> bits;
059: * false positives will happen with probability 2<sup>-<var>d</var></sup>.
060: *
061: * <P>Hash functions are generated at creation time using universal hashing. Each hash function
062: * uses {@link #NUMBER_OF_WEIGHTS} random integers, which are cyclically multiplied by
063: * the character codes in a character sequence. The resulting integers are XOR-ed together.
064: *
065: * <P>This class exports access methods that are very similar to those of {@link java.util.Set},
066: * but it does not implement that interface, as too many non-optional methods
067: * would be unimplementable (e.g., iterators).
068: *
069: * @author Sebastiano Vigna
070: */
071: public class BloomFilter32bitSplit implements Serializable, BloomFilter {
072:
073: private static final long serialVersionUID = -164106965277863971L;
074:
075: /** The number of weights used to create hash functions. */
076: final public static int NUMBER_OF_WEIGHTS = 2083; // CHANGED FROM 16
077: /** The number of bits in this filter. */
078: final public long m;
079: /** The number of hash functions used by this filter. */
080: final public int d;
081: /** The underlying bit vectorS. */
082: // final private int[] bits;
083: final private int[][] bits;
084: /** The random integers used to generate the hash functions. */
085: final private int[][] weight;
086:
087: /** The number of elements currently in the filter. It may be
088: * smaller than the actual number of additions of distinct character
089: * sequences because of false positives.
090: */
091: private int size;
092:
093: /** The natural logarithm of 2, used in the computation of the number of bits. */
094: private final static double NATURAL_LOG_OF_2 = Math.log(2);
095:
096: /** number of ints in 1MB. */
097: private final static int ONE_MB_INTS = 1 << 18; //
098:
099: private final static boolean DEBUG = false;
100:
101: /** Creates a new Bloom filter with given number of hash functions and expected number of elements.
102: *
103: * @param n the expected number of elements.
104: * @param d the number of hash functions; if the filter add not more than <code>n</code> elements,
105: * false positives will happen with probability 2<sup>-<var>d</var></sup>.
106: */
107: public BloomFilter32bitSplit(final int n, final int d) {
108: this .d = d;
109: int len = (int) Math
110: .ceil(((long) n * (long) d / NATURAL_LOG_OF_2) / 32);
111: // round up to ensure divisible into 1MiB chunks
112: len = ((len / ONE_MB_INTS) + 1) * ONE_MB_INTS;
113: this .m = len * 32L;
114: if (m >= 1L << 32) {
115: throw new IllegalArgumentException(
116: "This filter would require " + m + " bits");
117: }
118: // bits = new int[ len ];
119: bits = new int[len / ONE_MB_INTS][ONE_MB_INTS];
120:
121: if (DEBUG)
122: System.err.println("Number of bits: " + m);
123:
124: // seeded for reproduceable behavior in repeated runs; BUT:
125: // SecureRandom's default implementation (as of 1.5)
126: // seems to mix in its own seeding.
127: final SecureRandom random = new SecureRandom(new byte[] { 19,
128: 96 });
129: weight = new int[d][];
130: for (int i = 0; i < d; i++) {
131: weight[i] = new int[NUMBER_OF_WEIGHTS];
132: for (int j = 0; j < NUMBER_OF_WEIGHTS; j++)
133: weight[i][j] = random.nextInt();
134: }
135: }
136:
137: /** The number of character sequences in the filter.
138: *
139: * @return the number of character sequences in the filter (but see {@link #contains(CharSequence)}).
140: */
141:
142: public int size() {
143: return size;
144: }
145:
146: /** Hashes the given sequence with the given hash function.
147: *
148: * @param s a character sequence.
149: * @param l the length of <code>s</code>.
150: * @param k a hash function index (smaller than {@link #d}).
151: * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
152: */
153: private long hash(final CharSequence s, final int l, final int k) {
154: final int[] w = weight[k];
155: int h = 0, i = l;
156: while (i-- != 0)
157: h ^= s.charAt(i) * w[i % NUMBER_OF_WEIGHTS];
158: return ((long) h - Integer.MIN_VALUE) % m;
159: }
160:
161: /** Checks whether the given character sequence is in this filter.
162: *
163: * <P>Note that this method may return true on a character sequence that is has
164: * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
165: * where <var>d</var> is the number of hash functions specified at creation time, if
166: * the number of the elements in the filter is less than <var>n</var>, the number
167: * of expected elements specified at creation time.
168: *
169: * @param s a character sequence.
170: * @return true if the sequence is in the filter (or if a sequence with the
171: * same hash sequence is in the filter).
172: */
173:
174: public boolean contains(final CharSequence s) {
175: int i = d, l = s.length();
176: while (i-- != 0)
177: if (!getBit(hash(s, l, i)))
178: return false;
179: return true;
180: }
181:
182: /** Adds a character sequence to the filter.
183: *
184: * @param s a character sequence.
185: * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
186: */
187:
188: public boolean add(final CharSequence s) {
189: boolean result = false;
190: int i = d, l = s.length();
191: long h;
192: while (i-- != 0) {
193: h = hash(s, l, i);
194: if (!setGetBit(h))
195: result = true;
196: }
197: if (result)
198: size++;
199: return result;
200: }
201:
202: protected final static long ADDRESS_BITS_PER_UNIT = 5; // 32=2^5
203: protected final static long BIT_INDEX_MASK = 31; // = BITS_PER_UNIT - 1;
204:
205: /**
206: * Returns from the local bitvector the value of the bit with
207: * the specified index. The value is <tt>true</tt> if the bit
208: * with the index <tt>bitIndex</tt> is currently set; otherwise,
209: * returns <tt>false</tt>.
210: *
211: * (adapted from cern.colt.bitvector.QuickBitVector)
212: *
213: * @param bitIndex the bit index.
214: * @return the value of the bit with the specified index.
215: */
216: protected boolean getBit(long bitIndex) {
217: int intIndex = (int) (bitIndex >>> ADDRESS_BITS_PER_UNIT);
218: return ((bits[intIndex / ONE_MB_INTS][intIndex % ONE_MB_INTS] & (1 << (bitIndex & BIT_INDEX_MASK))) != 0);
219: }
220:
221: /**
222: * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
223: *
224: * (adapted from cern.colt.bitvector.QuickBitVector)
225: *
226: * @param bitIndex the index of the bit to be set.
227: */
228: protected void setBit(long bitIndex) {
229: int intIndex = (int) (bitIndex >>> ADDRESS_BITS_PER_UNIT);
230: bits[intIndex / ONE_MB_INTS][intIndex % ONE_MB_INTS] |= 1 << (bitIndex & BIT_INDEX_MASK);
231: }
232:
233: /**
234: * Sets the bit with index <tt>bitIndex</tt> in local bitvector --
235: * returning the old value.
236: *
237: * (adapted from cern.colt.bitvector.QuickBitVector)
238: *
239: * @param bitIndex the index of the bit to be set.
240: */
241: protected boolean setGetBit(long bitIndex) {
242: int intIndex = (int) (bitIndex >>> ADDRESS_BITS_PER_UNIT);
243: int a = intIndex / ONE_MB_INTS;
244: int b = intIndex % ONE_MB_INTS;
245: int mask = 1 << (bitIndex & BIT_INDEX_MASK);
246: boolean ret = ((bits[a][b] & (mask)) != 0);
247: bits[a][b] |= mask;
248: return ret;
249: }
250:
251: /* (non-Javadoc)
252: * @see org.archive.util.BloomFilter#getSizeBytes()
253: */
254: public long getSizeBytes() {
255: return bits.length * bits[0].length * 4;
256: }
257: }
|