|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
| 18 | +package org.apache.spark.util.sketch; |
| 19 | + |
| 20 | +import java.io.IOException; |
| 21 | +import java.io.InputStream; |
| 22 | +import java.io.OutputStream; |
| 23 | + |
| 24 | +/** |
| 25 | + * A Bloom filter is a space-efficient probabilistic data structure that offers an approximate |
| 26 | + * containment test with one-sided error: if it claims that an item is contained in it, this |
| 27 | + * might be in error, but if it claims that an item is <i>not</i> contained in it, then this is |
| 28 | + * definitely true. Currently supported data types include: |
| 29 | + * <ul> |
| 30 | + * <li>{@link Byte}</li> |
| 31 | + * <li>{@link Short}</li> |
| 32 | + * <li>{@link Integer}</li> |
| 33 | + * <li>{@link Long}</li> |
| 34 | + * <li>{@link String}</li> |
| 35 | + * </ul> |
| 36 | + * The false positive probability ({@code FPP}) of a Bloom filter is defined as the probability that |
| 37 | + * {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that has |
| 38 | + * not actually been put in the {@code BloomFilter}. |
| 39 | + * |
| 40 | + * The implementation is largely based on the {@code BloomFilter} class from Guava. |
| 41 | + */ |
| 42 | +public abstract class BloomFilter { |
| 43 | + |
| 44 | + public enum Version { |
| 45 | + /** |
| 46 | + * {@code BloomFilter} binary format version 1. All values written in big-endian order: |
| 47 | + * <ul> |
| 48 | + * <li>Version number, always 1 (32 bit)</li> |
| 49 | + * <li>Number of hash functions (32 bit)</li> |
| 50 | + * <li>Total number of words of the underlying bit array (32 bit)</li> |
| 51 | + * <li>The words/longs (numWords * 64 bit)</li> |
| 52 | + * </ul> |
| 53 | + */ |
| 54 | + V1(1); |
| 55 | + |
| 56 | + private final int versionNumber; |
| 57 | + |
| 58 | + Version(int versionNumber) { |
| 59 | + this.versionNumber = versionNumber; |
| 60 | + } |
| 61 | + |
| 62 | + int getVersionNumber() { |
| 63 | + return versionNumber; |
| 64 | + } |
| 65 | + } |
| 66 | + |
| 67 | + /** |
| 68 | + * Returns the probability that {@linkplain #mightContain(Object)} erroneously return {@code true} |
| 69 | + * for an object that has not actually been put in the {@code BloomFilter}. |
| 70 | + * |
| 71 | + * Ideally, this number should be close to the {@code fpp} parameter passed in |
| 72 | + * {@linkplain #create(long, double)}, or smaller. If it is significantly higher, it is usually |
| 73 | + * the case that too many items (more than expected) have been put in the {@code BloomFilter}, |
| 74 | + * degenerating it. |
| 75 | + */ |
| 76 | + public abstract double expectedFpp(); |
| 77 | + |
| 78 | + /** |
| 79 | + * Returns the number of bits in the underlying bit array. |
| 80 | + */ |
| 81 | + public abstract long bitSize(); |
| 82 | + |
| 83 | + /** |
| 84 | + * Puts an item into this {@code BloomFilter}. Ensures that subsequent invocations of |
| 85 | + * {@linkplain #mightContain(Object)} with the same item will always return {@code true}. |
| 86 | + * |
| 87 | + * @return true if the bloom filter's bits changed as a result of this operation. If the bits |
| 88 | + * changed, this is <i>definitely</i> the first time {@code object} has been added to the |
| 89 | + * filter. If the bits haven't changed, this <i>might</i> be the first time {@code object} |
| 90 | + * has been added to the filter. Note that {@code put(t)} always returns the |
| 91 | + * <i>opposite</i> result to what {@code mightContain(t)} would have returned at the time |
| 92 | + * it is called. |
| 93 | + */ |
| 94 | + public abstract boolean put(Object item); |
| 95 | + |
| 96 | + /** |
| 97 | + * A specialized variant of {@link #put(Object)} that only supports {@code String} items. |
| 98 | + */ |
| 99 | + public abstract boolean putString(String item); |
| 100 | + |
| 101 | + /** |
| 102 | + * A specialized variant of {@link #put(Object)} that only supports {@code long} items. |
| 103 | + */ |
| 104 | + public abstract boolean putLong(long item); |
| 105 | + |
| 106 | + /** |
| 107 | + * A specialized variant of {@link #put(Object)} that only supports byte array items. |
| 108 | + */ |
| 109 | + public abstract boolean putBinary(byte[] item); |
| 110 | + |
| 111 | + /** |
| 112 | + * Determines whether a given bloom filter is compatible with this bloom filter. For two |
| 113 | + * bloom filters to be compatible, they must have the same bit size. |
| 114 | + * |
| 115 | + * @param other The bloom filter to check for compatibility. |
| 116 | + */ |
| 117 | + public abstract boolean isCompatible(BloomFilter other); |
| 118 | + |
| 119 | + /** |
| 120 | + * Combines this bloom filter with another bloom filter by performing a bitwise OR of the |
| 121 | + * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the |
| 122 | + * bloom filters are appropriately sized to avoid saturating them. |
| 123 | + * |
| 124 | + * @param other The bloom filter to combine this bloom filter with. It is not mutated. |
| 125 | + * @throws IncompatibleMergeException if {@code isCompatible(other) == false} |
| 126 | + */ |
| 127 | + public abstract BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeException; |
| 128 | + |
| 129 | + /** |
| 130 | + * Combines this bloom filter with another bloom filter by performing a bitwise AND of the |
| 131 | + * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the |
| 132 | + * bloom filters are appropriately sized to avoid saturating them. |
| 133 | + * |
| 134 | + * @param other The bloom filter to combine this bloom filter with. It is not mutated. |
| 135 | + * @throws IncompatibleMergeException if {@code isCompatible(other) == false} |
| 136 | + */ |
| 137 | + public abstract BloomFilter intersectInPlace(BloomFilter other) throws IncompatibleMergeException; |
| 138 | + |
| 139 | + /** |
| 140 | + * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter, |
| 141 | + * {@code false} if this is <i>definitely</i> not the case. |
| 142 | + */ |
| 143 | + public abstract boolean mightContain(Object item); |
| 144 | + |
| 145 | + /** |
| 146 | + * A specialized variant of {@link #mightContain(Object)} that only tests {@code String} items. |
| 147 | + */ |
| 148 | + public abstract boolean mightContainString(String item); |
| 149 | + |
| 150 | + /** |
| 151 | + * A specialized variant of {@link #mightContain(Object)} that only tests {@code long} items. |
| 152 | + */ |
| 153 | + public abstract boolean mightContainLong(long item); |
| 154 | + |
| 155 | + /** |
| 156 | + * A specialized variant of {@link #mightContain(Object)} that only tests byte array items. |
| 157 | + */ |
| 158 | + public abstract boolean mightContainBinary(byte[] item); |
| 159 | + |
| 160 | + /** |
| 161 | + * Writes out this {@link BloomFilter} to an output stream in binary format. It is the caller's |
| 162 | + * responsibility to close the stream. |
| 163 | + */ |
| 164 | + public abstract void writeTo(OutputStream out) throws IOException; |
| 165 | + |
| 166 | + /** |
| 167 | + * @return the number of set bits in this {@link BloomFilter}. |
| 168 | + */ |
| 169 | + public long cardinality() { |
| 170 | + throw new UnsupportedOperationException("Not implemented"); |
| 171 | + } |
| 172 | + |
| 173 | + /** |
| 174 | + * Reads in a {@link BloomFilter} from an input stream. It is the caller's responsibility to close |
| 175 | + * the stream. |
| 176 | + */ |
| 177 | + public static BloomFilter readFrom(InputStream in) throws IOException { |
| 178 | + return BloomFilterImpl.readFrom(in); |
| 179 | + } |
| 180 | + |
| 181 | + /** |
| 182 | + * Computes the optimal k (number of hashes per item inserted in Bloom filter), given the |
| 183 | + * expected insertions and total number of bits in the Bloom filter. |
| 184 | + * |
| 185 | + * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula. |
| 186 | + * |
| 187 | + * @param n expected insertions (must be positive) |
| 188 | + * @param m total number of bits in Bloom filter (must be positive) |
| 189 | + */ |
| 190 | + private static int optimalNumOfHashFunctions(long n, long m) { |
| 191 | + // (m / n) * log(2), but avoid truncation due to division! |
| 192 | + return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); |
| 193 | + } |
| 194 | + |
| 195 | + /** |
| 196 | + * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified |
| 197 | + * expected insertions, the required false positive probability. |
| 198 | + * |
| 199 | + * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula. |
| 200 | + * |
| 201 | + * @param n expected insertions (must be positive) |
| 202 | + * @param p false positive rate (must be 0 < p < 1) |
| 203 | + */ |
| 204 | + private static long optimalNumOfBits(long n, double p) { |
| 205 | + return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); |
| 206 | + } |
| 207 | + |
| 208 | + static final double DEFAULT_FPP = 0.03; |
| 209 | + |
| 210 | + /** |
| 211 | + * Creates a {@link BloomFilter} with the expected number of insertions and a default expected |
| 212 | + * false positive probability of 3%. |
| 213 | + * |
| 214 | + * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, |
| 215 | + * will result in its saturation, and a sharp deterioration of its false positive probability. |
| 216 | + */ |
| 217 | + public static BloomFilter create(long expectedNumItems) { |
| 218 | + return create(expectedNumItems, DEFAULT_FPP); |
| 219 | + } |
| 220 | + |
| 221 | + /** |
| 222 | + * Creates a {@link BloomFilter} with the expected number of insertions and expected false |
| 223 | + * positive probability. |
| 224 | + * |
| 225 | + * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, |
| 226 | + * will result in its saturation, and a sharp deterioration of its false positive probability. |
| 227 | + */ |
| 228 | + public static BloomFilter create(long expectedNumItems, double fpp) { |
| 229 | + if (fpp <= 0D || fpp >= 1D) { |
| 230 | + throw new IllegalArgumentException( |
| 231 | + "False positive probability must be within range (0.0, 1.0)" |
| 232 | + ); |
| 233 | + } |
| 234 | + |
| 235 | + return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp)); |
| 236 | + } |
| 237 | + |
| 238 | + /** |
| 239 | + * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will |
| 240 | + * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter. |
| 241 | + */ |
| 242 | + public static BloomFilter create(long expectedNumItems, long numBits) { |
| 243 | + if (expectedNumItems <= 0) { |
| 244 | + throw new IllegalArgumentException("Expected insertions must be positive"); |
| 245 | + } |
| 246 | + |
| 247 | + if (numBits <= 0) { |
| 248 | + throw new IllegalArgumentException("Number of bits must be positive"); |
| 249 | + } |
| 250 | + |
| 251 | + return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits); |
| 252 | + } |
| 253 | +} |
0 commit comments