Skip to content
This repository was archived by the owner on Sep 18, 2023. It is now read-only.

Commit fbcfc56

Browse files
committed
backport runtime filter
Signed-off-by: Yuan Zhou <[email protected]>
1 parent 86e1a91 commit fbcfc56

File tree

10 files changed

+8580
-0
lines changed

10 files changed

+8580
-0
lines changed
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.util.sketch;
19+
20+
import java.io.IOException;
21+
import java.io.InputStream;
22+
import java.io.OutputStream;
23+
24+
/**
25+
* A Bloom filter is a space-efficient probabilistic data structure that offers an approximate
26+
* containment test with one-sided error: if it claims that an item is contained in it, this
27+
* might be in error, but if it claims that an item is <i>not</i> contained in it, then this is
28+
* definitely true. Currently supported data types include:
29+
* <ul>
30+
* <li>{@link Byte}</li>
31+
* <li>{@link Short}</li>
32+
* <li>{@link Integer}</li>
33+
* <li>{@link Long}</li>
34+
* <li>{@link String}</li>
35+
* </ul>
36+
* The false positive probability ({@code FPP}) of a Bloom filter is defined as the probability that
37+
* {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that has
38+
* not actually been put in the {@code BloomFilter}.
39+
*
40+
* The implementation is largely based on the {@code BloomFilter} class from Guava.
41+
*/
42+
public abstract class BloomFilter {
43+
44+
public enum Version {
45+
/**
46+
* {@code BloomFilter} binary format version 1. All values written in big-endian order:
47+
* <ul>
48+
* <li>Version number, always 1 (32 bit)</li>
49+
* <li>Number of hash functions (32 bit)</li>
50+
* <li>Total number of words of the underlying bit array (32 bit)</li>
51+
* <li>The words/longs (numWords * 64 bit)</li>
52+
* </ul>
53+
*/
54+
V1(1);
55+
56+
private final int versionNumber;
57+
58+
Version(int versionNumber) {
59+
this.versionNumber = versionNumber;
60+
}
61+
62+
int getVersionNumber() {
63+
return versionNumber;
64+
}
65+
}
66+
67+
/**
68+
* Returns the probability that {@linkplain #mightContain(Object)} erroneously return {@code true}
69+
* for an object that has not actually been put in the {@code BloomFilter}.
70+
*
71+
* Ideally, this number should be close to the {@code fpp} parameter passed in
72+
* {@linkplain #create(long, double)}, or smaller. If it is significantly higher, it is usually
73+
* the case that too many items (more than expected) have been put in the {@code BloomFilter},
74+
* degenerating it.
75+
*/
76+
public abstract double expectedFpp();
77+
78+
/**
79+
* Returns the number of bits in the underlying bit array.
80+
*/
81+
public abstract long bitSize();
82+
83+
/**
84+
* Puts an item into this {@code BloomFilter}. Ensures that subsequent invocations of
85+
* {@linkplain #mightContain(Object)} with the same item will always return {@code true}.
86+
*
87+
* @return true if the bloom filter's bits changed as a result of this operation. If the bits
88+
* changed, this is <i>definitely</i> the first time {@code object} has been added to the
89+
* filter. If the bits haven't changed, this <i>might</i> be the first time {@code object}
90+
* has been added to the filter. Note that {@code put(t)} always returns the
91+
* <i>opposite</i> result to what {@code mightContain(t)} would have returned at the time
92+
* it is called.
93+
*/
94+
public abstract boolean put(Object item);
95+
96+
/**
97+
* A specialized variant of {@link #put(Object)} that only supports {@code String} items.
98+
*/
99+
public abstract boolean putString(String item);
100+
101+
/**
102+
* A specialized variant of {@link #put(Object)} that only supports {@code long} items.
103+
*/
104+
public abstract boolean putLong(long item);
105+
106+
/**
107+
* A specialized variant of {@link #put(Object)} that only supports byte array items.
108+
*/
109+
public abstract boolean putBinary(byte[] item);
110+
111+
/**
112+
* Determines whether a given bloom filter is compatible with this bloom filter. For two
113+
* bloom filters to be compatible, they must have the same bit size.
114+
*
115+
* @param other The bloom filter to check for compatibility.
116+
*/
117+
public abstract boolean isCompatible(BloomFilter other);
118+
119+
/**
120+
* Combines this bloom filter with another bloom filter by performing a bitwise OR of the
121+
* underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the
122+
* bloom filters are appropriately sized to avoid saturating them.
123+
*
124+
* @param other The bloom filter to combine this bloom filter with. It is not mutated.
125+
* @throws IncompatibleMergeException if {@code isCompatible(other) == false}
126+
*/
127+
public abstract BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeException;
128+
129+
/**
130+
* Combines this bloom filter with another bloom filter by performing a bitwise AND of the
131+
* underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the
132+
* bloom filters are appropriately sized to avoid saturating them.
133+
*
134+
* @param other The bloom filter to combine this bloom filter with. It is not mutated.
135+
* @throws IncompatibleMergeException if {@code isCompatible(other) == false}
136+
*/
137+
public abstract BloomFilter intersectInPlace(BloomFilter other) throws IncompatibleMergeException;
138+
139+
/**
140+
* Returns {@code true} if the element <i>might</i> have been put in this Bloom filter,
141+
* {@code false} if this is <i>definitely</i> not the case.
142+
*/
143+
public abstract boolean mightContain(Object item);
144+
145+
/**
146+
* A specialized variant of {@link #mightContain(Object)} that only tests {@code String} items.
147+
*/
148+
public abstract boolean mightContainString(String item);
149+
150+
/**
151+
* A specialized variant of {@link #mightContain(Object)} that only tests {@code long} items.
152+
*/
153+
public abstract boolean mightContainLong(long item);
154+
155+
/**
156+
* A specialized variant of {@link #mightContain(Object)} that only tests byte array items.
157+
*/
158+
public abstract boolean mightContainBinary(byte[] item);
159+
160+
/**
161+
* Writes out this {@link BloomFilter} to an output stream in binary format. It is the caller's
162+
* responsibility to close the stream.
163+
*/
164+
public abstract void writeTo(OutputStream out) throws IOException;
165+
166+
/**
167+
* @return the number of set bits in this {@link BloomFilter}.
168+
*/
169+
public long cardinality() {
170+
throw new UnsupportedOperationException("Not implemented");
171+
}
172+
173+
/**
174+
* Reads in a {@link BloomFilter} from an input stream. It is the caller's responsibility to close
175+
* the stream.
176+
*/
177+
public static BloomFilter readFrom(InputStream in) throws IOException {
178+
return BloomFilterImpl.readFrom(in);
179+
}
180+
181+
/**
182+
* Computes the optimal k (number of hashes per item inserted in Bloom filter), given the
183+
* expected insertions and total number of bits in the Bloom filter.
184+
*
185+
* See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
186+
*
187+
* @param n expected insertions (must be positive)
188+
* @param m total number of bits in Bloom filter (must be positive)
189+
*/
190+
private static int optimalNumOfHashFunctions(long n, long m) {
191+
// (m / n) * log(2), but avoid truncation due to division!
192+
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
193+
}
194+
195+
/**
196+
* Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
197+
* expected insertions, the required false positive probability.
198+
*
199+
* See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
200+
*
201+
* @param n expected insertions (must be positive)
202+
* @param p false positive rate (must be 0 < p < 1)
203+
*/
204+
private static long optimalNumOfBits(long n, double p) {
205+
return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
206+
}
207+
208+
static final double DEFAULT_FPP = 0.03;
209+
210+
/**
211+
* Creates a {@link BloomFilter} with the expected number of insertions and a default expected
212+
* false positive probability of 3%.
213+
*
214+
* Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
215+
* will result in its saturation, and a sharp deterioration of its false positive probability.
216+
*/
217+
public static BloomFilter create(long expectedNumItems) {
218+
return create(expectedNumItems, DEFAULT_FPP);
219+
}
220+
221+
/**
222+
* Creates a {@link BloomFilter} with the expected number of insertions and expected false
223+
* positive probability.
224+
*
225+
* Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
226+
* will result in its saturation, and a sharp deterioration of its false positive probability.
227+
*/
228+
public static BloomFilter create(long expectedNumItems, double fpp) {
229+
if (fpp <= 0D || fpp >= 1D) {
230+
throw new IllegalArgumentException(
231+
"False positive probability must be within range (0.0, 1.0)"
232+
);
233+
}
234+
235+
return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp));
236+
}
237+
238+
/**
239+
* Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will
240+
* pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter.
241+
*/
242+
public static BloomFilter create(long expectedNumItems, long numBits) {
243+
if (expectedNumItems <= 0) {
244+
throw new IllegalArgumentException("Expected insertions must be positive");
245+
}
246+
247+
if (numBits <= 0) {
248+
throw new IllegalArgumentException("Number of bits must be positive");
249+
}
250+
251+
return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits);
252+
}
253+
}

0 commit comments

Comments
 (0)