@@ -89,6 +89,79 @@ import Data.BloomFilter.Hash
8989
9090import Prelude hiding (elem , notElem , read )
9191
92+ -- $overview
93+ --
94+ -- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The
95+ -- size determines the number of bits that should be used for the filter. Note
96+ -- that a filter is fixed in size; it cannot be resized after creation.
97+ --
98+ -- The size can be specified by asking for a target false positive rate (FPR)
99+ -- or a number of bits per element, and the number of elements in the filter.
100+ -- For example:
101+ --
102+ -- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements
103+ -- with a false positive rate of 1 in 1000
104+ --
105+ -- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements
106+ -- with 10 bits per element
107+ --
108+ -- Depending on the application it may be more important to target a fixed
109+ -- amount of memory to use, or target a specific FPR.
110+ --
111+ -- As a very rough guide for filter sizes, here are a range of FPRs and bits
112+ -- per element:
113+ --
114+ -- * FPR of 1e-1 requires approximately 4.8 bits per element
115+ -- * FPR of 1e-2 requires approximately 9.6 bits per element
116+ -- * FPR of 1e-3 requires approximately 14.4 bits per element
117+ -- * FPR of 1e-4 requires approximately 19.2 bits per element
118+ -- * FPR of 1e-5 requires approximately 24.0 bits per element
119+
120+
121+ -- $example
122+ --
123+ -- This example reads a dictionary file containing one word per line,
124+ -- constructs a Bloom filter with a 1% false positive rate, and
125+ -- spellchecks its standard input. Like the Unix @spell@ command, it
126+ -- prints each word that it does not recognize.
127+ --
128+ -- @
129+ -- import Data.Maybe (mapMaybe)
130+ -- import qualified Data.BloomFilter as B
131+ --
132+ -- main = do
133+ -- filt \<- B.fromList (B.policyForFPR 0.01) . words \<$> readFile "\/usr\/share\/dict\/words"
134+ -- let check word | B.elem word filt = Nothing
135+ -- | otherwise = Just word
136+ -- interact (unlines . mapMaybe check . lines)
137+ -- @
138+
139+
140+ -- $differences
141+ --
142+ -- This package is an entirely rewritten fork of
143+ -- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package.
144+ --
145+ -- The main differences are
146+ --
147+ -- * This packages support bloomfilters of arbitrary sizes
148+ -- (not limited to powers of two). Also sizes over 2^32 are supported.
149+ --
150+ -- * The 'Bloom' and 'MBloom' types are parametrised over a 'Hashable' type
151+ -- class, instead of having a @a -> ['Hash']@ typed field.
152+ -- This separation allows clean de\/serialization of Bloom filters in this
153+ -- package, as the hashing scheme is static.
154+ --
155+ -- * [@XXH3@ hash](https://xxhash.com/) is used instead of Jenkins'
156+ -- @lookup3@.
157+ --
158+ -- * Support for both classic and \"blocked\" Bloom filters. Blocked-structured
159+ -- Bloom filters arrange all the bits for each insert or lookup into a single
160+ -- cache line, which greatly reduces the number of slow uncached memory reads.
161+ -- The trade-off for this performance optimisation is a slightly worse
162+ -- trade-off between bits per element and the FPR. In practice for typical
163+ -- FPRs of 1-e3 -- 1e-4, this requires a couple extra bits per element.
164+
92165-- | Create an immutable Bloom filter, using the given setup function
93166-- which executes in the 'ST' monad.
94167--
@@ -205,75 +278,3 @@ deserialise bloomsalt bloomsize fill = do
205278 mbloom <- stToPrim $ new bloomsalt bloomsize
206279 Internal. deserialise mbloom fill
207280 stToPrim $ unsafeFreeze mbloom
208-
209- -- $overview
210- --
211- -- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The
212- -- size determines the number of bits that should be used for the filter. Note
213- -- that a filter is fixed in size; it cannot be resized after creation.
214- --
215- -- The size can be specified by asking for a target false positive rate (FPR)
216- -- or a number of bits per element, and the number of elements in the filter.
217- -- For example:
218- --
219- -- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements
220- -- with a false positive rate of 1 in 1000
221- --
222- -- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements
223- -- with 10 bits per element
224- --
225- -- Depending on the application it may be more important to target a fixed
226- -- amount of memory to use, or target a specific FPR.
227- --
228- -- As a very rough guide for filter sizes, here are a range of FPRs and bits
229- -- per element:
230- --
231- -- * FPR of 1e-1 requires approximately 4.8 bits per element
232- -- * FPR of 1e-2 requires approximately 9.6 bits per element
233- -- * FPR of 1e-3 requires approximately 14.4 bits per element
234- -- * FPR of 1e-4 requires approximately 19.2 bits per element
235- -- * FPR of 1e-5 requires approximately 24.0 bits per element
236- --
237-
238- -- $example
239- --
240- -- This example reads a dictionary file containing one word per line,
241- -- constructs a Bloom filter with a 1% false positive rate, and
242- -- spellchecks its standard input. Like the Unix @spell@ command, it
243- -- prints each word that it does not recognize.
244- --
245- -- @
246- -- import Data.Maybe (mapMaybe)
247- -- import qualified Data.BloomFilter as B
248- --
249- -- main = do
250- -- filt \<- B.fromList (B.policyForFPR 0.01) . words \<$> readFile "\/usr\/share\/dict\/words"
251- -- let check word | B.elem word filt = Nothing
252- -- | otherwise = Just word
253- -- interact (unlines . mapMaybe check . lines)
254- -- @
255-
256- -- $differences
257- --
258- -- This package is an entirely rewritten fork of
259- -- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package.
260- --
261- -- The main differences are
262- --
263- -- * This packages support bloomfilters of arbitrary sizes
264- -- (not limited to powers of two). Also sizes over 2^32 are supported.
265- --
266- -- * The 'Bloom' and 'MBloom' types are parametrised over a 'Hashable' type
267- -- class, instead of having a @a -> ['Hash']@ typed field.
268- -- This separation allows clean de\/serialization of Bloom filters in this
269- -- package, as the hashing scheme is static.
270- --
271- -- * [@XXH3@ hash](https://xxhash.com/) is used instead of Jenkins'
272- -- @lookup3@.
273- --
274- -- * Support for both classic and \"blocked\" Bloom filters. Blocked-structured
275- -- Bloom filters arrange all the bits for each insert or lookup into a single
276- -- cache line, which greatly reduces the number of slow uncached memory reads.
277- -- The trade-off for this performance optimisation is a slightly worse
278- -- trade-off between bits per element and the FPR. In practice for typical
279- -- FPRs of 1-e3 -- 1e-4, this requires a couple extra bits per element.
0 commit comments