Skip to content

Commit c3e8f8a

Browse files
authored
Merge pull request #631 from IntersectMBO/jdral/bloom-fpr
Update bloom filter FPR tests
2 parents 508d499 + e9a0549 commit c3e8f8a

File tree

4 files changed

+141
-77
lines changed

4 files changed

+141
-77
lines changed

lsm-tree.cabal

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,14 +372,14 @@ test-suite lsm-tree-test
372372
Test.Database.LSMTree.Internal.Merge
373373
Test.Database.LSMTree.Internal.MergingRun
374374
Test.Database.LSMTree.Internal.MergingTree
375-
Test.Database.LSMTree.Internal.Monkey
376375
Test.Database.LSMTree.Internal.PageAcc
377376
Test.Database.LSMTree.Internal.PageAcc1
378377
Test.Database.LSMTree.Internal.RawBytes
379378
Test.Database.LSMTree.Internal.RawOverflowPage
380379
Test.Database.LSMTree.Internal.RawPage
381380
Test.Database.LSMTree.Internal.Run
382381
Test.Database.LSMTree.Internal.RunAcc
382+
Test.Database.LSMTree.Internal.RunBloomFilterAlloc
383383
Test.Database.LSMTree.Internal.RunBuilder
384384
Test.Database.LSMTree.Internal.RunReader
385385
Test.Database.LSMTree.Internal.RunReaders

src/Database/LSMTree/Internal/RunAcc.hs

Lines changed: 49 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
--
77
module Database.LSMTree.Internal.RunAcc (
88
RunAcc (..)
9-
, RunBloomFilterAlloc (..)
109
, new
1110
, unsafeFinalise
1211
-- * Adding key\/op pairs
@@ -29,7 +28,12 @@ module Database.LSMTree.Internal.RunAcc (
2928
, addLargeKeyOp
3029
, addLargeSerialisedKeyOp
3130
, PageAcc.entryWouldFitInPage
31+
-- * Bloom filter allocation
32+
, RunBloomFilterAlloc (..)
33+
-- ** Exposed for testing
34+
, newMBloom
3235
, numHashFunctions
36+
, falsePositiveRate
3337
) where
3438

3539
import Control.DeepSeq (NFData (..))
@@ -76,17 +80,6 @@ data RunAcc s = RunAcc {
7680
, entryCount :: !(PrimVar s Int)
7781
}
7882

79-
-- | See 'Database.LSMTree.Internal.BloomFilterAlloc'
80-
data RunBloomFilterAlloc =
81-
-- | Bits per element in a filter
82-
RunAllocFixed !Word64
83-
| RunAllocRequestFPR !Double
84-
deriving stock (Show, Eq)
85-
86-
instance NFData RunBloomFilterAlloc where
87-
rnf (RunAllocFixed a) = rnf a
88-
rnf (RunAllocRequestFPR a) = rnf a
89-
9083
-- | @'new' nentries@ starts an incremental run construction.
9184
--
9285
-- @nentries@ should be an upper bound on the expected number of entries in the
@@ -96,15 +89,8 @@ new ::
9689
-> RunBloomFilterAlloc
9790
-> IndexType
9891
-> ST s (RunAcc s)
99-
new (NumEntries nentries) alloc indexType = do
100-
mbloom <- case alloc of
101-
RunAllocFixed !bitsPerEntry ->
102-
let !nbits = fromIntegral bitsPerEntry * fromIntegral nentries
103-
in MBloom.new
104-
(fromIntegralChecked $ numHashFunctions nbits (fromIntegralChecked nentries))
105-
(fromIntegralChecked nbits)
106-
RunAllocRequestFPR !fpr ->
107-
Bloom.Easy.easyNew fpr nentries
92+
new nentries alloc indexType = do
93+
mbloom <- newMBloom nentries alloc
10894
mindex <- Index.newWithDefaults indexType
10995
mpageacc <- PageAcc.newPageAcc
11096
entryCount <- newPrimVar 0
@@ -333,6 +319,31 @@ selectPagesAndChunks mpagemchunkPre page chunks =
333319
Just (pagePre, Nothing) -> ([pagePre, page], chunks)
334320
Just (pagePre, Just chunkPre) -> ([pagePre, page], chunkPre:chunks)
335321

322+
{-------------------------------------------------------------------------------
323+
Bloom filter allocation
324+
-------------------------------------------------------------------------------}
325+
326+
-- | See 'Database.LSMTree.Internal.Config.BloomFilterAlloc'
327+
data RunBloomFilterAlloc =
328+
-- | Bits per element in a filter
329+
RunAllocFixed !Word64
330+
| RunAllocRequestFPR !Double
331+
deriving stock (Show, Eq)
332+
333+
instance NFData RunBloomFilterAlloc where
334+
rnf (RunAllocFixed a) = rnf a
335+
rnf (RunAllocRequestFPR a) = rnf a
336+
337+
newMBloom :: NumEntries -> RunBloomFilterAlloc -> ST s (MBloom s a)
338+
newMBloom (NumEntries nentries) = \case
339+
RunAllocFixed !bitsPerEntry ->
340+
let !nbits = fromIntegral bitsPerEntry * fromIntegral nentries
341+
in MBloom.new
342+
(fromIntegralChecked $ numHashFunctions nbits (fromIntegralChecked nentries))
343+
(fromIntegralChecked nbits)
344+
RunAllocRequestFPR !fpr ->
345+
Bloom.Easy.easyNew fpr nentries
346+
336347
-- | Computes the optimal number of hash functions that minimises the false
337348
-- positive rate for a bloom filter.
338349
--
@@ -345,3 +356,20 @@ numHashFunctions ::
345356
-> Integer
346357
numHashFunctions nbits nentries = truncate @Double $ max 1 $
347358
(fromIntegral nbits / fromIntegral nentries) * log 2
359+
360+
-- | False positive rate
361+
--
362+
-- Assumes that the bloom filter uses 'numHashFunctions' hash functions.
363+
--
364+
-- See Niv Dayan, Manos Athanassoulis, Stratos Idreos,
365+
-- /Optimal Bloom Filters and Adaptive Merging for LSM-Trees/,
366+
-- Equation 2.
367+
falsePositiveRate ::
368+
Floating a
369+
=> a -- ^ entries
370+
-> a -- ^ bits
371+
-> a
372+
falsePositiveRate entries bits = exp ((-(bits / entries)) * sq (log 2))
373+
374+
sq :: Num a => a -> a
375+
sq x = x * x

test/Main.hs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@ import qualified Test.Database.LSMTree.Internal.Lookup
1919
import qualified Test.Database.LSMTree.Internal.Merge
2020
import qualified Test.Database.LSMTree.Internal.MergingRun
2121
import qualified Test.Database.LSMTree.Internal.MergingTree
22-
import qualified Test.Database.LSMTree.Internal.Monkey
2322
import qualified Test.Database.LSMTree.Internal.PageAcc
2423
import qualified Test.Database.LSMTree.Internal.PageAcc1
2524
import qualified Test.Database.LSMTree.Internal.RawBytes
2625
import qualified Test.Database.LSMTree.Internal.RawOverflowPage
2726
import qualified Test.Database.LSMTree.Internal.RawPage
2827
import qualified Test.Database.LSMTree.Internal.Run
2928
import qualified Test.Database.LSMTree.Internal.RunAcc
29+
import qualified Test.Database.LSMTree.Internal.RunBloomFilterAlloc
3030
import qualified Test.Database.LSMTree.Internal.RunBuilder
3131
import qualified Test.Database.LSMTree.Internal.RunReader
3232
import qualified Test.Database.LSMTree.Internal.RunReaders
@@ -66,14 +66,14 @@ main = do
6666
, Test.Database.LSMTree.Internal.Merge.tests
6767
, Test.Database.LSMTree.Internal.MergingRun.tests
6868
, Test.Database.LSMTree.Internal.MergingTree.tests
69-
, Test.Database.LSMTree.Internal.Monkey.tests
7069
, Test.Database.LSMTree.Internal.PageAcc.tests
7170
, Test.Database.LSMTree.Internal.PageAcc1.tests
7271
, Test.Database.LSMTree.Internal.RawBytes.tests
7372
, Test.Database.LSMTree.Internal.RawOverflowPage.tests
7473
, Test.Database.LSMTree.Internal.RawPage.tests
7574
, Test.Database.LSMTree.Internal.Run.tests
7675
, Test.Database.LSMTree.Internal.RunAcc.tests
76+
, Test.Database.LSMTree.Internal.RunBloomFilterAlloc.tests
7777
, Test.Database.LSMTree.Internal.RunBuilder.tests
7878
, Test.Database.LSMTree.Internal.RunReader.tests
7979
, Test.Database.LSMTree.Internal.RunReaders.tests

test/Test/Database/LSMTree/Internal/Monkey.hs renamed to test/Test/Database/LSMTree/Internal/RunBloomFilterAlloc.hs

Lines changed: 89 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,18 @@
55
{-# LANGUAGE NumericUnderscores #-}
66
{-# LANGUAGE ScopedTypeVariables #-}
77
{-# LANGUAGE TypeApplications #-}
8-
{- HLINT ignore "Use camelCase" -}
98

10-
module Test.Database.LSMTree.Internal.Monkey (
9+
{-# OPTIONS_GHC -Wno-orphans #-}
10+
11+
module Test.Database.LSMTree.Internal.RunBloomFilterAlloc (
1112
-- * Main test tree
1213
tests
1314
-- * Bloom filter construction
1415
--
1516
-- A common interface to bloom filter construction, based on expected false
1617
-- positive rates.
1718
, BloomMaker
18-
, mkBloomST
19-
, mkBloomEasy
19+
, mkBloomFromAlloc
2020
-- * Verifying FPRs
2121
, measureApproximateFPR
2222
, measureExactFPR
@@ -26,7 +26,6 @@ import Control.Exception (assert)
2626
import Control.Monad.ST
2727
import Data.BloomFilter (Bloom)
2828
import qualified Data.BloomFilter as Bloom
29-
import qualified Data.BloomFilter.Easy as Bloom.Easy
3029
import Data.BloomFilter.Hash (Hashable)
3130
import qualified Data.BloomFilter.Mutable as MBloom
3231
import Data.Foldable (Foldable (..))
@@ -35,22 +34,26 @@ import Data.Set (Set)
3534
import qualified Data.Set as Set
3635
import Data.Word (Word64)
3736
import Database.LSMTree.Extras.Random
37+
import qualified Database.LSMTree.Internal.Entry as LSMT
38+
import Database.LSMTree.Internal.RunAcc (RunBloomFilterAlloc (..),
39+
falsePositiveRate, newMBloom)
3840
import System.Random
3941
import Test.QuickCheck
42+
import Test.QuickCheck.Gen
4043
import Test.Tasty (TestTree, testGroup)
41-
import Test.Tasty.QuickCheck (testProperty)
44+
import Test.Tasty.QuickCheck
45+
import Test.Util.Arbitrary (noTags,
46+
prop_arbitraryAndShrinkPreserveInvariant)
4247
import Text.Printf (printf)
4348

4449
tests :: TestTree
45-
tests = testGroup "Database.LSMTree.Internal.Monkey" [
46-
testGroup "No false negatives" [
47-
testProperty "mkBloomEasy" $ prop_noFalseNegatives (Proxy @Word64) mkBloomEasy
48-
, testProperty "mkBloomST" $ prop_noFalseNegatives (Proxy @Word64) mkBloomST
49-
]
50-
, testGroup "Verify FPR" [
51-
testProperty "mkBloomEasy" $ prop_verifyFPR (Proxy @Word64) mkBloomEasy
52-
, testProperty "mkBloomST" $ prop_verifyFPR (Proxy @Word64) mkBloomST
53-
]
50+
tests = testGroup "Database.LSMTree.Internal.RunBloomFilterAlloc" [
51+
testProperty "prop_noFalseNegatives" $ prop_noFalseNegatives (Proxy @Word64)
52+
, testProperty "prop_verifyFPR" $ prop_verifyFPR (Proxy @Word64)
53+
, testGroup "RunBloomFilterAlloc" $
54+
prop_arbitraryAndShrinkPreserveInvariant noTags allocInvariant
55+
, testGroup "NumEntries" $
56+
prop_arbitraryAndShrinkPreserveInvariant noTags numEntriesInvariant
5457
]
5558

5659
{-------------------------------------------------------------------------------
@@ -59,47 +62,81 @@ tests = testGroup "Database.LSMTree.Internal.Monkey" [
5962

6063
prop_noFalseNegatives :: forall a proxy. Hashable a
6164
=> proxy a
62-
-> (Double -> BloomMaker a)
63-
-> FPR -- ^ Requested FPR
65+
-> RunBloomFilterAlloc
6466
-> UniformWithoutReplacement a
6567
-> Property
66-
prop_noFalseNegatives _ mkBloom (FPR requestedFPR) (UniformWithoutReplacement xs) =
67-
let xsBloom = mkBloom requestedFPR xs
68+
prop_noFalseNegatives _ alloc (UniformWithoutReplacement xs) =
69+
let xsBloom = mkBloomFromAlloc alloc xs
6870
in property $ all (`Bloom.elem` xsBloom) xs
6971

7072
prop_verifyFPR ::
7173
(Ord a, Uniform a, Hashable a)
7274
=> proxy a
73-
-> (Double -> BloomMaker a)
74-
-> FPR -- ^ Requested FPR
75+
-> RunBloomFilterAlloc
7576
-> NumEntries -- ^ @numEntries@
7677
-> Seed -- ^ 'StdGen' seed
7778
-> Property
78-
prop_verifyFPR p mkBloom (FPR requestedFPR) (NumEntries numEntries) (Seed seed) =
79+
prop_verifyFPR p alloc (NumEntries numEntries) (Seed seed) =
7980
let stdgen = mkStdGen seed
80-
measuredFPR = measureApproximateFPR p (mkBloom requestedFPR) numEntries stdgen
81-
requestedFPR' = requestedFPR + 0.03 -- @requestedFPR@ with an error margin
82-
in counterexample (printf "expected %f <= %f" measuredFPR requestedFPR') $
83-
FPR measuredFPR <= FPR requestedFPR'
81+
measuredFPR = measureApproximateFPR p (mkBloomFromAlloc alloc) numEntries stdgen
82+
expectedFPR = case alloc of
83+
RunAllocFixed bits ->
84+
falsePositiveRate (fromIntegral numEntries)
85+
(fromIntegral bits * fromIntegral numEntries)
86+
RunAllocRequestFPR requestedFPR -> requestedFPR
87+
-- error margins
88+
lb = expectedFPR - 0.1
89+
ub = expectedFPR + 0.03
90+
in assert (fprInvariant True measuredFPR) $ -- measured FPR is in the range [0,1]
91+
assert (fprInvariant True expectedFPR) $ -- expected FPR is in the range [0,1]
92+
counterexample (printf "expected $f <= %f <= %f" lb measuredFPR ub) $
93+
lb <= measuredFPR .&&. measuredFPR <= ub
8494

8595
{-------------------------------------------------------------------------------
8696
Modifiers
8797
-------------------------------------------------------------------------------}
8898

8999
--
90-
-- FPR
100+
-- Alloc
91101
--
92102

93-
newtype FPR = FPR { getFPR :: Double }
94-
deriving stock (Show, Eq, Ord)
95-
deriving newtype (Num, Fractional, Floating)
103+
instance Arbitrary RunBloomFilterAlloc where
104+
arbitrary = oneof [
105+
RunAllocFixed <$> genFixed
106+
, RunAllocRequestFPR <$> genFPR
107+
]
108+
shrink (RunAllocFixed x) = RunAllocFixed <$> shrinkFixed x
109+
shrink (RunAllocRequestFPR x) = RunAllocRequestFPR <$> shrinkFPR x
110+
111+
allocInvariant :: RunBloomFilterAlloc -> Bool
112+
allocInvariant (RunAllocFixed x) = fixedInvariant x
113+
allocInvariant (RunAllocRequestFPR x) = fprInvariant False x
114+
115+
genFixed :: Gen Word64
116+
genFixed = choose (fixedLB, fixedUB)
117+
118+
shrinkFixed :: Word64 -> [Word64]
119+
shrinkFixed x = [ x' | x' <- shrink x, fixedInvariant x']
96120

97-
instance Arbitrary FPR where
98-
arbitrary = FPR <$> arbitrary `suchThat` fprInvariant
99-
shrink (FPR x) = [FPR x' | x' <- shrink x, fprInvariant x']
121+
fixedInvariant :: Word64 -> Bool
122+
fixedInvariant x = fixedLB <= x && x <= fixedUB
100123

101-
fprInvariant :: Double -> Bool
102-
fprInvariant x = x >= 0.01 && x <= 0.99
124+
fixedLB :: Word64
125+
fixedLB = 0
126+
127+
fixedUB :: Word64
128+
fixedUB = 20
129+
130+
genFPR :: Gen Double
131+
genFPR = genDouble `suchThat` fprInvariant False
132+
133+
shrinkFPR :: Double -> [Double]
134+
shrinkFPR x = [ x' | x' <- shrink x, fprInvariant False x']
135+
136+
fprInvariant :: Bool -> Double -> Bool
137+
fprInvariant incl x
138+
| incl = 0 <= x && x <= 1
139+
| otherwise = 0 < x && x < 1
103140

104141
--
105142
-- NumEntries
@@ -110,16 +147,21 @@ newtype NumEntries = NumEntries { getNumEntries :: Int }
110147

111148
instance Arbitrary NumEntries where
112149
arbitrary = NumEntries <$> chooseInt (numEntriesLB, numEntriesUB)
113-
shrink (NumEntries x) = [NumEntries x' | x' <- shrink x, numEntriesInvariant x']
150+
shrink (NumEntries x) = [
151+
x''
152+
| x' <- shrink x
153+
, let x'' = NumEntries x'
154+
, numEntriesInvariant x''
155+
]
114156

115157
numEntriesLB :: Int
116-
numEntriesLB = 10_000
158+
numEntriesLB = 50_000
117159

118160
numEntriesUB :: Int
119161
numEntriesUB = 100_000
120162

121-
numEntriesInvariant :: Int -> Bool
122-
numEntriesInvariant x = x >= numEntriesLB && x <= numEntriesUB
163+
numEntriesInvariant :: NumEntries -> Bool
164+
numEntriesInvariant (NumEntries x) = x >= numEntriesLB && x <= numEntriesUB
123165

124166
--
125167
-- Seed
@@ -245,18 +287,12 @@ instance Monoid Counts where
245287

246288
type BloomMaker a = [a] -> Bloom a
247289

248-
-- | Create a bloom filter through the 'MBloom' interface. Tunes the bloom
249-
-- filter using 'suggestSizing'.
250-
mkBloomST :: Hashable a => Double -> BloomMaker a
251-
mkBloomST requestedFPR xs = runST $ do
252-
b <- MBloom.new numHashFuncs numBits
253-
mapM_ (MBloom.insert b) xs
254-
Bloom.freeze b
290+
-- | Create a bloom filter through the 'newMBloom' interface. Tunes the bloom
291+
-- filter according to 'RunBloomFilterAlloc'.
292+
mkBloomFromAlloc :: Hashable a => RunBloomFilterAlloc -> BloomMaker a
293+
mkBloomFromAlloc alloc xs = runST $ do
294+
mb <- newMBloom n alloc
295+
mapM_ (MBloom.insert mb) xs
296+
Bloom.unsafeFreeze mb
255297
where
256-
numEntries = length xs
257-
(numBits, numHashFuncs) = Bloom.Easy.suggestSizing numEntries requestedFPR
258-
259-
-- | Create a bloom filter through the "Data.BloomFilter.Easy" interface. Tunes
260-
-- the bloom filter using 'suggestSizing'.
261-
mkBloomEasy :: Hashable a => Double -> BloomMaker a
262-
mkBloomEasy = Bloom.Easy.easyList
298+
n = LSMT.NumEntries $ length xs

0 commit comments

Comments
 (0)