Skip to content

Commit 94fb5d3

Browse files
committed
Update bloom filter FPR tests
These tests were missing a link with the real implementation, since we were testing bloom filter construction functions that were not used in the real implementation. The updated tests test the `RunAcc`'s method of bloom filter allocation directly.
1 parent 511f079 commit 94fb5d3

File tree

2 files changed

+105
-50
lines changed

2 files changed

+105
-50
lines changed

src/Database/LSMTree/Internal/RunAcc.hs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ module Database.LSMTree.Internal.RunAcc (
3131
-- * Bloom filter allocation
3232
, RunBloomFilterAlloc (..)
3333
-- ** Exposed for testing
34+
, newMBloom
3435
, numHashFunctions
36+
, falsePositiveRate
3537
) where
3638

3739
import Control.DeepSeq (NFData (..))
@@ -354,3 +356,20 @@ numHashFunctions ::
354356
-> Integer
355357
numHashFunctions nbits nentries = truncate @Double $ max 1 $
356358
(fromIntegral nbits / fromIntegral nentries) * log 2
359+
360+
-- | False positive rate
361+
--
362+
-- Assumes that the bloom filter uses 'numHashFunctions' hash functions.
363+
--
364+
-- See Niv Dayan, Manos Athanassoulis, Stratos Idreos,
365+
-- /Optimal Bloom Filters and Adaptive Merging for LSM-Trees/,
366+
-- Equation 2.
367+
falsePositiveRate ::
368+
Floating a
369+
=> a -- ^ entries
370+
-> a -- ^ bits
371+
-> a
372+
falsePositiveRate entries bits = exp ((-(bits / entries)) * sq (log 2))
373+
374+
sq :: Num a => a -> a
375+
sq x = x * x

test/Test/Database/LSMTree/Internal/Monkey.hs

Lines changed: 86 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
{-# LANGUAGE NumericUnderscores #-}
66
{-# LANGUAGE ScopedTypeVariables #-}
77
{-# LANGUAGE TypeApplications #-}
8+
{-# OPTIONS_GHC -Wno-orphans #-}
89
{- HLINT ignore "Use camelCase" -}
910

1011
module Test.Database.LSMTree.Internal.Monkey (
@@ -15,8 +16,7 @@ module Test.Database.LSMTree.Internal.Monkey (
1516
-- A common interface to bloom filter construction, based on expected false
1617
-- positive rates.
1718
, BloomMaker
18-
, mkBloomST
19-
, mkBloomEasy
19+
, mkBloomFromAlloc
2020
-- * Verifying FPRs
2121
, measureApproximateFPR
2222
, measureExactFPR
@@ -26,7 +26,6 @@ import Control.Exception (assert)
2626
import Control.Monad.ST
2727
import Data.BloomFilter (Bloom)
2828
import qualified Data.BloomFilter as Bloom
29-
import qualified Data.BloomFilter.Easy as Bloom.Easy
3029
import Data.BloomFilter.Hash (Hashable)
3130
import qualified Data.BloomFilter.Mutable as MBloom
3231
import Data.Foldable (Foldable (..))
@@ -35,22 +34,26 @@ import Data.Set (Set)
3534
import qualified Data.Set as Set
3635
import Data.Word (Word64)
3736
import Database.LSMTree.Extras.Random
37+
import qualified Database.LSMTree.Internal.Entry as LSMT
38+
import Database.LSMTree.Internal.RunAcc (RunBloomFilterAlloc (..),
39+
falsePositiveRate, newMBloom)
3840
import System.Random
3941
import Test.QuickCheck
42+
import Test.QuickCheck.Gen
4043
import Test.Tasty (TestTree, testGroup)
41-
import Test.Tasty.QuickCheck (testProperty)
44+
import Test.Tasty.QuickCheck
45+
import Test.Util.Arbitrary (noTags,
46+
prop_arbitraryAndShrinkPreserveInvariant)
4247
import Text.Printf (printf)
4348

4449
tests :: TestTree
4550
tests = testGroup "Database.LSMTree.Internal.Monkey" [
46-
testGroup "No false negatives" [
47-
testProperty "mkBloomEasy" $ prop_noFalseNegatives (Proxy @Word64) mkBloomEasy
48-
, testProperty "mkBloomST" $ prop_noFalseNegatives (Proxy @Word64) mkBloomST
49-
]
50-
, testGroup "Verify FPR" [
51-
testProperty "mkBloomEasy" $ prop_verifyFPR (Proxy @Word64) mkBloomEasy
52-
, testProperty "mkBloomST" $ prop_verifyFPR (Proxy @Word64) mkBloomST
53-
]
51+
testProperty "prop_noFalseNegatives" $ prop_noFalseNegatives (Proxy @Word64)
52+
, testProperty "prop_verifyFPR" $ prop_verifyFPR (Proxy @Word64)
53+
, testGroup "RunBloomFilterAlloc" $
54+
prop_arbitraryAndShrinkPreserveInvariant noTags allocInvariant
55+
, testGroup "NumEntries" $
56+
prop_arbitraryAndShrinkPreserveInvariant noTags numEntriesInvariant
5457
]
5558

5659
{-------------------------------------------------------------------------------
@@ -59,47 +62,81 @@ tests = testGroup "Database.LSMTree.Internal.Monkey" [
5962

6063
prop_noFalseNegatives :: forall a proxy. Hashable a
6164
=> proxy a
62-
-> (Double -> BloomMaker a)
63-
-> FPR -- ^ Requested FPR
65+
-> RunBloomFilterAlloc
6466
-> UniformWithoutReplacement a
6567
-> Property
66-
prop_noFalseNegatives _ mkBloom (FPR requestedFPR) (UniformWithoutReplacement xs) =
67-
let xsBloom = mkBloom requestedFPR xs
68+
prop_noFalseNegatives _ alloc (UniformWithoutReplacement xs) =
69+
let xsBloom = mkBloomFromAlloc alloc xs
6870
in property $ all (`Bloom.elem` xsBloom) xs
6971

7072
prop_verifyFPR ::
7173
(Ord a, Uniform a, Hashable a)
7274
=> proxy a
73-
-> (Double -> BloomMaker a)
74-
-> FPR -- ^ Requested FPR
75+
-> RunBloomFilterAlloc
7576
-> NumEntries -- ^ @numEntries@
7677
-> Seed -- ^ 'StdGen' seed
7778
-> Property
78-
prop_verifyFPR p mkBloom (FPR requestedFPR) (NumEntries numEntries) (Seed seed) =
79+
prop_verifyFPR p alloc (NumEntries numEntries) (Seed seed) =
7980
let stdgen = mkStdGen seed
80-
measuredFPR = measureApproximateFPR p (mkBloom requestedFPR) numEntries stdgen
81-
requestedFPR' = requestedFPR + 0.03 -- @requestedFPR@ with an error margin
82-
in counterexample (printf "expected %f <= %f" measuredFPR requestedFPR') $
83-
FPR measuredFPR <= FPR requestedFPR'
81+
measuredFPR = measureApproximateFPR p (mkBloomFromAlloc alloc) numEntries stdgen
82+
expectedFPR = case alloc of
83+
RunAllocFixed bits ->
84+
falsePositiveRate (fromIntegral numEntries)
85+
(fromIntegral bits * fromIntegral numEntries)
86+
RunAllocRequestFPR requestedFPR -> requestedFPR
87+
-- error margins
88+
lb = expectedFPR - 0.1
89+
ub = expectedFPR + 0.03
90+
in assert (fprInvariant True measuredFPR) $ -- measured FPR is in the range [0,1]
91+
assert (fprInvariant True expectedFPR) $ -- expected FPR is in the range [0,1]
92+
counterexample (printf "expected $f <= %f <= %f" lb measuredFPR ub) $
93+
lb <= measuredFPR .&&. measuredFPR <= ub
8494

8595
{-------------------------------------------------------------------------------
8696
Modifiers
8797
-------------------------------------------------------------------------------}
8898

8999
--
90-
-- FPR
100+
-- Alloc
91101
--
92102

93-
newtype FPR = FPR { getFPR :: Double }
94-
deriving stock (Show, Eq, Ord)
95-
deriving newtype (Num, Fractional, Floating)
103+
instance Arbitrary RunBloomFilterAlloc where
104+
arbitrary = oneof [
105+
RunAllocFixed <$> genFixed
106+
, RunAllocRequestFPR <$> genFPR
107+
]
108+
shrink (RunAllocFixed x) = RunAllocFixed <$> shrinkFixed x
109+
shrink (RunAllocRequestFPR x) = RunAllocRequestFPR <$> shrinkFPR x
110+
111+
allocInvariant :: RunBloomFilterAlloc -> Bool
112+
allocInvariant (RunAllocFixed x) = fixedInvariant x
113+
allocInvariant (RunAllocRequestFPR x) = fprInvariant False x
114+
115+
genFixed :: Gen Word64
116+
genFixed = choose (fixedLB, fixedUB)
117+
118+
shrinkFixed :: Word64 -> [Word64]
119+
shrinkFixed x = [ x' | x' <- shrink x, fixedInvariant x']
120+
121+
fixedInvariant :: Word64 -> Bool
122+
fixedInvariant x = fixedLB <= x && x <= fixedUB
96123

97-
instance Arbitrary FPR where
98-
arbitrary = FPR <$> arbitrary `suchThat` fprInvariant
99-
shrink (FPR x) = [FPR x' | x' <- shrink x, fprInvariant x']
124+
fixedLB :: Word64
125+
fixedLB = 0
100126

101-
fprInvariant :: Double -> Bool
102-
fprInvariant x = x >= 0.01 && x <= 0.99
127+
fixedUB :: Word64
128+
fixedUB = 20
129+
130+
genFPR :: Gen Double
131+
genFPR = genDouble `suchThat` fprInvariant False
132+
133+
shrinkFPR :: Double -> [Double]
134+
shrinkFPR x = [ x' | x' <- shrink x, fprInvariant False x']
135+
136+
fprInvariant :: Bool -> Double -> Bool
137+
fprInvariant incl x
138+
| incl = 0 <= x && x <= 1
139+
| otherwise = 0 < x && x < 1
103140

104141
--
105142
-- NumEntries
@@ -110,16 +147,21 @@ newtype NumEntries = NumEntries { getNumEntries :: Int }
110147

111148
instance Arbitrary NumEntries where
112149
arbitrary = NumEntries <$> chooseInt (numEntriesLB, numEntriesUB)
113-
shrink (NumEntries x) = [NumEntries x' | x' <- shrink x, numEntriesInvariant x']
150+
shrink (NumEntries x) = [
151+
x''
152+
| x' <- shrink x
153+
, let x'' = NumEntries x'
154+
, numEntriesInvariant x''
155+
]
114156

115157
numEntriesLB :: Int
116-
numEntriesLB = 10_000
158+
numEntriesLB = 50_000
117159

118160
numEntriesUB :: Int
119161
numEntriesUB = 100_000
120162

121-
numEntriesInvariant :: Int -> Bool
122-
numEntriesInvariant x = x >= numEntriesLB && x <= numEntriesUB
163+
numEntriesInvariant :: NumEntries -> Bool
164+
numEntriesInvariant (NumEntries x) = x >= numEntriesLB && x <= numEntriesUB
123165

124166
--
125167
-- Seed
@@ -245,18 +287,12 @@ instance Monoid Counts where
245287

246288
type BloomMaker a = [a] -> Bloom a
247289

248-
-- | Create a bloom filter through the 'MBloom' interface. Tunes the bloom
249-
-- filter using 'suggestSizing'.
250-
mkBloomST :: Hashable a => Double -> BloomMaker a
251-
mkBloomST requestedFPR xs = runST $ do
252-
b <- MBloom.new numHashFuncs numBits
253-
mapM_ (MBloom.insert b) xs
254-
Bloom.freeze b
290+
-- | Create a bloom filter through the 'newMBloom' interface. Tunes the bloom
291+
-- filter according to 'RunBloomFilterAlloc'.
292+
mkBloomFromAlloc :: Hashable a => RunBloomFilterAlloc -> BloomMaker a
293+
mkBloomFromAlloc alloc xs = runST $ do
294+
mb <- newMBloom n alloc
295+
mapM_ (MBloom.insert mb) xs
296+
Bloom.unsafeFreeze mb
255297
where
256-
numEntries = length xs
257-
(numBits, numHashFuncs) = Bloom.Easy.suggestSizing numEntries requestedFPR
258-
259-
-- | Create a bloom filter through the "Data.BloomFilter.Easy" interface. Tunes
260-
-- the bloom filter using 'suggestSizing'.
261-
mkBloomEasy :: Hashable a => Double -> BloomMaker a
262-
mkBloomEasy = Bloom.Easy.easyList
298+
n = LSMT.NumEntries $ length xs

0 commit comments

Comments
 (0)