Skip to content

Commit 460f257

Browse files
authored
Merge pull request #441 from IntersectMBO/jeltsch/growing-vector/tests
Add tests for growing vectors
2 parents 361f4dc + 5a041c8 commit 460f257

File tree

3 files changed

+225
-0
lines changed

3 files changed

+225
-0
lines changed

lsm-tree.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ test-suite lsm-tree-test
366366
Test.Database.LSMTree.Internal.Serialise
367367
Test.Database.LSMTree.Internal.Serialise.Class
368368
Test.Database.LSMTree.Internal.Vector
369+
Test.Database.LSMTree.Internal.Vector.Growing
369370
Test.Database.LSMTree.Model.Table
370371
Test.Database.LSMTree.Monoidal
371372
Test.Database.LSMTree.Normal.StateMachine

test/Main.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import qualified Test.Database.LSMTree.Internal.RunReaders
2828
import qualified Test.Database.LSMTree.Internal.Serialise
2929
import qualified Test.Database.LSMTree.Internal.Serialise.Class
3030
import qualified Test.Database.LSMTree.Internal.Vector
31+
import qualified Test.Database.LSMTree.Internal.Vector.Growing
3132
import qualified Test.Database.LSMTree.Model.Table
3233
import qualified Test.Database.LSMTree.Monoidal
3334
import qualified Test.Database.LSMTree.Normal.StateMachine
@@ -63,6 +64,7 @@ main = defaultMain $ testGroup "lsm-tree"
6364
, Test.Database.LSMTree.Internal.Serialise.tests
6465
, Test.Database.LSMTree.Internal.Serialise.Class.tests
6566
, Test.Database.LSMTree.Internal.Vector.tests
67+
, Test.Database.LSMTree.Internal.Vector.Growing.tests
6668
, Test.Database.LSMTree.Model.Table.tests
6769
, Test.Database.LSMTree.Monoidal.tests
6870
, Test.Database.LSMTree.Normal.UnitTests.tests
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
module Test.Database.LSMTree.Internal.Vector.Growing (tests) where
2+
3+
import Prelude hiding (head, length, tail)
4+
5+
import Control.Category ((>>>))
6+
import Control.Monad.ST.Strict (ST, runST)
7+
import qualified Data.List as List (length)
8+
import Data.Vector (Vector, toList)
9+
import Database.LSMTree.Internal.Vector.Growing (GrowingVector,
10+
append, freeze, new)
11+
import Test.QuickCheck (Arbitrary (arbitrary, shrink), Gen,
12+
NonNegative (NonNegative, getNonNegative),
13+
Positive (Positive), Property, Small (Small), Testable,
14+
chooseInt, coverTable, shrinkList, shrinkMap, shrinkMapBy,
15+
sized, tabulate, (===))
16+
import Test.Tasty (TestTree, testGroup)
17+
import Test.Tasty.QuickCheck (testProperty)
18+
19+
-- * Tests
20+
21+
tests :: TestTree
22+
tests = testGroup "Test.Database.LSMTree.Internal.Vector.Growing" $
23+
[
24+
testProperty "Final vector is correct"
25+
prop_finalVectorIsCorrect
26+
]
27+
28+
-- * Utilities
29+
30+
-- ** Segments
31+
32+
data Segment a = Replicate Int a deriving stock Show
33+
34+
segmentLength :: Segment a -> Int
35+
segmentLength (Replicate count _) = max 0 count
36+
37+
segmentToList :: Segment a -> [a]
38+
segmentToList (Replicate count val) = replicate count val
39+
40+
appendSegment :: GrowingVector s a -> Segment a -> ST s ()
41+
appendSegment vec (Replicate count val) = append vec count val
42+
43+
-- ** Vector construction
44+
45+
{-
46+
Constructs an ordinary vector by creating a growing vector, appending
47+
segments to it, and finally freezing it.
48+
-}
49+
finalVector :: Int -- Initial buffer size
50+
-> [Segment a] -- Segments
51+
-> Vector a -- Final vector
52+
finalVector initialBufferSize segments = runST $ do
53+
vec <- new initialBufferSize
54+
mapM_ (appendSegment vec) segments
55+
freeze vec
56+
57+
{-
58+
Supplies the final contents of a growing vector for constructing a property.
59+
60+
The resulting property additionally provides information about the
61+
occurrence of buffer size scaling exponents, where a buffer size scaling
62+
exponent is the binary logarithm of the ratio between the buffer size after
63+
and before appending a segment. If buffer enlargement for an append would
64+
happen in cycles where in each cycle the buffer size is doubled, then the
65+
buffer size scaling exponent would be equal to the number of such cycles.
66+
Therefore, a buffer size scaling exponent tells if the buffer is enlarged
67+
and, if yes, how much.
68+
-}
69+
withFinalVector
70+
:: Testable prop
71+
=> Int -- Initial buffer size
72+
-> [Segment a] -- Segments
73+
-> (Vector a -> prop) -- Property from final vector
74+
-> Property -- Resulting property
75+
withFinalVector initialBufferSize segments consumer
76+
= coverTable
77+
"Buffer size scaling exponents"
78+
[(show scalingExp, 10) | scalingExp <- [0 .. 3] :: [Int]]
79+
$
80+
tabulate
81+
"Buffer size scaling exponents"
82+
(show <$> bufferSizeScalingExponents availableBufferSizes 0 segments)
83+
$
84+
consumer (finalVector initialBufferSize segments)
85+
where
86+
87+
availableBufferSizes :: [Int]
88+
availableBufferSizes = iterate (* 2) initialBufferSize
89+
{-
90+
We do not need to account for overflow here, because the lengths of the
91+
vectors we construct are small compared to @maxBound :: Int@.
92+
-}
93+
94+
bufferSizeScalingExponents :: [Int] -> Int -> [Segment a] -> [Int]
95+
bufferSizeScalingExponents _ _ []
96+
= []
97+
bufferSizeScalingExponents availableSizes length (head : tail)
98+
= List.length insufficient :
99+
bufferSizeScalingExponents sufficient length' tail
100+
where
101+
102+
length' :: Int
103+
!length' = length + segmentLength head
104+
105+
insufficient, sufficient :: [Int]
106+
(insufficient, sufficient) = span (< length') availableSizes
107+
108+
-- * Properties to test
109+
110+
prop_finalVectorIsCorrect :: Positive (Small Int)
111+
-> Segments Integer
112+
-> Property
113+
prop_finalVectorIsCorrect (Positive (Small initialBufferSize))
114+
(Segments segments)
115+
= withFinalVector initialBufferSize segments $ \ vec ->
116+
toList vec === concatMap segmentToList segments
117+
118+
-- * Test case generation and shrinking
119+
120+
generateSegment :: Arbitrary a => Int -> Gen (Segment a)
121+
generateSegment maxCount = Replicate <$> chooseInt (0, maxCount) <*> arbitrary
122+
123+
shrinkSegment :: Arbitrary a => Segment a -> [Segment a]
124+
shrinkSegment (Replicate count val)
125+
= [Replicate count' val | count' <- shrinkNat count] ++
126+
[Replicate count val' | val' <- shrink val]
127+
where
128+
129+
shrinkNat :: Int -> [Int]
130+
shrinkNat = shrinkMap getNonNegative NonNegative
131+
132+
{-
133+
A list of segments to be appended to an initially empty vector. 'Segments a'
134+
is isomorphic to '[Segment a]' but has a special way of generating test
135+
cases, which avoids skewing with respect to buffer size scaling exponents.
136+
137+
The key issue of segment list generation is how to generate the length of an
138+
individual segment. It seems worthwhile to randomly pick a natural number
139+
smaller than or equal to some maximum length, using a uniform distribution.
140+
The question to be answered is how to choose the maximum lengths for the
141+
different segments.
142+
143+
A naïve approach is to use a common maximum length for all segments in a
144+
particular list, possibly computed from the size parameter. However, this
145+
results in most appends not enlarging the buffer, meaning that the buffer
146+
size scaling exponent is zero in most cases. This is because, as the buffer
147+
size increases, the number of elements needed to cause the next buffer
148+
enlargement increases too.
149+
150+
Note that it does not help much to choose the maximum segment length large
151+
compared to the initial buffer size. If this is done, then the buffer is
152+
enlarged very quickly, likely during the first append, and the distributions
153+
of buffer size scaling exponents up to this first enlargement may actually
154+
be very good. However, after the first enlargement, the buffer size is
155+
typically of the same order of magnitude as the maximum segment length, so
156+
that the buffer size scaling exponents of the following appends are likely
157+
to be zero or close to it, with further buffer enlargements making the
158+
situation only worse.
159+
160+
A better idea is to choose the maximum length of each segment but the first
161+
proportional to the length of the vector just before appending it (for the
162+
first segment, such a choice is not appropriate, because it would lead to a
163+
maximum length of zero). With this approach, the distribution of buffer size
164+
scaling exponents stays roughly the same as soon as the buffer has been
165+
enlarged for the first time, because then the ratio between the current
166+
buffer size and the current vector length is always in the interval \([1,
167+
2)\) and thus varies only slightly. Reasonable distributions of buffer size
168+
scaling exponents can be achieved by choosing the ratio between maximum
169+
segment lengths and corresponding current-vector lengths appropriately.
170+
171+
A downside of this approach is that it requires tracking the current vector
172+
length. This tracking can be avoided by considering only the /expected/
173+
current vector length and choosing the maximum segment length proportional
174+
to it. The expected length of a vector is the sum of the expected lengths of
175+
the segments constituting it, and the expected length of a segment is
176+
proportional to the maximum length used for generating it. Therefore, this
177+
modified solution boils down to choosing the maximum length of each segment
178+
but the first proportional to the sum of the maximum lengths of the segments
179+
preceding it.
180+
181+
Note that, with the approach just set out, the maximum lengths of the
182+
segments in a segment list almost form an exponential sequence, whose base
183+
is determined by the chosen ratio between maximum segment lengths and
184+
corresponding sums of maximum predecessor segment lengths. Therefore, a
185+
similarly good, but simpler, solution is to have the maximum segment lengths
186+
precisely form an exponential sequence. This is the approach that we use in
187+
our segment list generation algorithm.
188+
189+
The concrete exponential sequences that we employ start with the size
190+
parameter and use 3 as the base of the exponentiation. Since initial buffer
191+
sizes are generated using a uniform distribution with the size parameter as
192+
the maximum, the choice of the size parameter as the first segment’s maximum
193+
length leads to non-trivial distributions of buffer size scaling exponents
194+
for the appends up to the one that causes the first buffer enlargement. The
195+
choice of 3 as the exponentiation’s base leads to a good overall
196+
distribution of buffer size scaling exponents, as experiments have shown.
197+
198+
Because of the exponential growth of maximum segment lengths, vectors
199+
constructed during testing get very large for longer segment lists.
200+
Therefore, we do not pick the lengths of segment lists in the usual way but
201+
instead make all segment lists have a common, small length. Concretely, we
202+
use 9 as this common length, because this leads to vectors that are not
203+
trivial but still consume reasonable amounts of memory.
204+
-}
205+
newtype Segments a = Segments [Segment a] deriving stock Show
206+
207+
instance Arbitrary a => Arbitrary (Segments a) where
208+
209+
arbitrary = sized $ iterate (* scalingFactor) >>>
210+
take count >>>
211+
mapM generateSegment >>>
212+
fmap Segments
213+
where
214+
215+
scalingFactor :: Int
216+
scalingFactor = 3
217+
218+
count :: Int
219+
count = 9
220+
221+
shrink = shrinkMapBy Segments (\ (Segments segments) -> segments) $
222+
shrinkList shrinkSegment

0 commit comments

Comments
 (0)