|
| 1 | +module Test.Database.LSMTree.Internal.Vector.Growing (tests) where |
| 2 | + |
| 3 | +import Prelude hiding (head, length, tail) |
| 4 | + |
| 5 | +import Control.Category ((>>>)) |
| 6 | +import Control.Monad.ST.Strict (ST, runST) |
| 7 | +import qualified Data.List as List (length) |
| 8 | +import Data.Vector (Vector, toList) |
| 9 | +import Database.LSMTree.Internal.Vector.Growing (GrowingVector, |
| 10 | + append, freeze, new) |
| 11 | +import Test.QuickCheck (Arbitrary (arbitrary, shrink), Gen, |
| 12 | + NonNegative (NonNegative, getNonNegative), |
| 13 | + Positive (Positive), Property, Small (Small), Testable, |
| 14 | + chooseInt, coverTable, shrinkList, shrinkMap, shrinkMapBy, |
| 15 | + sized, tabulate, (===)) |
| 16 | +import Test.Tasty (TestTree, testGroup) |
| 17 | +import Test.Tasty.QuickCheck (testProperty) |
| 18 | + |
| 19 | +-- * Tests |
| 20 | + |
| 21 | +tests :: TestTree |
| 22 | +tests = testGroup "Test.Database.LSMTree.Internal.Vector.Growing" $ |
| 23 | + [ |
| 24 | + testProperty "Final vector is correct" |
| 25 | + prop_finalVectorIsCorrect |
| 26 | + ] |
| 27 | + |
| 28 | +-- * Utilities |
| 29 | + |
| 30 | +-- ** Segments |
| 31 | + |
| 32 | +data Segment a = Replicate Int a deriving stock Show |
| 33 | + |
| 34 | +segmentLength :: Segment a -> Int |
| 35 | +segmentLength (Replicate count _) = max 0 count |
| 36 | + |
| 37 | +segmentToList :: Segment a -> [a] |
| 38 | +segmentToList (Replicate count val) = replicate count val |
| 39 | + |
| 40 | +appendSegment :: GrowingVector s a -> Segment a -> ST s () |
| 41 | +appendSegment vec (Replicate count val) = append vec count val |
| 42 | + |
| 43 | +-- ** Vector construction |
| 44 | + |
| 45 | +{- |
| 46 | + Constructs an ordinary vector by creating a growing vector, appending |
| 47 | + segments to it, and finally freezing it. |
| 48 | +-} |
| 49 | +finalVector :: Int -- Initial buffer size |
| 50 | + -> [Segment a] -- Segments |
| 51 | + -> Vector a -- Final vector |
| 52 | +finalVector initialBufferSize segments = runST $ do |
| 53 | + vec <- new initialBufferSize |
| 54 | + mapM_ (appendSegment vec) segments |
| 55 | + freeze vec |
| 56 | + |
| 57 | +{- |
| 58 | + Supplies the final contents of a growing vector for constructing a property. |
| 59 | +
|
| 60 | + The resulting property additionally provides information about the |
| 61 | + occurrence of buffer size scaling exponents, where a buffer size scaling |
| 62 | + exponent is the binary logarithm of the ratio between the buffer size after |
| 63 | + and before appending a segment. If buffer enlargement for an append would |
| 64 | + happen in cycles where in each cycle the buffer size is doubled, then the |
| 65 | + buffer size scaling exponent would be equal to the number of such cycles. |
| 66 | + Therefore, a buffer size scaling exponent tells if the buffer is enlarged |
| 67 | + and, if yes, how much. |
| 68 | +-} |
| 69 | +withFinalVector |
| 70 | + :: Testable prop |
| 71 | + => Int -- Initial buffer size |
| 72 | + -> [Segment a] -- Segments |
| 73 | + -> (Vector a -> prop) -- Property from final vector |
| 74 | + -> Property -- Resulting property |
| 75 | +withFinalVector initialBufferSize segments consumer |
| 76 | + = coverTable |
| 77 | + "Buffer size scaling exponents" |
| 78 | + [(show scalingExp, 10) | scalingExp <- [0 .. 3] :: [Int]] |
| 79 | + $ |
| 80 | + tabulate |
| 81 | + "Buffer size scaling exponents" |
| 82 | + (show <$> bufferSizeScalingExponents availableBufferSizes 0 segments) |
| 83 | + $ |
| 84 | + consumer (finalVector initialBufferSize segments) |
| 85 | + where |
| 86 | + |
| 87 | + availableBufferSizes :: [Int] |
| 88 | + availableBufferSizes = iterate (* 2) initialBufferSize |
| 89 | + {- |
| 90 | + We do not need to account for overflow here, because the lengths of the |
| 91 | + vectors we construct are small compared to @maxBound :: Int@. |
| 92 | + -} |
| 93 | + |
| 94 | + bufferSizeScalingExponents :: [Int] -> Int -> [Segment a] -> [Int] |
| 95 | + bufferSizeScalingExponents _ _ [] |
| 96 | + = [] |
| 97 | + bufferSizeScalingExponents availableSizes length (head : tail) |
| 98 | + = List.length insufficient : |
| 99 | + bufferSizeScalingExponents sufficient length' tail |
| 100 | + where |
| 101 | + |
| 102 | + length' :: Int |
| 103 | + !length' = length + segmentLength head |
| 104 | + |
| 105 | + insufficient, sufficient :: [Int] |
| 106 | + (insufficient, sufficient) = span (< length') availableSizes |
| 107 | + |
| 108 | +-- * Properties to test |
| 109 | + |
| 110 | +prop_finalVectorIsCorrect :: Positive (Small Int) |
| 111 | + -> Segments Integer |
| 112 | + -> Property |
| 113 | +prop_finalVectorIsCorrect (Positive (Small initialBufferSize)) |
| 114 | + (Segments segments) |
| 115 | + = withFinalVector initialBufferSize segments $ \ vec -> |
| 116 | + toList vec === concatMap segmentToList segments |
| 117 | + |
| 118 | +-- * Test case generation and shrinking |
| 119 | + |
| 120 | +generateSegment :: Arbitrary a => Int -> Gen (Segment a) |
| 121 | +generateSegment maxCount = Replicate <$> chooseInt (0, maxCount) <*> arbitrary |
| 122 | + |
| 123 | +shrinkSegment :: Arbitrary a => Segment a -> [Segment a] |
| 124 | +shrinkSegment (Replicate count val) |
| 125 | + = [Replicate count' val | count' <- shrinkNat count] ++ |
| 126 | + [Replicate count val' | val' <- shrink val] |
| 127 | + where |
| 128 | + |
| 129 | + shrinkNat :: Int -> [Int] |
| 130 | + shrinkNat = shrinkMap getNonNegative NonNegative |
| 131 | + |
| 132 | +{- |
| 133 | + A list of segments to be appended to an initially empty vector. 'Segments a' |
| 134 | + is isomorphic to '[Segment a]' but has a special way of generating test |
| 135 | + cases, which avoids skewing with respect to buffer size scaling exponents. |
| 136 | +
|
| 137 | + The key issue of segment list generation is how to generate the length of an |
| 138 | + individual segment. It seems worthwhile to randomly pick a natural number |
| 139 | + smaller than or equal to some maximum length, using a uniform distribution. |
| 140 | + The question to be answered is how to choose the maximum lengths for the |
| 141 | + different segments. |
| 142 | +
|
| 143 | + A naïve approach is to use a common maximum length for all segments in a |
| 144 | + particular list, possibly computed from the size parameter. However, this |
| 145 | + results in most appends not enlarging the buffer, meaning that the buffer |
| 146 | + size scaling exponent is zero in most cases. This is because, as the buffer |
| 147 | + size increases, the number of elements needed to cause the next buffer |
| 148 | + enlargement increases too. |
| 149 | +
|
| 150 | + Note that it does not help much to choose the maximum segment length large |
| 151 | + compared to the initial buffer size. If this is done, then the buffer is |
| 152 | + enlarged very quickly, likely during the first append, and the distributions |
| 153 | + of buffer size scaling exponents up to this first enlargement may actually |
| 154 | + be very good. However, after the first enlargement, the buffer size is |
| 155 | + typically of the same order of magnitude as the maximum segment length, so |
| 156 | + that the buffer size scaling exponents of the following appends are likely |
| 157 | + to be zero or close to it, with further buffer enlargements making the |
| 158 | + situation only worse. |
| 159 | +
|
| 160 | + A better idea is to choose the maximum length of each segment but the first |
| 161 | + proportional to the length of the vector just before appending it (for the |
| 162 | + first segment, such a choice is not appropriate, because it would lead to a |
| 163 | + maximum length of zero). With this approach, the distribution of buffer size |
| 164 | + scaling exponents stays roughly the same as soon as the buffer has been |
| 165 | + enlarged for the first time, because then the ratio between the current |
| 166 | + buffer size and the current vector length is always in the interval \([1, |
| 167 | + 2)\) and thus varies only slightly. Reasonable distributions of buffer size |
| 168 | + scaling exponents can be achieved by choosing the ratio between maximum |
| 169 | + segment lengths and corresponding current-vector lengths appropriately. |
| 170 | +
|
| 171 | + A downside of this approach is that it requires tracking the current vector |
| 172 | + length. This tracking can be avoided by considering only the /expected/ |
| 173 | + current vector length and choosing the maximum segment length proportional |
| 174 | + to it. The expected length of a vector is the sum of the expected lengths of |
| 175 | + the segments constituting it, and the expected length of a segment is |
| 176 | + proportional to the maximum length used for generating it. Therefore, this |
| 177 | + modified solution boils down to choosing the maximum length of each segment |
| 178 | + but the first proportional to the sum of the maximum lengths of the segments |
| 179 | + preceding it. |
| 180 | +
|
| 181 | + Note that, with the approach just set out, the maximum lengths of the |
| 182 | + segments in a segment list almost form an exponential sequence, whose base |
| 183 | + is determined by the chosen ratio between maximum segment lengths and |
| 184 | + corresponding sums of maximum predecessor segment lengths. Therefore, a |
| 185 | + similarly good, but simpler, solution is to have the maximum segment lengths |
| 186 | + precisely form an exponential sequence. This is the approach that we use in |
| 187 | + our segment list generation algorithm. |
| 188 | +
|
| 189 | + The concrete exponential sequences that we employ start with the size |
| 190 | + parameter and use 3 as the base of the exponentiation. Since initial buffer |
| 191 | + sizes are generated using a uniform distribution with the size parameter as |
| 192 | + the maximum, the choice of the size parameter as the first segment’s maximum |
| 193 | + length leads to non-trivial distributions of buffer size scaling exponents |
| 194 | + for the appends up to the one that causes the first buffer enlargement. The |
| 195 | + choice of 3 as the exponentiation’s base leads to a good overall |
| 196 | + distribution of buffer size scaling exponents, as experiments have shown. |
| 197 | +
|
| 198 | + Because of the exponential growth of maximum segment lengths, vectors |
| 199 | + constructed during testing get very large for longer segment lists. |
| 200 | + Therefore, we do not pick the lengths of segment lists in the usual way but |
| 201 | + instead make all segment lists have a common, small length. Concretely, we |
| 202 | + use 9 as this common length, because this leads to vectors that are not |
| 203 | + trivial but still consume reasonable amounts of memory. |
| 204 | +-} |
| 205 | +newtype Segments a = Segments [Segment a] deriving stock Show |
| 206 | + |
| 207 | +instance Arbitrary a => Arbitrary (Segments a) where |
| 208 | + |
| 209 | + arbitrary = sized $ iterate (* scalingFactor) >>> |
| 210 | + take count >>> |
| 211 | + mapM generateSegment >>> |
| 212 | + fmap Segments |
| 213 | + where |
| 214 | + |
| 215 | + scalingFactor :: Int |
| 216 | + scalingFactor = 3 |
| 217 | + |
| 218 | + count :: Int |
| 219 | + count = 9 |
| 220 | + |
| 221 | + shrink = shrinkMapBy Segments (\ (Segments segments) -> segments) $ |
| 222 | + shrinkList shrinkSegment |
0 commit comments