Skip to content

Commit de47d16

Browse files
authored
Merge pull request #746 from IntersectMBO/jdral/lift-8-bytes-restriction
Lift the restriction of minimum 8-byte keys for the compact index
2 parents 89170eb + 58119ff commit de47d16

File tree

14 files changed

+78
-85
lines changed

14 files changed

+78
-85
lines changed

bench/micro/Bench/Database/LSMTree/Internal/Index.hs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,13 @@ import Criterion.Main (Benchmark, Benchmarkable, bench, bgroup, env,
1111
import Data.List (foldl')
1212
-- foldl' is included in the Prelude from base 4.20 onwards
1313
#endif
14-
import Database.LSMTree.Extras.Generators (getKeyForIndexCompact,
15-
mkPages, toAppends)
14+
import Database.LSMTree.Extras.Generators (mkPages, toAppends)
1615
-- also for @Arbitrary@ instantiation of @SerialisedKey@
1716
import Database.LSMTree.Extras.Index (Append, append)
1817
import Database.LSMTree.Internal.Index (Index,
1918
IndexType (Compact, Ordinary), newWithDefaults, search,
2019
unsafeEnd)
21-
import Database.LSMTree.Internal.Serialise
22-
(SerialisedKey (SerialisedKey))
20+
import Database.LSMTree.Internal.Serialise (SerialisedKey)
2321
import Test.QuickCheck (choose, vector)
2422
import Test.QuickCheck.Gen (Gen (MkGen))
2523
import Test.QuickCheck.Random (mkQCGen)
@@ -61,8 +59,7 @@ generated (MkGen exec) = exec (mkQCGen 411) 30
6159
keysForIndexCompact :: Int -- ^ Number of keys
6260
-> [SerialisedKey] -- ^ Constructed keys
6361
keysForIndexCompact = vector >>>
64-
generated >>>
65-
map (getKeyForIndexCompact >>> SerialisedKey)
62+
generated
6663

6764
{-|
6865
Constructs append operations whose serialised keys conform to the key size

doc/format-run.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,9 @@ big-endian.
198198
The compact index type is designed to work with keys that are large
199199
cryptographic hashes, e.g. 32 bytes. In particular it requires:
200200

201-
* keys must be uniformly distributed
202-
* keys must be at least 8 bytes (64bits), but can otherwise be variable length
201+
* keys must be uniformly distributed;
202+
* keys can be of variable length;
203+
* keys less than 8 bytes (64bits) are padded with zeros (in LSB position).
203204

204205
For this important special case, we can do significantly better than storing a
205206
whole key per page: we can typically store just 8 bytes (64bits) per page. This

src-extras/Database/LSMTree/Extras/Generators.hs

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ module Database.LSMTree.Extras.Generators (
3333
, genRawBytesSized
3434
, packRawBytesPinnedOrUnpinned
3535
, LargeRawBytes (..)
36-
, isKeyForIndexCompact
37-
, KeyForIndexCompact (..)
3836
, BiasedKey (..)
3937
-- * helpers
4038
, shrinkVec
@@ -510,28 +508,6 @@ instance Arbitrary LargeRawBytes where
510508

511509
deriving newtype instance SerialiseValue LargeRawBytes
512510

513-
-- Serialised keys for the compact index must be at least 8 bytes long.
514-
515-
genKeyForIndexCompact :: Gen RawBytes
516-
genKeyForIndexCompact =
517-
genRawBytesN =<< QC.sized (\s -> QC.chooseInt (8, s + 8))
518-
519-
isKeyForIndexCompact :: RawBytes -> Bool
520-
isKeyForIndexCompact rb = RB.size rb >= 8
521-
522-
newtype KeyForIndexCompact =
523-
KeyForIndexCompact { getKeyForIndexCompact :: RawBytes }
524-
deriving stock (Eq, Ord, Show)
525-
526-
instance Arbitrary KeyForIndexCompact where
527-
arbitrary =
528-
KeyForIndexCompact <$> genKeyForIndexCompact
529-
shrink (KeyForIndexCompact rawBytes) =
530-
[KeyForIndexCompact rawBytes' | rawBytes' <- shrink rawBytes,
531-
isKeyForIndexCompact rawBytes']
532-
533-
deriving newtype instance SerialiseKey KeyForIndexCompact
534-
535511
-- we try to make collisions and close keys more likely (very crudely)
536512
arbitraryBiasedKey :: (RawBytes -> k) -> Gen RawBytes -> Gen k
537513
arbitraryBiasedKey fromRB genUnbiased = fromRB <$> frequency

src/Database/LSMTree.hs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,6 @@ module Database.LSMTree (
149149
serialiseKeyIdentity,
150150
serialiseKeyIdentityUpToSlicing,
151151
serialiseKeyPreservesOrdering,
152-
serialiseKeyMinimalSize,
153152
serialiseValueIdentity,
154153
serialiseValueIdentityUpToSlicing,
155154
packSlice,
@@ -227,8 +226,7 @@ import Database.LSMTree.Internal.Config
227226
DiskCachePolicy (..), FencePointerIndexType (..),
228227
LevelNo (..), MergeBatchSize (..), MergePolicy (..),
229228
MergeSchedule (..), SizeRatio (..), TableConfig (..),
230-
WriteBufferAlloc (..), defaultTableConfig,
231-
serialiseKeyMinimalSize)
229+
WriteBufferAlloc (..), defaultTableConfig)
232230
import Database.LSMTree.Internal.Config.Override
233231
(TableConfigOverride (..), noTableConfigOverride)
234232
import Database.LSMTree.Internal.Entry (NumEntries (..))

src/Database/LSMTree/Internal/Config.hs

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ module Database.LSMTree.Internal.Config (
2020
-- * Fence pointer index
2121
, FencePointerIndexType (..)
2222
, indexTypeForRun
23-
, serialiseKeyMinimalSize
2423
-- * Disk cache policy
2524
, DiskCachePolicy (..)
2625
, diskCachePolicyForLevel
@@ -36,11 +35,9 @@ import Database.LSMTree.Internal.Index (IndexType)
3635
import qualified Database.LSMTree.Internal.Index as Index
3736
(IndexType (Compact, Ordinary))
3837
import qualified Database.LSMTree.Internal.MergingRun as MR
39-
import qualified Database.LSMTree.Internal.RawBytes as RB
4038
import Database.LSMTree.Internal.Run (RunDataCaching (..))
4139
import Database.LSMTree.Internal.RunAcc (RunBloomFilterAlloc (..))
4240
import Database.LSMTree.Internal.RunBuilder (RunParams (..))
43-
import Database.LSMTree.Internal.Serialise.Class (SerialiseKey (..))
4441

4542
newtype LevelNo = LevelNo Int
4643
deriving stock (Show, Eq, Ord)
@@ -321,12 +318,11 @@ data FencePointerIndexType =
321318
| {- |
322319
Compact indexes are designed for the case where the keys in the database are uniformly distributed, e.g., when the keys are hashes.
323320
324-
When using a compact index, the 'Database.LSMTree.Internal.Serialise.Class.serialiseKey' function must satisfy the following additional law:
321+
When using a compact index, some requirements apply to serialised keys:
325322
326-
[Minimal size]
327-
@'Database.LSMTree.Internal.RawBytes.size' ('Database.LSMTree.Internal.Serialise.Class.serialiseKey' x) >= 8@
328-
329-
Use 'serialiseKeyMinimalSize' to test this law.
323+
* keys must be uniformly distributed;
324+
* keys can be of variable length;
325+
* keys less than 8 bytes (64bits) are padded with zeros (in LSB position).
330326
-}
331327
CompactIndex
332328
deriving stock (Eq, Show)
@@ -339,10 +335,6 @@ indexTypeForRun :: FencePointerIndexType -> IndexType
339335
indexTypeForRun CompactIndex = Index.Compact
340336
indexTypeForRun OrdinaryIndex = Index.Ordinary
341337

342-
-- | Test the __Minimal size__ law for the 'CompactIndex' option.
343-
serialiseKeyMinimalSize :: SerialiseKey k => k -> Bool
344-
serialiseKeyMinimalSize x = RB.size (serialiseKey x) >= 8
345-
346338
{-------------------------------------------------------------------------------
347339
Disk cache policy
348340
-------------------------------------------------------------------------------}

src/Database/LSMTree/Internal/MergeSchedule.hs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ addRunToLevels tr conf@TableConfig{..} resolve hfs hbio root salt uc r0 reg leve
673673
traceWith tr $ AtLevel ln TraceAddLevel
674674
-- Make a new level
675675
let policyForLevel = mergePolicyForLevel confMergePolicy ln V.empty ul
676-
ir <- newMerge policyForLevel MR.MergeLastLevel ln rs
676+
ir <- newMerge policyForLevel (mergeTypeForLevel V.empty ul) ln rs
677677
pure $! V.singleton $ Level ir V.empty
678678
go !ln rs' (V.uncons -> Just (Level ir rs, ls)) = do
679679
r <- expectCompletedMerge ln ir
@@ -714,7 +714,7 @@ addRunToLevels tr conf@TableConfig{..} resolve hfs hbio root salt uc r0 reg leve
714714
-- Otherwise we start merging the incoming runs into the run.
715715
LevelLevelling -> do
716716
assert (V.null rs && V.null ls) $ pure ()
717-
ir' <- newMerge LevelLevelling MR.MergeLastLevel ln (rs' `V.snoc` r)
717+
ir' <- newMerge LevelLevelling (mergeTypeForLevel ls ul) ln (rs' `V.snoc` r)
718718
pure $! Level ir' V.empty `V.cons` V.empty
719719

720720
-- Releases the incoming run.

src/Database/LSMTree/Internal/RawBytes.hs

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ module Database.LSMTree.Internal.RawBytes (
5151
) where
5252

5353
import Control.DeepSeq (NFData)
54-
import Control.Exception (assert)
54+
import Data.Bits (Bits (..))
5555
import Data.BloomFilter.Hash (Hashable (..), hashByteArray)
5656
import qualified Data.ByteString as BS
5757
import qualified Data.ByteString.Builder as BB
@@ -71,6 +71,9 @@ import GHC.Stack
7171
import GHC.Word
7272
import Text.Printf (printf)
7373

74+
-- $setup
75+
-- >>> import Numeric
76+
7477
{- Note: [Export structure]
7578
~~~~~~~~~~~~~~~~~~~~~~~
7679
Since RawBytes are very similar to Primitive Vectors, the code is sectioned
@@ -172,14 +175,29 @@ drop = coerce VP.drop
172175
--
173176
-- The /top/ corresponds to the most significant bit (big-endian).
174177
--
175-
-- PRECONDITION: The byte-size of the raw bytes should be at least 8 bytes.
178+
-- If the number of bits is smaller than @64@, then any missing bits default to
179+
-- @0@s.
180+
--
181+
-- >>> showHex (topBits64 (pack [1,0,0,0,0,0,0,0])) ""
182+
-- "100000000000000"
183+
--
184+
-- >>> showHex (topBits64 (pack [1,0,0])) ""
185+
-- "100000000000000"
176186
--
177187
-- TODO: optimisation ideas: use unsafe shift/byteswap primops, look at GHC
178188
-- core, find other opportunities for using primops.
179189
--
180190
topBits64 :: RawBytes -> Word64
181-
topBits64 rb@(RawBytes (VP.Vector (I# off#) _size (ByteArray k#))) =
182-
assert (size rb >= 8) $ toWord64 (indexWord8ArrayAsWord64# k# off#)
191+
topBits64 rb@(RawBytes v@(VP.Vector (I# off#) _size (ByteArray k#)))
192+
| n >= 8
193+
= toWord64 (indexWord8ArrayAsWord64# k# off#)
194+
| otherwise
195+
= VP.foldl' f 0 v `unsafeShiftL` ((8 - n) * 8)
196+
where
197+
!n = size rb
198+
199+
f :: Word64 -> Word8 -> Word64
200+
f acc w = acc `unsafeShiftL` 8 + fromIntegral w
183201

184202
#if (MIN_VERSION_GLASGOW_HASKELL(9, 4, 0, 0))
185203
toWord64 :: Word64# -> Word64

src/Database/LSMTree/Simple.hs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ module Database.LSMTree.Simple (
134134
serialiseKeyIdentity,
135135
serialiseKeyIdentityUpToSlicing,
136136
serialiseKeyPreservesOrdering,
137-
serialiseKeyMinimalSize,
138137
serialiseValueIdentity,
139138
serialiseValueIdentityUpToSlicing,
140139
packSlice,
@@ -182,9 +181,8 @@ import Database.LSMTree (BloomFilterAlloc, CursorClosedError (..),
182181
UnionCredits (..), UnionDebt (..), WriteBufferAlloc,
183182
isValidSnapshotName, noTableConfigOverride, packSlice,
184183
serialiseKeyIdentity, serialiseKeyIdentityUpToSlicing,
185-
serialiseKeyMinimalSize, serialiseKeyPreservesOrdering,
186-
serialiseValueIdentity, serialiseValueIdentityUpToSlicing,
187-
toSnapshotName)
184+
serialiseKeyPreservesOrdering, serialiseValueIdentity,
185+
serialiseValueIdentityUpToSlicing, toSnapshotName)
188186
import qualified Database.LSMTree as LSMT
189187
import qualified Database.LSMTree.Internal.Types as LSMT
190188
import qualified Database.LSMTree.Internal.Unsafe as Internal

test/Test/Database/LSMTree/Generators.hs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,6 @@ tests = testGroup "Test.Database.LSMTree.Generators" [
6464
prop_arbitraryAndShrinkPreserveInvariant
6565
(\(LargeRawBytes rb) -> labelRawBytes rb)
6666
(deepseqInvariant @LargeRawBytes)
67-
, testGroup "KeyForIndexCompact" $
68-
prop_arbitraryAndShrinkPreserveInvariant noTags $
69-
isKeyForIndexCompact . getKeyForIndexCompact
7067
, testGroup "BiasedKey" $
7168
prop_arbitraryAndShrinkPreserveInvariant
7269
(labelTestKOps @BiasedKey)

test/Test/Database/LSMTree/Internal/Index/Compact.hs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import Data.Word
2828
import Database.LSMTree.Extras
2929
import Database.LSMTree.Extras.Generators (ChunkSize (..),
3030
LogicalPageSummaries, LogicalPageSummary (..), Pages (..),
31-
genRawBytes, isKeyForIndexCompact, labelPages, toAppends)
31+
genRawBytes, labelPages, toAppends)
3232
import Database.LSMTree.Extras.Index (Append (..), appendToCompact)
3333
import Database.LSMTree.Internal.BitMath
3434
import Database.LSMTree.Internal.Chunk as Chunk (toByteString)
@@ -54,9 +54,7 @@ import Text.Printf (printf)
5454

5555
tests :: TestTree
5656
tests = testGroup "Test.Database.LSMTree.Internal.Index.Compact" [
57-
testGroup "TestKey" $
58-
prop_arbitraryAndShrinkPreserveInvariant @TestKey noTags isTestKey
59-
, testProperty "prop_distribution @TestKey" $
57+
testProperty "prop_distribution @TestKey" $
6058
prop_distribution @TestKey
6159
, testProperty "prop_searchMinMaxKeysAfterConstruction" $
6260
prop_searchMinMaxKeysAfterConstruction @TestKey 100
@@ -173,15 +171,12 @@ instance Arbitrary TestKey where
173171
-- Shrink keys extensively: most failures will occur in small counterexamples,
174172
-- so we don't have to limit the number of shrinks as much.
175173
shrink (TestKey bytes) = [
176-
TestKey bytes'
174+
testkey'
177175
| let RawBytes vec = bytes
178176
, vec' <- VP.fromList <$> shrink (VP.toList vec)
179-
, let bytes' = RawBytes vec'
180-
, isKeyForIndexCompact bytes'
177+
, let testkey' = TestKey $ RawBytes vec'
181178
]
182179

183-
isTestKey :: TestKey -> Bool
184-
isTestKey (TestKey bytes) = isKeyForIndexCompact bytes
185180

186181
{-------------------------------------------------------------------------------
187182
Properties

0 commit comments

Comments
 (0)