Split IncomingRun out of MergeSchedule

dcoutts · dcoutts · commit 0204324d8d4d · 2025-03-13T16:26:52.000Z
Also update the credit docs for nominal credits.
diff --git a/lsm-tree.cabal b/lsm-tree.cabal
@@ -131,6 +131,7 @@ library
     Database.LSMTree.Internal.CRC32C
     Database.LSMTree.Internal.Cursor
     Database.LSMTree.Internal.Entry
+    Database.LSMTree.Internal.IncomingRun
     Database.LSMTree.Internal.Index
     Database.LSMTree.Internal.Index.Compact
     Database.LSMTree.Internal.Index.CompactAcc
diff --git a/src/Database/LSMTree/Internal/IncomingRun.hs b/src/Database/LSMTree/Internal/IncomingRun.hs
@@ -0,0 +1,378 @@
+{-# LANGUAGE CPP           #-}
+{-# LANGUAGE MagicHash     #-}
+{-# LANGUAGE UnboxedTuples #-}
+
+#if !(MIN_VERSION_GLASGOW_HASKELL(9,0,0,0))
+-- Fix for ghc 8.10.x with deriving newtype Prim
+{-# LANGUAGE DataKinds     #-}
+#endif
+
+module Database.LSMTree.Internal.IncomingRun (
+    IncomingRun (..)
+  , MergePolicyForLevel (..)
+  , duplicateIncomingRun
+  , releaseIncomingRun
+  , newIncomingSingleRun
+  , newIncomingMergingRun
+  , snapshotIncomingRun
+
+    -- * Credits and credit tracking
+    -- $credittracking
+  , NominalDebt (..)
+  , NominalCredits (..)
+  , nominalDebtAsCredits
+  , supplyCreditsIncomingRun
+  , immediatelyCompleteIncomingRun
+  ) where
+
+import           Control.ActionRegistry
+import           Control.Concurrent.Class.MonadMVar.Strict
+import           Control.DeepSeq (NFData (..))
+import           Control.Monad.Class.MonadST (MonadST)
+import           Control.Monad.Class.MonadSTM (MonadSTM (..))
+import           Control.Monad.Class.MonadThrow (MonadMask, MonadThrow (..))
+import           Control.Monad.Primitive
+import           Control.RefCount
+import           Data.Primitive (Prim)
+import           Data.Primitive.PrimVar
+import           Database.LSMTree.Internal.Assertions (assert)
+import           Database.LSMTree.Internal.Config
+import           Database.LSMTree.Internal.Entry (NumEntries (..))
+import           Database.LSMTree.Internal.MergingRun (MergeCredits (..),
+                     MergeDebt (..), MergingRun)
+import qualified Database.LSMTree.Internal.MergingRun as MR
+import           Database.LSMTree.Internal.Run (Run)
+
+import           GHC.Exts (Word (W#), quotRemWord2#, timesWord2#)
+
+{-------------------------------------------------------------------------------
+  Incoming runs
+-------------------------------------------------------------------------------}
+
+-- | An incoming run is either a single run, or a merge.
+data IncomingRun m h =
+       Single  !(Ref (Run m h))
+     | Merging !MergePolicyForLevel
+               !NominalDebt
+               !(PrimVar (PrimState m) NominalCredits)
+               !(Ref (MergingRun MR.LevelMergeType m h))
+
+data MergePolicyForLevel = LevelTiering | LevelLevelling
+  deriving stock (Show, Eq)
+
+instance NFData MergePolicyForLevel where
+  rnf LevelTiering   = ()
+  rnf LevelLevelling = ()
+
+{-# SPECIALISE duplicateIncomingRun :: ActionRegistry IO -> IncomingRun IO h -> IO (IncomingRun IO h) #-}
+duplicateIncomingRun ::
+     (PrimMonad m, MonadMask m)
+  => ActionRegistry m
+  -> IncomingRun m h
+  -> m (IncomingRun m h)
+duplicateIncomingRun reg (Single r) =
+    Single <$> withRollback reg (dupRef r) releaseRef
+
+duplicateIncomingRun reg (Merging mp md mcv mr) =
+    Merging mp md <$> (newPrimVar =<< readPrimVar mcv)
+                  <*> withRollback reg (dupRef mr) releaseRef
+
+{-# SPECIALISE releaseIncomingRun :: IncomingRun IO h -> IO () #-}
+releaseIncomingRun ::
+     (PrimMonad m, MonadMask m)
+  => IncomingRun m h -> m ()
+releaseIncomingRun (Single         r) = releaseRef r
+releaseIncomingRun (Merging _ _ _ mr) = releaseRef mr
+
+{-# INLINE newIncomingSingleRun #-}
+newIncomingSingleRun ::
+     (PrimMonad m, MonadThrow m)
+  => Ref (Run m h)
+  -> m (IncomingRun m h)
+newIncomingSingleRun r = Single <$> dupRef r
+
+{-# INLINE newIncomingMergingRun #-}
+newIncomingMergingRun ::
+     (PrimMonad m, MonadThrow m)
+  => MergePolicyForLevel
+  -> NominalDebt
+  -> Ref (MergingRun MR.LevelMergeType m h)
+  -> m (IncomingRun m h)
+newIncomingMergingRun mergePolicy nominalDebt mr = do
+    nominalCreditsVar <- newPrimVar (NominalCredits 0)
+    Merging mergePolicy nominalDebt nominalCreditsVar <$> dupRef mr
+
+{-# SPECIALISE snapshotIncomingRun ::
+     IncomingRun IO h
+  -> IO (Either (Ref (Run IO h))
+                (MergePolicyForLevel,
+                 NominalDebt,
+                 NominalCredits,
+                 Ref (MergingRun MR.LevelMergeType IO h))) #-}
+snapshotIncomingRun ::
+     PrimMonad m
+  => IncomingRun m h
+  -> m (Either (Ref (Run m h))
+               (MergePolicyForLevel,
+                NominalDebt,
+                NominalCredits,
+                Ref (MergingRun MR.LevelMergeType m h)))
+snapshotIncomingRun (Single r) = pure (Left r)
+snapshotIncomingRun (Merging mergePolicy nominalDebt nominalCreditsVar mr) = do
+    nominalCredits <- readPrimVar nominalCreditsVar
+    pure (Right (mergePolicy, nominalDebt, nominalCredits, mr))
+
+{-------------------------------------------------------------------------------
+  Credits
+-------------------------------------------------------------------------------}
+
+{- $credittracking
+
+With scheduled merges, each update (e.g., insert) on a table contributes to the
+progression of ongoing merges in the levels structure. This ensures that merges
+are finished in time before a new merge has to be started. The points in the
+evolution of the levels structure where new merges are started are known: a
+flush of a full write buffer will create a new run on the first level, and
+after sufficient flushes (e.g., 4) we will start at least one new merge on the
+second level. This may cascade down to lower levels depending on how full the
+levels are. As such, we have a well-defined measure to determine when merges
+should be finished: it only depends on the maximum size of the write buffer!
+
+The simplest solution to making sure merges are done in time is to step them to
+completion immediately when started. This does not, however, spread out work
+over time nicely. Instead, we schedule merge work based on how many updates are
+made on the table, taking care to ensure that the merge is finished /just/ in
+time before the next flush comes around, and not too early.
+
+The progression is tracked using nominal credits. Each individual update
+contributes a single credit to each level, since each level contains precisely
+one ongoing merge. Contributing a credit does not, however, translate directly
+to performing one /unit/ of merging work:
+
+* The amount of work to do for one credit is adjusted depending on the actual
+  size of the merge we are doing. Last-level merges, for example, can have
+  larger inputs, and therefore we have to do a little more work for each
+  credit. Or input runs involved in a merge can be less than maximal size for
+  the level, and so there may be less merging work to do. As such, we /scale/
+  'NominalCredits' to 'MergeCredits', and then supply the 'MergeCredits' to
+  the 'MergingRun'.
+
+* Supplying 'MergeCredits' to a 'MergingRun' does not necessarily directly
+  translate into performing merging work. Merge credits are accumulated until
+  they go over a threshold, after which a batch of merge work will be performed.
+  Configuring this threshold should allow a good balance between spreading out
+  I\/O and achieving good (concurrent) performance.
+
+Merging runs can be shared across tables, which means that multiple threads
+can contribute to the same merge concurrently. Incoming runs however are /not/
+shared between tables. As such the tracking of 'NominalCredits' does not need
+to use any concurrency precautions.
+-}
+
+-- | Total merge debt to complete the merge in an incoming run.
+--
+-- This corresponds to the number (worst case, minimum number) of update
+-- operations inserted into the table, before we will expect the merge to
+-- complete.
+newtype NominalDebt = NominalDebt Int
+  deriving stock Eq
+  deriving newtype (NFData)
+
+-- | Merge credits that get supplied to a table's levels.
+--
+-- This corresponds to the number of update operations inserted into the table.
+newtype NominalCredits = NominalCredits Int
+  deriving stock Eq
+  deriving newtype (Prim, NFData)
+
+nominalDebtAsCredits :: NominalDebt -> NominalCredits
+nominalDebtAsCredits (NominalDebt c) = NominalCredits c
+
+{-# SPECIALISE supplyCreditsIncomingRun ::
+     TableConfig
+  -> LevelNo
+  -> IncomingRun IO h
+  -> NominalCredits
+  -> IO () #-}
+-- | Supply a given number of nominal credits to the merge in an incoming run.
+-- This is a relative addition of credits, not a new absolute total value.
+supplyCreditsIncomingRun ::
+     (MonadSTM m, MonadST m, MonadMVar m, MonadMask m)
+  => TableConfig
+  -> LevelNo
+  -> IncomingRun m h
+  -> NominalCredits
+  -> m ()
+supplyCreditsIncomingRun _ _ (Single _r) _ = return ()
+supplyCreditsIncomingRun conf ln (Merging _ nominalDebt nominalCreditsVar mr)
+                         deposit = do
+    (_nominalCredits,
+     nominalCredits') <- depositNominalCredits nominalDebt nominalCreditsVar
+                                               deposit
+    let !mergeDebt     = MR.totalMergeDebt mr
+        !mergeCredits' = scaleNominalToMergeCredit nominalDebt mergeDebt
+                                                   nominalCredits'
+        !thresh = creditThresholdForLevel conf ln
+    (_suppliedCredits,
+     _suppliedCredits') <- MR.supplyCreditsAbsolute mr thresh mergeCredits'
+    return ()
+    --TODO: currently each supplying credits action results in contributing
+    -- credits to the underlying merge, but this need not be the case. We
+    -- _could_ do threshold based batching at the level of the IncomingRun.
+    -- The IncomingRun does not need to worry about concurrency, so does not
+    -- pay the cost of atomic operations on the counters. Then when we
+    -- accumulate a batch we could supply that to the MergingRun (which must
+    -- use atomic operations for its counters). We could potentially simplify
+    -- MergingRun by dispensing with batching for the MergeCredits counters.
+
+-- TODO: the thresholds for doing merge work should be different for each level,
+-- maybe co-prime?
+creditThresholdForLevel :: TableConfig -> LevelNo -> MR.CreditThreshold
+creditThresholdForLevel conf (LevelNo _i) =
+    let AllocNumEntries (NumEntries x) = confWriteBufferAlloc conf
+    in  MR.CreditThreshold (MR.UnspentCredits (MergeCredits x))
+
+-- | Deposit nominal credits in the local credits var, ensuring the total
+-- credits does not exceed the total debt.
+--
+-- Depositing /could/ leave the credit higher than the total debt. It is not
+-- avoided by construction. The scenario is this: when a completed merge is
+-- underfull, we combine it with the incoming run, so it means we have one run
+-- fewer on the level then we'd normally have. This means that the level
+-- becomes full at a later time, so more time passes before we call
+-- 'MR.expectCompleted' on any levels further down the tree. This means we keep
+-- supplying nominal credits to levels further down past the point their
+-- nominal debt is paid off. So the solution here is just to drop any nominal
+-- credits that are in excess of the nominal debt.
+--
+-- This is /not/ itself thread safe. All 'TableContent' update operations are
+-- expected to be serialised by the caller. See concurrency comments for
+-- 'TableContent' for detail.
+depositNominalCredits ::
+     PrimMonad m
+  => NominalDebt
+  -> PrimVar (PrimState m) NominalCredits
+  -> NominalCredits
+  -> m (NominalCredits, NominalCredits)
+depositNominalCredits (NominalDebt nominalDebt)
+                      nominalCreditsVar
+                      (NominalCredits deposit) = do
+    NominalCredits before <- readPrimVar nominalCreditsVar
+    let !after = NominalCredits (min (before + deposit) nominalDebt)
+    writePrimVar nominalCreditsVar after
+    return (NominalCredits before, after)
+
+-- | Linearly scale a nominal credit (between 0 and the nominal debt) into an
+-- equivalent merge credit (between 0 and the total merge debt).
+--
+-- Crucially, @100% nominal credit ~~ 100% merge credit@, so when we pay off
+-- the nominal debt, we also exactly pay off the merge debt. That is:
+--
+-- > scaleNominalToMergeCredit nominalDebt mergeDebt nominalDebt == mergeDebt
+--
+-- (modulo some newtype conversions)
+--
+scaleNominalToMergeCredit ::
+     NominalDebt
+  -> MergeDebt
+  -> NominalCredits
+  -> MergeCredits
+scaleNominalToMergeCredit (NominalDebt             nominalDebt)
+                          (MergeDebt (MergeCredits mergeDebt))
+                          (NominalCredits          nominalCredits) =
+    -- The scaling involves an operation: (a * b) `div` c
+    -- but where potentially the variables a,b,c may be bigger than a 32bit
+    -- integer can hold. This would be the case for runs that have more than
+    -- 4 billion entries.
+    --
+    -- (This is assuming 64bit Int, the problem would be even worse for 32bit
+    -- systems. The solution here would also work for 32bit systems, allowing
+    -- up to, 2^31, 2 billion entries per run.)
+    --
+    -- To work correctly in this case we need higher range for the intermediate
+    -- result a*b which could be bigger than 64bits can hold. A correct
+    -- implementation can use Rational, but a fast implementation should use
+    -- only integer operations. This is relevant because this is on the fast
+    -- path for small insertions into the table that often do no merging work
+    -- and just update credit counters.
+
+    -- The fast implementation uses integer operations that produce a 128bit
+    -- intermediate result for the a*b result, and use a 128bit numerator in
+    -- the division operation (but 64bit denominator). These are known as
+    -- "widening multiplication" and "narrowing division". GHC has direct
+    -- support for these operations as primops: timesWord2# and quotRemWord2#,
+    -- but they are not exposed through any high level API shipped with GHC.
+
+    -- The specification using Rational is:
+    let mergeCredits_spec = floor $ toRational nominalCredits
+                                  * toRational mergeDebt
+                                  / toRational nominalDebt
+    -- Note that it doesn't matter if we use floor or ceiling here.
+    -- Rounding errors will not compound because we sum nominal debt and
+    -- convert absolute nominal to absolute merging credit. We don't
+    -- convert each deposit and sum all the rounding errors.
+    -- When nominalCredits == nominalDebt then the result is exact anyway
+    -- (being mergeDebt) so the rounding mode makes no difference when we
+    -- get to the end of the merge. Using floor makes things simpler for
+    -- the fast integer implementation below, so we take that as the spec.
+
+        -- If the nominalCredits is between 0 and nominalDebt then it's
+        -- guaranteed that the mergeCredit is between 0 and mergeDebt.
+        -- The mergeDebt fits in an Int, therefore the result does too.
+        -- Therefore the undefined behaviour case of timesDivABC_fast is
+        -- avoided and the w2i cannot overflow.
+        mergeCredits_fast = w2i $ timesDivABC_fast (i2w nominalCredits)
+                                                   (i2w mergeDebt)
+                                                   (i2w nominalDebt)
+     in assert (0 < nominalDebt) $
+        assert (0 <= nominalCredits && nominalCredits <= nominalDebt) $
+        assert (mergeCredits_spec == mergeCredits_fast) $
+        MergeCredits mergeCredits_fast
+  where
+    {-# INLINE i2w #-}
+    {-# INLINE w2i #-}
+    i2w :: Int -> Word
+    w2i :: Word -> Int
+    i2w = fromIntegral
+    w2i = fromIntegral
+
+-- | Compute @(a * b) `div` c@ for unsigned integers for the full range of
+-- 64bit unsigned integers, provided that @a <= c@ and thus the result will
+-- fit in 64bits.
+--
+-- The @a * b@ intermediate result is computed using 128bit precision.
+--
+-- Note: the behaviour is undefined if the result will not fit in 64bits.
+-- It will probably result in immediate termination with SIGFPE.
+--
+timesDivABC_fast :: Word -> Word -> Word -> Word
+timesDivABC_fast (W# a) (W# b) (W# c) =
+    case timesWord2# a b of
+      (# ph, pl #) ->
+            case quotRemWord2# ph pl c of
+              (# q, _r #) -> W# q
+
+{-# SPECIALISE immediatelyCompleteIncomingRun ::
+     TableConfig
+  -> LevelNo
+  -> IncomingRun IO h
+  -> IO (Ref (Run IO h)) #-}
+-- | Supply enough credits to complete the merge now.
+immediatelyCompleteIncomingRun ::
+     (MonadSTM m, MonadST m, MonadMVar m, MonadMask m)
+  => TableConfig
+  -> LevelNo
+  -> IncomingRun m h
+  -> m (Ref (Run m h))
+immediatelyCompleteIncomingRun conf ln ir =
+    case ir of
+      Single r -> dupRef r
+      Merging _ (NominalDebt nominalDebt) nominalCreditsVar mr -> do
+
+        NominalCredits nominalCredits <- readPrimVar nominalCreditsVar
+        let !deposit = NominalCredits (nominalDebt - nominalCredits)
+        supplyCreditsIncomingRun conf ln ir deposit
+
+        -- This ensures the merge is really completed. However, we don't
+        -- release the merge yet, but we do return a new reference to the run.
+        MR.expectCompleted mr
diff --git a/src/Database/LSMTree/Internal/MergeSchedule.hs b/src/Database/LSMTree/Internal/MergeSchedule.hs