Skip to content

Commit df88019

Browse files
authored
LedgerDB: prune on garbage collection instead of on every change (#1513)
This is in preparation for #1424 This PR is intended to be reviewed commit-by-commit. Currently, we prune the LedgerDB (ie remove all but the last `k+1` states) every time we adopt a longer chain. This means that we can not rely on the fact that other threads (like the `copyAndSnapshot` ChainDB background) actually observe all immutable ledger states, just as described in the caveats of our `Watcher` abstraction. However, a predictable ledger snapshotting rule (#1424) requires this property; otherwise, when the node is under high load and/or we are adopting multiple blocks in quick succession, the node might not be able to create a snapshot for its desired block. This PR changes this fact: Now, when adopting new blocks, the LedgerDB is *not* immediately pruned. Instead, the a new dedicated background thread for ledger maintenance tasks (flushing/snapshotting/garbage collection) in the ChainDB will periodically (on every new immutable block) wake up and (in particular) garbage collect the LedgerDB based on a slot number. Also, this makes the semantics more consistent with the existing garbage collection of previously-applied blocks in the LedgerDB, and also with how the ChainDB works, where we also don't immediately delete blocks from the VolatileDB once they are buried beneath `k+1` blocks. See #1513 (comment) for benchmarks demonstrating that the peak memory usage does not increase while syncing (where we now briefly might hold more than `k+1` ledger states in memory).
2 parents c8c627e + dec284f commit df88019

File tree

13 files changed

+390
-302
lines changed

13 files changed

+390
-302
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
### Breaking
2+
3+
- Changed pruning of immutable ledger states to happen on LedgerDB garbage
4+
collection instead of directly on every block adoption. This is purely an
5+
internal refactoring (with breaking API changes) supporting predictable ledger
6+
snapshotting.
7+
8+
- Avoid maintaining volatile ledger states during ledger replay, making it
9+
slightly more efficient.

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/ChainDB/Impl.hs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,9 @@ openDBInternal args launchBgTasks = runWithTempRegistry $ do
278278
let testing =
279279
Internal
280280
{ intCopyToImmutableDB = getEnv h (withFuse copyTestFuse . Background.copyToImmutableDB)
281-
, intGarbageCollect = getEnv1 h Background.garbageCollect
281+
, intGarbageCollect = \slot -> getEnv h $ \e -> do
282+
Background.garbageCollectBlocks e slot
283+
LedgerDB.garbageCollect (cdbLedgerDB e) slot
282284
, intTryTakeSnapshot = getEnv h $ \env' ->
283285
void $ LedgerDB.tryTakeSnapshot (cdbLedgerDB env') Nothing maxBound
284286
, intAddBlockRunner = getEnv h (Background.addBlockRunner addBlockTestFuse)

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/ChainDB/Impl/Background.hs

Lines changed: 129 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{-# LANGUAGE BangPatterns #-}
22
{-# LANGUAGE DeriveAnyClass #-}
33
{-# LANGUAGE DeriveGeneric #-}
4+
{-# LANGUAGE DerivingStrategies #-}
45
{-# LANGUAGE FlexibleContexts #-}
56
{-# LANGUAGE LambdaCase #-}
67
{-# LANGUAGE NamedFieldPuns #-}
@@ -19,11 +20,10 @@ module Ouroboros.Consensus.Storage.ChainDB.Impl.Background
1920
launchBgTasks
2021

2122
-- * Copying blocks from the VolatileDB to the ImmutableDB
22-
, copyAndSnapshotRunner
2323
, copyToImmutableDB
2424

2525
-- * Executing garbage collection
26-
, garbageCollect
26+
, garbageCollectBlocks
2727

2828
-- * Scheduling garbage collections
2929
, GcParams (..)
@@ -76,6 +76,7 @@ import qualified Ouroboros.Consensus.Storage.VolatileDB as VolatileDB
7676
import Ouroboros.Consensus.Util
7777
import Ouroboros.Consensus.Util.Condense
7878
import Ouroboros.Consensus.Util.IOLike
79+
import Ouroboros.Consensus.Util.STM (Watcher (..), forkLinkedWatcher)
7980
import Ouroboros.Network.AnchoredFragment (AnchoredSeq (..))
8081
import qualified Ouroboros.Network.AnchoredFragment as AF
8182

@@ -99,17 +100,30 @@ launchBgTasks cdb@CDB{..} replayed = do
99100
!addBlockThread <-
100101
launch "ChainDB.addBlockRunner" $
101102
addBlockRunner cdbChainSelFuse cdb
103+
104+
ledgerDbTasksTrigger <- newLedgerDbTasksTrigger replayed
105+
!ledgerDbMaintenaceThread <-
106+
forkLinkedWatcher cdbRegistry "ChainDB.ledgerDbTaskWatcher" $
107+
ledgerDbTaskWatcher cdb ledgerDbTasksTrigger
108+
102109
gcSchedule <- newGcSchedule
103110
!gcThread <-
104-
launch "ChainDB.gcScheduleRunner" $
111+
launch "ChainDB.gcBlocksScheduleRunner" $
105112
gcScheduleRunner gcSchedule $
106-
garbageCollect cdb
107-
!copyAndSnapshotThread <-
108-
launch "ChainDB.copyAndSnapshotRunner" $
109-
copyAndSnapshotRunner cdb gcSchedule replayed cdbCopyFuse
113+
garbageCollectBlocks cdb
114+
115+
!copyToImmutableDBThread <-
116+
launch "ChainDB.copyToImmutableDBRunner" $
117+
copyToImmutableDBRunner cdb ledgerDbTasksTrigger gcSchedule cdbCopyFuse
118+
110119
atomically $
111120
writeTVar cdbKillBgThreads $
112-
sequence_ [addBlockThread, gcThread, copyAndSnapshotThread]
121+
sequence_
122+
[ addBlockThread
123+
, cancelThread ledgerDbMaintenaceThread
124+
, gcThread
125+
, copyToImmutableDBThread
126+
]
113127
where
114128
launch :: String -> m Void -> m (m ())
115129
launch = fmap cancelThread .: forkLinkedThread cdbRegistry
@@ -198,22 +212,18 @@ copyToImmutableDB CDB{..} = electric $ do
198212
_ -> error "header to remove not on the current chain"
199213

200214
{-------------------------------------------------------------------------------
201-
Snapshotting
215+
Copy to ImmutableDB
202216
-------------------------------------------------------------------------------}
203217

204-
-- | Copy blocks from the VolatileDB to ImmutableDB and take snapshots of the
205-
-- LedgerDB
218+
-- | Copy blocks from the VolatileDB to ImmutableDB and trigger further tasks in
219+
-- other threads.
206220
--
207221
-- We watch the chain for changes. Whenever the chain is longer than @k@, then
208222
-- the headers older than @k@ are copied from the VolatileDB to the ImmutableDB
209223
-- (using 'copyToImmutableDB'). Once that is complete,
210224
--
211-
-- * We periodically take a snapshot of the LedgerDB (depending on its config).
212-
-- When enough blocks (depending on its config) have been replayed during
213-
-- startup, a snapshot of the replayed LedgerDB will be written to disk at the
214-
-- start of this function. NOTE: After this initial snapshot we do not take a
215-
-- snapshot of the LedgerDB until the chain has changed again, irrespective of
216-
-- the LedgerDB policy.
225+
-- * Trigger LedgerDB maintenance tasks, namely flushing, taking snapshots and
226+
-- garbage collection.
217227
--
218228
-- * Schedule GC of the VolatileDB ('scheduleGC') for the 'SlotNo' of the most
219229
-- recent block that was copied.
@@ -228,32 +238,26 @@ copyToImmutableDB CDB{..} = electric $ do
228238
-- GC can happen, when we restart the node and schedule the /next/ GC, it will
229239
-- /imply/ any previously scheduled GC, since GC is driven by slot number
230240
-- ("garbage collect anything older than @x@").
231-
copyAndSnapshotRunner ::
241+
copyToImmutableDBRunner ::
232242
forall m blk.
233243
( IOLike m
234244
, LedgerSupportsProtocol blk
235245
) =>
236246
ChainDbEnv m blk ->
247+
LedgerDbTasksTrigger m ->
237248
GcSchedule m ->
238-
-- | Number of immutable blocks replayed on ledger DB startup
239-
Word64 ->
240249
Fuse m ->
241250
m Void
242-
copyAndSnapshotRunner cdb@CDB{..} gcSchedule replayed fuse = do
251+
copyToImmutableDBRunner cdb@CDB{..} ledgerDbTasksTrigger gcSchedule fuse = do
243252
-- this first flush will persist the differences that come from the initial
244253
-- chain selection.
245254
LedgerDB.tryFlush cdbLedgerDB
246-
loop =<< LedgerDB.tryTakeSnapshot cdbLedgerDB Nothing replayed
255+
forever copyAndTrigger
247256
where
248257
SecurityParam k = configSecurityParam cdbTopLevelConfig
249258

250-
loop :: LedgerDB.SnapCounters -> m Void
251-
loop counters = do
252-
let LedgerDB.SnapCounters
253-
{ prevSnapshotTime
254-
, ntBlocksSinceLastSnap
255-
} = counters
256-
259+
copyAndTrigger :: m ()
260+
copyAndTrigger = do
257261
-- Wait for the chain to grow larger than @k@
258262
numToWrite <- atomically $ do
259263
curChain <- icWithoutTime <$> readTVar cdbChain
@@ -264,14 +268,10 @@ copyAndSnapshotRunner cdb@CDB{..} gcSchedule replayed fuse = do
264268
--
265269
-- This is a synchronous operation: when it returns, the blocks have been
266270
-- copied to disk (though not flushed, necessarily).
267-
withFuse fuse (copyToImmutableDB cdb) >>= scheduleGC'
271+
gcSlotNo <- withFuse fuse (copyToImmutableDB cdb)
268272

269-
LedgerDB.tryFlush cdbLedgerDB
270-
271-
now <- getMonotonicTime
272-
let ntBlocksSinceLastSnap' = ntBlocksSinceLastSnap + numToWrite
273-
274-
loop =<< LedgerDB.tryTakeSnapshot cdbLedgerDB ((,now) <$> prevSnapshotTime) ntBlocksSinceLastSnap'
273+
triggerLedgerDbTasks ledgerDbTasksTrigger gcSlotNo numToWrite
274+
scheduleGC' gcSlotNo
275275

276276
scheduleGC' :: WithOrigin SlotNo -> m ()
277277
scheduleGC' Origin = return ()
@@ -285,16 +285,104 @@ copyAndSnapshotRunner cdb@CDB{..} gcSchedule replayed fuse = do
285285
}
286286
gcSchedule
287287

288+
{-------------------------------------------------------------------------------
289+
LedgerDB maintenance tasks
290+
-------------------------------------------------------------------------------}
291+
292+
-- | Trigger for the LedgerDB maintenance tasks, namely whenever the immutable
293+
-- DB tip slot advances when we finish copying blocks to it.
294+
newtype LedgerDbTasksTrigger m
295+
= LedgerDbTasksTrigger (StrictTVar m LedgerDbTaskState)
296+
297+
data LedgerDbTaskState = LedgerDbTaskState
298+
{ ldbtsImmTip :: !(WithOrigin SlotNo)
299+
, ldbtsPrevSnapshotTime :: !(Maybe Time)
300+
, ldbtsBlocksSinceLastSnapshot :: !Word64
301+
}
302+
deriving stock Generic
303+
deriving anyclass NoThunks
304+
305+
newLedgerDbTasksTrigger ::
306+
IOLike m =>
307+
-- | Number of blocks replayed.
308+
Word64 ->
309+
m (LedgerDbTasksTrigger m)
310+
newLedgerDbTasksTrigger replayed = LedgerDbTasksTrigger <$> newTVarIO st
311+
where
312+
st =
313+
LedgerDbTaskState
314+
{ ldbtsImmTip = Origin
315+
, ldbtsPrevSnapshotTime = Nothing
316+
, ldbtsBlocksSinceLastSnapshot = replayed
317+
}
318+
319+
triggerLedgerDbTasks ::
320+
forall m.
321+
IOLike m =>
322+
LedgerDbTasksTrigger m ->
323+
-- | New tip of the ImmutableDB.
324+
WithOrigin SlotNo ->
325+
-- | Number of blocks written to the ImmutableDB.
326+
Word64 ->
327+
m ()
328+
triggerLedgerDbTasks (LedgerDbTasksTrigger varSt) immTip numWritten =
329+
atomically $ modifyTVar varSt $ \st ->
330+
st
331+
{ ldbtsImmTip = immTip
332+
, ldbtsBlocksSinceLastSnapshot = ldbtsBlocksSinceLastSnapshot st + numWritten
333+
}
334+
335+
-- | Run LedgerDB maintenance tasks when 'LedgerDbTasksTrigger' changes.
336+
--
337+
-- * Flushing of differences.
338+
-- * Taking snapshots.
339+
-- * Garbage collection.
340+
ledgerDbTaskWatcher ::
341+
forall m blk.
342+
IOLike m =>
343+
ChainDbEnv m blk ->
344+
LedgerDbTasksTrigger m ->
345+
Watcher m LedgerDbTaskState (WithOrigin SlotNo)
346+
ledgerDbTaskWatcher CDB{..} (LedgerDbTasksTrigger varSt) =
347+
Watcher
348+
{ wFingerprint = ldbtsImmTip
349+
, wInitial = Nothing
350+
, wReader = readTVar varSt
351+
, wNotify =
352+
\LedgerDbTaskState
353+
{ ldbtsImmTip
354+
, ldbtsBlocksSinceLastSnapshot = blocksSinceLast
355+
, ldbtsPrevSnapshotTime = prevSnapTime
356+
} ->
357+
whenJust (withOriginToMaybe ldbtsImmTip) $ \slotNo -> do
358+
LedgerDB.tryFlush cdbLedgerDB
359+
360+
now <- getMonotonicTime
361+
LedgerDB.SnapCounters
362+
{ prevSnapshotTime
363+
, ntBlocksSinceLastSnap
364+
} <-
365+
LedgerDB.tryTakeSnapshot
366+
cdbLedgerDB
367+
((,now) <$> prevSnapTime)
368+
blocksSinceLast
369+
atomically $ modifyTVar varSt $ \st ->
370+
st
371+
{ ldbtsBlocksSinceLastSnapshot =
372+
ldbtsBlocksSinceLastSnapshot st - blocksSinceLast + ntBlocksSinceLastSnap
373+
, ldbtsPrevSnapshotTime = prevSnapshotTime
374+
}
375+
376+
LedgerDB.garbageCollect cdbLedgerDB slotNo
377+
}
378+
288379
{-------------------------------------------------------------------------------
289380
Executing garbage collection
290381
-------------------------------------------------------------------------------}
291382

292383
-- | Trigger a garbage collection for blocks older than the given 'SlotNo' on
293384
-- the VolatileDB.
294385
--
295-
-- Also removes the corresponding cached "previously applied points" from the
296-
-- LedgerDB.
297-
--
298386
-- This is thread-safe as the VolatileDB locks itself while performing a GC.
299387
--
300388
-- When calling this function it is __critical__ that the blocks that will be
@@ -304,11 +392,10 @@ copyAndSnapshotRunner cdb@CDB{..} gcSchedule replayed fuse = do
304392
--
305393
-- TODO will a long GC be a bottleneck? It will block any other calls to
306394
-- @putBlock@ and @getBlock@.
307-
garbageCollect :: forall m blk. IOLike m => ChainDbEnv m blk -> SlotNo -> m ()
308-
garbageCollect CDB{..} slotNo = do
395+
garbageCollectBlocks :: forall m blk. IOLike m => ChainDbEnv m blk -> SlotNo -> m ()
396+
garbageCollectBlocks CDB{..} slotNo = do
309397
VolatileDB.garbageCollect cdbVolatileDB slotNo
310398
atomically $ do
311-
LedgerDB.garbageCollect cdbLedgerDB slotNo
312399
modifyTVar cdbInvalid $ fmap $ Map.filter ((>= slotNo) . invalidBlockSlotNo)
313400
traceWith cdbTracer $ TraceGCEvent $ PerformedGC slotNo
314401

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/API.hs

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -251,9 +251,15 @@ data LedgerDB m l blk = LedgerDB
251251
-- back as many blocks as the passed @Word64@.
252252
, getPrevApplied :: STM m (Set (RealPoint blk))
253253
-- ^ Get the references to blocks that have previously been applied.
254-
, garbageCollect :: SlotNo -> STM m ()
255-
-- ^ Garbage collect references to old blocks that have been previously
256-
-- applied and committed.
254+
, garbageCollect :: SlotNo -> m ()
255+
-- ^ Garbage collect references to old state that is older than the given
256+
-- slot.
257+
--
258+
-- Concretely, this affects:
259+
--
260+
-- * Ledger states (and potentially underlying handles for on-disk storage).
261+
--
262+
-- * The set of previously applied points.
257263
, tryTakeSnapshot ::
258264
l ~ ExtLedgerState blk =>
259265
Maybe (Time, Time) ->
@@ -298,7 +304,14 @@ data TestInternals m l blk = TestInternals
298304
{ wipeLedgerDB :: m ()
299305
, takeSnapshotNOW :: WhereToTakeSnapshot -> Maybe String -> m ()
300306
, push :: ExtLedgerState blk DiffMK -> m ()
307+
-- ^ Push a ledger state, and prune the 'LedgerDB' to its immutable tip.
308+
--
309+
-- This does not modify the set of previously applied points.
301310
, reapplyThenPushNOW :: blk -> m ()
311+
-- ^ Apply block to the tip ledger state (using reapplication), and prune the
312+
-- 'LedgerDB' to its immutable tip.
313+
--
314+
-- This does not modify the set of previously applied points.
302315
, truncateSnapshots :: m ()
303316
, closeLedgerDB :: m ()
304317
, getNumLedgerTablesHandles :: m Word64
@@ -456,11 +469,10 @@ data InitDB db m blk = InitDB
456469
-- ^ Closing the database, to be reopened again with a different snapshot or
457470
-- with the genesis state.
458471
, initReapplyBlock :: !(LedgerDbCfg (ExtLedgerState blk) -> blk -> db -> m db)
459-
-- ^ Reapply a block from the immutable DB when initializing the DB.
472+
-- ^ Reapply a block from the immutable DB when initializing the DB. Prune the
473+
-- LedgerDB such that there are no volatile states.
460474
, currentTip :: !(db -> LedgerState blk EmptyMK)
461475
-- ^ Getting the current tip for tracing the Ledger Events.
462-
, pruneDb :: !(db -> m db)
463-
-- ^ Prune the database so that no immutable states are considered volatile.
464476
, mkLedgerDb ::
465477
!(db -> m (LedgerDB m (ExtLedgerState blk) blk, TestInternals m (ExtLedgerState blk) blk))
466478
-- ^ Create a LedgerDB from the initialized data structures from previous
@@ -545,13 +557,7 @@ initialize
545557
Left err -> do
546558
closeDb initDb
547559
error $ "Invariant violation: invalid immutable chain " <> show err
548-
Right (db, replayed) -> do
549-
db' <- pruneDb dbIface db
550-
return
551-
( acc InitFromGenesis
552-
, db'
553-
, replayed
554-
)
560+
Right (db, replayed) -> return (acc InitFromGenesis, db, replayed)
555561
tryNewestFirst acc (s : ss) = do
556562
eInitDb <- initFromSnapshot s
557563
case eInitDb of
@@ -603,9 +609,7 @@ initialize
603609
Monad.when (diskSnapshotIsTemporary s) $ deleteSnapshot hasFS s
604610
closeDb initDb
605611
tryNewestFirst (acc . InitFailure s err) ss
606-
Right (db, replayed) -> do
607-
db' <- pruneDb dbIface db
608-
return (acc (InitFromSnapshot s pt), db', replayed)
612+
Right (db, replayed) -> return (acc (InitFromSnapshot s pt), db, replayed)
609613

610614
replayTracer' =
611615
decorateReplayTracerWithGoal
@@ -775,10 +779,10 @@ type LedgerSupportsLedgerDB blk =
775779
-------------------------------------------------------------------------------}
776780

777781
-- | Options for prunning the LedgerDB
778-
--
779-
-- Rather than using a plain `Word64` we use this to be able to distinguish that
780-
-- we are indeed using
781-
-- 1. @0@ in places where it is necessary
782-
-- 2. the security parameter as is, in other places
783-
data LedgerDbPrune = LedgerDbPruneAll | LedgerDbPruneKeeping SecurityParam
782+
data LedgerDbPrune
783+
= -- | Prune all states, keeping only the current tip.
784+
LedgerDbPruneAll
785+
| -- | Prune such that all (non-anchor) states are not older than the given
786+
-- slot.
787+
LedgerDbPruneBeforeSlot SlotNo
784788
deriving Show

0 commit comments

Comments
 (0)