Skip to content

Commit 8e16ce0

Browse files
authored
LedgerDB.V2: prevent race condition when snapshotting (#1555)
Closes #1551 The commits are intended to be reviewed individually. - The first commit fixes the race condition described in #1551 by using the `ldbOpenHandlesLock` to duplicate the handle when snapshotting. - The second commit reverts 7900088 added in #1516, enabled by the first commit. Without the first commit, this would cause test failures, see the commit description. - The third and fourth commits are purely refactorings to be able to reuse the pattern of "duplicating while holding a read lock". - The fifth commit uses the same pattern for `takeSnapshotNOW`, a testing function, see the commit description. - The last commit adds documentation for `ldbOpenHandlesLock`.
2 parents 9468967 + 4010598 commit 8e16ce0

File tree

4 files changed

+107
-86
lines changed

4 files changed

+107
-86
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
### Patch
2+
3+
- LedgerDB.V2: prevent race condition when creating snapshots.

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/V1.hs

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ module Ouroboros.Consensus.Storage.LedgerDB.V1 (mkInitDb) where
2323
import Control.Arrow ((>>>))
2424
import Control.Monad
2525
import Control.Monad.Except
26+
import Control.Monad.Trans (lift)
2627
import Control.ResourceRegistry
2728
import Control.Tracer
2829
import Data.Bifunctor (first)
@@ -721,38 +722,30 @@ acquireAtTarget ::
721722
ResourceRegistry m ->
722723
Either Word64 (Target (Point blk)) ->
723724
ReadLocked m (Either GetForkerError (Resources m l))
724-
acquireAtTarget ldbEnv rr (Right VolatileTip) =
725-
readLocked $ do
726-
dblog <- readTVarIO (ldbChangelog ldbEnv)
727-
Right . (,dblog) <$> acquire ldbEnv rr dblog
728-
acquireAtTarget ldbEnv rr (Right ImmutableTip) =
729-
readLocked $ do
730-
dblog <- readTVarIO (ldbChangelog ldbEnv)
731-
Right . (,rollbackToAnchor dblog)
732-
<$> acquire ldbEnv rr dblog
733-
acquireAtTarget ldbEnv rr (Right (SpecificPoint pt)) =
734-
readLocked $ do
735-
dblog <- readTVarIO (ldbChangelog ldbEnv)
736-
let immTip = getTip $ anchor dblog
737-
case rollback pt dblog of
738-
Nothing
739-
| pointSlot pt < pointSlot immTip -> pure $ Left $ PointTooOld Nothing
740-
| otherwise -> pure $ Left PointNotOnChain
741-
Just dblog' -> Right . (,dblog') <$> acquire ldbEnv rr dblog'
742-
acquireAtTarget ldbEnv rr (Left n) = readLocked $ do
743-
dblog <- readTVarIO (ldbChangelog ldbEnv)
744-
case rollbackN n dblog of
745-
Nothing ->
746-
return $
747-
Left $
725+
acquireAtTarget ldbEnv rr target = readLocked $ runExceptT $ do
726+
dblog <- lift $ readTVarIO (ldbChangelog ldbEnv)
727+
-- Get the prefix of the dblog ending in the specified target.
728+
dblog' <- case target of
729+
Right VolatileTip -> pure dblog
730+
Right ImmutableTip -> pure $ rollbackToAnchor dblog
731+
Right (SpecificPoint pt) -> do
732+
let immTip = getTip $ anchor dblog
733+
case rollback pt dblog of
734+
Nothing
735+
| pointSlot pt < pointSlot immTip -> throwError $ PointTooOld Nothing
736+
| otherwise -> throwError PointNotOnChain
737+
Just dblog' -> pure dblog'
738+
Left n -> case rollbackN n dblog of
739+
Nothing ->
740+
throwError $
748741
PointTooOld $
749-
Just $
742+
Just
750743
ExceededRollback
751744
{ rollbackMaximum = maxRollback dblog
752745
, rollbackRequested = n
753746
}
754-
Just dblog' ->
755-
Right . (,dblog') <$> acquire ldbEnv rr dblog'
747+
Just dblog' -> pure dblog'
748+
lift $ (,dblog') <$> acquire ldbEnv rr dblog'
756749

757750
acquire ::
758751
(IOLike m, GetTip l) =>

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/V2.hs

Lines changed: 83 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,16 @@ import Control.RAWLock
2424
import qualified Control.RAWLock as RAWLock
2525
import Control.ResourceRegistry
2626
import Control.Tracer
27+
import Data.Foldable (traverse_)
2728
import qualified Data.Foldable as Foldable
2829
import Data.Functor.Contravariant ((>$<))
2930
import Data.Kind (Type)
3031
import Data.Map (Map)
3132
import qualified Data.Map.Strict as Map
3233
import Data.Set (Set)
3334
import qualified Data.Set as Set
35+
import Data.Traversable (for)
36+
import Data.Tuple (Solo (..))
3437
import Data.Void
3538
import Data.Word
3639
import GHC.Generics
@@ -193,19 +196,17 @@ mkInternals ::
193196
mkInternals bss h =
194197
TestInternals
195198
{ takeSnapshotNOW = \whereTo suff -> getEnv h $ \env -> do
196-
st <-
197-
( case whereTo of
199+
let selectWhereTo = case whereTo of
198200
TakeAtImmutableTip -> anchorHandle
199201
TakeAtVolatileTip -> currentHandle
200-
)
201-
<$> readTVarIO (ldbSeq env)
202-
Monad.void $
203-
takeSnapshot
204-
(configCodec . getExtLedgerCfg . ledgerDbCfg $ ldbCfg env)
205-
(LedgerDBSnapshotEvent >$< ldbTracer env)
206-
(ldbHasFS env)
207-
suff
208-
st
202+
withStateRef env (MkSolo . selectWhereTo) $ \(MkSolo st) ->
203+
Monad.void $
204+
takeSnapshot
205+
(configCodec . getExtLedgerCfg . ledgerDbCfg $ ldbCfg env)
206+
(LedgerDBSnapshotEvent >$< ldbTracer env)
207+
(ldbHasFS env)
208+
suff
209+
st
209210
, push = \st -> withRegistry $ \reg -> do
210211
eFrk <- newForkerAtTarget h reg VolatileTip
211212
case eFrk of
@@ -368,13 +369,13 @@ implTryTakeSnapshot ::
368369
implTryTakeSnapshot bss env mTime nrBlocks =
369370
if onDiskShouldTakeSnapshot (ldbSnapshotPolicy env) (uncurry (flip diffTime) <$> mTime) nrBlocks
370371
then do
371-
Monad.void
372-
. takeSnapshot
373-
(configCodec . getExtLedgerCfg . ledgerDbCfg $ ldbCfg env)
374-
(LedgerDBSnapshotEvent >$< ldbTracer env)
375-
(ldbHasFS env)
376-
. anchorHandle
377-
=<< readTVarIO (ldbSeq env)
372+
withStateRef env (MkSolo . anchorHandle) $ \(MkSolo st) ->
373+
Monad.void $
374+
takeSnapshot
375+
(configCodec . getExtLedgerCfg . ledgerDbCfg $ ldbCfg env)
376+
(LedgerDBSnapshotEvent >$< ldbTracer env)
377+
(ldbHasFS env)
378+
st
378379
Monad.void $
379380
trimSnapshots
380381
(LedgerDBSnapshotEvent >$< ldbTracer env)
@@ -457,6 +458,19 @@ data LedgerDBEnv m l blk = LedgerDBEnv
457458
, ldbResolveBlock :: !(ResolveBlock m blk)
458459
, ldbQueryBatchSize :: !QueryBatchSize
459460
, ldbOpenHandlesLock :: !(RAWLock m LDBLock)
461+
-- ^ While holding a read lock (at least), all handles in the 'ldbSeq' are
462+
-- guaranteed to be open. During this time, the handle can be duplicated and
463+
-- then be used independently, see 'getStateRef' and 'withStateRef'.
464+
--
465+
-- Therefore, closing any handles which were previously in 'ldbSeq' requires
466+
-- acquiring a write lock. Concretely, both of the following approaches are
467+
-- fine:
468+
--
469+
-- * Modify 'ldbSeq' without any locking, and then close the removed handles
470+
-- while holding a write lock. See e.g. 'closeForkerEnv'.
471+
--
472+
-- * Modify 'ldbSeq' while holding a write lock, and then close the removed
473+
-- handles without any locking.
460474
}
461475
deriving Generic
462476

@@ -546,8 +560,36 @@ getEnvSTM (LDBHandle varState) f =
546560
Acquiring consistent views
547561
-------------------------------------------------------------------------------}
548562

549-
-- | This function must hold the 'LDBLock' such that handles are not released
550-
-- before they are duplicated.
563+
-- | Get a 'StateRef' from the 'LedgerSeq' in the 'LedgerDBEnv', with the
564+
-- 'LedgerTablesHandle' having been duplicated (such that the original can be
565+
-- closed). The caller is responsible for closing the handle.
566+
--
567+
-- For more flexibility, an arbitrary 'Traversable' of the 'StateRef' can be
568+
-- returned; for the simple use case of getting a single 'StateRef', use @t ~
569+
-- 'Solo'@.
570+
getStateRef ::
571+
(IOLike m, Traversable t) =>
572+
LedgerDBEnv m l blk ->
573+
(LedgerSeq m l -> t (StateRef m l)) ->
574+
m (t (StateRef m l))
575+
getStateRef ldbEnv project =
576+
RAWLock.withReadAccess (ldbOpenHandlesLock ldbEnv) $ \LDBLock -> do
577+
tst <- project <$> readTVarIO (ldbSeq ldbEnv)
578+
for tst $ \st -> do
579+
tables' <- duplicate $ tables st
580+
pure st{tables = tables'}
581+
582+
-- | Like 'StateRef', but takes care of closing the handle when the given action
583+
-- returns or errors.
584+
withStateRef ::
585+
(IOLike m, Traversable t) =>
586+
LedgerDBEnv m l blk ->
587+
(LedgerSeq m l -> t (StateRef m l)) ->
588+
(t (StateRef m l) -> m a) ->
589+
m a
590+
withStateRef ldbEnv project =
591+
bracket (getStateRef ldbEnv project) (traverse_ (close . tables))
592+
551593
acquireAtTarget ::
552594
( HeaderHash l ~ HeaderHash blk
553595
, IOLike m
@@ -557,41 +599,28 @@ acquireAtTarget ::
557599
) =>
558600
LedgerDBEnv m l blk ->
559601
Either Word64 (Target (Point blk)) ->
560-
LDBLock ->
561602
m (Either GetForkerError (StateRef m l))
562-
acquireAtTarget ldbEnv (Right VolatileTip) _ = do
563-
l <- readTVarIO (ldbSeq ldbEnv)
564-
let StateRef st tbs = currentHandle l
565-
t <- duplicate tbs
566-
pure $ Right $ StateRef st t
567-
acquireAtTarget ldbEnv (Right ImmutableTip) _ = do
568-
l <- readTVarIO (ldbSeq ldbEnv)
569-
let StateRef st tbs = anchorHandle l
570-
t <- duplicate tbs
571-
pure $ Right $ StateRef st t
572-
acquireAtTarget ldbEnv (Right (SpecificPoint pt)) _ = do
573-
dblog <- readTVarIO (ldbSeq ldbEnv)
574-
let immTip = getTip $ anchor dblog
575-
case currentHandle <$> rollback pt dblog of
576-
Nothing
577-
| pointSlot pt < pointSlot immTip -> pure $ Left $ PointTooOld Nothing
578-
| otherwise -> pure $ Left PointNotOnChain
579-
Just (StateRef st tbs) ->
580-
Right . StateRef st <$> duplicate tbs
581-
acquireAtTarget ldbEnv (Left n) _ = do
582-
dblog <- readTVarIO (ldbSeq ldbEnv)
583-
case currentHandle <$> rollbackN n dblog of
584-
Nothing ->
585-
return $
586-
Left $
603+
acquireAtTarget ldbEnv target =
604+
getStateRef ldbEnv $ \l -> case target of
605+
Right VolatileTip -> pure $ currentHandle l
606+
Right ImmutableTip -> pure $ anchorHandle l
607+
Right (SpecificPoint pt) -> do
608+
let immTip = getTip $ anchor l
609+
case rollback pt l of
610+
Nothing
611+
| pointSlot pt < pointSlot immTip -> throwError $ PointTooOld Nothing
612+
| otherwise -> throwError PointNotOnChain
613+
Just t' -> pure $ currentHandle t'
614+
Left n -> case rollbackN n l of
615+
Nothing ->
616+
throwError $
587617
PointTooOld $
588-
Just $
618+
Just
589619
ExceededRollback
590-
{ rollbackMaximum = maxRollback dblog
620+
{ rollbackMaximum = maxRollback l
591621
, rollbackRequested = n
592622
}
593-
Just (StateRef st tbs) ->
594-
Right . StateRef st <$> duplicate tbs
623+
Just l' -> pure $ currentHandle l'
595624

596625
newForkerAtTarget ::
597626
( HeaderHash l ~ HeaderHash blk
@@ -605,8 +634,8 @@ newForkerAtTarget ::
605634
ResourceRegistry m ->
606635
Target (Point blk) ->
607636
m (Either GetForkerError (Forker m l blk))
608-
newForkerAtTarget h rr pt = getEnv h $ \ldbEnv@LedgerDBEnv{ldbOpenHandlesLock = lock} ->
609-
RAWLock.withReadAccess lock (acquireAtTarget ldbEnv (Right pt)) >>= traverse (newForker h ldbEnv rr)
637+
newForkerAtTarget h rr pt = getEnv h $ \ldbEnv ->
638+
acquireAtTarget ldbEnv (Right pt) >>= traverse (newForker h ldbEnv rr)
610639

611640
newForkerByRollback ::
612641
( HeaderHash l ~ HeaderHash blk
@@ -620,8 +649,8 @@ newForkerByRollback ::
620649
ResourceRegistry m ->
621650
Word64 ->
622651
m (Either GetForkerError (Forker m l blk))
623-
newForkerByRollback h rr n = getEnv h $ \ldbEnv@LedgerDBEnv{ldbOpenHandlesLock = lock} -> do
624-
RAWLock.withReadAccess lock (acquireAtTarget ldbEnv (Left n)) >>= traverse (newForker h ldbEnv rr)
652+
newForkerByRollback h rr n = getEnv h $ \ldbEnv ->
653+
acquireAtTarget ldbEnv (Left n) >>= traverse (newForker h ldbEnv rr)
625654

626655
-- | Close all open 'Forker's.
627656
closeAllForkers ::

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/V2/InMemory.hs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,7 @@ newInMemoryLedgerTablesHandle tracer someFS@(SomeHasFS hasFS) l = do
9696
pure
9797
LedgerTablesHandle
9898
{ close = do
99-
-- Temporarily a no-op until
100-
-- https://github.com/IntersectMBO/ouroboros-consensus/issues/1551 has
101-
-- been fixed.
102-
103-
-- atomically $ writeTVar tv LedgerTablesHandleClosed
99+
atomically $ writeTVar tv LedgerTablesHandleClosed
104100
traceWith tracer V2.TraceLedgerTablesHandleClose
105101
, duplicate = do
106102
hs <- readTVarIO tv

0 commit comments

Comments
 (0)