Skip to content

Commit 7ec4475

Browse files
Merge pull request #5265 from IntersectMBO/karknu/max_recon
Karknu/max recon
2 parents e2e346a + 5ed6eda commit 7ec4475

File tree

19 files changed

+528
-330
lines changed

19 files changed

+528
-330
lines changed

docs/network-spec/miniprotocols.tex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -872,7 +872,7 @@ \subsection{Timeouts per state}
872872
\header{state} & \header{timeout} \\\hline
873873
\StIdle & \texttt{3673}s \\
874874
\StCanAwait & \texttt{10}s \\
875-
\StMustReply & random between \texttt{135}s and \texttt{269}s \\
875+
\StMustReply & random between \texttt{601}s and \texttt{911}s \\
876876
\StIntersect & \texttt{10}s \\
877877
\end{tabular}
878878
\caption{timeouts per state}

ouroboros-network-api/src/Ouroboros/Network/PeerSelection/LedgerPeers/Type.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ newtype AccPoolStakeCoded = AccPoolStakeCoded AccPoolStake
221221
data IsBigLedgerPeer
222222
= IsBigLedgerPeer
223223
| IsNotBigLedgerPeer
224-
deriving Eq
224+
deriving (Eq, Show)
225225

226226
-- | Return ledger state information and ledger peers.
227227
--

ouroboros-network/CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@
66

77
### Non-breaking changes
88

9+
* Limit the number of faulures to 5 before a peer that isn't a localroot, bootstrap peer or public root peer is forgotten.
10+
* Decrease the time blockfetch waits for chainsync to exit in case of an error
11+
* Increase the timeout for chainsync in state StMustReply to between 601 and 911 seconds.
12+
* Ensure timeout to enter safe mode when enabling bootstrap peers is respected
13+
* bugfix async demotion
14+
* Add hot connection duration traces and sigusr1 debug handler
15+
916
## 0.21.4.0 -- 2025-10-05
1017

1118
### Non-breaking changes

ouroboros-network/sim-tests-lib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1292,6 +1292,8 @@ prop_peer_selection_action_trace_coverage defaultBearerInfo diffScript =
12921292
"PeerMonitoringResult " ++ show wspt
12931293
peerSelectionActionsTraceMap (AcquireConnectionError e) =
12941294
"AcquireConnectionError " ++ show e
1295+
peerSelectionActionsTraceMap (PeerHotDuration _id _dt) =
1296+
"PeerHotDuration"
12951297

12961298
eventsSeenNames = map peerSelectionActionsTraceMap events
12971299

@@ -2378,7 +2380,7 @@ prop_diffusion_target_established_local ioSimTrace traceNumber =
23782380
(fromMaybe Set.empty)
23792381
. Signal.fromEvents
23802382
. Signal.selectEvents
2381-
(\case TracePromoteColdFailed _ _ peer _ _ ->
2383+
(\case TracePromoteColdFailed _ _ peer _ _ _ ->
23822384
Just (Set.singleton peer)
23832385
--TODO: what about TraceDemoteWarmDone ?
23842386
-- these are also not immediate candidates
@@ -3018,7 +3020,7 @@ prop_diffusion_async_demotions ioSimTrace traceNumber =
30183020
Just $ Stop failures
30193021
where
30203022
failures = Set.singleton peeraddr
3021-
TracePromoteColdFailed _ _ peeraddr _ _ ->
3023+
TracePromoteColdFailed _ _ peeraddr _ _ _ ->
30223024
Just $ Stop failures
30233025
where
30243026
failures = Set.singleton peeraddr
@@ -3030,7 +3032,7 @@ prop_diffusion_async_demotions ioSimTrace traceNumber =
30303032
Just $ Stop failures
30313033
where
30323034
failures = Set.singleton peeraddr
3033-
TracePromoteColdBigLedgerPeerFailed _ _ peeraddr _ _ ->
3035+
TracePromoteColdBigLedgerPeerFailed _ _ peeraddr _ _ _ ->
30343036
Just $ Stop failures
30353037
where
30363038
failures = Set.singleton peeraddr
@@ -3765,7 +3767,7 @@ prop_diffusion_peer_selection_actions_no_dodgy_traces ioSimTrace traceNumber =
37653767
$ evs'
37663768
numOfActiveColdErrors = length
37673769
. filter (\case
3768-
(PeerStatusChangeFailure HotToWarm{} ActiveCold)
3770+
(PeerStatusChangeFailure HotToWarm{} ActiveCold{})
37693771
-> True
37703772
_ -> False)
37713773
$ evs'
@@ -3790,7 +3792,7 @@ prop_diffusion_peer_selection_actions_no_dodgy_traces ioSimTrace traceNumber =
37903792
. map
37913793
(\case
37923794
ev@( WithTime _ (PeerStatusChangeFailure (HotToWarm _) TimeoutError)
3793-
, WithTime _ (PeerStatusChangeFailure (HotToWarm _) ActiveCold)
3795+
, WithTime _ (PeerStatusChangeFailure (HotToWarm _) ActiveCold{})
37943796
)
37953797
-> counterexample (show ev)
37963798
$ counterexample (unlines $ map show peerSelectionActionsEvents)
@@ -3860,7 +3862,8 @@ prop_diffusion_peer_selection_actions_no_dodgy_traces ioSimTrace traceNumber =
38603862
WithTime _ (PeerStatusChangeFailure type_ _) -> getConnId type_
38613863
WithTime _ (PeerMonitoringError connId _) -> Just connId
38623864
WithTime _ (PeerMonitoringResult connId _) -> Just connId
3863-
WithTime _ (AcquireConnectionError _) -> Nothing)
3865+
WithTime _ (AcquireConnectionError _) -> Nothing
3866+
WithTime _ (PeerHotDuration connId _) -> Just connId)
38643867
$ peerSelectionActionsEvents
38653868
)
38663869

@@ -4750,4 +4753,3 @@ showBucket size a | a < size
47504753
, show (a `div` size * size + size)
47514754
, ")"
47524755
]
4753-

ouroboros-network/sim-tests-lib/Test/Ouroboros/Network/PeerSelection.hs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2514,7 +2514,7 @@ prop_governor_target_established_below (MaxTime maxTime) env =
25142514
(fromMaybe Set.empty)
25152515
. Signal.fromEvents
25162516
. Signal.selectEvents
2517-
(\case TracePromoteColdFailed _ _ peer _ _ ->
2517+
(\case TracePromoteColdFailed _ _ peer _ _ _ ->
25182518
--TODO: the environment does not yet cause this to happen
25192519
-- it requires synchronous failure in the establish action
25202520
Just $! Set.singleton peer
@@ -2623,7 +2623,7 @@ prop_governor_target_established_big_ledger_peers_below (MaxTime maxTime) env =
26232623
(fromMaybe Set.empty)
26242624
. Signal.fromEvents
26252625
. Signal.selectEvents
2626-
(\case TracePromoteColdBigLedgerPeerFailed _ _ peer _ _ ->
2626+
(\case TracePromoteColdBigLedgerPeerFailed _ _ peer _ _ _ ->
26272627
--TODO: the environment does not yet cause this to happen
26282628
-- it requires synchronous failure in the establish action
26292629
Just (Set.singleton peer)
@@ -3261,7 +3261,7 @@ prop_governor_target_established_local (MaxTime maxTime) env =
32613261
(fromMaybe Set.empty)
32623262
. Signal.fromEvents
32633263
. Signal.selectEvents
3264-
(\case TracePromoteColdFailed _ _ peer _ _ ->
3264+
(\case TracePromoteColdFailed _ _ peer _ _ _ ->
32653265
--TODO: the environment does not yet cause this to happen
32663266
-- it requires synchronous failure in the establish action
32673267
Just (Set.singleton peer)
@@ -4108,6 +4108,8 @@ _governorFindingPublicRoots targetNumberOfRootPeers readDomains readUseBootstrap
41084108
policyPeerShareBatchWaitTime = 0, -- seconds
41094109
policyPeerShareOverallTimeout = 0, -- seconds
41104110
policyPeerShareActivationDelay = 2, -- seconds
4111+
policyMaxConnectionRetries = 5,
4112+
policyClearFailCountDelay = 120, --seconds
41114113
policyErrorDelay = 0 -- seconds
41124114
}
41134115
pickTrivially :: Applicative m => Set SockAddr -> Int -> m (Set SockAddr)

ouroboros-network/sim-tests-lib/Test/Ouroboros/Network/PeerSelection/Cardano/MockEnvironment.hs

Lines changed: 60 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,8 @@ mockPeerSelectionPolicy GovernorMockEnvironment {
723723
policyPeerShareBatchWaitTime = 3, -- seconds
724724
policyPeerShareOverallTimeout = 10, -- seconds
725725
policyPeerShareActivationDelay = 300, -- seconds
726+
policyMaxConnectionRetries = 5,
727+
policyClearFailCountDelay = 120, -- seconds
726728
policyErrorDelay = 10 -- seconds
727729
}
728730

@@ -751,64 +753,64 @@ tracerTracePeerSelection = contramap f tracerTestTraceEvent
751753
-- make the tracer strict
752754
f :: TracePeerSelection extraState extraFlags extraPeers PeerAddr
753755
-> TestTraceEvent extraState extraFlags extraPeers extraCounters
754-
f a@(TraceLocalRootPeersChanged !_ !_) = GovernorEvent a
755-
f a@(TraceTargetsChanged !_ !_) = GovernorEvent a
756-
f a@(TracePublicRootsRequest !_ !_) = GovernorEvent a
757-
f a@(TracePublicRootsResults !_ !_ !_) = GovernorEvent a
758-
f a@(TracePublicRootsFailure !_ !_ !_) = GovernorEvent a
759-
f a@(TraceForgetColdPeers !_ !_ !_) = GovernorEvent a
760-
f a@(TraceBigLedgerPeersRequest !_ !_) = GovernorEvent a
761-
f a@(TraceBigLedgerPeersResults !_ !_ !_) = GovernorEvent a
762-
f a@(TraceBigLedgerPeersFailure !_ !_ !_) = GovernorEvent a
763-
f a@(TraceForgetBigLedgerPeers !_ !_ !_) = GovernorEvent a
764-
f a@(TracePickInboundPeers !_ !_ !_ !_) = GovernorEvent a
765-
f a@(TracePeerShareRequests !_ !_ !_ !_ !_) = GovernorEvent a
766-
f a@(TracePeerShareResults !_) = GovernorEvent a
767-
f a@(TracePeerShareResultsFiltered !_) = GovernorEvent a
768-
f a@(TracePromoteColdPeers !_ !_ !_) = GovernorEvent a
769-
f a@(TracePromoteColdLocalPeers !_ !_) = GovernorEvent a
770-
f a@(TracePromoteColdFailed !_ !_ !_ !_ !_) = GovernorEvent a
771-
f a@(TracePromoteColdDone !_ !_ !_) = GovernorEvent a
772-
f a@(TracePromoteColdBigLedgerPeers !_ !_ !_) = GovernorEvent a
773-
f a@(TracePromoteColdBigLedgerPeerFailed !_ !_ !_ !_ !_) = GovernorEvent a
774-
f a@(TracePromoteColdBigLedgerPeerDone !_ !_ !_) = GovernorEvent a
775-
f a@(TracePromoteWarmPeers !_ !_ !_) = GovernorEvent a
776-
f a@(TracePromoteWarmLocalPeers !_ !_) = GovernorEvent a
777-
f a@(TracePromoteWarmFailed !_ !_ !_ !_) = GovernorEvent a
778-
f a@(TracePromoteWarmDone !_ !_ !_) = GovernorEvent a
779-
f a@(TracePromoteWarmAborted !_ !_ !_) = GovernorEvent a
780-
f a@(TracePromoteWarmBigLedgerPeers !_ !_ !_) = GovernorEvent a
781-
f a@(TracePromoteWarmBigLedgerPeerFailed !_ !_ !_ !_) = GovernorEvent a
782-
f a@(TracePromoteWarmBigLedgerPeerDone !_ !_ !_) = GovernorEvent a
783-
f a@(TracePromoteWarmBigLedgerPeerAborted !_ !_ !_) = GovernorEvent a
784-
f a@(TraceDemoteWarmPeers !_ !_ !_) = GovernorEvent a
785-
f a@(TraceDemoteWarmFailed !_ !_ !_ !_) = GovernorEvent a
786-
f a@(TraceDemoteWarmDone !_ !_ !_) = GovernorEvent a
787-
f a@(TraceDemoteWarmBigLedgerPeers !_ !_ !_) = GovernorEvent a
788-
f a@(TraceDemoteWarmBigLedgerPeerFailed !_ !_ !_ !_) = GovernorEvent a
789-
f a@(TraceDemoteWarmBigLedgerPeerDone !_ !_ !_) = GovernorEvent a
790-
f a@(TraceDemoteHotPeers !_ !_ !_) = GovernorEvent a
791-
f a@(TraceDemoteLocalHotPeers !_ !_) = GovernorEvent a
792-
f a@(TraceDemoteHotFailed !_ !_ !_ !_) = GovernorEvent a
793-
f a@(TraceDemoteHotDone !_ !_ !_) = GovernorEvent a
794-
f a@(TraceDemoteHotBigLedgerPeers !_ !_ !_) = GovernorEvent a
795-
f a@(TraceDemoteHotBigLedgerPeerFailed !_ !_ !_ !_) = GovernorEvent a
796-
f a@(TraceDemoteHotBigLedgerPeerDone !_ !_ !_) = GovernorEvent a
797-
f a@(TraceDemoteAsynchronous !_) = GovernorEvent a
798-
f a@(TraceDemoteLocalAsynchronous !_) = GovernorEvent a
799-
f a@(TraceDemoteBigLedgerPeersAsynchronous !_) = GovernorEvent a
800-
f a@TraceGovernorWakeup = GovernorEvent a
801-
f a@(TraceChurnWait !_) = GovernorEvent a
802-
f a@(TraceChurnMode !_) = GovernorEvent a
803-
f a@(TraceLedgerStateJudgementChanged !_) = GovernorEvent a
804-
f a@TraceOnlyBootstrapPeers = GovernorEvent a
805-
f a@TraceBootstrapPeersFlagChangedWhilstInSensitiveState = GovernorEvent a
806-
f a@(TraceUseBootstrapPeersChanged !_) = GovernorEvent a
807-
f a@(TraceOutboundGovernorCriticalFailure !_) = GovernorEvent a
808-
f a@(TraceDebugState !_ !_) = GovernorEvent a
809-
f a@(TraceChurnAction !_ !_ !_) = GovernorEvent a
810-
f a@(TraceChurnTimeout !_ !_ !_) = GovernorEvent a
811-
f a@(TraceVerifyPeerSnapshot !_) = GovernorEvent a
756+
f a@(TraceLocalRootPeersChanged !_ !_) = GovernorEvent a
757+
f a@(TraceTargetsChanged !_ !_) = GovernorEvent a
758+
f a@(TracePublicRootsRequest !_ !_) = GovernorEvent a
759+
f a@(TracePublicRootsResults !_ !_ !_) = GovernorEvent a
760+
f a@(TracePublicRootsFailure !_ !_ !_) = GovernorEvent a
761+
f a@(TraceForgetColdPeers !_ !_ !_) = GovernorEvent a
762+
f a@(TraceBigLedgerPeersRequest !_ !_) = GovernorEvent a
763+
f a@(TraceBigLedgerPeersResults !_ !_ !_) = GovernorEvent a
764+
f a@(TraceBigLedgerPeersFailure !_ !_ !_) = GovernorEvent a
765+
f a@(TraceForgetBigLedgerPeers !_ !_ !_) = GovernorEvent a
766+
f a@(TracePickInboundPeers !_ !_ !_ !_) = GovernorEvent a
767+
f a@(TracePeerShareRequests !_ !_ !_ !_ !_) = GovernorEvent a
768+
f a@(TracePeerShareResults !_) = GovernorEvent a
769+
f a@(TracePeerShareResultsFiltered !_) = GovernorEvent a
770+
f a@(TracePromoteColdPeers !_ !_ !_) = GovernorEvent a
771+
f a@(TracePromoteColdLocalPeers !_ !_) = GovernorEvent a
772+
f a@(TracePromoteColdFailed !_ !_ !_ !_ !_ !_) = GovernorEvent a
773+
f a@(TracePromoteColdDone !_ !_ !_) = GovernorEvent a
774+
f a@(TracePromoteColdBigLedgerPeers !_ !_ !_) = GovernorEvent a
775+
f a@(TracePromoteColdBigLedgerPeerFailed !_ !_ !_ !_ !_ !_) = GovernorEvent a
776+
f a@(TracePromoteColdBigLedgerPeerDone !_ !_ !_) = GovernorEvent a
777+
f a@(TracePromoteWarmPeers !_ !_ !_) = GovernorEvent a
778+
f a@(TracePromoteWarmLocalPeers !_ !_) = GovernorEvent a
779+
f a@(TracePromoteWarmFailed !_ !_ !_ !_) = GovernorEvent a
780+
f a@(TracePromoteWarmDone !_ !_ !_) = GovernorEvent a
781+
f a@(TracePromoteWarmAborted !_ !_ !_) = GovernorEvent a
782+
f a@(TracePromoteWarmBigLedgerPeers !_ !_ !_) = GovernorEvent a
783+
f a@(TracePromoteWarmBigLedgerPeerFailed !_ !_ !_ !_) = GovernorEvent a
784+
f a@(TracePromoteWarmBigLedgerPeerDone !_ !_ !_) = GovernorEvent a
785+
f a@(TracePromoteWarmBigLedgerPeerAborted !_ !_ !_) = GovernorEvent a
786+
f a@(TraceDemoteWarmPeers !_ !_ !_) = GovernorEvent a
787+
f a@(TraceDemoteWarmFailed !_ !_ !_ !_) = GovernorEvent a
788+
f a@(TraceDemoteWarmDone !_ !_ !_) = GovernorEvent a
789+
f a@(TraceDemoteWarmBigLedgerPeers !_ !_ !_) = GovernorEvent a
790+
f a@(TraceDemoteWarmBigLedgerPeerFailed !_ !_ !_ !_) = GovernorEvent a
791+
f a@(TraceDemoteWarmBigLedgerPeerDone !_ !_ !_) = GovernorEvent a
792+
f a@(TraceDemoteHotPeers !_ !_ !_) = GovernorEvent a
793+
f a@(TraceDemoteLocalHotPeers !_ !_) = GovernorEvent a
794+
f a@(TraceDemoteHotFailed !_ !_ !_ !_) = GovernorEvent a
795+
f a@(TraceDemoteHotDone !_ !_ !_) = GovernorEvent a
796+
f a@(TraceDemoteHotBigLedgerPeers !_ !_ !_) = GovernorEvent a
797+
f a@(TraceDemoteHotBigLedgerPeerFailed !_ !_ !_ !_) = GovernorEvent a
798+
f a@(TraceDemoteHotBigLedgerPeerDone !_ !_ !_) = GovernorEvent a
799+
f a@(TraceDemoteAsynchronous !_) = GovernorEvent a
800+
f a@(TraceDemoteLocalAsynchronous !_) = GovernorEvent a
801+
f a@(TraceDemoteBigLedgerPeersAsynchronous !_) = GovernorEvent a
802+
f a@TraceGovernorWakeup = GovernorEvent a
803+
f a@(TraceChurnWait !_) = GovernorEvent a
804+
f a@(TraceChurnMode !_) = GovernorEvent a
805+
f a@(TraceLedgerStateJudgementChanged !_) = GovernorEvent a
806+
f a@TraceOnlyBootstrapPeers = GovernorEvent a
807+
f a@TraceBootstrapPeersFlagChangedWhilstInSensitiveState = GovernorEvent a
808+
f a@(TraceUseBootstrapPeersChanged !_) = GovernorEvent a
809+
f a@(TraceOutboundGovernorCriticalFailure !_) = GovernorEvent a
810+
f a@(TraceDebugState !_ !_) = GovernorEvent a
811+
f a@(TraceChurnAction !_ !_ !_) = GovernorEvent a
812+
f a@(TraceChurnTimeout !_ !_ !_) = GovernorEvent a
813+
f a@(TraceVerifyPeerSnapshot !_) = GovernorEvent a
812814

813815
tracerDebugPeerSelection :: Tracer (IOSim s) (DebugPeerSelection Cardano.ExtraState PeerTrustable (Cardano.ExtraPeers PeerAddr) PeerAddr)
814816
tracerDebugPeerSelection = GovernorDebug `contramap` tracerTestTraceEvent

ouroboros-network/src/Ouroboros/Cardano/Network/Diffusion/Handlers.hs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88

99
module Ouroboros.Cardano.Network.Diffusion.Handlers where
1010

11+
import Control.Concurrent.Class.MonadSTM.Strict
12+
import Control.Monad.Class.MonadTime.SI
13+
1114
import Cardano.Network.PeerSelection.Bootstrap (UseBootstrapPeers)
1215
import Cardano.Network.Types (LedgerStateJudgement)
13-
import Control.Concurrent.Class.MonadSTM.Strict
1416
import Ouroboros.Cardano.Network.PeerSelection.Governor.PeerSelectionState qualified as Cardano
1517
import Ouroboros.Network.ConnectionManager.Types
1618
import Ouroboros.Network.Diffusion.P2P (TracersExtra (..))
@@ -19,7 +21,6 @@ import Ouroboros.Network.PeerSelection.LedgerPeers.Type (UseLedgerPeers)
1921
import Ouroboros.Network.PeerSelection.PeerMetric
2022
import Ouroboros.Network.PeerSelection.PeerSharing (PeerSharing)
2123
#ifdef POSIX
22-
import Control.Monad.Class.MonadTime.SI
2324
import Control.Tracer (traceWith)
2425
import Ouroboros.Network.ConnectionManager.Core (Trace (..))
2526
import Ouroboros.Network.PeerSelection.Governor.Types
@@ -39,6 +40,8 @@ sigUSR1Handler
3940
-> PeerSharing
4041
-> STM IO UseBootstrapPeers
4142
-> STM IO LedgerStateJudgement
43+
-> (peerconn -> STM IO (Maybe Time))
44+
-- ^ return time when an active peer was promoted to a hot peer.
4245
-> ConnectionManager muxMode socket ntnAddr
4346
handle handleError IO
4447
-> StrictTVar IO (PeerSelectionState
@@ -49,7 +52,7 @@ sigUSR1Handler
4952
-> IO ()
5053
#ifdef POSIX
5154
sigUSR1Handler tracersExtra getUseLedgerPeers ownPeerSharing getBootstrapPeers
52-
getLedgerStateJudgement connectionManager dbgStateVar metrics = do
55+
getLedgerStateJudgement getPromotedHotTime connectionManager dbgStateVar metrics = do
5356
_ <- Signals.installHandler
5457
Signals.sigUSR1
5558
(Signals.Catch
@@ -66,7 +69,7 @@ sigUSR1Handler tracersExtra getUseLedgerPeers ownPeerSharing getBootstrapPeers
6669
useBootstrapPeers
6770
<*> readTVar dbgStateVar
6871

69-
let dbgState = makeDebugPeerSelectionState ps up bp lsj am
72+
dbgState <- makeDebugPeerSelectionState ps up bp lsj am getPromotedHotTime now
7073

7174
traceWith (dtConnectionManagerTracer tracersExtra)
7275
(TrState state)
@@ -77,5 +80,5 @@ sigUSR1Handler tracersExtra getUseLedgerPeers ownPeerSharing getBootstrapPeers
7780
Nothing
7881
return ()
7982
#else
80-
sigUSR1Handler _ _ _ _ _ _ _ _ = pure ()
83+
sigUSR1Handler _ _ _ _ _ _ _ _ _ = pure ()
8184
#endif

0 commit comments

Comments
 (0)