Skip to content

Commit a17249d

Browse files
committed
Low lease ttl (#2148)
When we detect a lease has expired, just re-request it, instead of trying to use it indefinitely. --- * [x] CHANGELOG updated * [x] Documentation update not needed * [x] Haddocks update not needed * [x] No new TODOs introduced
1 parent cba23ed commit a17249d

File tree

3 files changed

+47
-17
lines changed

3 files changed

+47
-17
lines changed

hydra-node/hydra-node.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ test-suite tests
421421
, text
422422
, time
423423
, tls
424+
, typed-process
424425
, websockets
425426

426427
build-tool-depends: hspec-discover:hspec-discover

hydra-node/src/Hydra/Network/Etcd.hs

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -450,23 +450,27 @@ pollConnectivity tracer conn advertise NetworkCallback{onConnectivity} = do
450450
-- Write our alive key using lease
451451
writeAlive leaseId
452452
traceWith tracer CreatedLease{leaseId}
453-
withKeepAlive leaseId $ \keepAlive ->
454-
forever $ do
455-
-- Keep our lease alive
456-
ttlRemaining <- keepAlive
457-
when (ttlRemaining < 1) $
458-
traceWith tracer LowLeaseTTL{ttlRemaining}
459-
-- Determine alive peers
460-
alive <- getAlive
461-
let othersAlive = alive \\ [advertise]
462-
seenAlive <- atomically $ swapTVar seenAliveVar othersAlive
463-
forM_ (othersAlive \\ seenAlive) $ onConnectivity . PeerConnected
464-
forM_ (seenAlive \\ othersAlive) $ onConnectivity . PeerDisconnected
465-
-- Wait roughly ttl / 2
466-
threadDelay (ttlRemaining / 2)
453+
withKeepAlive leaseId (aliveLoop seenAliveVar)
467454
where
455+
aliveLoop seenAliveVar keepAlive = do
456+
-- Keep our lease alive
457+
ttlRemaining <- keepAlive
458+
if ttlRemaining <= 0
459+
then
460+
-- The keep alive did not work as no time to live remaining. Get a new lease instead
461+
traceWith tracer LowLeaseTTL{ttlRemaining}
462+
else do
463+
-- Determine alive peers
464+
alive <- getAlive
465+
let othersAlive = alive \\ [advertise]
466+
seenAlive <- atomically $ swapTVar seenAliveVar othersAlive
467+
forM_ (othersAlive \\ seenAlive) $ onConnectivity . PeerConnected
468+
forM_ (seenAlive \\ othersAlive) $ onConnectivity . PeerDisconnected
469+
threadDelay 1
470+
aliveLoop seenAliveVar keepAlive
471+
468472
onGrpcException seenAliveVar GrpcException{grpcError}
469-
| grpcError == GrpcUnavailable || grpcError == GrpcDeadlineExceeded = do
473+
| grpcError `elem` [GrpcUnavailable, GrpcDeadlineExceeded, GrpcCancelled] = do
470474
onConnectivity NetworkDisconnected
471475
atomically $ writeTVar seenAliveVar []
472476
threadDelay 1
@@ -483,7 +487,7 @@ pollConnectivity tracer conn advertise NetworkCallback{onConnectivity} = do
483487
void . action $ do
484488
send $ NextElem $ defMessage & #id .~ leaseId
485489
recv >>= \case
486-
NextElem res -> pure . fromIntegral $ res ^. #ttl
490+
NextElem res -> pure $ res ^. #ttl
487491
NoNextElem -> do
488492
traceWith tracer NoKeepAliveResponse
489493
pure 0
@@ -622,7 +626,7 @@ data EtcdLog
622626
| FailedToDecodeLog {log :: Text, reason :: Text}
623627
| FailedToDecodeValue {key :: Text, value :: Text, reason :: Text}
624628
| CreatedLease {leaseId :: Int64}
625-
| LowLeaseTTL {ttlRemaining :: DiffTime}
629+
| LowLeaseTTL {ttlRemaining :: Int64}
626630
| NoKeepAliveResponse
627631
| MatchingProtocolVersion {version :: ProtocolVersion}
628632
deriving stock (Eq, Show, Generic)

hydra-node/test/Hydra/NetworkSpec.hs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import Hydra.Network.Message (Message (..))
2828
import Hydra.Node.Network (NetworkConfiguration (..))
2929
import System.Directory (removeFile)
3030
import System.FilePath ((</>))
31+
import System.Process.Typed (readProcessStdout_, runProcess_, shell)
3132
import Test.Aeson.GenericSpecs (Settings (..), defaultSettings, roundtripAndGoldenADTSpecsWithSettings)
3233
import Test.Hydra.Node.Fixture (alice, aliceSk, bob, bobSk, carol, carolSk)
3334
import Test.Network.Ports (randomUnusedTCPPorts, withFreePort)
@@ -139,6 +140,30 @@ spec = do
139140
waitFor NetworkConnected
140141
waitFor $ PeerConnected carolConfig.advertise
141142

143+
it "handles expired lease" $ \tracer -> do
144+
withTempDir "test-etcd" $ \tmp -> do
145+
failAfter 5 $ do
146+
PeerConfig2{aliceConfig, bobConfig} <- setup2Peers tmp
147+
-- Record and assert connectivity events from alice's perspective
148+
(recordReceived, _, waitConnectivity) <- newRecordingCallback
149+
let
150+
waitFor :: HasCallStack => Connectivity -> IO ()
151+
waitFor = waitEq waitConnectivity 60
152+
withEtcdNetwork @Int tracer v1 aliceConfig recordReceived $ \_ -> do
153+
withEtcdNetwork @Int tracer v1 bobConfig noopCallback $ \_ -> do
154+
waitFor NetworkConnected
155+
waitFor $ PeerConnected bobConfig.advertise
156+
-- Expire all leases manually to simulate a keepAlive coming too
157+
-- late. Note that we do not distinguish which is which so
158+
-- alice's lease will also be killed, but does not matter here.
159+
let endpoints = "--endpoints=" <> show (listen aliceConfig)
160+
output <- readProcessStdout_ . shell $ "etcdctl lease list " <> endpoints
161+
let leases = drop 1 $ lines $ decodeUtf8 output
162+
forM_ leases $ \lease ->
163+
runProcess_ . shell $ "etcdctl lease revoke " <> endpoints <> " " <> toString lease
164+
-- Alice sees bob disconnected and connected again
165+
waitFor $ PeerConnected bobConfig.advertise
166+
142167
it "checks protocol version" $ \tracer -> do
143168
withTempDir "test-etcd" $ \tmp -> do
144169
failAfter 10 $ do

0 commit comments

Comments
 (0)