Fix for broadcast deadlock (#2171)

noonio · noonio · commit b01dbb0fa5fe · 2025-08-18T11:45:39.000+01:00
Continuing #2168 but now the tests and such will run. This work is by @jmagan but I had to make my own PR to satisfy our rules around PR execution 🥲.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,11 @@ changes.
 - Handle failing lease keep alive in network component and avoid bursts in
   heartbeating.
 
+- Fix for blocking bug when broadcasting messages via etcd. See:
+  https://github.com/cardano-scaling/hydra/issues/2167. This is not a full fix
+  but is enough to resolve the problem until we can identify the central cause
+  of the issue.
+
 ## [0.22.3] - 2025-07-21
 
 * Change behavior of `Hydra.Network.Etcd` to fallback to earliest possible
diff --git a/hydra-node/src/Hydra/Network/Etcd.hs b/hydra-node/src/Hydra/Network/Etcd.hs
@@ -77,7 +77,6 @@ import Hydra.Network (
 import Hydra.Network.EtcdBinary (getEtcdBinary)
 import Network.GRPC.Client (
   Address (..),
-  CallParams (..),
   ConnParams (..),
   Connection,
   ReconnectPolicy (..),
@@ -87,7 +86,6 @@ import Network.GRPC.Client (
   TimeoutUnit (..),
   TimeoutValue (..),
   rpc,
-  rpcWith,
   withConnection,
  )
 import Network.GRPC.Client.StreamType.IO (biDiStreaming, nonStreaming)
@@ -120,7 +118,6 @@ import System.Process.Typed (
   unsafeProcessHandle,
   waitExitCode,
  )
-import UnliftIO (readTVarIO)
 
 -- | Concrete network component that broadcasts messages to an etcd cluster and
 -- listens for incoming messages.
@@ -140,50 +137,21 @@ withEtcdNetwork tracer protocolVersion config callback action = do
   withProcessInterrupt (etcdCmd etcdBinPath envVars) $ \p -> do
     race_ (waitExitCode p >>= \ec -> fail $ "Sub-process etcd exited with: " <> show ec) $ do
       race_ (traceStderr p callback) $ do
-        -- XXX: cleanup reconnecting through policy if other threads fail
-        doneVar <- newTVarIO False
         -- NOTE: The connection to the server is set up asynchronously; the
         -- first rpc call will block until the connection has been established.
-        withConnection (connParams doneVar) grpcServer $ \conn -> do
+        withConnection (connParams tracer Nothing) (grpcServer config) $ \conn -> do
           -- REVIEW: checkVersion blocks if used on main thread - why?
           withAsync (checkVersion tracer conn protocolVersion callback) $ \_ -> do
             race_ (pollConnectivity tracer conn advertise callback) $
               race_ (waitMessages tracer conn persistenceDir callback) $ do
                 queue <- newPersistentQueue (persistenceDir </> "pending-broadcast") 100
-                race_ (broadcastMessages tracer conn advertise queue) $ do
+                race_ (broadcastMessages tracer config advertise queue) $ do
                   action
                     Network
                       { broadcast = writePersistentQueue queue
                       }
-                  atomically (writeTVar doneVar True)
  where
-  connParams doneVar =
-    def
-      { connReconnectPolicy = reconnectPolicy doneVar
-      , -- NOTE: Not rate limit pings to our trusted, local etcd node. See
-        -- comment on 'http2OverridePingRateLimit'.
-        connHTTP2Settings = defaultHTTP2Settings{http2OverridePingRateLimit = Just maxBound}
-      }
-
-  reconnectPolicy doneVar = ReconnectAfter ReconnectToOriginal $ do
-    done <- readTVarIO doneVar
-    if done
-      then pure DontReconnect
-      else do
-        threadDelay 1
-        traceWith tracer Reconnecting
-        pure $ reconnectPolicy doneVar
-
   clientHost = Host{hostname = "127.0.0.1", port = getClientPort config}
-
-  grpcServer =
-    ServerInsecure $
-      Address
-        { addressHost = toString $ hostname clientHost
-        , addressPort = port clientHost
-        , addressAuthority = Nothing
-        }
-
   traceStderr p NetworkCallback{onConnectivity} =
     forever $ do
       bs <- BS.hGetLine (getStderr p)
@@ -243,6 +211,32 @@ withEtcdNetwork tracer protocolVersion config callback action = do
 
   NetworkConfiguration{persistenceDir, listen, advertise, peers, whichEtcd} = config
 
+connParams :: Tracer IO EtcdLog -> Maybe Timeout -> ConnParams
+connParams tracer to =
+  def
+    { connReconnectPolicy = reconnectPolicy
+    , -- NOTE: Not rate limit pings to our trusted, local etcd node. See
+      -- comment on 'http2OverridePingRateLimit'.
+      connHTTP2Settings = defaultHTTP2Settings{http2OverridePingRateLimit = Just maxBound}
+    , connDefaultTimeout = to
+    }
+ where
+  reconnectPolicy = ReconnectAfter ReconnectToOriginal $ do
+    threadDelay 1
+    traceWith tracer Reconnecting
+    pure reconnectPolicy
+
+grpcServer :: NetworkConfiguration -> Server
+grpcServer config =
+  ServerInsecure $
+    Address
+      { addressHost = toString $ hostname clientHost
+      , addressPort = port clientHost
+      , addressAuthority = Nothing
+      }
+ where
+  clientHost = Host{hostname = "127.0.0.1", port = getClientPort config}
+
 -- | Get the client port corresponding to a listen address.
 --
 -- The client port used by the started etcd port is offset by the same amount as
@@ -314,15 +308,15 @@ checkVersion tracer conn ourVersion NetworkCallback{onConnectivity} = do
 broadcastMessages ::
   (ToCBOR msg, Eq msg) =>
   Tracer IO EtcdLog ->
-  Connection ->
+  NetworkConfiguration ->
   -- | Used to identify sender.
   Host ->
   PersistentQueue IO msg ->
   IO ()
-broadcastMessages tracer conn ourHost queue =
+broadcastMessages tracer config ourHost queue =
   withGrpcContext "broadcastMessages" . forever $ do
     msg <- peekPersistentQueue queue
-    (putMessage conn ourHost msg >> popPersistentQueue queue msg)
+    (putMessage tracer config ourHost msg >> popPersistentQueue queue msg)
       `catch` \case
         GrpcException{grpcError, grpcErrorMessage}
           | grpcError == GrpcUnavailable || grpcError == GrpcDeadlineExceeded -> do
@@ -333,19 +327,18 @@ broadcastMessages tracer conn ourHost queue =
 -- | Broadcast a message to the etcd cluster.
 putMessage ::
   ToCBOR msg =>
-  Connection ->
+  Tracer IO EtcdLog ->
+  NetworkConfiguration ->
   -- | Used to identify sender.
   Host ->
   msg ->
   IO ()
-putMessage conn ourHost msg =
-  void $ nonStreaming conn (rpcWith @(Protobuf KV "put") callParams) req
+putMessage tracer config ourHost msg = do
+  -- XXX: Here we open a new connection _for every message_! This is
+  -- effectively a work-around for https://github.com/cardano-scaling/hydra/issues/2167.
+  withConnection (connParams tracer (Just . Timeout Second $ TimeoutValue 3)) (grpcServer config) $ \conn -> do
+    void $ nonStreaming conn (rpc @(Protobuf KV "put")) req
  where
-  -- NOTE: Timeout puts after 3 seconds. This is not tested, but we saw the
-  -- 'pending-broadcast' queue fill up and suspect that 'put' requests in
-  -- 'broadcastMessages' were just not served and stay pending forever.
-  callParams = def{callTimeout = Just . Timeout Second $ TimeoutValue 3}
-
   req =
     defMessage
       & #key .~ key
diff --git a/hydra-node/test/Hydra/NetworkSpec.hs b/hydra-node/test/Hydra/NetworkSpec.hs
@@ -63,6 +63,29 @@ spec = do
                 broadcast n ("asdf" :: Text)
                 waitNext `shouldReturn` "asdf"
 
+      -- Note: This test is disabled as it takes took long; but it is
+      -- important to keep around. Successfully completion of this test looks
+      -- like either a "mvcc database size exceeded" error; or no error at
+      -- all. Failures looks like complete blocking
+      -- XXX: Maybe run this one nightly; when we start doing nightly tests.
+      xit "broadcasts 100KiB messages 1M times" $ \tracer ->
+        withTempDir "test-etcd" $ \tmp -> do
+          putStrLn $ "Folder " ++ show tmp
+          PeerConfig2{aliceConfig, bobConfig} <- setup2Peers tmp
+          (recordReceived, waitNext, _) <- newRecordingCallback
+          -- Create a 100KiB message (100 * 1024 characters)
+          let largeMessage = toText $ replicate (100 * 1024) 'a'
+          withEtcdNetwork @Text tracer v1 aliceConfig recordReceived $ \n1 -> do
+            withEtcdNetwork @Text tracer v1 bobConfig noopCallback $ \_ -> do
+              forM_ [1 :: Integer .. 1000000] $ \i -> do
+                let msgWithId = largeMessage <> " - Message #" <> show i
+                when (i `mod` 10000 == 0) $
+                  putStrLn $
+                    "Broadcasting 100KiB message #" <> show i <> " (size: " <> show (length (toString msgWithId)) <> " chars)"
+                broadcast n1 msgWithId
+                _ <- waitNext
+                threadDelay 0.02
+
       it "broadcasts messages to single connected peer" $ \tracer -> do
         withTempDir "test-etcd" $ \tmp -> do
           failAfter 5 $ do