Timeout and retry broadcast after 3 seconds (#2154)

noonio · noonio · commit cba23edb7f20 · 2025-08-18T11:45:39.000+01:00
Client-side timeout of grpc to `put` messages to the etcd cluster.
Blocking without a timeout on this is the only explanation we could find
to see the `pending-broadcast` queue fill up.

---

* [x] CHANGELOG updated
* [x] Documentation update not needed
* [x] Haddocks update not needed
* [x] No new TODOs introduced
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,10 @@ changes.
 - Fix an internal persistent queue blocking after restart when it reached
   capacity.
 
+- Timeout and retry broadcast of network messages after 3 seconds in case the
+  `etcd` grpc server is not responsive. This should avoid build-up on the
+  outbound persistent queue.
+
 - Handle failing lease keep alive in network component and avoid bursts in
   heartbeating.
 
diff --git a/hydra-node/src/Hydra/Network/Etcd.hs b/hydra-node/src/Hydra/Network/Etcd.hs
@@ -82,12 +82,17 @@ import Hydra.Network (
 import Hydra.Node.EmbedTH (embedExecutable)
 import Network.GRPC.Client (
   Address (..),
+  CallParams (..),
   ConnParams (..),
   Connection,
   ReconnectPolicy (..),
   ReconnectTo (ReconnectToOriginal),
   Server (..),
+  Timeout (..),
+  TimeoutUnit (..),
+  TimeoutValue (..),
   rpc,
+  rpcWith,
   withConnection,
  )
 import Network.GRPC.Client.StreamType.IO (biDiStreaming, nonStreaming)
@@ -323,8 +328,8 @@ checkVersion tracer conn ourVersion NetworkCallback{onConnectivity} = do
 
 -- | Broadcast messages from a queue to the etcd cluster.
 --
--- TODO: retrying on failure even needed?
--- Retries on failure to 'putMessage' in case we are on a minority cluster.
+-- Retries on failure to 'putMessage' in case we are on a minority cluster or
+-- when the grpc call timeouts.
 broadcastMessages ::
   (ToCBOR msg, Eq msg) =>
   Tracer IO EtcdLog ->
@@ -353,8 +358,13 @@ putMessage ::
   msg ->
   IO ()
 putMessage conn ourHost msg =
-  void $ nonStreaming conn (rpc @(Protobuf KV "put")) req
+  void $ nonStreaming conn (rpcWith @(Protobuf KV "put") callParams) req
  where
+  -- NOTE: Timeout puts after 3 seconds. This is not tested, but we saw the
+  -- 'pending-broadcast' queue fill up and suspect that 'put' requests in
+  -- 'broadcastMessages' were just not served and stay pending forever.
+  callParams = def{callTimeout = Just . Timeout Second $ TimeoutValue 3}
+
   req =
     defMessage
       & #key .~ key