@@ -77,7 +77,6 @@ import Hydra.Network (
77
77
import Hydra.Network.EtcdBinary (getEtcdBinary )
78
78
import Network.GRPC.Client (
79
79
Address (.. ),
80
- CallParams (.. ),
81
80
ConnParams (.. ),
82
81
Connection ,
83
82
ReconnectPolicy (.. ),
@@ -87,7 +86,6 @@ import Network.GRPC.Client (
87
86
TimeoutUnit (.. ),
88
87
TimeoutValue (.. ),
89
88
rpc ,
90
- rpcWith ,
91
89
withConnection ,
92
90
)
93
91
import Network.GRPC.Client.StreamType.IO (biDiStreaming , nonStreaming )
@@ -120,7 +118,6 @@ import System.Process.Typed (
120
118
unsafeProcessHandle ,
121
119
waitExitCode ,
122
120
)
123
- import UnliftIO (readTVarIO )
124
121
125
122
-- | Concrete network component that broadcasts messages to an etcd cluster and
126
123
-- listens for incoming messages.
@@ -140,50 +137,21 @@ withEtcdNetwork tracer protocolVersion config callback action = do
140
137
withProcessInterrupt (etcdCmd etcdBinPath envVars) $ \ p -> do
141
138
race_ (waitExitCode p >>= \ ec -> fail $ " Sub-process etcd exited with: " <> show ec) $ do
142
139
race_ (traceStderr p callback) $ do
143
- -- XXX: cleanup reconnecting through policy if other threads fail
144
- doneVar <- newTVarIO False
145
140
-- NOTE: The connection to the server is set up asynchronously; the
146
141
-- first rpc call will block until the connection has been established.
147
- withConnection (connParams doneVar) grpcServer $ \ conn -> do
142
+ withConnection (connParams tracer Nothing ) ( grpcServer config) $ \ conn -> do
148
143
-- REVIEW: checkVersion blocks if used on main thread - why?
149
144
withAsync (checkVersion tracer conn protocolVersion callback) $ \ _ -> do
150
145
race_ (pollConnectivity tracer conn advertise callback) $
151
146
race_ (waitMessages tracer conn persistenceDir callback) $ do
152
147
queue <- newPersistentQueue (persistenceDir </> " pending-broadcast" ) 100
153
- race_ (broadcastMessages tracer conn advertise queue) $ do
148
+ race_ (broadcastMessages tracer config advertise queue) $ do
154
149
action
155
150
Network
156
151
{ broadcast = writePersistentQueue queue
157
152
}
158
- atomically (writeTVar doneVar True )
159
153
where
160
- connParams doneVar =
161
- def
162
- { connReconnectPolicy = reconnectPolicy doneVar
163
- , -- NOTE: Not rate limit pings to our trusted, local etcd node. See
164
- -- comment on 'http2OverridePingRateLimit'.
165
- connHTTP2Settings = defaultHTTP2Settings{http2OverridePingRateLimit = Just maxBound }
166
- }
167
-
168
- reconnectPolicy doneVar = ReconnectAfter ReconnectToOriginal $ do
169
- done <- readTVarIO doneVar
170
- if done
171
- then pure DontReconnect
172
- else do
173
- threadDelay 1
174
- traceWith tracer Reconnecting
175
- pure $ reconnectPolicy doneVar
176
-
177
154
clientHost = Host {hostname = " 127.0.0.1" , port = getClientPort config}
178
-
179
- grpcServer =
180
- ServerInsecure $
181
- Address
182
- { addressHost = toString $ hostname clientHost
183
- , addressPort = port clientHost
184
- , addressAuthority = Nothing
185
- }
186
-
187
155
traceStderr p NetworkCallback {onConnectivity} =
188
156
forever $ do
189
157
bs <- BS. hGetLine (getStderr p)
@@ -243,6 +211,32 @@ withEtcdNetwork tracer protocolVersion config callback action = do
243
211
244
212
NetworkConfiguration {persistenceDir, listen, advertise, peers, whichEtcd} = config
245
213
214
+ connParams :: Tracer IO EtcdLog -> Maybe Timeout -> ConnParams
215
+ connParams tracer to =
216
+ def
217
+ { connReconnectPolicy = reconnectPolicy
218
+ , -- NOTE: Not rate limit pings to our trusted, local etcd node. See
219
+ -- comment on 'http2OverridePingRateLimit'.
220
+ connHTTP2Settings = defaultHTTP2Settings{http2OverridePingRateLimit = Just maxBound }
221
+ , connDefaultTimeout = to
222
+ }
223
+ where
224
+ reconnectPolicy = ReconnectAfter ReconnectToOriginal $ do
225
+ threadDelay 1
226
+ traceWith tracer Reconnecting
227
+ pure reconnectPolicy
228
+
229
+ grpcServer :: NetworkConfiguration -> Server
230
+ grpcServer config =
231
+ ServerInsecure $
232
+ Address
233
+ { addressHost = toString $ hostname clientHost
234
+ , addressPort = port clientHost
235
+ , addressAuthority = Nothing
236
+ }
237
+ where
238
+ clientHost = Host {hostname = " 127.0.0.1" , port = getClientPort config}
239
+
246
240
-- | Get the client port corresponding to a listen address.
247
241
--
248
242
-- The client port used by the started etcd port is offset by the same amount as
@@ -314,15 +308,15 @@ checkVersion tracer conn ourVersion NetworkCallback{onConnectivity} = do
314
308
broadcastMessages ::
315
309
(ToCBOR msg , Eq msg ) =>
316
310
Tracer IO EtcdLog ->
317
- Connection ->
311
+ NetworkConfiguration ->
318
312
-- | Used to identify sender.
319
313
Host ->
320
314
PersistentQueue IO msg ->
321
315
IO ()
322
- broadcastMessages tracer conn ourHost queue =
316
+ broadcastMessages tracer config ourHost queue =
323
317
withGrpcContext " broadcastMessages" . forever $ do
324
318
msg <- peekPersistentQueue queue
325
- (putMessage conn ourHost msg >> popPersistentQueue queue msg)
319
+ (putMessage tracer config ourHost msg >> popPersistentQueue queue msg)
326
320
`catch` \ case
327
321
GrpcException {grpcError, grpcErrorMessage}
328
322
| grpcError == GrpcUnavailable || grpcError == GrpcDeadlineExceeded -> do
@@ -333,19 +327,18 @@ broadcastMessages tracer conn ourHost queue =
333
327
-- | Broadcast a message to the etcd cluster.
334
328
putMessage ::
335
329
ToCBOR msg =>
336
- Connection ->
330
+ Tracer IO EtcdLog ->
331
+ NetworkConfiguration ->
337
332
-- | Used to identify sender.
338
333
Host ->
339
334
msg ->
340
335
IO ()
341
- putMessage conn ourHost msg =
342
- void $ nonStreaming conn (rpcWith @ (Protobuf KV " put" ) callParams) req
336
+ putMessage tracer config ourHost msg = do
337
+ -- XXX: Here we open a new connection _for every message_! This is
338
+ -- effectively a work-around for https://github.com/cardano-scaling/hydra/issues/2167.
339
+ withConnection (connParams tracer (Just . Timeout Second $ TimeoutValue 3 )) (grpcServer config) $ \ conn -> do
340
+ void $ nonStreaming conn (rpc @ (Protobuf KV " put" )) req
343
341
where
344
- -- NOTE: Timeout puts after 3 seconds. This is not tested, but we saw the
345
- -- 'pending-broadcast' queue fill up and suspect that 'put' requests in
346
- -- 'broadcastMessages' were just not served and stay pending forever.
347
- callParams = def{callTimeout = Just . Timeout Second $ TimeoutValue 3 }
348
-
349
342
req =
350
343
defMessage
351
344
& # key .~ key
0 commit comments