Skip to content

Commit e1f9505

Browse files
committed
Last revision reset (#2137)
> [!NOTE] > This is currently on top of the 0.22.2 tag and I could not find a 0.22 release branch to open the PR. We should probably not merge it like this. Instead I should rebase it to master and we cherry pick it back to do a 0.22.3 release. The last-known-revision kept by the Etcd network component may be incorrect if the etcd cluster compacted this revision while the node was offline or if the last-known-revision state file was removed. Both cases can be handled by detecting a failing watch request and at least using the compactRevision from the response. This is a somewhat exceptional situation and the node state may be inconsistent because of this. Hence we also log a warning when this happens. --- * [x] CHANGELOG updated * [ ] Documentation updated or not needed * [x] Haddocks updated * [x] No new TODOs introduced or explained herafter
1 parent c16e945 commit e1f9505

File tree

4 files changed

+106
-40
lines changed

4 files changed

+106
-40
lines changed

CHANGELOG.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,15 @@ when the number of persisted `StateChanged` events exceeds the configured `--per
6161
preserving sequential order and making it easier to identify which rotated log file was used to compute it.
6262

6363

64-
## [0.22.2] - 2025.06.30
64+
## [0.22.2] - 2025-06-30
6565

6666
* Fix wrong hydra-script-tx-ids in networks.json
6767

68-
## [0.22.1] - 2025.06.27
68+
## [0.22.1] - 2025-06-27
6969

7070
* Fix for bug where node got stalled at `ReplayingState` [#2089](https://github.com/cardano-scaling/hydra/issues/2089)
7171

72-
## [0.22.0] - 2025.06.17
72+
## [0.22.0] - 2025-06-17
7373

7474
- Tested with `cardano-node 10.1.4` and `cardano-cli 10.1.1.0`.
7575

hydra-node/src/Hydra/Network/Etcd.hs

Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@
2020
-- only deliver messages that were not seen before. In case we are not connected
2121
-- to our 'etcd' instance or not enough peers (= on a minority cluster), we
2222
-- retry sending, but also store messages to broadcast in a 'PersistentQueue',
23-
-- which makes the node resilient against crashes while sending. TODO: Is this
24-
-- needed? performance limitation?
23+
-- which makes the node resilient against crashes while sending.
2524
--
2625
-- Connectivity and compatibility with other nodes on the cluster is tracked
2726
-- using the key-value service as well:
@@ -93,7 +92,6 @@ import Network.GRPC.Client (
9392
)
9493
import Network.GRPC.Client.StreamType.IO (biDiStreaming, nonStreaming)
9594
import Network.GRPC.Common (GrpcError (..), GrpcException (..), HTTP2Settings (..), NextElem (..), def, defaultHTTP2Settings)
96-
import Network.GRPC.Common.NextElem (whileNext_)
9795
import Network.GRPC.Common.Protobuf (Proto (..), Protobuf, defMessage, (.~))
9896
import Network.GRPC.Etcd (
9997
Compare'CompareResult (..),
@@ -102,6 +100,7 @@ import Network.GRPC.Etcd (
102100
Lease,
103101
Watch,
104102
)
103+
import Network.Socket (PortNumber)
105104
import System.Directory (createDirectoryIfMissing, listDirectory, removeFile)
106105
import System.Environment.Blank (getEnvironment)
107106
import System.FilePath ((</>))
@@ -175,7 +174,7 @@ withEtcdNetwork tracer protocolVersion config callback action = do
175174
traceWith tracer Reconnecting
176175
pure $ reconnectPolicy doneVar
177176

178-
clientHost = Host{hostname = "127.0.0.1", port = clientPort}
177+
clientHost = Host{hostname = "127.0.0.1", port = getClientPort config}
179178

180179
grpcServer =
181180
ServerInsecure $
@@ -185,11 +184,6 @@ withEtcdNetwork tracer protocolVersion config callback action = do
185184
, addressAuthority = Nothing
186185
}
187186

188-
-- NOTE: Offset client port by the same amount as configured 'port' is offset
189-
-- from the default '5001'. This will result in the default client port 2379
190-
-- be used by default still.
191-
clientPort = 2379 + port listen - 5001
192-
193187
traceStderr p NetworkCallback{onConnectivity} =
194188
forever $ do
195189
bs <- BS.hGetLine (getStderr p)
@@ -249,6 +243,14 @@ withEtcdNetwork tracer protocolVersion config callback action = do
249243

250244
NetworkConfiguration{persistenceDir, listen, advertise, peers, whichEtcd} = config
251245

246+
-- | Get the client port corresponding to a listen address.
247+
--
248+
-- The client port used by the started etcd port is offset by the same amount as
249+
-- the listen address is offset by the default port 5001. This will result in
250+
-- the default client port 2379 be used by default still.
251+
getClientPort :: NetworkConfiguration -> PortNumber
252+
getClientPort NetworkConfiguration{listen} = 2379 + port listen - 5001
253+
252254
-- | Check and write version on etcd cluster. This will retry until we are on a
253255
-- majority cluster and succeed. If the version does not match a corresponding
254256
-- 'Connectivity' message is sent via 'NetworkCallback'.
@@ -282,8 +284,7 @@ checkVersion tracer conn ourVersion NetworkCallback{onConnectivity} = do
282284
Right theirVersion ->
283285
unless (theirVersion == ourVersion) $
284286
onConnectivity VersionMismatch{ourVersion, theirVersion = Just theirVersion}
285-
else
286-
traceWith tracer $ MatchingProtocolVersion{version = ourVersion}
287+
else traceWith tracer $ MatchingProtocolVersion{version = ourVersion}
287288
where
288289
versionKey = "version"
289290

@@ -361,11 +362,13 @@ waitMessages ::
361362
NetworkCallback msg IO ->
362363
IO ()
363364
waitMessages tracer conn directory NetworkCallback{deliver} = do
364-
revision <- getLastKnownRevision directory
365365
withGrpcContext "waitMessages" . forever $ do
366366
-- NOTE: We have not observed the watch (subscription) fail even when peers
367367
-- leave and we end up on a minority cluster.
368368
biDiStreaming conn (rpc @(Protobuf Watch "watch")) $ \send recv -> do
369+
revision <- getLastKnownRevision directory
370+
let startRevision = fromIntegral (revision + 1)
371+
traceWith tracer WatchMessagesStartRevision{startRevision}
369372
-- NOTE: Request all keys starting with 'msg'. See also section KeyRanges
370373
-- in https://etcd.io/docs/v3.5/learning/api/#key-value-api
371374
let watchRequest =
@@ -374,34 +377,48 @@ waitMessages tracer conn directory NetworkCallback{deliver} = do
374377
& #rangeEnd .~ "msh" -- NOTE: g+1 to query prefixes
375378
& #startRevision .~ fromIntegral (revision + 1)
376379
send . NextElem $ defMessage & #createRequest .~ watchRequest
377-
whileNext_ recv process
380+
loop send recv
378381
-- Wait before re-trying
379382
threadDelay 1
380383
where
381-
process res = do
382-
let revision = fromIntegral $ res ^. #header . #revision
383-
putLastKnownRevision directory revision
384-
forM_ (res ^. #events) $ \event -> do
385-
let value = event ^. #kv . #value
386-
case decodeFull' value of
387-
Left err ->
388-
traceWith
389-
tracer
390-
FailedToDecodeValue
391-
{ key = decodeUtf8 $ event ^. #kv . #key
392-
, value = encodeBase16 value
393-
, reason = show err
394-
}
395-
Right msg -> deliver msg
384+
loop send recv =
385+
recv >>= \case
386+
NoNextElem -> pure ()
387+
NextElem res ->
388+
if res ^. #canceled
389+
then do
390+
let compactRevision = res ^. #compactRevision
391+
traceWith tracer WatchMessagesFallbackTo{compactRevision}
392+
putLastKnownRevision directory . fromIntegral $ (compactRevision - 1) `max` 0
393+
-- Gracefully close watch stream
394+
send NoNextElem
395+
else do
396+
let revision = res ^. #header . #revision
397+
putLastKnownRevision directory . fromIntegral $ revision `max` 0
398+
forM_ (res ^. #events) process
399+
loop send recv
400+
401+
process event = do
402+
let value = event ^. #kv . #value
403+
case decodeFull' value of
404+
Left err ->
405+
traceWith
406+
tracer
407+
FailedToDecodeValue
408+
{ key = decodeUtf8 $ event ^. #kv . #key
409+
, value = encodeBase16 value
410+
, reason = show err
411+
}
412+
Right msg -> deliver msg
396413

397414
getLastKnownRevision :: MonadIO m => FilePath -> m Natural
398415
getLastKnownRevision directory = do
399416
liftIO $
400417
try (decodeFileStrict' $ directory </> "last-known-revision") >>= \case
401418
Right rev -> do
402-
pure $ fromMaybe 1 rev
419+
pure $ fromMaybe 0 rev
403420
Left (e :: IOException)
404-
| isDoesNotExistError e -> pure 1
421+
| isDoesNotExistError e -> pure 0
405422
| otherwise -> do
406423
fail $ "Failed to load last known revision: " <> show e
407424

@@ -614,5 +631,7 @@ data EtcdLog
614631
| LowLeaseTTL {ttlRemaining :: Int64}
615632
| NoKeepAliveResponse
616633
| MatchingProtocolVersion {version :: ProtocolVersion}
634+
| WatchMessagesStartRevision {startRevision :: Int64}
635+
| WatchMessagesFallbackTo {compactRevision :: Int64}
617636
deriving stock (Eq, Show, Generic)
618637
deriving anyclass (ToJSON, FromJSON)

hydra-node/test/Hydra/NetworkSpec.hs

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import Hydra.Network (
2323
ProtocolVersion (..),
2424
WhichEtcd (..),
2525
)
26-
import Hydra.Network.Etcd (withEtcdNetwork)
26+
import Hydra.Network.Etcd (getClientPort, withEtcdNetwork)
2727
import Hydra.Network.Message (Message (..))
2828
import Hydra.Node.Network (NetworkConfiguration (..))
2929
import System.Directory (removeFile)
@@ -202,14 +202,60 @@ spec = do
202202
withEtcdNetwork @Int tracer v1 carolConfig recordCarol $ \_ -> do
203203
broadcast n1 1001
204204
waitCarol `shouldReturn` 1001
205-
-- We can reset the last known view (internal implementation detail)
205+
206+
it "handles compaction and lost local state" $ \tracer -> do
207+
withTempDir "test-etcd" $ \tmp -> do
208+
failAfter 20 $ do
209+
PeerConfig3{aliceConfig, bobConfig, carolConfig} <- setup3Peers tmp
210+
(recordBob, waitBob, _) <- newRecordingCallback
211+
(recordCarol, waitCarol, _) <- newRecordingCallback
212+
withEtcdNetwork @Int tracer v1 aliceConfig noopCallback $ \n1 ->
213+
withEtcdNetwork @Int tracer v1 bobConfig recordBob $ \_ -> do
214+
-- First we send 5 messages with carol online
215+
withEtcdNetwork @Int tracer v1 carolConfig recordCarol $ \_ -> do
216+
forM_ [1 .. 5] $ \msg -> do
217+
broadcast n1 msg
218+
waitBob `shouldReturn` msg
219+
waitCarol `shouldReturn` msg
220+
-- Carol stopped and we continue sending messages
221+
forM_ [5 .. 100] $ \msg -> do
222+
broadcast n1 msg
223+
waitBob `shouldReturn` msg
224+
-- Even while carol is down, the etcd component would
225+
-- "auto-compact" messages. By default down to 1000 messages
226+
-- after/every 5 minutes. This is interesting as it should
227+
-- result in carol never some messages, but is hard to test
228+
-- (without waiting 5 minutes). Instead we issue a direct etcd
229+
-- command to compact everything before revision 50.
230+
runProcess_ . shell $
231+
"etcdctl compact 50 --endpoints=127.0.0.1:" <> show (getClientPort aliceConfig)
232+
-- When carol starts now we would expect it to start catching up
233+
-- from the earliest possible revision 50. While missing some
234+
-- messages.
235+
withEtcdNetwork @Int tracer v1 carolConfig recordCarol $ \_ -> do
236+
-- NOTE: Revision 50 may not correspond to message 50, so we
237+
-- only assert its some message bigger than 25 and expect to
238+
-- see all further messages to 100.
239+
firstMsg <- waitCarol
240+
firstMsg `shouldSatisfy` (> 25)
241+
forM_ [firstMsg + 1 .. 100] $ \msg ->
242+
waitCarol `shouldReturn` msg
243+
-- Carol should be able to receive new messages just fine.
244+
forM_ [101 .. 105] $ \msg -> do
245+
broadcast n1 msg
246+
waitCarol `shouldReturn` msg
247+
-- Similarly, should carol lose its local state, we expect it to
248+
-- see everything from the last compacted revision 50. We can
249+
-- enforce this by removing the corresponding file (an internal
250+
-- implementation detail)
206251
removeFile (persistenceDir carolConfig </> "last-known-revision")
207252
withEtcdNetwork @Int tracer v1 carolConfig recordCarol $ \_ -> do
208-
-- NOTE: The etcd component would "auto-compact" messages down
209-
-- to 1000 messages after 5 minutes. This would result in
210-
-- starting at 1001 here, but is hard to test (without waiting
211-
-- 5 minutes).
212-
forM_ messages $ \msg ->
253+
-- NOTE: Revision 50 may not correspond to message 50, so we
254+
-- only assert its some message bigger than 25 and expect to
255+
-- see all further messages to 105.
256+
firstMsg <- waitCarol
257+
firstMsg `shouldSatisfy` (> 25)
258+
forM_ [firstMsg + 1 .. 105] $ \msg -> do
213259
waitCarol `shouldReturn` msg
214260

215261
it "emits cluster id mismatch" $ \tracer -> do

nix/hydra/packages.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@
149149
buildInputs = [
150150
nativePkgs.hydra-node.components.tests.tests
151151
pkgs.check-jsonschema
152+
pkgs.etcd # For etcdctl command in tests
152153
];
153154
};
154155
hydra-cluster-tests = pkgs.mkShellNoCC {

0 commit comments

Comments
 (0)