Skip to content

Commit 8d5d3ec

Browse files
authored
Merge branch 'master' into vrom911/update-aws-head-instructions
2 parents 95f8568 + 6b3f9d7 commit 8d5d3ec

File tree

12 files changed

+258
-72
lines changed

12 files changed

+258
-72
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ changes.
2424

2525
- Tested with `cardano-node 10.4.1` and `cardano-cli 10.8.0.0`.
2626

27+
Fix rotation log id consistency after restart by changing the rotation check to trigger only
28+
when the number of persisted `StateChanged` events exceeds the configured `--persistence-rotate-after` threshold.
29+
* This also prevents immediate rotation on startup when the threshold is set to 1.
30+
* `Checkpoint` event ids now match the suffix of their preceding rotated log file and the last `StateChanged` event id within it,
31+
preserving sequential order and making it easier to identify which rotated log file was used to compute it.
32+
33+
2734
## [0.22.2] - 2025.06.30
2835

2936
* Fix wrong hydra-script-tx-ids in networks.json

docs/docs/dev/architecture/event-sourcing.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,12 @@ Event log rotation was introduced to improve recovery times by reducing the numb
4545

4646
Only rotated log files are saved with an incrementing `logId` suffix in their names, while the main `state` log file remains unchanged to preserve backward compatibility. This `logId` suffix corresponds to the ID of the last event included in that file.
4747
Rotation can be enabled via the optional `--persistence-rotate-after` command-line argument, which specifies the number of events after which rotation should occur.
48-
> For example, with `--persistence-rotate-after 100`, you’ll get rotated files named: state-99, state-199, state-299, and so on, each containing 100 events. This is because event IDs start at 0.
48+
> For example, with `--persistence-rotate-after 100`, you’ll get rotated files named: state-100, state-200, state-300, and so on, each containing 101 events. This is because event IDs start at 0, so state-100 includes 101 state changed events (0–100) without a checkpoint. Subsequent rotated files include a checkpoint plus 100 new state changed events.
4949
50-
Note that, depending on the rotation configuration used, the current `state` file may already contain more events than the specified threshold, causing a rotation to occur immediately on startup before any new inputs are processed.
50+
Note that a checkpoint event id matches the last persisted event id from the previous rotated log file, preserving the sequential order of event ids across logs.
51+
This also makes it easier to identify which rotated log file was used to compute the checkpoint, as its event id matches the file name suffix.
52+
53+
Depending on the rotation configuration used, the current `state` file may already contain more events than the specified threshold, causing a rotation to occur immediately on startup before any new inputs are processed.
5154

5255
Upon rotation, a server output is produced to notify external agents when a checkpoint occurs, allowing them to perform archival or cleanup actions without interrupting the Hydra Head.
5356

docs/docs/known-issues.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Known errors are:
3333
- `configuredPeers` - peers info coming from `hydra-node` arguments.
3434

3535
- `member ... has already been bootstrapped` - missing information in `<persistence-dir>/etcd`
36-
- need to bootstrap new cluster or manual workarounds, see also https://etcd.io/docs/v3.5/op-guide/failures/
36+
- restart your hydra-node with the `ETCD_INITIAL_CLUSTER_STATE` environment variable set to `existing` (`new` is the default), see also https://etcd.io/docs/v3.3/op-guide/configuration/
3737

3838
We should be able to work around these UX issues using [etcd discovery](https://etcd.io/docs/v3.5/op-guide/clustering/#etcd-discovery) eventually.
3939

docs/docs/tutorial/index.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ unzip -d bin hydra-aarch64-darwin-${hydra_version}.zip
7777

7878
cardano_node_version=10.4.1
7979
curl -L -O https://github.com/IntersectMBO/cardano-node/releases/download/${cardano_node_version}/cardano-node-${cardano_node_version}-macos.tar.gz
80-
tar xf cardano-node-${cardano_node_version}-macos.tar.gz --wildcards ./bin/cardano-node ./bin/cardano-cli './bin/*.dylib'
81-
tar xf cardano-node-${cardano_node_version}-macos.tar.gz ./share/preprod --strip-components=3
80+
tar xf cardano-node-${cardano_node_version}-macos.tar.gz ./bin/cardano-node ./bin/cardano-cli './bin/*.dylib'
81+
tar xf cardano-node-${cardano_node_version}-macos.tar.gz --strip-components=3 ./share/preprod/
8282

8383
curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/input-output-hk/mithril/refs/heads/main/mithril-install.sh | sh -s -- -c mithril-client -d latest -p bin
8484

hydra-cluster/bench/Bench/EndToEnd.hs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -264,15 +264,15 @@ withOSStats workDir tvar action =
264264
race
265265
( do
266266
-- Write the header
267-
atomically $ writeTVar tvar [" | Time | Used | Free | ", "|------|------|------|"]
267+
atomically $ writeTVar tvar [" | Time | Used | Free | ", "|------------------------------------|------|------|"]
268268
collectStats tvar out
269269
)
270270
action
271271
>>= \case
272272
Left _ -> failure "dool process failed unexpectedly"
273273
Right a -> pure a
274274
where
275-
process = (proc "dool" ["-cm", "-n", "-N", "lo", "--noheaders", "--noupdate", "5"]){cwd = Just workDir}
275+
process = (proc "dool" ["-m", "--noupdate"]){cwd = Just workDir}
276276

277277
collectStats _ Nothing = pure ()
278278
collectStats tvar' (Just hdl) =
@@ -281,9 +281,9 @@ withOSStats workDir tvar action =
281281

282282
processStat :: TVar IO [Text] -> String -> IO ()
283283
processStat tvar' stat = do
284-
let matches = getAllTextMatches (stat =~ ("[0-9.]+.|([A-Z])" :: String)) :: [String]
284+
let matches = getAllTextMatches (stat =~ ("[0-9.]+([A-Z])" :: String)) :: [String]
285285
case matches of
286-
(_ : _ : _ : _ : _ : memUsed : memFree : _) -> do
286+
(memUsed : memFree : _ : _) -> do
287287
now <- getCurrentTime
288288
let str =
289289
pack $

hydra-cluster/src/Hydra/Cluster/Options.hs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@ import Hydra.Options (persistenceRotateAfterParser)
1010
import Hydra.Prelude
1111
import Options.Applicative (Parser, eitherReader, flag, flag', help, long, metavar, strOption)
1212
import Options.Applicative.Builder (option)
13+
import Test.QuickCheck (Positive)
1314

1415
data Options = Options
1516
{ knownNetwork :: Maybe KnownNetwork
1617
, stateDirectory :: Maybe FilePath
1718
, publishHydraScripts :: PublishOrReuse
1819
, useMithril :: UseMithril
1920
, scenario :: Scenario
20-
, persistenceRotateAfter :: Maybe Natural
21+
, persistenceRotateAfter :: Maybe (Positive Natural)
2122
}
2223
deriving stock (Show, Eq, Generic)
2324
deriving anyclass (ToJSON)

hydra-cluster/src/Hydra/Cluster/Scenarios.hs

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ import Hydra.API.HTTPServer (
3939
DraftCommitTxResponse (..),
4040
TransactionSubmitted (..),
4141
)
42-
import Hydra.API.ServerOutput (HeadStatus (Idle))
42+
import Hydra.API.ServerOutput (HeadStatus (..))
4343
import Hydra.Cardano.Api (
4444
Coin (..),
4545
Era,
@@ -149,11 +149,12 @@ import Network.HTTP.Req (
149149
)
150150
import Network.HTTP.Simple (getResponseBody, httpJSON, setRequestBodyJSON)
151151
import Network.HTTP.Types (urlEncode)
152+
import System.Environment (setEnv, unsetEnv)
152153
import System.FilePath ((</>))
153154
import System.Process (callProcess)
154155
import Test.Hydra.Tx.Fixture (testNetworkId)
155156
import Test.Hydra.Tx.Gen (genDatum, genKeyPair, genTxOutWithReferenceScript)
156-
import Test.QuickCheck (choose, elements, generate)
157+
import Test.QuickCheck (Positive, choose, elements, generate)
157158

158159
data EndToEndLog
159160
= ClusterOptions {options :: Options}
@@ -500,7 +501,7 @@ singlePartyOpenAHead ::
500501
FilePath ->
501502
backend ->
502503
[TxId] ->
503-
Maybe Natural ->
504+
Maybe (Positive Natural) ->
504505
-- | Continuation called when the head is open
505506
(HydraClient -> SigningKey PaymentKey -> HeadId -> IO a) ->
506507
IO a
@@ -1743,6 +1744,46 @@ canSideLoadSnapshot tracer workDir backend hydraScriptsTxId = do
17431744
where
17441745
hydraTracer = contramap FromHydraNode tracer
17451746

1747+
canResumeOnMemberAlreadyBootstrapped :: ChainBackend backend => Tracer IO EndToEndLog -> FilePath -> backend -> [TxId] -> IO ()
1748+
canResumeOnMemberAlreadyBootstrapped tracer workDir backend hydraScriptsTxId = do
1749+
let clients = [Alice, Bob]
1750+
[(aliceCardanoVk, _aliceCardanoSk), (bobCardanoVk, _)] <- forM clients keysFor
1751+
seedFromFaucet_ backend aliceCardanoVk 100_000_000 (contramap FromFaucet tracer)
1752+
seedFromFaucet_ backend bobCardanoVk 100_000_000 (contramap FromFaucet tracer)
1753+
1754+
networkId <- Backend.queryNetworkId backend
1755+
let contestationPeriod = 1
1756+
aliceChainConfig <-
1757+
chainConfigFor Alice workDir backend hydraScriptsTxId [Bob] contestationPeriod
1758+
<&> setNetworkId networkId
1759+
bobChainConfig <-
1760+
chainConfigFor Bob workDir backend hydraScriptsTxId [Alice] contestationPeriod
1761+
<&> setNetworkId networkId
1762+
1763+
withHydraNode hydraTracer aliceChainConfig workDir 1 aliceSk [bobVk] [1, 2] $ \n1 -> do
1764+
waitMatch 20 n1 $ \v -> do
1765+
guard $ v ^? key "tag" == Just "Greetings"
1766+
guard $ v ^? key "headStatus" == Just (toJSON Idle)
1767+
withHydraNode hydraTracer bobChainConfig workDir 2 bobSk [aliceVk] [1, 2] $ \n2 -> do
1768+
waitMatch 20 n2 $ \v -> do
1769+
guard $ v ^? key "tag" == Just "Greetings"
1770+
guard $ v ^? key "headStatus" == Just (toJSON Idle)
1771+
1772+
threadDelay 5
1773+
1774+
callProcess "rm" ["-rf", workDir </> "state-2"]
1775+
1776+
withHydraNode hydraTracer bobChainConfig workDir 2 bobSk [aliceVk] [1, 2] (const $ pure ())
1777+
`shouldThrow` \(e :: SomeException) ->
1778+
"hydra-node" `isInfixOf` show e
1779+
&& "etcd" `isInfixOf` show e
1780+
1781+
setEnv "ETCD_INITIAL_CLUSTER_STATE" "existing"
1782+
withHydraNode hydraTracer bobChainConfig workDir 2 bobSk [aliceVk] [1, 2] (const $ pure ())
1783+
unsetEnv "ETCD_INITIAL_CLUSTER_STATE"
1784+
where
1785+
hydraTracer = contramap FromHydraNode tracer
1786+
17461787
-- | Three hydra nodes open a head and we assert that none of them sees errors if a party is duplicated.
17471788
threeNodesWithMirrorParty :: ChainBackend backend => Tracer IO EndToEndLog -> FilePath -> backend -> [TxId] -> IO ()
17481789
threeNodesWithMirrorParty tracer workDir backend hydraScriptsTxId = do

hydra-cluster/test/Test/EndToEndSpec.hs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import CardanoClient (
1212
)
1313
import CardanoNode (
1414
withBackend,
15+
withCardanoNodeDevnet,
1516
)
1617
import Control.Lens ((^..), (^?))
1718
import Control.Monad (foldM_)
@@ -52,6 +53,7 @@ import Hydra.Cluster.Scenarios (
5253
canCommit,
5354
canDecommit,
5455
canRecoverDeposit,
56+
canResumeOnMemberAlreadyBootstrapped,
5557
canSeePendingDeposits,
5658
canSideLoadSnapshot,
5759
canSubmitTransactionThroughAPI,
@@ -106,7 +108,7 @@ import System.FilePath ((</>))
106108
import Test.Hydra.Cluster.Utils (chainPointToSlot)
107109
import Test.Hydra.Tx.Fixture (testNetworkId)
108110
import Test.Hydra.Tx.Gen (genKeyPair, genUTxOFor)
109-
import Test.QuickCheck (generate)
111+
import Test.QuickCheck (Positive (..), generate)
110112
import Prelude qualified
111113

112114
allNodeIds :: [Int]
@@ -204,7 +206,7 @@ spec = around (showLogsOnFailure "EndToEndSpec") $ do
204206

205207
-- Measure restart after rotation
206208
options <- prepareHydraNode offlineConfig tmpDir 1 aliceSk [] [] id
207-
let options' = options{persistenceRotateAfter = Just 10}
209+
let options' = options{persistenceRotateAfter = Just (Positive 10)}
208210
t1 <- getCurrentTime
209211
diff2 <- withPreparedHydraNode (contramap FromHydraNode tracer) tmpDir 1 options' $ \_ -> do
210212
t2 <- getCurrentTime
@@ -656,6 +658,12 @@ spec = around (showLogsOnFailure "EndToEndSpec") $ do
656658
publishHydraScriptsAs backend Faucet
657659
>>= canSideLoadSnapshot tracer tmpDir backend
658660

661+
it "can resume when member has already been bootstrapped" $ \tracer -> do
662+
withClusterTempDir $ \tmpDir -> do
663+
withCardanoNodeDevnet (contramap FromCardanoNode tracer) tmpDir $ \_ backend ->
664+
publishHydraScriptsAs backend Faucet
665+
>>= canResumeOnMemberAlreadyBootstrapped tracer tmpDir backend
666+
659667
describe "two hydra heads scenario" $ do
660668
it "two heads on the same network do not conflict" $ \tracer ->
661669
failAfter 60 $

hydra-node/src/Hydra/Events/Rotation.hs

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ import Conduit (MonadUnliftIO, runConduit, runResourceT, (.|))
66
import Control.Concurrent.Class.MonadSTM (modifyTVar', newTVarIO, readTVarIO, writeTVar)
77
import Data.Conduit.Combinators qualified as C
88
import Hydra.Events (EventId, EventSink (..), EventSource (..), HasEventId (..))
9+
import Test.QuickCheck (Positive (..))
910

10-
newtype RotationConfig = RotateAfter Natural
11+
newtype RotationConfig = RotateAfter (Positive Natural)
1112

1213
type LogId = EventId
1314

@@ -52,37 +53,42 @@ newRotatedEventStore config s0 aggregator checkpointer eventStore = do
5253
rotate = const . const $ pure ()
5354
}
5455
where
55-
RotateAfter rotateAfterX = config
56+
RotateAfter (Positive rotateAfterX) = config
5657

5758
aggregateEvents (!n, !_evId, !acc) e = (n + 1, getEventId e, aggregator acc e)
5859

5960
shouldRotate numberOfEventsV = do
6061
currentNumberOfEvents <- readTVarIO numberOfEventsV
61-
pure $ currentNumberOfEvents >= rotateAfterX
62+
-- since rotateAfterX can be any positive number (including 1),
63+
-- we use (>) instead of (>=) to avoid triggering a rotation immediately after a checkpoint,
64+
-- which would lead to an infinite loop
65+
pure $ currentNumberOfEvents > rotateAfterX
6266

6367
rotatedPutEvent numberOfEventsV aggregateStateV event = do
6468
putEvent event
6569
atomically $ do
6670
-- aggregate new state
6771
modifyTVar' aggregateStateV (`aggregator` event)
6872
-- bump numberOfEvents
69-
numberOfEvents <- readTVar numberOfEventsV
70-
let numberOfEvents' = numberOfEvents + 1
71-
writeTVar numberOfEventsV numberOfEvents'
73+
modifyTVar' numberOfEventsV (+ 1)
7274
-- check rotation
7375
whenM (shouldRotate numberOfEventsV) $ do
7476
let eventId = getEventId event
7577
rotateEventLog numberOfEventsV aggregateStateV eventId
7678

7779
rotateEventLog numberOfEventsV aggregateStateV lastEventId = do
78-
-- build checkpoint event
80+
-- build the checkpoint event
7981
now <- getCurrentTime
8082
aggregateState <- readTVarIO aggregateStateV
81-
let checkpoint = checkpointer aggregateState (lastEventId + 1) now
82-
-- rotate with checkpoint
83+
-- the checkpoint has the same event id as the last event persisted
84+
let checkpoint = checkpointer aggregateState lastEventId now
85+
-- the rotated log file name suffix (logId) matches the last event persisted,
86+
-- while the checkpoint event is appended to the new (current) state log file
8387
rotate lastEventId checkpoint
84-
-- clear numberOfEvents + bump logId
88+
-- reset `numberOfEvents` to 1 because
89+
-- the checkpoint event was just appended during rotation
90+
-- and will be sourced from the event store on restart
8591
atomically $ do
86-
writeTVar numberOfEventsV 0
92+
writeTVar numberOfEventsV 1
8793

8894
EventStore{eventSource, eventSink = EventSink{putEvent}, rotate} = eventStore

hydra-node/src/Hydra/Options.hs

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ import Options.Applicative (
7979
)
8080
import Options.Applicative.Builder (str)
8181
import Options.Applicative.Help (vsep)
82-
import Test.QuickCheck (elements, listOf, listOf1, oneof, vectorOf)
82+
import Test.QuickCheck (Positive (..), choose, elements, listOf, listOf1, oneof, vectorOf)
8383

8484
data Command
8585
= Run RunOptions
@@ -194,7 +194,7 @@ data RunOptions = RunOptions
194194
, hydraSigningKey :: FilePath
195195
, hydraVerificationKeys :: [FilePath]
196196
, persistenceDir :: FilePath
197-
, persistenceRotateAfter :: Maybe Natural
197+
, persistenceRotateAfter :: Maybe (Positive Natural)
198198
, chainConfig :: ChainConfig
199199
, ledgerConfig :: LedgerConfig
200200
, whichEtcd :: WhichEtcd
@@ -203,6 +203,13 @@ data RunOptions = RunOptions
203203
deriving stock (Eq, Show, Generic)
204204
deriving anyclass (ToJSON, FromJSON)
205205

206+
-- Orphan instances
207+
instance ToJSON a => ToJSON (Positive a) where
208+
toJSON (Positive a) = toJSON a
209+
210+
instance FromJSON a => FromJSON (Positive a) where
211+
parseJSON v = Positive <$> parseJSON v
212+
206213
-- Orphan instance
207214
instance Arbitrary IP where
208215
arbitrary = IPv4 . toIPv4w <$> arbitrary
@@ -223,7 +230,7 @@ instance Arbitrary RunOptions where
223230
hydraSigningKey <- genFilePath "sk"
224231
hydraVerificationKeys <- reasonablySized (listOf (genFilePath "vk"))
225232
persistenceDir <- genDirPath
226-
persistenceRotateAfter <- arbitrary
233+
persistenceRotateAfter <- oneof [pure Nothing, Just . Positive . fromInteger <$> choose (1, 100000)]
227234
chainConfig <- arbitrary
228235
ledgerConfig <- arbitrary
229236
whichEtcd <- arbitrary
@@ -852,15 +859,22 @@ persistenceDirParser =
852859
\Do not edit these files manually!"
853860
)
854861

855-
persistenceRotateAfterParser :: Parser Natural
862+
persistenceRotateAfterParser :: Parser (Positive Natural)
856863
persistenceRotateAfterParser =
857864
option
858-
auto
865+
(eitherReader validateRotateAfter)
859866
( long "persistence-rotate-after"
860867
<> metavar "NATURAL"
861868
<> help
862-
"The number of Hydra events to trigger rotation (default: no rotation)"
869+
"The number of Hydra events to trigger rotation (default: no rotation).\
870+
\Note it must be a positive number."
863871
)
872+
where
873+
validateRotateAfter :: String -> Either String (Positive Natural)
874+
validateRotateAfter arg =
875+
case readMaybe arg of
876+
Just n | n > 0 -> Right (Positive n)
877+
_ -> Left "--persistence-rotate-after must be a positive number"
864878

865879
hydraNodeCommand :: ParserInfo Command
866880
hydraNodeCommand =
@@ -992,7 +1006,7 @@ toArgs
9921006
<> concatMap toArgPeer peers
9931007
<> maybe [] (\port -> ["--monitoring-port", show port]) monitoringPort
9941008
<> ["--persistence-dir", persistenceDir]
995-
<> maybe [] (\rotateAfter -> ["--persistence-rotate-after", show rotateAfter]) persistenceRotateAfter
1009+
<> maybe [] (\rotateAfter -> ["--persistence-rotate-after", showPositive rotateAfter]) persistenceRotateAfter
9961010
<> argsChainConfig chainConfig
9971011
<> argsLedgerConfig
9981012
<> ["--api-transaction-timeout", show apiTransactionTimeout]
@@ -1063,6 +1077,9 @@ toArgs
10631077
{ cardanoLedgerProtocolParametersFile
10641078
} = ledgerConfig
10651079

1080+
showPositive :: Show a => Positive a -> String
1081+
showPositive (Positive x) = show x
1082+
10661083
toArgNodeSocket :: SocketPath -> [String]
10671084
toArgNodeSocket nodeSocket = ["--node-socket", unFile nodeSocket]
10681085

0 commit comments

Comments
 (0)