Merge branch 'master' into vrom911/update-aws-head-instructions

vrom911 · web-flow · commit 8d5d3ec28e6f · 2025-08-05T10:08:50.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,13 @@ changes.
 
 - Tested with `cardano-node 10.4.1` and `cardano-cli 10.8.0.0`.
 
+Fix rotation log id consistency after restart by changing the rotation check to trigger only
+when the number of persisted `StateChanged` events exceeds the configured `--persistence-rotate-after` threshold.
+  * This also prevents immediate rotation on startup when the threshold is set to 1.
+  * `Checkpoint` event ids now match the suffix of their preceding rotated log file and the last `StateChanged` event id within it,
+  preserving sequential order and making it easier to identify which rotated log file was used to compute it.
+
+
 ## [0.22.2] - 2025.06.30
 
 * Fix wrong hydra-script-tx-ids in networks.json
diff --git a/docs/docs/dev/architecture/event-sourcing.md b/docs/docs/dev/architecture/event-sourcing.md
@@ -45,9 +45,12 @@ Event log rotation was introduced to improve recovery times by reducing the numb
 
 Only rotated log files are saved with an incrementing `logId` suffix in their names, while the main `state` log file remains unchanged to preserve backward compatibility. This `logId` suffix corresponds to the ID of the last event included in that file.
 Rotation can be enabled via the optional `--persistence-rotate-after` command-line argument, which specifies the number of events after which rotation should occur.
-> For example, with `--persistence-rotate-after 100`, you’ll get rotated files named: state-99, state-199, state-299, and so on, each containing 100 events. This is because event IDs start at 0.
+> For example, with `--persistence-rotate-after 100`, you’ll get rotated files named: state-100, state-200, state-300, and so on, each containing 101 events. This is because event IDs start at 0, so state-100 includes 101 state changed events (0–100) without a checkpoint. Subsequent rotated files include a checkpoint plus 100 new state changed events.
 
-Note that, depending on the rotation configuration used, the current `state` file may already contain more events than the specified threshold, causing a rotation to occur immediately on startup before any new inputs are processed.
+Note that a checkpoint event id matches the last persisted event id from the previous rotated log file, preserving the sequential order of event ids across logs.
+This also makes it easier to identify which rotated log file was used to compute the checkpoint, as its event id matches the file name suffix.
+
+Depending on the rotation configuration used, the current `state` file may already contain more events than the specified threshold, causing a rotation to occur immediately on startup before any new inputs are processed.
 
 Upon rotation, a server output is produced to notify external agents when a checkpoint occurs, allowing them to perform archival or cleanup actions without interrupting the Hydra Head.
 
diff --git a/docs/docs/known-issues.md b/docs/docs/known-issues.md
@@ -33,7 +33,7 @@ Known errors are:
       - `configuredPeers` - peers info coming from `hydra-node` arguments.
 
  - `member ... has already been bootstrapped` - missing information in `<persistence-dir>/etcd`
-   - need to bootstrap new cluster or manual workarounds, see also https://etcd.io/docs/v3.5/op-guide/failures/
+   - restart your hydra-node with the `ETCD_INITIAL_CLUSTER_STATE` environment variable set to `existing` (`new` is the default), see also https://etcd.io/docs/v3.3/op-guide/configuration/
 
 We should be able to work around these UX issues using [etcd discovery](https://etcd.io/docs/v3.5/op-guide/clustering/#etcd-discovery) eventually.
 
diff --git a/docs/docs/tutorial/index.md b/docs/docs/tutorial/index.md
@@ -77,8 +77,8 @@ unzip -d bin hydra-aarch64-darwin-${hydra_version}.zip
 
 cardano_node_version=10.4.1
 curl -L -O https://github.com/IntersectMBO/cardano-node/releases/download/${cardano_node_version}/cardano-node-${cardano_node_version}-macos.tar.gz
-tar xf cardano-node-${cardano_node_version}-macos.tar.gz --wildcards ./bin/cardano-node ./bin/cardano-cli './bin/*.dylib'
-tar xf cardano-node-${cardano_node_version}-macos.tar.gz ./share/preprod --strip-components=3
+tar xf cardano-node-${cardano_node_version}-macos.tar.gz ./bin/cardano-node ./bin/cardano-cli './bin/*.dylib'
+tar xf cardano-node-${cardano_node_version}-macos.tar.gz --strip-components=3 ./share/preprod/
 
 curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/input-output-hk/mithril/refs/heads/main/mithril-install.sh | sh -s -- -c mithril-client -d latest -p bin
 
diff --git a/hydra-cluster/bench/Bench/EndToEnd.hs b/hydra-cluster/bench/Bench/EndToEnd.hs
@@ -264,15 +264,15 @@ withOSStats workDir tvar action =
         race
           ( do
               -- Write the header
-              atomically $ writeTVar tvar [" | Time | Used | Free | ", "|------|------|------|"]
+              atomically $ writeTVar tvar [" | Time | Used | Free | ", "|------------------------------------|------|------|"]
               collectStats tvar out
           )
           action
           >>= \case
             Left _ -> failure "dool process failed unexpectedly"
             Right a -> pure a
  where
-  process = (proc "dool" ["-cm", "-n", "-N", "lo", "--noheaders", "--noupdate", "5"]){cwd = Just workDir}
+  process = (proc "dool" ["-m", "--noupdate"]){cwd = Just workDir}
 
   collectStats _ Nothing = pure ()
   collectStats tvar' (Just hdl) =
@@ -281,9 +281,9 @@ withOSStats workDir tvar action =
 
   processStat :: TVar IO [Text] -> String -> IO ()
   processStat tvar' stat = do
-    let matches = getAllTextMatches (stat =~ ("[0-9.]+.|([A-Z])" :: String)) :: [String]
+    let matches = getAllTextMatches (stat =~ ("[0-9.]+([A-Z])" :: String)) :: [String]
     case matches of
-      (_ : _ : _ : _ : _ : memUsed : memFree : _) -> do
+      (memUsed : memFree : _ : _) -> do
         now <- getCurrentTime
         let str =
               pack $
diff --git a/hydra-cluster/src/Hydra/Cluster/Options.hs b/hydra-cluster/src/Hydra/Cluster/Options.hs
@@ -10,14 +10,15 @@ import Hydra.Options (persistenceRotateAfterParser)
 import Hydra.Prelude
 import Options.Applicative (Parser, eitherReader, flag, flag', help, long, metavar, strOption)
 import Options.Applicative.Builder (option)
+import Test.QuickCheck (Positive)
 
 data Options = Options
   { knownNetwork :: Maybe KnownNetwork
   , stateDirectory :: Maybe FilePath
   , publishHydraScripts :: PublishOrReuse
   , useMithril :: UseMithril
   , scenario :: Scenario
-  , persistenceRotateAfter :: Maybe Natural
+  , persistenceRotateAfter :: Maybe (Positive Natural)
   }
   deriving stock (Show, Eq, Generic)
   deriving anyclass (ToJSON)
diff --git a/hydra-cluster/src/Hydra/Cluster/Scenarios.hs b/hydra-cluster/src/Hydra/Cluster/Scenarios.hs
@@ -39,7 +39,7 @@ import Hydra.API.HTTPServer (
   DraftCommitTxResponse (..),
   TransactionSubmitted (..),
  )
-import Hydra.API.ServerOutput (HeadStatus (Idle))
+import Hydra.API.ServerOutput (HeadStatus (..))
 import Hydra.Cardano.Api (
   Coin (..),
   Era,
@@ -149,11 +149,12 @@ import Network.HTTP.Req (
  )
 import Network.HTTP.Simple (getResponseBody, httpJSON, setRequestBodyJSON)
 import Network.HTTP.Types (urlEncode)
+import System.Environment (setEnv, unsetEnv)
 import System.FilePath ((</>))
 import System.Process (callProcess)
 import Test.Hydra.Tx.Fixture (testNetworkId)
 import Test.Hydra.Tx.Gen (genDatum, genKeyPair, genTxOutWithReferenceScript)
-import Test.QuickCheck (choose, elements, generate)
+import Test.QuickCheck (Positive, choose, elements, generate)
 
 data EndToEndLog
   = ClusterOptions {options :: Options}
@@ -500,7 +501,7 @@ singlePartyOpenAHead ::
   FilePath ->
   backend ->
   [TxId] ->
-  Maybe Natural ->
+  Maybe (Positive Natural) ->
   -- | Continuation called when the head is open
   (HydraClient -> SigningKey PaymentKey -> HeadId -> IO a) ->
   IO a
@@ -1743,6 +1744,46 @@ canSideLoadSnapshot tracer workDir backend hydraScriptsTxId = do
  where
   hydraTracer = contramap FromHydraNode tracer
 
+canResumeOnMemberAlreadyBootstrapped :: ChainBackend backend => Tracer IO EndToEndLog -> FilePath -> backend -> [TxId] -> IO ()
+canResumeOnMemberAlreadyBootstrapped tracer workDir backend hydraScriptsTxId = do
+  let clients = [Alice, Bob]
+  [(aliceCardanoVk, _aliceCardanoSk), (bobCardanoVk, _)] <- forM clients keysFor
+  seedFromFaucet_ backend aliceCardanoVk 100_000_000 (contramap FromFaucet tracer)
+  seedFromFaucet_ backend bobCardanoVk 100_000_000 (contramap FromFaucet tracer)
+
+  networkId <- Backend.queryNetworkId backend
+  let contestationPeriod = 1
+  aliceChainConfig <-
+    chainConfigFor Alice workDir backend hydraScriptsTxId [Bob] contestationPeriod
+      <&> setNetworkId networkId
+  bobChainConfig <-
+    chainConfigFor Bob workDir backend hydraScriptsTxId [Alice] contestationPeriod
+      <&> setNetworkId networkId
+
+  withHydraNode hydraTracer aliceChainConfig workDir 1 aliceSk [bobVk] [1, 2] $ \n1 -> do
+    waitMatch 20 n1 $ \v -> do
+      guard $ v ^? key "tag" == Just "Greetings"
+      guard $ v ^? key "headStatus" == Just (toJSON Idle)
+    withHydraNode hydraTracer bobChainConfig workDir 2 bobSk [aliceVk] [1, 2] $ \n2 -> do
+      waitMatch 20 n2 $ \v -> do
+        guard $ v ^? key "tag" == Just "Greetings"
+        guard $ v ^? key "headStatus" == Just (toJSON Idle)
+
+      threadDelay 5
+
+    callProcess "rm" ["-rf", workDir </> "state-2"]
+
+    withHydraNode hydraTracer bobChainConfig workDir 2 bobSk [aliceVk] [1, 2] (const $ pure ())
+      `shouldThrow` \(e :: SomeException) ->
+        "hydra-node" `isInfixOf` show e
+          && "etcd" `isInfixOf` show e
+
+    setEnv "ETCD_INITIAL_CLUSTER_STATE" "existing"
+    withHydraNode hydraTracer bobChainConfig workDir 2 bobSk [aliceVk] [1, 2] (const $ pure ())
+    unsetEnv "ETCD_INITIAL_CLUSTER_STATE"
+ where
+  hydraTracer = contramap FromHydraNode tracer
+
 -- | Three hydra nodes open a head and we assert that none of them sees errors if a party is duplicated.
 threeNodesWithMirrorParty :: ChainBackend backend => Tracer IO EndToEndLog -> FilePath -> backend -> [TxId] -> IO ()
 threeNodesWithMirrorParty tracer workDir backend hydraScriptsTxId = do
diff --git a/hydra-cluster/test/Test/EndToEndSpec.hs b/hydra-cluster/test/Test/EndToEndSpec.hs
@@ -12,6 +12,7 @@ import CardanoClient (
  )
 import CardanoNode (
   withBackend,
+  withCardanoNodeDevnet,
  )
 import Control.Lens ((^..), (^?))
 import Control.Monad (foldM_)
@@ -52,6 +53,7 @@ import Hydra.Cluster.Scenarios (
   canCommit,
   canDecommit,
   canRecoverDeposit,
+  canResumeOnMemberAlreadyBootstrapped,
   canSeePendingDeposits,
   canSideLoadSnapshot,
   canSubmitTransactionThroughAPI,
@@ -106,7 +108,7 @@ import System.FilePath ((</>))
 import Test.Hydra.Cluster.Utils (chainPointToSlot)
 import Test.Hydra.Tx.Fixture (testNetworkId)
 import Test.Hydra.Tx.Gen (genKeyPair, genUTxOFor)
-import Test.QuickCheck (generate)
+import Test.QuickCheck (Positive (..), generate)
 import Prelude qualified
 
 allNodeIds :: [Int]
@@ -204,7 +206,7 @@ spec = around (showLogsOnFailure "EndToEndSpec") $ do
 
         -- Measure restart after rotation
         options <- prepareHydraNode offlineConfig tmpDir 1 aliceSk [] [] id
-        let options' = options{persistenceRotateAfter = Just 10}
+        let options' = options{persistenceRotateAfter = Just (Positive 10)}
         t1 <- getCurrentTime
         diff2 <- withPreparedHydraNode (contramap FromHydraNode tracer) tmpDir 1 options' $ \_ -> do
           t2 <- getCurrentTime
@@ -656,6 +658,12 @@ spec = around (showLogsOnFailure "EndToEndSpec") $ do
             publishHydraScriptsAs backend Faucet
               >>= canSideLoadSnapshot tracer tmpDir backend
 
+      it "can resume when member has already been bootstrapped" $ \tracer -> do
+        withClusterTempDir $ \tmpDir -> do
+          withCardanoNodeDevnet (contramap FromCardanoNode tracer) tmpDir $ \_ backend ->
+            publishHydraScriptsAs backend Faucet
+              >>= canResumeOnMemberAlreadyBootstrapped tracer tmpDir backend
+
     describe "two hydra heads scenario" $ do
       it "two heads on the same network do not conflict" $ \tracer ->
         failAfter 60 $
diff --git a/hydra-node/src/Hydra/Events/Rotation.hs b/hydra-node/src/Hydra/Events/Rotation.hs
@@ -6,8 +6,9 @@ import Conduit (MonadUnliftIO, runConduit, runResourceT, (.|))
 import Control.Concurrent.Class.MonadSTM (modifyTVar', newTVarIO, readTVarIO, writeTVar)
 import Data.Conduit.Combinators qualified as C
 import Hydra.Events (EventId, EventSink (..), EventSource (..), HasEventId (..))
+import Test.QuickCheck (Positive (..))
 
-newtype RotationConfig = RotateAfter Natural
+newtype RotationConfig = RotateAfter (Positive Natural)
 
 type LogId = EventId
 
@@ -52,37 +53,42 @@ newRotatedEventStore config s0 aggregator checkpointer eventStore = do
         rotate = const . const $ pure ()
       }
  where
-  RotateAfter rotateAfterX = config
+  RotateAfter (Positive rotateAfterX) = config
 
   aggregateEvents (!n, !_evId, !acc) e = (n + 1, getEventId e, aggregator acc e)
 
   shouldRotate numberOfEventsV = do
     currentNumberOfEvents <- readTVarIO numberOfEventsV
-    pure $ currentNumberOfEvents >= rotateAfterX
+    -- since rotateAfterX can be any positive number (including 1),
+    -- we use (>) instead of (>=) to avoid triggering a rotation immediately after a checkpoint,
+    -- which would lead to an infinite loop
+    pure $ currentNumberOfEvents > rotateAfterX
 
   rotatedPutEvent numberOfEventsV aggregateStateV event = do
     putEvent event
     atomically $ do
       -- aggregate new state
       modifyTVar' aggregateStateV (`aggregator` event)
       -- bump numberOfEvents
-      numberOfEvents <- readTVar numberOfEventsV
-      let numberOfEvents' = numberOfEvents + 1
-      writeTVar numberOfEventsV numberOfEvents'
+      modifyTVar' numberOfEventsV (+ 1)
     -- check rotation
     whenM (shouldRotate numberOfEventsV) $ do
       let eventId = getEventId event
       rotateEventLog numberOfEventsV aggregateStateV eventId
 
   rotateEventLog numberOfEventsV aggregateStateV lastEventId = do
-    -- build checkpoint event
+    -- build the checkpoint event
     now <- getCurrentTime
     aggregateState <- readTVarIO aggregateStateV
-    let checkpoint = checkpointer aggregateState (lastEventId + 1) now
-    -- rotate with checkpoint
+    -- the checkpoint has the same event id as the last event persisted
+    let checkpoint = checkpointer aggregateState lastEventId now
+    -- the rotated log file name suffix (logId) matches the last event persisted,
+    -- while the checkpoint event is appended to the new (current) state log file
     rotate lastEventId checkpoint
-    -- clear numberOfEvents + bump logId
+    -- reset `numberOfEvents` to 1 because
+    -- the checkpoint event was just appended during rotation
+    -- and will be sourced from the event store on restart
     atomically $ do
-      writeTVar numberOfEventsV 0
+      writeTVar numberOfEventsV 1
 
   EventStore{eventSource, eventSink = EventSink{putEvent}, rotate} = eventStore
diff --git a/hydra-node/src/Hydra/Options.hs b/hydra-node/src/Hydra/Options.hs
@@ -79,7 +79,7 @@ import Options.Applicative (
  )
 import Options.Applicative.Builder (str)
 import Options.Applicative.Help (vsep)
-import Test.QuickCheck (elements, listOf, listOf1, oneof, vectorOf)
+import Test.QuickCheck (Positive (..), choose, elements, listOf, listOf1, oneof, vectorOf)
 
 data Command
   = Run RunOptions
@@ -194,7 +194,7 @@ data RunOptions = RunOptions
   , hydraSigningKey :: FilePath
   , hydraVerificationKeys :: [FilePath]
   , persistenceDir :: FilePath
-  , persistenceRotateAfter :: Maybe Natural
+  , persistenceRotateAfter :: Maybe (Positive Natural)
   , chainConfig :: ChainConfig
   , ledgerConfig :: LedgerConfig
   , whichEtcd :: WhichEtcd
@@ -203,6 +203,13 @@ data RunOptions = RunOptions
   deriving stock (Eq, Show, Generic)
   deriving anyclass (ToJSON, FromJSON)
 
+-- Orphan instances
+instance ToJSON a => ToJSON (Positive a) where
+  toJSON (Positive a) = toJSON a
+
+instance FromJSON a => FromJSON (Positive a) where
+  parseJSON v = Positive <$> parseJSON v
+
 -- Orphan instance
 instance Arbitrary IP where
   arbitrary = IPv4 . toIPv4w <$> arbitrary
@@ -223,7 +230,7 @@ instance Arbitrary RunOptions where
     hydraSigningKey <- genFilePath "sk"
     hydraVerificationKeys <- reasonablySized (listOf (genFilePath "vk"))
     persistenceDir <- genDirPath
-    persistenceRotateAfter <- arbitrary
+    persistenceRotateAfter <- oneof [pure Nothing, Just . Positive . fromInteger <$> choose (1, 100000)]
     chainConfig <- arbitrary
     ledgerConfig <- arbitrary
     whichEtcd <- arbitrary
@@ -852,15 +859,22 @@ persistenceDirParser =
           \Do not edit these files manually!"
     )
 
-persistenceRotateAfterParser :: Parser Natural
+persistenceRotateAfterParser :: Parser (Positive Natural)
 persistenceRotateAfterParser =
   option
-    auto
+    (eitherReader validateRotateAfter)
     ( long "persistence-rotate-after"
         <> metavar "NATURAL"
         <> help
-          "The number of Hydra events to trigger rotation (default: no rotation)"
+          "The number of Hydra events to trigger rotation (default: no rotation).\
+          \Note it must be a positive number."
     )
+ where
+  validateRotateAfter :: String -> Either String (Positive Natural)
+  validateRotateAfter arg =
+    case readMaybe arg of
+      Just n | n > 0 -> Right (Positive n)
+      _ -> Left "--persistence-rotate-after must be a positive number"
 
 hydraNodeCommand :: ParserInfo Command
 hydraNodeCommand =
@@ -992,7 +1006,7 @@ toArgs
       <> concatMap toArgPeer peers
       <> maybe [] (\port -> ["--monitoring-port", show port]) monitoringPort
       <> ["--persistence-dir", persistenceDir]
-      <> maybe [] (\rotateAfter -> ["--persistence-rotate-after", show rotateAfter]) persistenceRotateAfter
+      <> maybe [] (\rotateAfter -> ["--persistence-rotate-after", showPositive rotateAfter]) persistenceRotateAfter
       <> argsChainConfig chainConfig
       <> argsLedgerConfig
       <> ["--api-transaction-timeout", show apiTransactionTimeout]
@@ -1063,6 +1077,9 @@ toArgs
       { cardanoLedgerProtocolParametersFile
       } = ledgerConfig
 
+    showPositive :: Show a => Positive a -> String
+    showPositive (Positive x) = show x
+
 toArgNodeSocket :: SocketPath -> [String]
 toArgNodeSocket nodeSocket = ["--node-socket", unFile nodeSocket]
 
diff --git a/hydra-node/test/Hydra/Events/RotationSpec.hs b/hydra-node/test/Hydra/Events/RotationSpec.hs
diff --git a/hydra-node/test/Hydra/OptionsSpec.hs b/hydra-node/test/Hydra/OptionsSpec.hs