fix rotation log id after restart (#2153)

ffakenz · web-flow · commit 6b3f9d79076e · 2025-08-01T22:23:26.000+02:00
&lt;!-- Describe your change here --&gt;

After rotation, we now reset the number of events to 1 (not 0),
because the checkpoint event is sourced on restart. This avoids
a mismatch between the rotation check on startup and during normal
operation.
That discrepancy was the cause of inconsistent rotation log ids after
restarts.

Also, we changed the rotation condition to use (&gt;) instead of (&gt;=),
preventing a follow up rotation on start up when the configured
threshold is 1
(since checkpointing would immediately trigger a new rotation).

Lastly, a checkpoint event id now matches the last persisted event id
from its preceding rotated log file, preserving sequential order of
event ids across logs.

This also makes it easier to identify which rotated log file was used to
compute the checkpoint,
as its event id matches the file name suffix.

---

&lt;!-- Consider each and tick it off one way or the other --&gt;
* [X] CHANGELOG updated or not needed
* [x] Documentation updated or not needed
* [x] Haddocks updated or not needed
* [x] No new TODOs introduced or explained herafter
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,13 @@ changes.
 
 - Tested with `cardano-node 10.4.1` and `cardano-cli 10.8.0.0`.
 
+Fix rotation log id consistency after restart by changing the rotation check to trigger only
+when the number of persisted `StateChanged` events exceeds the configured `--persistence-rotate-after` threshold.
+  * This also prevents immediate rotation on startup when the threshold is set to 1.
+  * `Checkpoint` event ids now match the suffix of their preceding rotated log file and the last `StateChanged` event id within it,
+  preserving sequential order and making it easier to identify which rotated log file was used to compute it.
+
+
 ## [0.22.2] - 2025.06.30
 
 * Fix wrong hydra-script-tx-ids in networks.json
diff --git a/docs/docs/dev/architecture/event-sourcing.md b/docs/docs/dev/architecture/event-sourcing.md
@@ -45,9 +45,12 @@ Event log rotation was introduced to improve recovery times by reducing the numb
 
 Only rotated log files are saved with an incrementing `logId` suffix in their names, while the main `state` log file remains unchanged to preserve backward compatibility. This `logId` suffix corresponds to the ID of the last event included in that file.
 Rotation can be enabled via the optional `--persistence-rotate-after` command-line argument, which specifies the number of events after which rotation should occur.
-> For example, with `--persistence-rotate-after 100`, you’ll get rotated files named: state-99, state-199, state-299, and so on, each containing 100 events. This is because event IDs start at 0.
+> For example, with `--persistence-rotate-after 100`, you’ll get rotated files named: state-100, state-200, state-300, and so on, each containing 101 events. This is because event IDs start at 0, so state-100 includes 101 state changed events (0–100) without a checkpoint. Subsequent rotated files include a checkpoint plus 100 new state changed events.
 
-Note that, depending on the rotation configuration used, the current `state` file may already contain more events than the specified threshold, causing a rotation to occur immediately on startup before any new inputs are processed.
+Note that a checkpoint event id matches the last persisted event id from the previous rotated log file, preserving the sequential order of event ids across logs.
+This also makes it easier to identify which rotated log file was used to compute the checkpoint, as its event id matches the file name suffix.
+
+Depending on the rotation configuration used, the current `state` file may already contain more events than the specified threshold, causing a rotation to occur immediately on startup before any new inputs are processed.
 
 Upon rotation, a server output is produced to notify external agents when a checkpoint occurs, allowing them to perform archival or cleanup actions without interrupting the Hydra Head.
 
diff --git a/hydra-cluster/src/Hydra/Cluster/Options.hs b/hydra-cluster/src/Hydra/Cluster/Options.hs
@@ -10,14 +10,15 @@ import Hydra.Options (persistenceRotateAfterParser)
 import Hydra.Prelude
 import Options.Applicative (Parser, eitherReader, flag, flag', help, long, metavar, strOption)
 import Options.Applicative.Builder (option)
+import Test.QuickCheck (Positive)
 
 data Options = Options
   { knownNetwork :: Maybe KnownNetwork
   , stateDirectory :: Maybe FilePath
   , publishHydraScripts :: PublishOrReuse
   , useMithril :: UseMithril
   , scenario :: Scenario
-  , persistenceRotateAfter :: Maybe Natural
+  , persistenceRotateAfter :: Maybe (Positive Natural)
   }
   deriving stock (Show, Eq, Generic)
   deriving anyclass (ToJSON)
diff --git a/hydra-cluster/src/Hydra/Cluster/Scenarios.hs b/hydra-cluster/src/Hydra/Cluster/Scenarios.hs
@@ -154,7 +154,7 @@ import System.FilePath ((</>))
 import System.Process (callProcess)
 import Test.Hydra.Tx.Fixture (testNetworkId)
 import Test.Hydra.Tx.Gen (genDatum, genKeyPair, genTxOutWithReferenceScript)
-import Test.QuickCheck (choose, elements, generate)
+import Test.QuickCheck (Positive, choose, elements, generate)
 
 data EndToEndLog
   = ClusterOptions {options :: Options}
@@ -501,7 +501,7 @@ singlePartyOpenAHead ::
   FilePath ->
   backend ->
   [TxId] ->
-  Maybe Natural ->
+  Maybe (Positive Natural) ->
   -- | Continuation called when the head is open
   (HydraClient -> SigningKey PaymentKey -> HeadId -> IO a) ->
   IO a
diff --git a/hydra-cluster/test/Test/EndToEndSpec.hs b/hydra-cluster/test/Test/EndToEndSpec.hs
@@ -108,7 +108,7 @@ import System.FilePath ((</>))
 import Test.Hydra.Cluster.Utils (chainPointToSlot)
 import Test.Hydra.Tx.Fixture (testNetworkId)
 import Test.Hydra.Tx.Gen (genKeyPair, genUTxOFor)
-import Test.QuickCheck (generate)
+import Test.QuickCheck (Positive (..), generate)
 import Prelude qualified
 
 allNodeIds :: [Int]
@@ -206,7 +206,7 @@ spec = around (showLogsOnFailure "EndToEndSpec") $ do
 
         -- Measure restart after rotation
         options <- prepareHydraNode offlineConfig tmpDir 1 aliceSk [] [] id
-        let options' = options{persistenceRotateAfter = Just 10}
+        let options' = options{persistenceRotateAfter = Just (Positive 10)}
         t1 <- getCurrentTime
         diff2 <- withPreparedHydraNode (contramap FromHydraNode tracer) tmpDir 1 options' $ \_ -> do
           t2 <- getCurrentTime
diff --git a/hydra-node/src/Hydra/Events/Rotation.hs b/hydra-node/src/Hydra/Events/Rotation.hs
@@ -6,8 +6,9 @@ import Conduit (MonadUnliftIO, runConduit, runResourceT, (.|))
 import Control.Concurrent.Class.MonadSTM (modifyTVar', newTVarIO, readTVarIO, writeTVar)
 import Data.Conduit.Combinators qualified as C
 import Hydra.Events (EventId, EventSink (..), EventSource (..), HasEventId (..))
+import Test.QuickCheck (Positive (..))
 
-newtype RotationConfig = RotateAfter Natural
+newtype RotationConfig = RotateAfter (Positive Natural)
 
 type LogId = EventId
 
@@ -52,37 +53,42 @@ newRotatedEventStore config s0 aggregator checkpointer eventStore = do
         rotate = const . const $ pure ()
       }
  where
-  RotateAfter rotateAfterX = config
+  RotateAfter (Positive rotateAfterX) = config
 
   aggregateEvents (!n, !_evId, !acc) e = (n + 1, getEventId e, aggregator acc e)
 
   shouldRotate numberOfEventsV = do
     currentNumberOfEvents <- readTVarIO numberOfEventsV
-    pure $ currentNumberOfEvents >= rotateAfterX
+    -- since rotateAfterX can be any positive number (including 1),
+    -- we use (>) instead of (>=) to avoid triggering a rotation immediately after a checkpoint,
+    -- which would lead to an infinite loop
+    pure $ currentNumberOfEvents > rotateAfterX
 
   rotatedPutEvent numberOfEventsV aggregateStateV event = do
     putEvent event
     atomically $ do
       -- aggregate new state
       modifyTVar' aggregateStateV (`aggregator` event)
       -- bump numberOfEvents
-      numberOfEvents <- readTVar numberOfEventsV
-      let numberOfEvents' = numberOfEvents + 1
-      writeTVar numberOfEventsV numberOfEvents'
+      modifyTVar' numberOfEventsV (+ 1)
     -- check rotation
     whenM (shouldRotate numberOfEventsV) $ do
       let eventId = getEventId event
       rotateEventLog numberOfEventsV aggregateStateV eventId
 
   rotateEventLog numberOfEventsV aggregateStateV lastEventId = do
-    -- build checkpoint event
+    -- build the checkpoint event
     now <- getCurrentTime
     aggregateState <- readTVarIO aggregateStateV
-    let checkpoint = checkpointer aggregateState (lastEventId + 1) now
-    -- rotate with checkpoint
+    -- the checkpoint has the same event id as the last event persisted
+    let checkpoint = checkpointer aggregateState lastEventId now
+    -- the rotated log file name suffix (logId) matches the last event persisted,
+    -- while the checkpoint event is appended to the new (current) state log file
     rotate lastEventId checkpoint
-    -- clear numberOfEvents + bump logId
+    -- reset `numberOfEvents` to 1 because
+    -- the checkpoint event was just appended during rotation
+    -- and will be sourced from the event store on restart
     atomically $ do
-      writeTVar numberOfEventsV 0
+      writeTVar numberOfEventsV 1
 
   EventStore{eventSource, eventSink = EventSink{putEvent}, rotate} = eventStore
diff --git a/hydra-node/src/Hydra/Options.hs b/hydra-node/src/Hydra/Options.hs
@@ -79,7 +79,7 @@ import Options.Applicative (
  )
 import Options.Applicative.Builder (str)
 import Options.Applicative.Help (vsep)
-import Test.QuickCheck (elements, listOf, listOf1, oneof, vectorOf)
+import Test.QuickCheck (Positive (..), choose, elements, listOf, listOf1, oneof, vectorOf)
 
 data Command
   = Run RunOptions
@@ -194,7 +194,7 @@ data RunOptions = RunOptions
   , hydraSigningKey :: FilePath
   , hydraVerificationKeys :: [FilePath]
   , persistenceDir :: FilePath
-  , persistenceRotateAfter :: Maybe Natural
+  , persistenceRotateAfter :: Maybe (Positive Natural)
   , chainConfig :: ChainConfig
   , ledgerConfig :: LedgerConfig
   , whichEtcd :: WhichEtcd
@@ -203,6 +203,13 @@ data RunOptions = RunOptions
   deriving stock (Eq, Show, Generic)
   deriving anyclass (ToJSON, FromJSON)
 
+-- Orphan instances
+instance ToJSON a => ToJSON (Positive a) where
+  toJSON (Positive a) = toJSON a
+
+instance FromJSON a => FromJSON (Positive a) where
+  parseJSON v = Positive <$> parseJSON v
+
 -- Orphan instance
 instance Arbitrary IP where
   arbitrary = IPv4 . toIPv4w <$> arbitrary
@@ -223,7 +230,7 @@ instance Arbitrary RunOptions where
     hydraSigningKey <- genFilePath "sk"
     hydraVerificationKeys <- reasonablySized (listOf (genFilePath "vk"))
     persistenceDir <- genDirPath
-    persistenceRotateAfter <- arbitrary
+    persistenceRotateAfter <- oneof [pure Nothing, Just . Positive . fromInteger <$> choose (1, 100000)]
     chainConfig <- arbitrary
     ledgerConfig <- arbitrary
     whichEtcd <- arbitrary
@@ -852,15 +859,22 @@ persistenceDirParser =
           \Do not edit these files manually!"
     )
 
-persistenceRotateAfterParser :: Parser Natural
+persistenceRotateAfterParser :: Parser (Positive Natural)
 persistenceRotateAfterParser =
   option
-    auto
+    (eitherReader validateRotateAfter)
     ( long "persistence-rotate-after"
         <> metavar "NATURAL"
         <> help
-          "The number of Hydra events to trigger rotation (default: no rotation)"
+          "The number of Hydra events to trigger rotation (default: no rotation).\
+          \Note it must be a positive number."
     )
+ where
+  validateRotateAfter :: String -> Either String (Positive Natural)
+  validateRotateAfter arg =
+    case readMaybe arg of
+      Just n | n > 0 -> Right (Positive n)
+      _ -> Left "--persistence-rotate-after must be a positive number"
 
 hydraNodeCommand :: ParserInfo Command
 hydraNodeCommand =
@@ -992,7 +1006,7 @@ toArgs
       <> concatMap toArgPeer peers
       <> maybe [] (\port -> ["--monitoring-port", show port]) monitoringPort
       <> ["--persistence-dir", persistenceDir]
-      <> maybe [] (\rotateAfter -> ["--persistence-rotate-after", show rotateAfter]) persistenceRotateAfter
+      <> maybe [] (\rotateAfter -> ["--persistence-rotate-after", showPositive rotateAfter]) persistenceRotateAfter
       <> argsChainConfig chainConfig
       <> argsLedgerConfig
       <> ["--api-transaction-timeout", show apiTransactionTimeout]
@@ -1063,6 +1077,9 @@ toArgs
       { cardanoLedgerProtocolParametersFile
       } = ledgerConfig
 
+    showPositive :: Show a => Positive a -> String
+    showPositive (Positive x) = show x
+
 toArgNodeSocket :: SocketPath -> [String]
 toArgNodeSocket nodeSocket = ["--node-socket", unFile nodeSocket]
 
diff --git a/hydra-node/test/Hydra/Events/RotationSpec.hs b/hydra-node/test/Hydra/Events/RotationSpec.hs
diff --git a/hydra-node/test/Hydra/OptionsSpec.hs b/hydra-node/test/Hydra/OptionsSpec.hs