Skip to content

Commit 986d049

Browse files
authored
Merge pull request #6396 from IntersectMBO/mkarg/restart-prometheussimple
PrometheusSimple backend robustness improvements
2 parents 1b26ad0 + d67d0e3 commit 986d049

File tree

5 files changed

+104
-28
lines changed

5 files changed

+104
-28
lines changed

cardano-node/src/Cardano/Node/Tracing/API.hs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{-# LANGUAGE BangPatterns #-}
12
{-# LANGUAGE FlexibleContexts #-}
23
{-# LANGUAGE MonoLocalBinds #-}
34
{-# LANGUAGE PackageImports #-}
@@ -11,7 +12,7 @@ module Cardano.Node.Tracing.API
1112
) where
1213

1314
import Cardano.Logging hiding (traceWith)
14-
import Cardano.Logging.Prometheus.TCPServer (runPrometheusSimple)
15+
import Cardano.Logging.Prometheus.TCPServer
1516
import Cardano.Node.Configuration.NodeAddress (PortNumber)
1617
import Cardano.Node.Configuration.POM (NodeConfiguration (..))
1718
import Cardano.Node.Protocol.Types
@@ -36,11 +37,12 @@ import Ouroboros.Network.NodeToNode (RemoteAddress)
3637

3738
import Prelude
3839

40+
import Control.Concurrent.Async (link)
3941
import Control.DeepSeq (deepseq)
4042
import Control.Exception (SomeException (..))
41-
import Control.Monad (forM_)
4243
import "contra-tracer" Control.Tracer (traceWith)
4344
import "trace-dispatcher" Control.Tracer (nullTracer)
45+
import Data.Functor.Contravariant ((>$<))
4446
import qualified Data.Map.Strict as Map
4547
import Data.Maybe
4648
import Data.Time.Clock (getCurrentTime)
@@ -83,10 +85,7 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do
8385

8486
traceWith (nodeStateTracer tracers) NodeTracingOnlineConfiguring
8587

86-
mError <- kickoffPrometheusSimple
87-
forM_ mError $ \errMsg ->
88-
let errMsg' = "PrometheusSimple backend disabled due to initialisation error: " ++ errMsg
89-
in traceWith (nodeStateTracer tracers) (NodeTracingFailure errMsg')
88+
kickoffPrometheusSimple
9089

9190
startResourceTracer
9291
(resourcesTracer tracers)
@@ -105,16 +104,14 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do
105104
mkTracers
106105
:: TraceConfig
107106
-> IO ( IO ()
108-
, IO (Maybe String)
107+
, IO ()
109108
, Tracers RemoteAddress LocalAddress blk IO
110109
)
111110
mkTracers trConfig = mdo
112111
ekgStore <- EKG.newStore
113112
EKG.registerGcMetrics ekgStore
114113
ekgTrace <- ekgTracer trConfig ekgStore
115114

116-
let kickoffPrometheusSimple = maybe (pure Nothing) (runPrometheusSimple ekgStore) prometheusSimple
117-
118115
stdoutTrace <- standardTracer
119116

120117
-- We should initialize forwarding only if 'Forwarder' backend
@@ -155,6 +152,16 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do
155152
dpTracer
156153
trConfig
157154
p
155+
156+
let
157+
kickoffPrometheusSimple = case prometheusSimple of
158+
Nothing -> pure ()
159+
Just ps ->
160+
let
161+
!nsTr = nodeStateTracer tracers
162+
!tracePrometheus = NodePrometheusSimple >$< nsTr
163+
in runPrometheusSimple tracePrometheus ekgStore ps >>= link
164+
158165
pure (kickoffForwarder, kickoffPrometheusSimple, tracers)
159166

160167
where

cardano-node/src/Cardano/Node/Tracing/StateRep.hs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ module Cardano.Node.Tracing.StateRep
2222
import Cardano.Api (textShow)
2323

2424
import Cardano.Logging
25+
import Cardano.Logging.Prometheus.TCPServer (TracePrometheusSimple (..))
2526
import Cardano.Node.Handlers.Shutdown (ShutdownTrace)
2627
import Cardano.Node.Protocol.Types (SomeConsensusProtocol (..))
2728
import qualified Cardano.Node.Startup as Startup
@@ -48,6 +49,11 @@ deriving instance ToJSON ChunkNo
4849

4950
deriving instance NFData ChunkNo
5051

52+
deriving instance Generic TracePrometheusSimple
53+
deriving instance FromJSON TracePrometheusSimple
54+
deriving instance ToJSON TracePrometheusSimple
55+
deriving instance NFData TracePrometheusSimple
56+
5157
data OpeningDbs
5258
= StartedOpeningImmutableDB
5359
| OpenedImmutableDB (WithOrigin SlotNo) ChunkNo
@@ -100,6 +106,7 @@ data NodeState
100106
= NodeTracingOnlineConfiguring
101107
| NodeTracingFailure String
102108
| NodeTracingForwardingInterrupted HowToConnect String
109+
| NodePrometheusSimple TracePrometheusSimple
103110
| NodeOpeningDbs OpeningDbs
104111
| NodeReplays Replays
105112
| NodeInitChainSelection InitChainSelection
@@ -112,7 +119,7 @@ data NodeState
112119
deriving instance (NFData NodeState)
113120

114121
instance LogFormatting NodeState where
115-
forMachine _ = \case
122+
forMachine _dtal = \case
116123
NodeTracingOnlineConfiguring -> mconcat
117124
[ "kind" .= String "NodeTracingOnlineConfiguring" ]
118125
NodeOpeningDbs x -> mconcat
@@ -136,12 +143,16 @@ instance LogFormatting NodeState where
136143
, "conn" .= howToConnect
137144
, "message" .= toJSON x
138145
]
146+
NodePrometheusSimple promSimple ->
147+
forMachine _dtal promSimple
139148

140149
forHuman = \case
141150
NodeTracingFailure errMsg ->
142151
T.pack errMsg
143152
NodeTracingForwardingInterrupted howToConnect errMsg ->
144153
T.pack $ "trace forwarding connection with " <> show howToConnect <> " failed: " <> errMsg
154+
NodePrometheusSimple promSimple ->
155+
forHuman promSimple
145156
_
146157
-> ""
147158

@@ -152,6 +163,10 @@ instance MetaTrace NodeState where
152163
Namespace [] ["NodeTracingFailure"]
153164
namespaceFor NodeTracingForwardingInterrupted {} =
154165
Namespace [] ["NodeTracingForwardingInterrupted"]
166+
namespaceFor (NodePrometheusSimple TracePrometheusSimpleStart{}) =
167+
Namespace [] ["PrometheusSimple", "Start"]
168+
namespaceFor (NodePrometheusSimple TracePrometheusSimpleStop{}) =
169+
Namespace [] ["PrometheusSimple", "Stop"]
155170
namespaceFor NodeOpeningDbs {} =
156171
Namespace [] ["OpeningDbs"]
157172
namespaceFor NodeReplays {} =
@@ -173,6 +188,10 @@ instance MetaTrace NodeState where
173188
Just Error
174189
severityFor (Namespace _ ["NodeTracingForwardingInterrupted"]) _ =
175190
Just Warning
191+
severityFor (Namespace _ ["PrometheusSimple", "Start"]) _ =
192+
Just Info
193+
severityFor (Namespace _ ["PrometheusSimple", "Stop"]) _ =
194+
Just Warning
176195
severityFor (Namespace _ ["OpeningDbs"]) _ =
177196
Just Info
178197
severityFor (Namespace _ ["NodeReplays"]) _ =
@@ -210,12 +229,18 @@ instance MetaTrace NodeState where
210229
"Node startup"
211230
documentFor (Namespace _ ["NodeShutdown"]) = Just
212231
"Node shutting down"
232+
documentFor (Namespace _ ["PrometheusSimple", "Start"]) =
233+
Just "PrometheusSimple backend is starting"
234+
documentFor (Namespace _ ["PrometheusSimple", "Stop"]) =
235+
Just "PrometheusSimple backend stopped"
213236
documentFor _ns = Nothing
214237

215238
allNamespaces = [
216239
Namespace [] ["NodeTracingOnlineConfiguring"]
217240
, Namespace [] ["NodeTracingFailure"]
218241
, Namespace [] ["NodeTracingForwardingInterrupted"]
242+
, Namespace [] ["PrometheusSimple", "Start"]
243+
, Namespace [] ["PrometheusSimple", "Stop"]
219244
, Namespace [] ["OpeningDbs"]
220245
, Namespace [] ["NodeReplays"]
221246
, Namespace [] ["NodeInitChainSelection"]

trace-dispatcher/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## 2.11.1 -- Dez 2025
44

5+
* Increase `PrometheusSimple` robustness by restarting the backend upon crash, adding start/stop traces and more eagerly reaping of dangling sockets
56
* Removed `TraceConfig.tcPeerFrequency` and hence `TraceOptionPeerFrequency` from config representation
67
* Removed unused module `Cardano.Logging.Types.NodePeers`
78

trace-dispatcher/src/Cardano/Logging/Prometheus/NetworkRun.hs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,12 @@ data NetworkRunParams = NetworkRunParams
4444

4545
defaultRunParams :: String -> NetworkRunParams
4646
defaultRunParams name = NetworkRunParams
47-
{ runSocketTimeout = 30
47+
{ runSocketTimeout = 22
4848
, runSocketGraceful = 1000
4949
, runRecvMaxSize = 2048
5050
, runRateLimit = 3.0
5151
, runConnLimitGlobal = 12
52-
, runConnLimitPerHost = 3
52+
, runConnLimitPerHost = 4
5353
, runServerName = name
5454
}
5555

@@ -68,8 +68,8 @@ mkTCPServerRunner
6868
:: NetworkRunParams
6969
-> Maybe HostName
7070
-> PortNumber
71-
-> TimeoutServer a
72-
-> IO (IO a)
71+
-> TimeoutServer ()
72+
-> IO (IO ())
7373
mkTCPServerRunner runParams (fromMaybe "127.0.0.1" -> host) portNo server = do
7474
!sock <- openTCPServerSocket =<< resolve host portNo
7575
let
@@ -81,8 +81,8 @@ mkTCPServerRunner runParams (fromMaybe "127.0.0.1" -> host) portNo server = do
8181
runTCPServerWithSocket
8282
:: NetworkRunParams
8383
-> Socket
84-
-> TimeoutServer a
85-
-> IO a
84+
-> TimeoutServer ()
85+
-> IO ()
8686
runTCPServerWithSocket runParams@NetworkRunParams{..} sock server = do
8787
rateLimiter <- mkRateLimiter runServerName runRateLimit
8888
ConnLimiter{..} <- mkConnLimiter runConnLimitGlobal runConnLimitPerHost
@@ -91,13 +91,13 @@ runTCPServerWithSocket runParams@NetworkRunParams{..} sock server = do
9191
E.bracketOnError (accept sock) (close . fst) $ \(conn, peer) -> do
9292
noLimitHit <- canServeThisPeer peer
9393
if noLimitHit
94-
then void $ forkFinally (server' mgr conn) (const $ gclose conn >> releasePeer peer)
94+
then void $ forkFinally (runServer mgr conn) (const $ gclose conn >> releasePeer peer)
9595
else close conn
9696
where
9797
gclose = if runSocketGraceful > 0 then flip gracefulClose runSocketGraceful else close
98-
server' mgr conn = do
98+
runServer mgr conn = do
9999
threadLabelMe $ runServerName ++ " timeout server"
100-
T.withHandle mgr (return ()) $ \timeoutHandle ->
100+
T.withHandleKillThread mgr (return ()) $ \timeoutHandle ->
101101
server runParams (T.tickle timeoutHandle) conn
102102

103103
resolve :: HostName -> PortNumber -> IO AddrInfo

trace-dispatcher/src/Cardano/Logging/Prometheus/TCPServer.hs

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,35 @@
1+
{-# LANGUAGE PackageImports #-}
2+
3+
{-# OPTIONS_GHC -Wno-partial-fields #-}
4+
15
-- | Run a simple Prometheus TCP server, responding *only* to the '/metrics' URL with current Node metrics
2-
module Cardano.Logging.Prometheus.TCPServer (runPrometheusSimple) where
6+
module Cardano.Logging.Prometheus.TCPServer
7+
( runPrometheusSimple
8+
, runPrometheusSimpleSilent
9+
10+
, TracePrometheusSimple (..)
11+
) where
312

413
import Cardano.Logging.Prometheus.Exposition (renderExpositionFromSample)
514
import Cardano.Logging.Prometheus.NetworkRun
15+
import Cardano.Logging.Types
16+
import Cardano.Logging.Utils (runInLoop, showT)
617

7-
import Control.Concurrent.Async (async, link)
18+
import Control.Concurrent.Async (Async, async)
819
import qualified Control.Exception as E
9-
import Control.Monad (when)
20+
import Control.Monad (join, when)
21+
import "contra-tracer" Control.Tracer
22+
import Data.Aeson.Types as AE (Value (String), (.=))
1023
import Data.ByteString (ByteString)
1124
import Data.ByteString.Builder
1225
import qualified Data.ByteString.Char8 as BC
1326
import Data.Int (Int64)
1427
import Data.List (find, intersperse)
28+
import Data.Text as TS (pack)
1529
import Data.Text.Lazy (Text)
1630
import qualified Data.Text.Lazy as T
1731
import qualified Data.Text.Lazy.Encoding as T (encodeUtf8Builder)
32+
import Data.Word (Word16)
1833
import Network.HTTP.Date (epochTimeToHTTPDate, formatHTTPDate)
1934
import Network.Socket (HostName, PortNumber)
2035
import qualified Network.Socket.ByteString as Strict (recv)
@@ -24,13 +39,41 @@ import System.Posix.Types (EpochTime)
2439
import System.PosixCompat.Time (epochTime)
2540

2641

27-
-- Will provide a 'Just errormessage' iff creating the Prometheus server failed
28-
runPrometheusSimple :: EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Maybe String)
29-
runPrometheusSimple ekgStore (noSuffixes, mHost, portNo) =
30-
E.try createRunner >>= \case
31-
Left (E.SomeException e) -> pure (Just $ E.displayException e)
32-
Right runner -> async runner >>= link >> pure Nothing
42+
data TracePrometheusSimple =
43+
TracePrometheusSimpleStart { port :: Word16 }
44+
| TracePrometheusSimpleStop { message :: String }
45+
deriving Show
46+
47+
instance LogFormatting TracePrometheusSimple where
48+
forMachine _ = \case
49+
TracePrometheusSimpleStart portNo -> mconcat
50+
[ "kind" .= AE.String "PrometheusSimpleStart"
51+
, "port" .= portNo
52+
]
53+
TracePrometheusSimpleStop message -> mconcat
54+
[ "kind" .= AE.String "TracePrometheusSimpleStop"
55+
, "message" .= message
56+
]
57+
58+
forHuman = \case
59+
TracePrometheusSimpleStart portNo -> "PrometheusSimple backend starting on port " <> showT portNo
60+
TracePrometheusSimpleStop message -> "PrometheusSimple backend stop: " <> TS.pack message
61+
62+
63+
-- Same as below, but will not trace anything
64+
runPrometheusSimpleSilent :: EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Async ())
65+
runPrometheusSimpleSilent = runPrometheusSimple nullTracer
66+
67+
-- Will retry / restart Prometheus server when an exception occurs, in increasing intervals
68+
runPrometheusSimple :: Tracer IO TracePrometheusSimple -> EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Async ())
69+
runPrometheusSimple tr ekgStore (noSuffixes, mHost, portNo) =
70+
async $ runInLoop fromScratchThrowing traceInterruption 1 60
3371
where
72+
traceInterruption (E.SomeException e) =
73+
traceWith tr $ TracePrometheusSimpleStop (E.displayException e)
74+
75+
fromScratchThrowing = traceWith tr (TracePrometheusSimpleStart $ fromIntegral portNo) >> join createRunner
76+
3477
getCurrentExposition = renderExpositionFromSample noSuffixes <$> sampleAll ekgStore
3578
createRunner = mkTCPServerRunner (defaultRunParams "PrometheusSimple") mHost portNo (serveAccepted getCurrentExposition)
3679

0 commit comments

Comments
 (0)