Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions internal/evm/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ func NewClient(ctx context.Context, wsURL, rpcURL string, logger zerolog.Logger)
// }
//}

// GetRPCURL returns the RPC URL of the client
func (c *Client) GetRPCURL() string {
return c.rpcURL
}

// HealthCheckRequest performs a lightweight JSON-RPC health check and returns the RTT duration
func (c *Client) HealthCheckRequest(ctx context.Context) (time.Duration, error) {
// Create the JSON-RPC request for eth_blockNumber
Expand Down
7 changes: 4 additions & 3 deletions internal/jsonrpc/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ func Monitor(
Int("scrape_interval_seconds", scrapeInterval).
Msg("starting JSON-RPC health monitoring")

// Initialize SLO threshold gauges once at startup
m.InitializeJsonRpcSloThresholds(chainID)

ticker := time.NewTicker(time.Duration(scrapeInterval) * time.Second)
defer ticker.Stop()

Expand Down Expand Up @@ -57,9 +54,13 @@ func performHealthCheck(
) error {
duration, err := evmClient.HealthCheckRequest(ctx)
if err != nil {
// Record endpoint as unreachable
m.RecordEndpointAvailability(chainID, evmClient.GetRPCURL(), false)
return err
}

// Record endpoint as reachable
m.RecordEndpointAvailability(chainID, evmClient.GetRPCURL(), true)
m.RecordJsonRpcRequestDuration(chainID, duration)
Comment on lines 57 to 70

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for recording endpoint availability is present in both the success and failure paths of the health check, leading to some code repetition. You can make this more concise and adhere to the DRY (Don't Repeat Yourself) principle by recording the availability based on the error status before checking the error, and then returning if an error occurred.

Suggested change
if err != nil {
// Record endpoint as unreachable
m.RecordEndpointAvailability(chainID, evmClient.GetRPCURL(), false)
return err
}
// Record endpoint as reachable
m.RecordEndpointAvailability(chainID, evmClient.GetRPCURL(), true)
m.RecordJsonRpcRequestDuration(chainID, duration)
m.RecordEndpointAvailability(chainID, evmClient.GetRPCURL(), err == nil)
if err != nil {
return err
}
m.RecordJsonRpcRequestDuration(chainID, duration)


logger.Info().
Expand Down
50 changes: 36 additions & 14 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ type Metrics struct {
SubmissionDaHeight *prometheus.GaugeVec
// BlockTime tracks the time between consecutive blocks over a rolling window.
BlockTime *prometheus.SummaryVec
// JsonRpcRequestDuration tracks the duration of JSON-RPC requests to the EVM node.
// JsonRpcRequestDuration tracks the duration of JSON-RPC requests to the EVM node (histogram for detailed buckets).
JsonRpcRequestDuration *prometheus.HistogramVec
// JsonRpcRequestSloSeconds exports constant SLO thresholds for JSON-RPC requests.
JsonRpcRequestSloSeconds *prometheus.GaugeVec
// JsonRpcRequestDurationSummary tracks JSON-RPC request duration with percentiles over a rolling window.
JsonRpcRequestDurationSummary *prometheus.SummaryVec
// EndpointAvailability tracks whether an endpoint is reachable (1 = reachable, 0 = unreachable).
EndpointAvailability *prometheus.GaugeVec

// internal tracking to ensure we only record increasing DA heights
latestHeaderDaHeight uint64
Expand Down Expand Up @@ -156,13 +158,29 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
},
[]string{"chain_id"},
),
JsonRpcRequestSloSeconds: factory.NewGaugeVec(
JsonRpcRequestDurationSummary: factory.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: namespace,
Name: "jsonrpc_request_duration_summary_seconds",
Help: "JSON-RPC request duration percentiles over rolling window",
Objectives: map[float64]float64{
0.5: 0.05, // p50 (median) ±5%
0.9: 0.01, // p90 ±1%
0.95: 0.01, // p95 ±1%
0.99: 0.001, // p99 ±0.1%
},
MaxAge: 2 * time.Minute, // rolling window of 2 minutes
AgeBuckets: 5,
},
[]string{"chain_id"},
),
EndpointAvailability: factory.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "jsonrpc_request_slo_seconds",
Help: "SLO thresholds for JSON-RPC request duration",
Name: "endpoint_availability",
Help: "endpoint availability status (1 = reachable, 0 = unreachable)",
},
[]string{"chain_id", "percentile"},
[]string{"chain_id", "endpoint"},
),
ranges: make(map[string][]*blockRange),
lastBlockArrivalTime: make(map[string]time.Time),
Expand Down Expand Up @@ -421,13 +439,17 @@ func (m *Metrics) RecordBlockTime(chainID string, arrivalTime time.Time) {

// RecordJsonRpcRequestDuration records the duration of a JSON-RPC request
func (m *Metrics) RecordJsonRpcRequestDuration(chainID string, duration time.Duration) {
m.JsonRpcRequestDuration.WithLabelValues(chainID).Observe(duration.Seconds())
seconds := duration.Seconds()
m.JsonRpcRequestDuration.WithLabelValues(chainID).Observe(seconds)
m.JsonRpcRequestDurationSummary.WithLabelValues(chainID).Observe(seconds)
}

// InitializeJsonRpcSloThresholds initializes the constant SLO threshold gauges
func (m *Metrics) InitializeJsonRpcSloThresholds(chainID string) {
m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p50").Set(0.2)
m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p90").Set(0.35)
m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p95").Set(0.4)
m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p99").Set(0.5)
// RecordEndpointAvailability records whether an endpoint is reachable
// available should be true if endpoint is reachable, false otherwise
func (m *Metrics) RecordEndpointAvailability(chainID, endpoint string, available bool) {
value := 0.0
if available {
value = 1.0
}
m.EndpointAvailability.WithLabelValues(chainID, endpoint).Set(value)
}
2 changes: 2 additions & 0 deletions mise.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[tools]
go = "latest"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using go = "latest" can lead to non-reproducible builds. When a new version of Go is released, your build environment could change unexpectedly, potentially causing build failures or introducing subtle bugs. It is a best practice to pin dependencies and tool versions to ensure consistent builds across all environments. The go.mod file specifies version 1.24.6, so it would be best to align with that here.

Suggested change
go = "latest"
go = "1.24.6"