Skip to content

Commit f843122

Browse files
authored
feat: refactor endpoint errors metric (#14)
1 parent db1142f commit f843122

File tree

4 files changed

+89
-0
lines changed

4 files changed

+89
-0
lines changed

pkg/exporters/drift/drift.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"github.com/01builders/ev-metrics/pkg/metrics"
7+
"github.com/01builders/ev-metrics/pkg/utils"
78
"github.com/ethereum/go-ethereum/ethclient"
89
"github.com/rs/zerolog"
910
"time"
@@ -53,9 +54,12 @@ func (e exporter) ExportMetrics(ctx context.Context, m *metrics.Metrics) error {
5354
refHeight, err := getBlockHeight(ctx, e.referenceNode)
5455
if err != nil {
5556
e.logger.Error().Err(err).Str("endpoint", e.referenceNode).Msg("failed to get reference node block height")
57+
m.RecordEndpointAvailability(e.chainID, e.referenceNode, false)
58+
m.RecordEndpointError(e.chainID, e.referenceNode, utils.CategorizeError(err))
5659
continue
5760
}
5861

62+
m.RecordEndpointAvailability(e.chainID, e.referenceNode, true)
5963
m.RecordReferenceBlockHeight(e.chainID, e.referenceNode, refHeight)
6064
e.logger.Info().Uint64("height", refHeight).Str("endpoint", e.referenceNode).Msg("recorded reference node height")
6165

@@ -64,9 +68,12 @@ func (e exporter) ExportMetrics(ctx context.Context, m *metrics.Metrics) error {
6468
currentHeight, err := getBlockHeight(ctx, fullNode)
6569
if err != nil {
6670
e.logger.Error().Err(err).Str("endpoint", fullNode).Msg("failed to get full node block height")
71+
m.RecordEndpointAvailability(e.chainID, fullNode, false)
72+
m.RecordEndpointError(e.chainID, fullNode, utils.CategorizeError(err))
6773
continue
6874
}
6975

76+
m.RecordEndpointAvailability(e.chainID, fullNode, true)
7077
m.RecordCurrentBlockHeight(e.chainID, fullNode, currentHeight)
7178
m.RecordBlockHeightDrift(e.chainID, fullNode, refHeight, currentHeight)
7279

pkg/exporters/jsonrpc/json_rpc.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"github.com/01builders/ev-metrics/internal/clients/evm"
66
"github.com/01builders/ev-metrics/pkg/metrics"
7+
"github.com/01builders/ev-metrics/pkg/utils"
78
"time"
89

910
"github.com/rs/zerolog"
@@ -67,9 +68,12 @@ func performHealthCheck(
6768
) error {
6869
duration, err := evmClient.HealthCheckRequest(ctx)
6970
if err != nil {
71+
m.RecordEndpointAvailability(chainID, "jsonrpc", false)
72+
m.RecordEndpointError(chainID, "jsonrpc", utils.CategorizeError(err))
7073
return err
7174
}
7275

76+
m.RecordEndpointAvailability(chainID, "jsonrpc", true)
7377
m.RecordJsonRpcRequestDuration(chainID, duration)
7478

7579
logger.Info().

pkg/metrics/metrics.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ type Metrics struct {
3434
JsonRpcRequestDuration *prometheus.HistogramVec
3535
// JsonRpcRequestSloSeconds exports constant SLO thresholds for JSON-RPC requests.
3636
JsonRpcRequestSloSeconds *prometheus.GaugeVec
37+
// EndpointAvailability tracks whether an endpoint is reachable (1.0 = available, 0.0 = unavailable).
38+
EndpointAvailability *prometheus.GaugeVec
39+
// EndpointErrors tracks endpoint connection errors by type.
40+
EndpointErrors *prometheus.CounterVec
3741

3842
// internal tracking to ensure we only record increasing DA heights
3943
latestHeaderDaHeight uint64
@@ -164,6 +168,22 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
164168
},
165169
[]string{"chain_id", "percentile"},
166170
),
171+
EndpointAvailability: factory.NewGaugeVec(
172+
prometheus.GaugeOpts{
173+
Namespace: namespace,
174+
Name: "endpoint_availability",
175+
Help: "endpoint availability status (1.0 = available, 0.0 = unavailable)",
176+
},
177+
[]string{"chain_id", "endpoint"},
178+
),
179+
EndpointErrors: factory.NewCounterVec(
180+
prometheus.CounterOpts{
181+
Namespace: namespace,
182+
Name: "endpoint_errors_total",
183+
Help: "total number of endpoint connection errors by type",
184+
},
185+
[]string{"chain_id", "endpoint", "error_type"},
186+
),
167187
ranges: make(map[string][]*blockRange),
168188
lastBlockArrivalTime: make(map[string]time.Time),
169189
}
@@ -431,3 +451,18 @@ func (m *Metrics) InitializeJsonRpcSloThresholds(chainID string) {
431451
m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p95").Set(0.4)
432452
m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p99").Set(0.5)
433453
}
454+
455+
// RecordEndpointAvailability records whether an endpoint is reachable
456+
// available should be true if endpoint is reachable, false otherwise
457+
func (m *Metrics) RecordEndpointAvailability(chainID, endpoint string, available bool) {
458+
value := 0.0
459+
if available {
460+
value = 1.0
461+
}
462+
m.EndpointAvailability.WithLabelValues(chainID, endpoint).Set(value)
463+
}
464+
465+
// RecordEndpointError records an endpoint connection error with its type
466+
func (m *Metrics) RecordEndpointError(chainID, endpoint, errorType string) {
467+
m.EndpointErrors.WithLabelValues(chainID, endpoint, errorType).Inc()
468+
}

pkg/utils/errors.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package utils
2+
3+
import "strings"
4+
5+
// CategorizeError categorizes errors for metrics tracking
6+
// Returns a string representing the error type for use in metrics labels
7+
func CategorizeError(err error) string {
8+
if err == nil {
9+
return "none"
10+
}
11+
12+
errStr := err.Error()
13+
14+
// Check for common error patterns
15+
switch {
16+
case strings.Contains(errStr, "connection refused"):
17+
return "connection_refused"
18+
case strings.Contains(errStr, "timeout"):
19+
return "timeout"
20+
case strings.Contains(errStr, "no such host"):
21+
return "dns_error"
22+
case strings.Contains(errStr, "context canceled"):
23+
return "context_canceled"
24+
case strings.Contains(errStr, "context deadline exceeded"):
25+
return "context_deadline"
26+
case strings.Contains(errStr, "failed to connect"):
27+
return "connection_failed"
28+
case strings.Contains(errStr, "failed to get block number"):
29+
return "rpc_error"
30+
case strings.Contains(errStr, "unexpected status code"):
31+
return "http_error"
32+
case strings.Contains(errStr, "failed to send request"):
33+
return "request_failed"
34+
case strings.Contains(errStr, "failed to read response"):
35+
return "response_read_failed"
36+
case strings.Contains(errStr, "failed to marshal"):
37+
return "marshal_error"
38+
case strings.Contains(errStr, "failed to create request"):
39+
return "request_creation_failed"
40+
default:
41+
return "unknown"
42+
}
43+
}

0 commit comments

Comments
 (0)