Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions internal/drift/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package drift
import (
"context"
"fmt"
"strings"

"github.com/01builders/ev-metrics/internal/metrics"
"github.com/ethereum/go-ethereum/ethclient"
"github.com/rs/zerolog"
Expand Down Expand Up @@ -31,22 +33,32 @@ func Monitor(ctx context.Context, m *metrics.Metrics, chainID string, referenceN
refHeight, err := getBlockHeight(ctx, referenceNode)
if err != nil {
logger.Error().Err(err).Str("endpoint", referenceNode).Msg("failed to get reference node block height")
// Record error for reference node
errorType := classifyError(err)
m.RecordEndpointError(chainID, referenceNode, errorType)
m.RecordEndpointAvailability(chainID, referenceNode, false)
continue
}

m.RecordReferenceBlockHeight(chainID, referenceNode, refHeight)
m.RecordEndpointAvailability(chainID, referenceNode, true)
logger.Info().Uint64("height", refHeight).Str("endpoint", referenceNode).Msg("recorded reference node height")

// get each full node height and calculate drift
for _, fullNode := range fullNodes {
currentHeight, err := getBlockHeight(ctx, fullNode)
if err != nil {
logger.Error().Err(err).Str("endpoint", fullNode).Msg("failed to get full node block height")
// Record error for full node
errorType := classifyError(err)
m.RecordEndpointError(chainID, fullNode, errorType)
m.RecordEndpointAvailability(chainID, fullNode, false)
continue
}

m.RecordCurrentBlockHeight(chainID, fullNode, currentHeight)
m.RecordBlockHeightDrift(chainID, fullNode, refHeight, currentHeight)
m.RecordEndpointAvailability(chainID, fullNode, true)

drift := int64(refHeight) - int64(currentHeight)
logger.Info().
Expand Down Expand Up @@ -75,3 +87,50 @@ func getBlockHeight(ctx context.Context, rpcURL string) (uint64, error) {

return height, nil
}

// classifyError categorizes errors into specific types for metrics
func classifyError(err error) string {
if err == nil {
return "none"
}

errStr := strings.ToLower(err.Error())

// Check for connection refused
if strings.Contains(errStr, "connection refused") {
return "connection_refused"
}

// Check for timeout errors
if strings.Contains(errStr, "timeout") || strings.Contains(errStr, "deadline exceeded") {
return "timeout"
}

// Check for DNS/host resolution errors
if strings.Contains(errStr, "no such host") || strings.Contains(errStr, "dns") {
return "dns_error"
}

// Check for network unreachable
if strings.Contains(errStr, "network is unreachable") || strings.Contains(errStr, "no route to host") {
return "network_unreachable"
}

// Check for HTTP status errors
if strings.Contains(errStr, "status code") {
return "http_error"
}

// Check for EOF errors
if strings.Contains(errStr, "eof") {
return "eof"
}

// Check for TLS/certificate errors
if strings.Contains(errStr, "tls") || strings.Contains(errStr, "certificate") {
return "tls_error"
}

// Default to unknown error type
return "unknown"
}
5 changes: 5 additions & 0 deletions internal/evm/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ func NewClient(ctx context.Context, wsURL, rpcURL string, logger zerolog.Logger)
// }
//}

// GetRPCURL returns the RPC URL of the client
func (c *Client) GetRPCURL() string {
return c.rpcURL
}

// HealthCheckRequest performs a lightweight JSON-RPC health check and returns the RTT duration
func (c *Client) HealthCheckRequest(ctx context.Context) (time.Duration, error) {
// Create the JSON-RPC request for eth_blockNumber
Expand Down
60 changes: 57 additions & 3 deletions internal/jsonrpc/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package jsonrpc

import (
"context"
"strings"
"time"

"github.com/01builders/ev-metrics/internal/evm"
Expand All @@ -24,9 +25,6 @@ func Monitor(
Int("scrape_interval_seconds", scrapeInterval).
Msg("starting JSON-RPC health monitoring")

// Initialize SLO threshold gauges once at startup
m.InitializeJsonRpcSloThresholds(chainID)

ticker := time.NewTicker(time.Duration(scrapeInterval) * time.Second)
defer ticker.Stop()

Expand Down Expand Up @@ -57,9 +55,18 @@ func performHealthCheck(
) error {
duration, err := evmClient.HealthCheckRequest(ctx)
if err != nil {
// Record endpoint as unreachable
m.RecordEndpointAvailability(chainID, evmClient.GetRPCURL(), false)

// Classify and record the error type
errorType := classifyError(err)
m.RecordEndpointError(chainID, evmClient.GetRPCURL(), errorType)

return err
}

// Record endpoint as reachable
m.RecordEndpointAvailability(chainID, evmClient.GetRPCURL(), true)
m.RecordJsonRpcRequestDuration(chainID, duration)

logger.Info().
Expand All @@ -69,3 +76,50 @@ func performHealthCheck(

return nil
}

// classifyError categorizes errors into specific types for metrics
func classifyError(err error) string {
if err == nil {
return "none"
}

errStr := strings.ToLower(err.Error())

// Check for connection refused
if strings.Contains(errStr, "connection refused") {
return "connection_refused"
}

// Check for timeout errors
if strings.Contains(errStr, "timeout") || strings.Contains(errStr, "deadline exceeded") {
return "timeout"
}

// Check for DNS/host resolution errors
if strings.Contains(errStr, "no such host") || strings.Contains(errStr, "dns") {
return "dns_error"
}

// Check for network unreachable
if strings.Contains(errStr, "network is unreachable") || strings.Contains(errStr, "no route to host") {
return "network_unreachable"
}

// Check for HTTP status errors
if strings.Contains(errStr, "status code") {
return "http_error"
}

// Check for EOF errors
if strings.Contains(errStr, "eof") {
return "eof"
}

// Check for TLS/certificate errors
if strings.Contains(errStr, "tls") || strings.Contains(errStr, "certificate") {
return "tls_error"
}

// Default to unknown error type
return "unknown"
}
Loading