Skip to content

Commit 18c3cf2

Browse files
rpc health check (#88)
Dialing just establishes a TCP connection to the server, while an RPC call actually verifies the server is a functioning node by making a real request. This ticket tracks adding a quick RPC call to client creation to ensure the client we return to the user is usable. --------- Co-authored-by: Giorgio Gambino <[email protected]>
1 parent c5c5038 commit 18c3cf2

File tree

3 files changed

+102
-4
lines changed

3 files changed

+102
-4
lines changed

.changeset/poor-dryers-shine.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"chainlink-deployments-framework": minor
3+
---
4+
5+
Adding RPC client health check after successful dial

deployment/multiclient.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ const (
3030
RPCDefaultDialRetryAttempts = 1
3131
RPCDefaultDialRetryDelay = 1000 * time.Millisecond
3232
RPCDefaultDialTimeout = 10 * time.Second
33+
34+
// Default timeout for health checks
35+
RPCDefaultHealthCheckTimeout = 2 * time.Second
3336
)
3437

3538
type RetryConfig struct {
@@ -63,6 +66,20 @@ type MultiClient struct {
6366
chainName string
6467
}
6568

69+
// rpcHealthCheck performs a basic health check on the RPC client by calling eth_blockNumber
70+
func (mc *MultiClient) rpcHealthCheck(ctx context.Context, client *ethclient.Client) error {
71+
timeoutCtx, cancel := context.WithTimeout(ctx, RPCDefaultHealthCheckTimeout)
72+
defer cancel()
73+
74+
// Try to get the latest block number
75+
_, err := client.BlockNumber(timeoutCtx)
76+
if err != nil {
77+
return fmt.Errorf("health check failed: %w", err)
78+
}
79+
80+
return nil
81+
}
82+
6683
func NewMultiClient(lggr logger.Logger, rpcsCfg RPCConfig, opts ...func(client *MultiClient)) (*MultiClient, error) {
6784
if len(rpcsCfg.RPCs) == 0 {
6885
return nil, errors.New("no RPCs provided, need at least one")
@@ -85,6 +102,13 @@ func NewMultiClient(lggr logger.Logger, rpcsCfg RPCConfig, opts ...func(client *
85102
client, err := mc.dialWithRetry(rpc, lggr)
86103
if err != nil {
87104
lggr.Warnf("failed to dial client %d for RPC '%s' trying with the next one: %v", i, rpc.Name, err)
105+
106+
continue
107+
}
108+
if err := mc.rpcHealthCheck(context.Background(), client); err != nil {
109+
lggr.Warnf("health check failed for client %d for RPC '%s' trying with the next one: %v", i, rpc.Name, err)
110+
client.Close()
111+
88112
continue
89113
}
90114
clients = append(clients, client)

deployment/multiclient_test.go

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package deployment
33
import (
44
"context"
55
"errors"
6+
"net/http"
7+
"net/http/httptest"
68
"testing"
79
"time"
810

@@ -13,14 +15,44 @@ import (
1315
"github.com/smartcontractkit/chainlink-common/pkg/logger"
1416
)
1517

18+
// Helper RPC server that always answers with a valid eth_blockNumber response
19+
func newMockRPCServer(t *testing.T) *httptest.Server {
20+
t.Helper()
21+
22+
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
23+
w.Header().Set("Content-Type", "application/json")
24+
// Return a valid eth_blockNumber response
25+
_, _ = w.Write([]byte(`{"jsonrpc":"2.0","id":1,"result":"0x1"}`))
26+
})
27+
28+
return httptest.NewServer(handler)
29+
}
30+
31+
// Helper RPC server that always answers with a JSON-RPC error payload
32+
func newBadRPCServer(t *testing.T) *httptest.Server {
33+
t.Helper()
34+
35+
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
36+
w.Header().Set("Content-Type", "application/json")
37+
// Standard JSON-RPC error payload
38+
_, _ = w.Write([]byte(`{"jsonrpc":"2.0","id":1,"error":{"code":-32000,"message":"internal error"}}`))
39+
})
40+
41+
return httptest.NewServer(handler)
42+
}
43+
1644
// TODO(giogam): This test is incomplete, it should be completed with support for websockets URLS
1745
func TestMultiClient(t *testing.T) {
1846
t.Parallel()
47+
48+
mockSrv := newMockRPCServer(t)
49+
defer mockSrv.Close()
50+
1951
var (
2052
lggr = logger.Test(t)
2153
chainSelector uint64 = 16015286601757825753 // "ethereum-testnet-sepolia"
22-
wsURL = "ws://example.com"
23-
httpURL = "http://example.com"
54+
wsURL = "" // WS unused in this test
55+
httpURL = mockSrv.URL // use mock server for health-check
2456
)
2557

2658
// Expect defaults to be set if not provided.
@@ -43,13 +75,50 @@ func TestMultiClient(t *testing.T) {
4375

4476
// Expect second client to be set as backup.
4577
mc, err = NewMultiClient(lggr, RPCConfig{ChainSelector: chainSelector, RPCs: []RPC{
46-
{Name: "test-rpc", WSURL: wsURL, HTTPURL: httpURL, PreferredURLScheme: URLSchemePreferenceHTTP},
47-
{Name: "test-rpc", WSURL: wsURL, HTTPURL: httpURL, PreferredURLScheme: URLSchemePreferenceHTTP},
78+
{Name: "test-rpc", WSURL: wsURL, HTTPURL: httpURL, PreferredURLScheme: URLSchemePreferenceHTTP}, //preferred
79+
{Name: "test-rpc", WSURL: wsURL, HTTPURL: httpURL, PreferredURLScheme: URLSchemePreferenceHTTP}, //backup
4880
}})
4981
require.NoError(t, err)
5082
require.Len(t, mc.Backups, 1)
5183
}
5284

85+
// Verifies that a bad eth_blockNumber response causes MultiClient to skip the
86+
// first RPC and succeed with the next one.
87+
func TestMultiClient_healthCheckSkipsBadRPC(t *testing.T) {
88+
t.Parallel()
89+
90+
badSrv := newBadRPCServer(t)
91+
defer badSrv.Close()
92+
93+
goodSrv := newMockRPCServer(t)
94+
defer goodSrv.Close()
95+
96+
var (
97+
lggr = logger.Test(t)
98+
chainSelector uint64 = 16015286601757825753
99+
)
100+
101+
mc, err := NewMultiClient(lggr, RPCConfig{ChainSelector: chainSelector, RPCs: []RPC{
102+
// first RPC -> health-check fails
103+
{Name: "bad-rpc", WSURL: "", HTTPURL: badSrv.URL, PreferredURLScheme: URLSchemePreferenceHTTP},
104+
// second RPC -> health-check passes
105+
{Name: "good-rpc", WSURL: "", HTTPURL: goodSrv.URL, PreferredURLScheme: URLSchemePreferenceHTTP},
106+
}})
107+
require.NoError(t, err)
108+
109+
// Only the good RPC should remain (primary) and there should be no backups.
110+
require.NotNil(t, mc.Client)
111+
require.Empty(t, mc.Backups)
112+
113+
// Sanity-check: calling BlockNumber on the surviving client should succeed.
114+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
115+
defer cancel()
116+
117+
blockNum, err := mc.BlockNumber(ctx)
118+
require.NoError(t, err)
119+
assert.Equal(t, uint64(1), blockNum)
120+
}
121+
53122
func TestMultiClient_dialWithRetry(t *testing.T) {
54123
t.Parallel()
55124

0 commit comments

Comments
 (0)