From 1629daf7edb6a575b62fac3b1cfad72ac79682eb Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Tue, 10 Feb 2026 13:16:14 +0530
Subject: [PATCH 01/29] heimdall: added initial implementation of heimdall RPC
 fallback

---
 cmd/utils/bor_flags.go                        |   9 +
 consensus/bor/heimdall/failover_client.go     | 231 ++++++++++++
 .../bor/heimdall/failover_client_test.go      | 340 ++++++++++++++++++
 eth/ethconfig/config.go                       |  10 +
 eth/ethconfig/gen_config.go                   |   6 +
 internal/cli/server/config.go                 |  15 +-
 internal/cli/server/flags.go                  |   6 +
 internal/cli/server/testdata/default.toml     |   1 +
 8 files changed, 613 insertions(+), 5 deletions(-)
 create mode 100644 consensus/bor/heimdall/failover_client.go
 create mode 100644 consensus/bor/heimdall/failover_client_test.go

diff --git a/cmd/utils/bor_flags.go b/cmd/utils/bor_flags.go
index f5f719f79f..faa9219d32 100644
--- a/cmd/utils/bor_flags.go
+++ b/cmd/utils/bor_flags.go
@@ -23,6 +23,13 @@ var (
 		Value: "http://localhost:1317",
 	}
 
+	// HeimdallSecondaryURLFlag flag for secondary heimdall url (failover)
+	HeimdallSecondaryURLFlag = &cli.StringFlag{
+		Name:  "bor.heimdall.secondary",
+		Usage: "URL of a secondary Heimdall service for failover",
+		Value: "",
+	}
+
 	// HeimdallTimeoutFlag flag for heimdall timeout
 	HeimdallTimeoutFlag = &cli.DurationFlag{
 		Name:  "bor.heimdalltimeout",
@@ -71,6 +78,7 @@ var (
 	// BorFlags all bor related flags
 	BorFlags = []cli.Flag{
 		HeimdallURLFlag,
+		HeimdallSecondaryURLFlag,
 		HeimdallTimeoutFlag,
 		WithoutHeimdallFlag,
 		HeimdallgRPCAddressFlag,
@@ -84,6 +92,7 @@ var (
 // SetBorConfig sets bor config
 func SetBorConfig(ctx *cli.Context, cfg *eth.Config) {
 	cfg.HeimdallURL = ctx.String(HeimdallURLFlag.Name)
+	cfg.HeimdallSecondaryURL = ctx.String(HeimdallSecondaryURLFlag.Name)
 	cfg.HeimdallTimeout = ctx.Duration(HeimdallTimeoutFlag.Name)
 	cfg.WithoutHeimdall = ctx.Bool(WithoutHeimdallFlag.Name)
 	cfg.HeimdallgRPCAddress = ctx.String(HeimdallgRPCAddressFlag.Name)
diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
new file mode 100644
index 0000000000..cf41d47485
--- /dev/null
+++ b/consensus/bor/heimdall/failover_client.go
@@ -0,0 +1,231 @@
+package heimdall
+
+import (
+	"context"
+	"errors"
+	"net"
+	"sync"
+	"time"
+
+	"github.com/0xPolygon/heimdall-v2/x/bor/types"
+	ctypes "github.com/cometbft/cometbft/rpc/core/types"
+
+	"github.com/ethereum/go-ethereum/consensus/bor/clerk"
+	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/checkpoint"
+	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/milestone"
+	"github.com/ethereum/go-ethereum/log"
+)
+
+const (
+	defaultAttemptTimeout    = 30 * time.Second
+	defaultSecondaryCooldown = 5 * time.Minute
+)
+
+// heimdallClient is a local interface matching bor.IHeimdallClient to avoid
+// an import cycle with the consensus/bor package.
+type heimdallClient interface {
+	StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error)
+	GetSpan(ctx context.Context, spanID uint64) (*types.Span, error)
+	GetLatestSpan(ctx context.Context) (*types.Span, error)
+	FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error)
+	FetchCheckpointCount(ctx context.Context) (int64, error)
+	FetchMilestone(ctx context.Context) (*milestone.Milestone, error)
+	FetchMilestoneCount(ctx context.Context) (int64, error)
+	FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error)
+	Close()
+}
+
+// FailoverHeimdallClient wraps two heimdall clients (primary + secondary) and
+// transparently fails over from primary to secondary when the primary is
+// unreachable. After a cooldown period it probes the primary again.
+type FailoverHeimdallClient struct {
+	clients        [2]heimdallClient
+	mu             sync.Mutex
+	active         int       // 0 = primary, 1 = secondary
+	lastSwitch     time.Time // when we last switched to secondary
+	attemptTimeout time.Duration
+	cooldown       time.Duration
+}
+
+func NewFailoverHeimdallClient(primary, secondary heimdallClient) *FailoverHeimdallClient {
+	return &FailoverHeimdallClient{
+		clients:        [2]heimdallClient{primary, secondary},
+		attemptTimeout: defaultAttemptTimeout,
+		cooldown:       defaultSecondaryCooldown,
+	}
+}
+
+func (f *FailoverHeimdallClient) StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) ([]*clerk.EventRecordWithTime, error) {
+		return c.StateSyncEvents(ctx, fromID, to)
+	})
+}
+
+func (f *FailoverHeimdallClient) GetSpan(ctx context.Context, spanID uint64) (*types.Span, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*types.Span, error) {
+		return c.GetSpan(ctx, spanID)
+	})
+}
+
+func (f *FailoverHeimdallClient) GetLatestSpan(ctx context.Context) (*types.Span, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*types.Span, error) {
+		return c.GetLatestSpan(ctx)
+	})
+}
+
+func (f *FailoverHeimdallClient) FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*checkpoint.Checkpoint, error) {
+		return c.FetchCheckpoint(ctx, number)
+	})
+}
+
+func (f *FailoverHeimdallClient) FetchCheckpointCount(ctx context.Context) (int64, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (int64, error) {
+		return c.FetchCheckpointCount(ctx)
+	})
+}
+
+func (f *FailoverHeimdallClient) FetchMilestone(ctx context.Context) (*milestone.Milestone, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*milestone.Milestone, error) {
+		return c.FetchMilestone(ctx)
+	})
+}
+
+func (f *FailoverHeimdallClient) FetchMilestoneCount(ctx context.Context) (int64, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (int64, error) {
+		return c.FetchMilestoneCount(ctx)
+	})
+}
+
+func (f *FailoverHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*ctypes.SyncInfo, error) {
+		return c.FetchStatus(ctx)
+	})
+}
+
+func (f *FailoverHeimdallClient) Close() {
+	f.clients[0].Close()
+	f.clients[1].Close()
+}
+
+// callWithFailover executes fn against the active client. If the active client
+// is primary and the call fails with a failover-eligible error, it retries on
+// the secondary. If on secondary past the cooldown, it probes the primary first.
+func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn func(context.Context, heimdallClient) (T, error)) (T, error) {
+	f.mu.Lock()
+	active := f.active
+	shouldProbe := active == 1 && time.Since(f.lastSwitch) >= f.cooldown
+	f.mu.Unlock()
+
+	// If on secondary and cooldown has elapsed, probe primary
+	if shouldProbe {
+		subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
+		result, err := fn(subCtx, f.clients[0])
+		cancel()
+
+		if err == nil {
+			f.mu.Lock()
+			f.active = 0
+			f.mu.Unlock()
+
+			log.Info("Heimdall failover: primary recovered, switching back")
+
+			return result, nil
+		}
+
+		if !isFailoverError(err, ctx) {
+			var zero T
+			return zero, err
+		}
+
+		// Primary still down, stay on secondary
+		f.mu.Lock()
+		f.lastSwitch = time.Now()
+		f.mu.Unlock()
+
+		log.Debug("Heimdall failover: primary still down after probe, staying on secondary", "err", err)
+
+		return fn(ctx, f.clients[1])
+	}
+
+	if active == 1 {
+		// On secondary, not yet time to probe: use secondary directly
+		return fn(ctx, f.clients[1])
+	}
+
+	// Active is primary: try with timeout
+	subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
+	result, err := fn(subCtx, f.clients[0])
+	cancel()
+
+	if err == nil {
+		return result, nil
+	}
+
+	if !isFailoverError(err, ctx) {
+		var zero T
+		return zero, err
+	}
+
+	// Failover to secondary
+	f.mu.Lock()
+	f.active = 1
+	f.lastSwitch = time.Now()
+	f.mu.Unlock()
+
+	log.Warn("Heimdall failover: primary failed, switching to secondary", "err", err)
+
+	return fn(ctx, f.clients[1])
+}
+
+// isFailoverError returns true if the error warrants trying the secondary.
+// It distinguishes between sub-context timeouts (failover-eligible) and
+// caller context cancellation (not eligible).
+func isFailoverError(err error, callerCtx context.Context) bool {
+	if err == nil {
+		return false
+	}
+
+	// If the caller's context is done, this is not a failover scenario
+	if callerCtx.Err() != nil {
+		return false
+	}
+
+	// Shutdown detected - not a transport error
+	if errors.Is(err, ErrShutdownDetected) {
+		return false
+	}
+
+	// 503 is a Heimdall feature-gate, not a transport issue
+	if errors.Is(err, ErrServiceUnavailable) {
+		return false
+	}
+
+	// Transport errors
+	var netErr net.Error
+	if errors.As(err, &netErr) {
+		return true
+	}
+
+	// No response from Heimdall
+	if errors.Is(err, ErrNoResponse) {
+		return true
+	}
+
+	// Non-successful HTTP response (4xx, 5xx excluding 503)
+	if errors.Is(err, ErrNotSuccessfulResponse) {
+		return true
+	}
+
+	// Sub-context deadline exceeded (the caller's context is still alive at this point)
+	if errors.Is(err, context.DeadlineExceeded) {
+		return true
+	}
+
+	// Context canceled from sub-context (caller ctx is still alive)
+	if errors.Is(err, context.Canceled) {
+		return true
+	}
+
+	return false
+}
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
new file mode 100644
index 0000000000..dc22325b18
--- /dev/null
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -0,0 +1,340 @@
+package heimdall
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"net/http"
+	"net/http/httptest"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/0xPolygon/heimdall-v2/x/bor/types"
+	ctypes "github.com/cometbft/cometbft/rpc/core/types"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/ethereum/go-ethereum/consensus/bor/clerk"
+	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/checkpoint"
+	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/milestone"
+)
+
+// mockHeimdallClient is a configurable mock implementing the heimdallClient interface.
+type mockHeimdallClient struct {
+	getSpanFn func(ctx context.Context, spanID uint64) (*types.Span, error)
+	closeFn   func()
+	hits      atomic.Int32
+}
+
+func (m *mockHeimdallClient) StateSyncEvents(_ context.Context, _ uint64, _ int64) ([]*clerk.EventRecordWithTime, error) {
+	return nil, nil
+}
+
+func (m *mockHeimdallClient) GetSpan(ctx context.Context, spanID uint64) (*types.Span, error) {
+	m.hits.Add(1)
+
+	if m.getSpanFn != nil {
+		return m.getSpanFn(ctx, spanID)
+	}
+
+	return &types.Span{Id: spanID}, nil
+}
+
+func (m *mockHeimdallClient) GetLatestSpan(_ context.Context) (*types.Span, error) {
+	return nil, nil
+}
+
+func (m *mockHeimdallClient) FetchCheckpoint(_ context.Context, _ int64) (*checkpoint.Checkpoint, error) {
+	return nil, nil
+}
+
+func (m *mockHeimdallClient) FetchCheckpointCount(_ context.Context) (int64, error) {
+	return 0, nil
+}
+
+func (m *mockHeimdallClient) FetchMilestone(_ context.Context) (*milestone.Milestone, error) {
+	return nil, nil
+}
+
+func (m *mockHeimdallClient) FetchMilestoneCount(_ context.Context) (int64, error) {
+	return 0, nil
+}
+
+func (m *mockHeimdallClient) FetchStatus(_ context.Context) (*ctypes.SyncInfo, error) {
+	return nil, nil
+}
+
+func (m *mockHeimdallClient) Close() {
+	if m.closeFn != nil {
+		m.closeFn()
+	}
+}
+
+func TestFailover_SwitchOnPrimaryDown(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(ctx context.Context, _ uint64) (*types.Span, error) {
+			// Simulate transport error
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	span, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	require.NotNil(t, span)
+
+	assert.GreaterOrEqual(t, primary.hits.Load(), int32(1), "primary should have been tried")
+	assert.Equal(t, int32(1), secondary.hits.Load(), "secondary should have been called once")
+}
+
+func TestFailover_NoSwitchOnContextCanceled(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(ctx context.Context, _ uint64) (*types.Span, error) {
+			// Block until context is cancelled
+			<-ctx.Done()
+			return nil, ctx.Err()
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 5 * time.Second // longer than caller's ctx
+	defer fc.Close()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer cancel()
+
+	_, err := fc.GetSpan(ctx, 1)
+	require.Error(t, err)
+	assert.Equal(t, int32(0), secondary.hits.Load(), "should not failover on caller context cancellation")
+}
+
+func TestFailover_NoSwitchOnServiceUnavailable(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, ErrServiceUnavailable
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.Error(t, err)
+	assert.True(t, errors.Is(err, ErrServiceUnavailable))
+	assert.Equal(t, int32(0), secondary.hits.Load(), "should not failover on 503")
+}
+
+func TestFailover_NoSwitchOnShutdownDetected(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, ErrShutdownDetected
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.Error(t, err)
+	assert.True(t, errors.Is(err, ErrShutdownDetected))
+	assert.Equal(t, int32(0), secondary.hits.Load(), "should not failover on shutdown")
+}
+
+func TestFailover_StickyBehavior(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 1 * time.Hour // very long cooldown
+	defer fc.Close()
+
+	// First call triggers failover
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+
+	primaryBefore := primary.hits.Load()
+	secondaryBefore := secondary.hits.Load()
+
+	// Subsequent calls should go directly to secondary without trying primary
+	for i := 0; i < 3; i++ {
+		_, err = fc.GetSpan(context.Background(), 1)
+		require.NoError(t, err)
+	}
+
+	assert.Equal(t, primaryBefore, primary.hits.Load(), "primary should not be contacted while sticky")
+	assert.Equal(t, secondaryBefore+3, secondary.hits.Load(), "all calls should go to secondary")
+}
+
+func TestFailover_ProbeBackToPrimary(t *testing.T) {
+	primaryDown := atomic.Bool{}
+	primaryDown.Store(true)
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, spanID uint64) (*types.Span, error) {
+			if primaryDown.Load() {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &types.Span{Id: spanID}, nil
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 50 * time.Millisecond
+	defer fc.Close()
+
+	// Trigger failover
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+
+	// Wait for cooldown to elapse
+	time.Sleep(100 * time.Millisecond)
+
+	// Bring primary back
+	primaryDown.Store(false)
+
+	primaryBefore := primary.hits.Load()
+
+	// Next call should probe primary and succeed
+	_, err = fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	assert.Greater(t, primary.hits.Load(), primaryBefore, "primary should have been probed")
+
+	// Verify we're back on primary
+	secondaryBefore := secondary.hits.Load()
+	_, err = fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	assert.Equal(t, secondaryBefore, secondary.hits.Load(), "should be back on primary now")
+}
+
+func TestFailover_ProbeBackFails(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 50 * time.Millisecond
+	defer fc.Close()
+
+	// Trigger failover
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+
+	// Wait for cooldown
+	time.Sleep(100 * time.Millisecond)
+
+	// Probe should fail, then fallback to secondary
+	secondaryBefore := secondary.hits.Load()
+	_, err = fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	assert.Greater(t, secondary.hits.Load(), secondaryBefore, "should fall back to secondary after failed probe")
+}
+
+func TestFailover_ClosesBothClients(t *testing.T) {
+	var primaryClosed, secondaryClosed atomic.Bool
+
+	primary := &mockHeimdallClient{closeFn: func() { primaryClosed.Store(true) }}
+	secondary := &mockHeimdallClient{closeFn: func() { secondaryClosed.Store(true) }}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.Close()
+
+	assert.True(t, primaryClosed.Load(), "primary should be closed")
+	assert.True(t, secondaryClosed.Load(), "secondary should be closed")
+}
+
+func TestFailover_PassthroughWhenPrimaryHealthy(t *testing.T) {
+	primary := &mockHeimdallClient{}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 5 * time.Second
+	defer fc.Close()
+
+	for i := 0; i < 5; i++ {
+		_, err := fc.GetSpan(context.Background(), 1)
+		require.NoError(t, err)
+	}
+
+	assert.Equal(t, int32(5), primary.hits.Load(), "all calls should go to primary")
+	assert.Equal(t, int32(0), secondary.hits.Load(), "secondary should not be contacted")
+}
+
+// Integration test using real HTTP servers to verify end-to-end behavior
+func TestFailover_Integration_ServiceUnavailable(t *testing.T) {
+	primary := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusServiceUnavailable)
+	}))
+	t.Cleanup(primary.Close)
+
+	secondary := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+	t.Cleanup(secondary.Close)
+
+	primaryClient := NewHeimdallClient(primary.URL, 5*time.Second)
+	secondaryClient := NewHeimdallClient(secondary.URL, 5*time.Second)
+
+	fc := NewFailoverHeimdallClient(primaryClient, secondaryClient)
+	fc.attemptTimeout = 2 * time.Second
+	defer fc.Close()
+
+	ctx := WithRequestType(context.Background(), SpanRequest)
+
+	// 503 should NOT trigger failover
+	_, err := fc.GetSpan(ctx, 1)
+	require.Error(t, err)
+	assert.True(t, errors.Is(err, ErrServiceUnavailable))
+}
+
+func TestIsFailoverError(t *testing.T) {
+	ctx := context.Background()
+
+	// Transport errors should trigger failover
+	netErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+	assert.True(t, isFailoverError(netErr, ctx), "net.Error should trigger failover")
+
+	// ErrNoResponse should trigger failover
+	assert.True(t, isFailoverError(ErrNoResponse, ctx), "ErrNoResponse should trigger failover")
+
+	// ErrNotSuccessfulResponse should trigger failover
+	assert.True(t, isFailoverError(fmt.Errorf("wrapped: %w", ErrNotSuccessfulResponse), ctx), "ErrNotSuccessfulResponse should trigger failover")
+
+	// DeadlineExceeded with live caller ctx should trigger failover
+	assert.True(t, isFailoverError(context.DeadlineExceeded, ctx), "DeadlineExceeded should trigger failover when caller ctx is alive")
+
+	// ErrShutdownDetected should NOT trigger failover
+	assert.False(t, isFailoverError(ErrShutdownDetected, ctx), "ErrShutdownDetected should not trigger failover")
+
+	// ErrServiceUnavailable should NOT trigger failover
+	assert.False(t, isFailoverError(ErrServiceUnavailable, ctx), "ErrServiceUnavailable should not trigger failover")
+
+	// Caller context cancelled should NOT trigger failover
+	cancelledCtx, cancel := context.WithCancel(ctx)
+	cancel()
+	assert.False(t, isFailoverError(context.DeadlineExceeded, cancelledCtx), "should not failover when caller ctx is done")
+
+	// nil error should not trigger failover
+	assert.False(t, isFailoverError(nil, ctx), "nil error should not trigger failover")
+}
diff --git a/eth/ethconfig/config.go b/eth/ethconfig/config.go
index 3dd06f150d..0fd1f3f7db 100644
--- a/eth/ethconfig/config.go
+++ b/eth/ethconfig/config.go
@@ -208,6 +208,9 @@ type Config struct {
 	// URL to connect to Heimdall node
 	HeimdallURL string
 
+	// URL to connect to a secondary Heimdall node for failover
+	HeimdallSecondaryURL string
+
 	// timeout in heimdall requests
 	HeimdallTimeout time.Duration
 
@@ -340,6 +343,13 @@ func CreateConsensusEngine(chainConfig *params.ChainConfig, ethConfig *Config, d
 				heimdallClient = heimdall.NewHeimdallClient(ethConfig.HeimdallURL, ethConfig.HeimdallTimeout)
 			}
 
+			if ethConfig.HeimdallSecondaryURL != "" {
+				secondaryClient := heimdall.NewHeimdallClient(ethConfig.HeimdallSecondaryURL, ethConfig.HeimdallTimeout)
+				heimdallClient = heimdall.NewFailoverHeimdallClient(heimdallClient, secondaryClient)
+
+				log.Info("Heimdall failover enabled", "primary", ethConfig.HeimdallURL, "secondary", ethConfig.HeimdallSecondaryURL)
+			}
+
 			var heimdallWSClient bor.IHeimdallWSClient
 			var err error
 			if ethConfig.HeimdallWSAddress != "" {
diff --git a/eth/ethconfig/gen_config.go b/eth/ethconfig/gen_config.go
index b1ba37d578..98ef6e3021 100644
--- a/eth/ethconfig/gen_config.go
+++ b/eth/ethconfig/gen_config.go
@@ -67,6 +67,7 @@ func (c Config) MarshalTOML() (interface{}, error) {
 		RPCEVMTimeout                        time.Duration
 		RPCTxFeeCap                          float64
 		HeimdallURL                          string
+		HeimdallSecondaryURL                 string
 		HeimdallTimeout                      time.Duration
 		WithoutHeimdall                      bool
 		HeimdallgRPCAddress                  string
@@ -136,6 +137,7 @@ func (c Config) MarshalTOML() (interface{}, error) {
 	enc.RPCEVMTimeout = c.RPCEVMTimeout
 	enc.RPCTxFeeCap = c.RPCTxFeeCap
 	enc.HeimdallURL = c.HeimdallURL
+	enc.HeimdallSecondaryURL = c.HeimdallSecondaryURL
 	enc.HeimdallTimeout = c.HeimdallTimeout
 	enc.WithoutHeimdall = c.WithoutHeimdall
 	enc.HeimdallgRPCAddress = c.HeimdallgRPCAddress
@@ -213,6 +215,7 @@ func (c *Config) UnmarshalTOML(unmarshal func(interface{}) error) error {
 		RPCEVMTimeout                        *time.Duration
 		RPCTxFeeCap                          *float64
 		HeimdallURL                          *string
+		HeimdallSecondaryURL                 *string
 		HeimdallTimeout                      *time.Duration
 		WithoutHeimdall                      *bool
 		HeimdallgRPCAddress                  *string
@@ -373,6 +376,9 @@ func (c *Config) UnmarshalTOML(unmarshal func(interface{}) error) error {
 	if dec.HeimdallURL != nil {
 		c.HeimdallURL = *dec.HeimdallURL
 	}
+	if dec.HeimdallSecondaryURL != nil {
+		c.HeimdallSecondaryURL = *dec.HeimdallSecondaryURL
+	}
 	if dec.HeimdallTimeout != nil {
 		c.HeimdallTimeout = *dec.HeimdallTimeout
 	}
diff --git a/internal/cli/server/config.go b/internal/cli/server/config.go
index bb607ce9d1..1703fa350d 100644
--- a/internal/cli/server/config.go
+++ b/internal/cli/server/config.go
@@ -309,6 +309,9 @@ type HeimdallConfig struct {
 	// URL is the url of the heimdall server
 	URL string `hcl:"url,optional" toml:"url,optional"`
 
+	// SecondaryURL is the url of a secondary heimdall server used for failover
+	SecondaryURL string `hcl:"secondary-url,optional" toml:"secondary-url,optional"`
+
 	Timeout time.Duration `hcl:"timeout,optional" toml:"timeout,optional"`
 
 	// Without is used to disable remote heimdall during testing
@@ -802,11 +805,12 @@ func DefaultConfig() *Config {
 			},
 		},
 		Heimdall: &HeimdallConfig{
-			URL:         "http://localhost:1317",
-			Timeout:     5 * time.Second,
-			Without:     false,
-			GRPCAddress: "",
-			WSAddress:   "",
+			URL:          "http://localhost:1317",
+			SecondaryURL: "",
+			Timeout:      5 * time.Second,
+			Without:      false,
+			GRPCAddress:  "",
+			WSAddress:    "",
 		},
 		SyncMode:    "full",
 		GcMode:      "full",
@@ -1140,6 +1144,7 @@ func (c *Config) buildEth(stack *node.Node, accountManager *accounts.Manager) (*
 	}
 
 	n.HeimdallURL = c.Heimdall.URL
+	n.HeimdallSecondaryURL = c.Heimdall.SecondaryURL
 	n.HeimdallTimeout = c.Heimdall.Timeout
 	n.WithoutHeimdall = c.Heimdall.Without
 	n.HeimdallgRPCAddress = c.Heimdall.GRPCAddress
diff --git a/internal/cli/server/flags.go b/internal/cli/server/flags.go
index a7ae19265c..dca6c0368f 100644
--- a/internal/cli/server/flags.go
+++ b/internal/cli/server/flags.go
@@ -179,6 +179,12 @@ func (c *Command) Flags(config *Config) *flagset.Flagset {
 		Value:   &c.cliConfig.Heimdall.URL,
 		Default: c.cliConfig.Heimdall.URL,
 	})
+	f.StringFlag(&flagset.StringFlag{
+		Name:    "bor.heimdall.secondary",
+		Usage:   "URL of a secondary Heimdall service for failover",
+		Value:   &c.cliConfig.Heimdall.SecondaryURL,
+		Default: c.cliConfig.Heimdall.SecondaryURL,
+	})
 	f.DurationFlag(&flagset.DurationFlag{
 		Name:    "bor.heimdalltimeout",
 		Usage:   "Timeout period for bor's outgoing requests to heimdall",
diff --git a/internal/cli/server/testdata/default.toml b/internal/cli/server/testdata/default.toml
index d3b00e5fcc..658bca960c 100644
--- a/internal/cli/server/testdata/default.toml
+++ b/internal/cli/server/testdata/default.toml
@@ -50,6 +50,7 @@ devfakeauthor = false
 
 [heimdall]
   url = "http://localhost:1317"
+  secondary-url = ""
   "bor.without" = false
   grpc-address = ""
   "bor.runheimdall" = false

From fe0c07b329a09139d45831b8e2458a8dc8c41265 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Wed, 11 Feb 2026 16:12:39 +0530
Subject: [PATCH 02/29] added comment for clarification

---
 consensus/bor/heimdall/failover_client.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index cf41d47485..b4cb9e5bb2 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -145,6 +145,10 @@ func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn
 
 		log.Debug("Heimdall failover: primary still down after probe, staying on secondary", "err", err)
 
+		// Secondary calls use the caller's ctx directly (no sub-timeout).
+		// The timeout is only needed on primary to bound the failover decision.
+		// Once on secondary there is no further fallback, so the caller's
+		// context (which always has a cancellation path in Bor) governs lifetime.
 		return fn(ctx, f.clients[1])
 	}
 

From fe49be3e57f90a398b64efea240fd644cc82044c Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Wed, 11 Feb 2026 17:11:47 +0530
Subject: [PATCH 03/29] reduced the colldown time to 2 minutes

---
 consensus/bor/heimdall/failover_client.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index b4cb9e5bb2..d74b9c3e4a 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -18,7 +18,7 @@ import (
 
 const (
 	defaultAttemptTimeout    = 30 * time.Second
-	defaultSecondaryCooldown = 5 * time.Minute
+	defaultSecondaryCooldown = 2 * time.Minute
 )
 
 // heimdallClient is a local interface matching bor.IHeimdallClient to avoid

From 0c8c0a68aa1b64960371c1077461fa378f3e8c8a Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Wed, 11 Feb 2026 21:25:32 +0530
Subject: [PATCH 04/29] added more unit tests

---
 .../bor/heimdall/failover_client_test.go      | 290 +++++++++++++++++-
 1 file changed, 273 insertions(+), 17 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index dc22325b18..1d99cad48d 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -23,13 +23,26 @@ import (
 
 // mockHeimdallClient is a configurable mock implementing the heimdallClient interface.
 type mockHeimdallClient struct {
-	getSpanFn func(ctx context.Context, spanID uint64) (*types.Span, error)
-	closeFn   func()
-	hits      atomic.Int32
+	getSpanFn           func(ctx context.Context, spanID uint64) (*types.Span, error)
+	getLatestSpanFn     func(ctx context.Context) (*types.Span, error)
+	stateSyncEventsFn   func(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error)
+	fetchCheckpointFn   func(ctx context.Context, number int64) (*checkpoint.Checkpoint, error)
+	fetchCheckpointCntFn func(ctx context.Context) (int64, error)
+	fetchMilestoneFn    func(ctx context.Context) (*milestone.Milestone, error)
+	fetchMilestoneCntFn func(ctx context.Context) (int64, error)
+	fetchStatusFn       func(ctx context.Context) (*ctypes.SyncInfo, error)
+	closeFn             func()
+	hits                atomic.Int32
 }
 
-func (m *mockHeimdallClient) StateSyncEvents(_ context.Context, _ uint64, _ int64) ([]*clerk.EventRecordWithTime, error) {
-	return nil, nil
+func (m *mockHeimdallClient) StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error) {
+	m.hits.Add(1)
+
+	if m.stateSyncEventsFn != nil {
+		return m.stateSyncEventsFn(ctx, fromID, to)
+	}
+
+	return []*clerk.EventRecordWithTime{}, nil
 }
 
 func (m *mockHeimdallClient) GetSpan(ctx context.Context, spanID uint64) (*types.Span, error) {
@@ -42,28 +55,64 @@ func (m *mockHeimdallClient) GetSpan(ctx context.Context, spanID uint64) (*types
 	return &types.Span{Id: spanID}, nil
 }
 
-func (m *mockHeimdallClient) GetLatestSpan(_ context.Context) (*types.Span, error) {
-	return nil, nil
+func (m *mockHeimdallClient) GetLatestSpan(ctx context.Context) (*types.Span, error) {
+	m.hits.Add(1)
+
+	if m.getLatestSpanFn != nil {
+		return m.getLatestSpanFn(ctx)
+	}
+
+	return &types.Span{Id: 99}, nil
 }
 
-func (m *mockHeimdallClient) FetchCheckpoint(_ context.Context, _ int64) (*checkpoint.Checkpoint, error) {
-	return nil, nil
+func (m *mockHeimdallClient) FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error) {
+	m.hits.Add(1)
+
+	if m.fetchCheckpointFn != nil {
+		return m.fetchCheckpointFn(ctx, number)
+	}
+
+	return &checkpoint.Checkpoint{}, nil
 }
 
-func (m *mockHeimdallClient) FetchCheckpointCount(_ context.Context) (int64, error) {
-	return 0, nil
+func (m *mockHeimdallClient) FetchCheckpointCount(ctx context.Context) (int64, error) {
+	m.hits.Add(1)
+
+	if m.fetchCheckpointCntFn != nil {
+		return m.fetchCheckpointCntFn(ctx)
+	}
+
+	return 10, nil
 }
 
-func (m *mockHeimdallClient) FetchMilestone(_ context.Context) (*milestone.Milestone, error) {
-	return nil, nil
+func (m *mockHeimdallClient) FetchMilestone(ctx context.Context) (*milestone.Milestone, error) {
+	m.hits.Add(1)
+
+	if m.fetchMilestoneFn != nil {
+		return m.fetchMilestoneFn(ctx)
+	}
+
+	return &milestone.Milestone{}, nil
 }
 
-func (m *mockHeimdallClient) FetchMilestoneCount(_ context.Context) (int64, error) {
-	return 0, nil
+func (m *mockHeimdallClient) FetchMilestoneCount(ctx context.Context) (int64, error) {
+	m.hits.Add(1)
+
+	if m.fetchMilestoneCntFn != nil {
+		return m.fetchMilestoneCntFn(ctx)
+	}
+
+	return 5, nil
 }
 
-func (m *mockHeimdallClient) FetchStatus(_ context.Context) (*ctypes.SyncInfo, error) {
-	return nil, nil
+func (m *mockHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error) {
+	m.hits.Add(1)
+
+	if m.fetchStatusFn != nil {
+		return m.fetchStatusFn(ctx)
+	}
+
+	return &ctypes.SyncInfo{}, nil
 }
 
 func (m *mockHeimdallClient) Close() {
@@ -308,6 +357,210 @@ func TestFailover_Integration_ServiceUnavailable(t *testing.T) {
 	assert.True(t, errors.Is(err, ErrServiceUnavailable))
 }
 
+func TestFailover_StateSyncEvents(t *testing.T) {
+	primary := &mockHeimdallClient{
+		stateSyncEventsFn: func(_ context.Context, _ uint64, _ int64) ([]*clerk.EventRecordWithTime, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{
+		stateSyncEventsFn: func(_ context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error) {
+			return []*clerk.EventRecordWithTime{{EventRecord: clerk.EventRecord{ID: fromID}}}, nil
+		},
+	}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	events, err := fc.StateSyncEvents(context.Background(), 42, 100)
+	require.NoError(t, err)
+	require.Len(t, events, 1)
+	assert.Equal(t, uint64(42), events[0].ID)
+	assert.Equal(t, int32(1), secondary.hits.Load())
+}
+
+func TestFailover_GetLatestSpan(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getLatestSpanFn: func(_ context.Context) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{
+		getLatestSpanFn: func(_ context.Context) (*types.Span, error) {
+			return &types.Span{Id: 77}, nil
+		},
+	}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	span, err := fc.GetLatestSpan(context.Background())
+	require.NoError(t, err)
+	assert.Equal(t, uint64(77), span.Id)
+	assert.Equal(t, int32(1), secondary.hits.Load())
+}
+
+func TestFailover_FetchCheckpoint(t *testing.T) {
+	primary := &mockHeimdallClient{
+		fetchCheckpointFn: func(_ context.Context, _ int64) (*checkpoint.Checkpoint, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	cp, err := fc.FetchCheckpoint(context.Background(), 5)
+	require.NoError(t, err)
+	require.NotNil(t, cp)
+	assert.Equal(t, int32(1), secondary.hits.Load())
+}
+
+func TestFailover_FetchCheckpointCount(t *testing.T) {
+	primary := &mockHeimdallClient{
+		fetchCheckpointCntFn: func(_ context.Context) (int64, error) {
+			return 0, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	count, err := fc.FetchCheckpointCount(context.Background())
+	require.NoError(t, err)
+	assert.Equal(t, int64(10), count)
+	assert.Equal(t, int32(1), secondary.hits.Load())
+}
+
+func TestFailover_FetchMilestone(t *testing.T) {
+	primary := &mockHeimdallClient{
+		fetchMilestoneFn: func(_ context.Context) (*milestone.Milestone, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	ms, err := fc.FetchMilestone(context.Background())
+	require.NoError(t, err)
+	require.NotNil(t, ms)
+	assert.Equal(t, int32(1), secondary.hits.Load())
+}
+
+func TestFailover_FetchMilestoneCount(t *testing.T) {
+	primary := &mockHeimdallClient{
+		fetchMilestoneCntFn: func(_ context.Context) (int64, error) {
+			return 0, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	count, err := fc.FetchMilestoneCount(context.Background())
+	require.NoError(t, err)
+	assert.Equal(t, int64(5), count)
+	assert.Equal(t, int32(1), secondary.hits.Load())
+}
+
+func TestFailover_FetchStatus(t *testing.T) {
+	primary := &mockHeimdallClient{
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	status, err := fc.FetchStatus(context.Background())
+	require.NoError(t, err)
+	require.NotNil(t, status)
+	assert.Equal(t, int32(1), secondary.hits.Load())
+}
+
+func TestFailover_ProbeBackNonFailoverError(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, ErrShutdownDetected
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 50 * time.Millisecond
+	defer fc.Close()
+
+	// Force onto secondary
+	fc.mu.Lock()
+	fc.active = 1
+	fc.lastSwitch = time.Now().Add(-time.Hour) // cooldown already elapsed
+	fc.mu.Unlock()
+
+	// Probe primary → gets ErrShutdownDetected (non-failover error)
+	// Should return the error directly, NOT fall back to secondary
+	secondaryBefore := secondary.hits.Load()
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.Error(t, err)
+	assert.True(t, errors.Is(err, ErrShutdownDetected))
+	assert.Equal(t, secondaryBefore, secondary.hits.Load(), "should not fall back to secondary on non-failover error during probe")
+}
+
+func TestFailover_SwitchOnPrimaryDeadlineExceeded(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(ctx context.Context, _ uint64) (*types.Span, error) {
+			// Block until the sub-context deadline expires
+			<-ctx.Done()
+			return nil, ctx.Err()
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	span, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	require.NotNil(t, span)
+	assert.Equal(t, int32(1), primary.hits.Load(), "primary should have been tried")
+	assert.Equal(t, int32(1), secondary.hits.Load(), "should failover on sub-context deadline exceeded")
+}
+
+func TestFailover_SwitchOnPrimaryContextCanceled(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			// Return context.Canceled as if a sub-context was canceled
+			return nil, context.Canceled
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	span, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	require.NotNil(t, span)
+	assert.Equal(t, int32(1), primary.hits.Load(), "primary should have been tried")
+	assert.Equal(t, int32(1), secondary.hits.Load(), "should failover on sub-context canceled")
+}
+
 func TestIsFailoverError(t *testing.T) {
 	ctx := context.Background()
 
@@ -324,6 +577,9 @@ func TestIsFailoverError(t *testing.T) {
 	// DeadlineExceeded with live caller ctx should trigger failover
 	assert.True(t, isFailoverError(context.DeadlineExceeded, ctx), "DeadlineExceeded should trigger failover when caller ctx is alive")
 
+	// Canceled with live caller ctx should trigger failover (sub-context was canceled, not the caller)
+	assert.True(t, isFailoverError(context.Canceled, ctx), "Canceled should trigger failover when caller ctx is alive")
+
 	// ErrShutdownDetected should NOT trigger failover
 	assert.False(t, isFailoverError(ErrShutdownDetected, ctx), "ErrShutdownDetected should not trigger failover")
 

From 93bd0e6225a6fac345ae135eeb4e7f29a3d6220f Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Wed, 11 Feb 2026 21:35:16 +0530
Subject: [PATCH 05/29] lint and duplication fix

---
 .../bor/heimdall/failover_client_test.go      | 85 +++++++++----------
 1 file changed, 41 insertions(+), 44 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 1d99cad48d..3a4cf08489 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -23,16 +23,16 @@ import (
 
 // mockHeimdallClient is a configurable mock implementing the heimdallClient interface.
 type mockHeimdallClient struct {
-	getSpanFn           func(ctx context.Context, spanID uint64) (*types.Span, error)
-	getLatestSpanFn     func(ctx context.Context) (*types.Span, error)
-	stateSyncEventsFn   func(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error)
-	fetchCheckpointFn   func(ctx context.Context, number int64) (*checkpoint.Checkpoint, error)
+	getSpanFn            func(ctx context.Context, spanID uint64) (*types.Span, error)
+	getLatestSpanFn      func(ctx context.Context) (*types.Span, error)
+	stateSyncEventsFn    func(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error)
+	fetchCheckpointFn    func(ctx context.Context, number int64) (*checkpoint.Checkpoint, error)
 	fetchCheckpointCntFn func(ctx context.Context) (int64, error)
-	fetchMilestoneFn    func(ctx context.Context) (*milestone.Milestone, error)
-	fetchMilestoneCntFn func(ctx context.Context) (int64, error)
-	fetchStatusFn       func(ctx context.Context) (*ctypes.SyncInfo, error)
-	closeFn             func()
-	hits                atomic.Int32
+	fetchMilestoneFn     func(ctx context.Context) (*milestone.Milestone, error)
+	fetchMilestoneCntFn  func(ctx context.Context) (int64, error)
+	fetchStatusFn        func(ctx context.Context) (*ctypes.SyncInfo, error)
+	closeFn              func()
+	hits                 atomic.Int32
 }
 
 func (m *mockHeimdallClient) StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error) {
@@ -520,45 +520,42 @@ func TestFailover_ProbeBackNonFailoverError(t *testing.T) {
 	assert.Equal(t, secondaryBefore, secondary.hits.Load(), "should not fall back to secondary on non-failover error during probe")
 }
 
-func TestFailover_SwitchOnPrimaryDeadlineExceeded(t *testing.T) {
-	primary := &mockHeimdallClient{
-		getSpanFn: func(ctx context.Context, _ uint64) (*types.Span, error) {
-			// Block until the sub-context deadline expires
-			<-ctx.Done()
-			return nil, ctx.Err()
+func TestFailover_SwitchOnPrimarySubContextError(t *testing.T) {
+	tests := []struct {
+		name      string
+		primaryFn func(ctx context.Context, _ uint64) (*types.Span, error)
+	}{
+		{
+			name: "DeadlineExceeded",
+			primaryFn: func(ctx context.Context, _ uint64) (*types.Span, error) {
+				<-ctx.Done()
+				return nil, ctx.Err()
+			},
 		},
-	}
-	secondary := &mockHeimdallClient{}
-
-	fc := NewFailoverHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
-	defer fc.Close()
-
-	span, err := fc.GetSpan(context.Background(), 1)
-	require.NoError(t, err)
-	require.NotNil(t, span)
-	assert.Equal(t, int32(1), primary.hits.Load(), "primary should have been tried")
-	assert.Equal(t, int32(1), secondary.hits.Load(), "should failover on sub-context deadline exceeded")
-}
-
-func TestFailover_SwitchOnPrimaryContextCanceled(t *testing.T) {
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			// Return context.Canceled as if a sub-context was canceled
-			return nil, context.Canceled
+		{
+			name: "Canceled",
+			primaryFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+				return nil, context.Canceled
+			},
 		},
 	}
-	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
-	defer fc.Close()
-
-	span, err := fc.GetSpan(context.Background(), 1)
-	require.NoError(t, err)
-	require.NotNil(t, span)
-	assert.Equal(t, int32(1), primary.hits.Load(), "primary should have been tried")
-	assert.Equal(t, int32(1), secondary.hits.Load(), "should failover on sub-context canceled")
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			primary := &mockHeimdallClient{getSpanFn: tt.primaryFn}
+			secondary := &mockHeimdallClient{}
+
+			fc := NewFailoverHeimdallClient(primary, secondary)
+			fc.attemptTimeout = 100 * time.Millisecond
+			defer fc.Close()
+
+			span, err := fc.GetSpan(context.Background(), 1)
+			require.NoError(t, err)
+			require.NotNil(t, span)
+			assert.Equal(t, int32(1), primary.hits.Load(), "primary should have been tried")
+			assert.Equal(t, int32(1), secondary.hits.Load(), "should failover on sub-context error")
+		})
+	}
 }
 
 func TestIsFailoverError(t *testing.T) {

From a02d07f6e17e77df60a5a01da5bef0ca9ed00534 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Wed, 11 Feb 2026 22:15:52 +0530
Subject: [PATCH 06/29] 1 more unit test

---
 eth/ethconfig/config_test.go | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/eth/ethconfig/config_test.go b/eth/ethconfig/config_test.go
index b85431d12d..1329830fbf 100644
--- a/eth/ethconfig/config_test.go
+++ b/eth/ethconfig/config_test.go
@@ -10,6 +10,7 @@ import (
 	ctypes "github.com/cometbft/cometbft/rpc/core/types"
 	"github.com/ethereum/go-ethereum/consensus/bor"
 	"github.com/ethereum/go-ethereum/consensus/bor/clerk"
+	"github.com/ethereum/go-ethereum/consensus/bor/heimdall"
 	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/checkpoint"
 	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/milestone"
 	"github.com/ethereum/go-ethereum/core/rawdb"
@@ -88,6 +89,24 @@ func TestCreateConsensusEngine_OverrideHeimdallClient(t *testing.T) {
 	require.True(t, ok, "Expected Bor consensus engine")
 }
 
+func TestCreateConsensusEngine_HeimdallSecondaryURL(t *testing.T) {
+	t.Parallel()
+	ethConfig := &Config{
+		OverrideHeimdallClient: &mockHeimdallClient{},
+		HeimdallSecondaryURL:   "http://secondary:1317",
+	}
+
+	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+	require.NoError(t, err)
+	defer engine.Close()
+
+	borEngine, ok := engine.(*bor.Bor)
+	require.True(t, ok, "Expected Bor consensus engine")
+
+	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+	require.True(t, ok, "Expected HeimdallClient to be wrapped in FailoverHeimdallClient")
+}
+
 func TestCreateConsensusEngine_WithoutHeimdall(t *testing.T) {
 	t.Parallel()
 	ethConfig := &Config{WithoutHeimdall: true}

From 8a9d2f7c1fb9c72176be23ee9e908c519742f999 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Thu, 12 Feb 2026 13:54:00 +0530
Subject: [PATCH 07/29] added failover for heimdall grpc and ws clients

---
 cmd/utils/bor_flags.go                    |  18 ++
 consensus/bor/heimdallws/client.go        |  93 +++++--
 consensus/bor/heimdallws/client_test.go   | 306 ++++++++++++++++++++++
 eth/ethconfig/config.go                   |  51 +++-
 eth/ethconfig/gen_config.go               |  12 +
 internal/cli/server/config.go             |  22 +-
 internal/cli/server/flags.go              |  12 +
 internal/cli/server/testdata/default.toml |   3 +
 8 files changed, 487 insertions(+), 30 deletions(-)
 create mode 100644 consensus/bor/heimdallws/client_test.go

diff --git a/cmd/utils/bor_flags.go b/cmd/utils/bor_flags.go
index faa9219d32..f8b1224e0b 100644
--- a/cmd/utils/bor_flags.go
+++ b/cmd/utils/bor_flags.go
@@ -50,6 +50,13 @@ var (
 		Value: "",
 	}
 
+	// HeimdallgRPCSecondaryAddressFlag flag for secondary heimdall gRPC address (failover)
+	HeimdallgRPCSecondaryAddressFlag = &cli.StringFlag{
+		Name:  "bor.heimdallgRPC.secondary",
+		Usage: "Address of a secondary Heimdall gRPC service for failover",
+		Value: "",
+	}
+
 	// HeimdallWSAddressFlag flag for heimdall websocket subscription service
 	HeimdallWSAddressFlag = &cli.StringFlag{
 		Name:  "bor.heimdallWS",
@@ -57,6 +64,13 @@ var (
 		Value: "",
 	}
 
+	// HeimdallWSSecondaryAddressFlag flag for secondary heimdall WS address (failover)
+	HeimdallWSSecondaryAddressFlag = &cli.StringFlag{
+		Name:  "bor.heimdallWS.secondary",
+		Usage: "Address of a secondary Heimdall WS Subscription service for failover",
+		Value: "",
+	}
+
 	// RunHeimdallFlag flag for running heimdall internally from bor
 	RunHeimdallFlag = &cli.BoolFlag{
 		Name:  "bor.runheimdall",
@@ -82,7 +96,9 @@ var (
 		HeimdallTimeoutFlag,
 		WithoutHeimdallFlag,
 		HeimdallgRPCAddressFlag,
+		HeimdallgRPCSecondaryAddressFlag,
 		HeimdallWSAddressFlag,
+		HeimdallWSSecondaryAddressFlag,
 		RunHeimdallFlag,
 		RunHeimdallArgsFlag,
 		UseHeimdallAppFlag,
@@ -96,7 +112,9 @@ func SetBorConfig(ctx *cli.Context, cfg *eth.Config) {
 	cfg.HeimdallTimeout = ctx.Duration(HeimdallTimeoutFlag.Name)
 	cfg.WithoutHeimdall = ctx.Bool(WithoutHeimdallFlag.Name)
 	cfg.HeimdallgRPCAddress = ctx.String(HeimdallgRPCAddressFlag.Name)
+	cfg.HeimdallgRPCSecondaryAddress = ctx.String(HeimdallgRPCSecondaryAddressFlag.Name)
 	cfg.HeimdallWSAddress = ctx.String(HeimdallWSAddressFlag.Name)
+	cfg.HeimdallWSSecondaryAddress = ctx.String(HeimdallWSSecondaryAddressFlag.Name)
 	cfg.RunHeimdall = ctx.Bool(RunHeimdallFlag.Name)
 	cfg.RunHeimdallArgs = ctx.String(RunHeimdallArgsFlag.Name)
 	cfg.UseHeimdallApp = ctx.Bool(UseHeimdallAppFlag.Name)
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index 2428f289b5..d69ed6cc54 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -14,22 +14,52 @@ import (
 	"github.com/ethereum/go-ethereum/log"
 )
 
-// HeimdallWSClient represents a websocket client with auto-reconnection.
+const (
+	// defaultPrimaryAttempts is the number of consecutive failures on the primary URL
+	// before switching to the secondary (~30s at 10s/attempt).
+	defaultPrimaryAttempts = 3
+
+	// defaultReconnectDelay is the backoff between reconnection attempts.
+	defaultReconnectDelay = 10 * time.Second
+
+	// defaultWSCooldown is how long to stay on secondary before probing primary again.
+	defaultWSCooldown = 2 * time.Minute
+)
+
+// HeimdallWSClient represents a websocket client with auto-reconnection and failover support.
 type HeimdallWSClient struct {
-	conn   *websocket.Conn
-	url    string // store the URL for reconnection
-	events chan *milestone.Milestone
-	done   chan struct{}
-	mu     sync.Mutex
+	conn      *websocket.Conn
+	urls      []string // primary at [0], secondary at [1] (if configured)
+	activeURL int      // index into urls
+	events    chan *milestone.Milestone
+	done      chan struct{}
+	mu        sync.Mutex
+
+	// lastFailover tracks when the client last switched to secondary
+	lastFailover time.Time
+
+	// Configurable parameters (defaults set in constructor, overridable for testing)
+	primaryAttempts int
+	reconnectDelay  time.Duration
+	wsCooldown      time.Duration
 }
 
-// NewHeimdallWSClient creates a new WS client for Heimdall.
-func NewHeimdallWSClient(url string) (*HeimdallWSClient, error) {
+// NewHeimdallWSClient creates a new WS client for Heimdall with optional failover.
+// If secondaryURL is empty, the client operates with a single URL (existing behavior).
+func NewHeimdallWSClient(primaryURL string, secondaryURL string) (*HeimdallWSClient, error) {
+	urls := []string{primaryURL}
+	if secondaryURL != "" {
+		urls = append(urls, secondaryURL)
+	}
+
 	return &HeimdallWSClient{
-		conn:   nil,
-		url:    url,
-		events: make(chan *milestone.Milestone),
-		done:   make(chan struct{}),
+		conn:            nil,
+		urls:            urls,
+		events:          make(chan *milestone.Milestone),
+		done:            make(chan struct{}),
+		primaryAttempts: defaultPrimaryAttempts,
+		reconnectDelay:  defaultReconnectDelay,
+		wsCooldown:      defaultWSCooldown,
 	}, nil
 }
 
@@ -43,16 +73,18 @@ func (c *HeimdallWSClient) SubscribeMilestoneEvents(ctx context.Context) <-chan
 	return c.events
 }
 
-// retry until subscribe
+// tryUntilSubscribeMilestoneEvents retries connecting and subscribing until success,
+// with failover to secondary URL after defaultPrimaryAttempts failures on primary.
 func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context) {
+	primaryAttempts := 0
 	firstTime := true
 	for {
 		if !firstTime {
-			time.Sleep(10 * time.Second)
+			time.Sleep(c.reconnectDelay)
 		}
 		firstTime = false
 
-		// Check for context cancellation.
+		// Check for context cancellation or unsubscribe.
 		select {
 		case <-ctx.Done():
 			log.Info("Context cancelled during reconnection")
@@ -63,9 +95,32 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		default:
 		}
 
-		conn, _, err := websocket.DefaultDialer.Dial(c.url, nil)
+		// If on secondary and cooldown has elapsed, probe primary first.
+		if c.activeURL == 1 && !c.lastFailover.IsZero() && time.Since(c.lastFailover) >= c.wsCooldown {
+			log.Info("WS cooldown elapsed, probing primary", "url", c.urls[0])
+			c.activeURL = 0
+			primaryAttempts = 0
+		}
+
+		url := c.urls[c.activeURL]
+
+		conn, _, err := websocket.DefaultDialer.Dial(url, nil)
 		if err != nil {
-			log.Error("failed to dial websocket on heimdall ws subscription", "err", err)
+			log.Error("failed to dial websocket on heimdall ws subscription", "url", url, "err", err)
+
+			// Count failures on primary; switch to secondary after threshold.
+			if c.activeURL == 0 {
+				primaryAttempts++
+
+				if len(c.urls) > 1 && primaryAttempts >= c.primaryAttempts {
+					log.Warn("Primary WS failed, switching to secondary",
+						"primary", c.urls[0], "secondary", c.urls[1], "attempts", primaryAttempts)
+					c.activeURL = 1
+					c.lastFailover = time.Now()
+					primaryAttempts = 0
+				}
+			}
+
 			continue
 		}
 		c.mu.Lock()
@@ -81,10 +136,10 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		req.Params.Query = "tm.event='NewBlock' AND milestone.number>0"
 
 		if err := c.conn.WriteJSON(req); err != nil {
-			log.Error("failed to send subscription request on heimdall ws subscription", "err", err)
+			log.Error("failed to send subscription request on heimdall ws subscription", "url", url, "err", err)
 			continue
 		}
-		log.Info("Successfully connected on heimdall ws subscription")
+		log.Info("successfully connected on heimdall ws subscription", "url", url)
 		return
 	}
 }
diff --git a/consensus/bor/heimdallws/client_test.go b/consensus/bor/heimdallws/client_test.go
new file mode 100644
index 0000000000..15b3e964fd
--- /dev/null
+++ b/consensus/bor/heimdallws/client_test.go
@@ -0,0 +1,306 @@
+package heimdallws
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/gorilla/websocket"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+var upgrader = websocket.Upgrader{
+	CheckOrigin: func(r *http.Request) bool { return true },
+}
+
+// newTestWSServer creates a test WS server that accepts connections and sends a subscription ack.
+// If reject is true, the server closes connections immediately.
+func newTestWSServer(t *testing.T, reject bool) *httptest.Server {
+	t.Helper()
+
+	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if reject {
+			w.WriteHeader(http.StatusServiceUnavailable)
+			return
+		}
+
+		conn, err := upgrader.Upgrade(w, r, nil)
+		if err != nil {
+			t.Logf("upgrade error: %v", err)
+			return
+		}
+		defer conn.Close()
+
+		// Read the subscription request.
+		_, _, err = conn.ReadMessage()
+		if err != nil {
+			return
+		}
+
+		// Send a simple ack (not a milestone, just keeps connection alive).
+		ack := map[string]interface{}{
+			"jsonrpc": "2.0",
+			"id":      0,
+			"result":  map[string]interface{}{},
+		}
+
+		if err := conn.WriteJSON(ack); err != nil {
+			return
+		}
+
+		// Keep the connection open until client disconnects.
+		for {
+			if _, _, err := conn.ReadMessage(); err != nil {
+				return
+			}
+		}
+	}))
+}
+
+// newTestWSServerWithMilestone creates a test WS server that sends a milestone event after connection.
+func newTestWSServerWithMilestone(t *testing.T) *httptest.Server {
+	t.Helper()
+
+	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		conn, err := upgrader.Upgrade(w, r, nil)
+		if err != nil {
+			t.Logf("upgrade error: %v", err)
+			return
+		}
+		defer conn.Close()
+
+		// Read the subscription request.
+		_, _, err = conn.ReadMessage()
+		if err != nil {
+			return
+		}
+
+		// Send a milestone event.
+		resp := wsResponse{
+			JSONRPC: "2.0",
+			ID:      0,
+			Result: wsResult{
+				Query: "tm.event='NewBlock' AND milestone.number>0",
+				Data: wsData{
+					Type: "tendermint/event/NewBlock",
+					Value: wsValue{
+						FinalizeBlock: finalizeBlock{
+							Events: []wsEvent{
+								{
+									Type: "milestone",
+									Attributes: []attribute{
+										{Key: "proposer", Value: "0x0000000000000000000000000000000000000001"},
+										{Key: "hash", Value: "0x0000000000000000000000000000000000000000000000000000000000000002"},
+										{Key: "start_block", Value: "100"},
+										{Key: "end_block", Value: "200"},
+										{Key: "bor_chain_id", Value: "137"},
+										{Key: "milestone_id", Value: "test-1"},
+										{Key: "timestamp", Value: "1000"},
+										{Key: "total_difficulty", Value: "500"},
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+		}
+
+		data, _ := json.Marshal(resp)
+		if err := conn.WriteMessage(websocket.TextMessage, data); err != nil {
+			return
+		}
+
+		// Keep connection open.
+		for {
+			if _, _, err := conn.ReadMessage(); err != nil {
+				return
+			}
+		}
+	}))
+}
+
+func wsURL(httpURL string) string {
+	return "ws" + strings.TrimPrefix(httpURL, "http")
+}
+
+func TestWSClient_ConstructorSingleURL(t *testing.T) {
+	client, err := NewHeimdallWSClient("ws://localhost:1234", "")
+	require.NoError(t, err)
+	assert.Len(t, client.urls, 1)
+	assert.Equal(t, "ws://localhost:1234", client.urls[0])
+	assert.Equal(t, 0, client.activeURL)
+}
+
+func TestWSClient_ConstructorDualURL(t *testing.T) {
+	client, err := NewHeimdallWSClient("ws://primary:1234", "ws://secondary:5678")
+	require.NoError(t, err)
+	assert.Len(t, client.urls, 2)
+	assert.Equal(t, "ws://primary:1234", client.urls[0])
+	assert.Equal(t, "ws://secondary:5678", client.urls[1])
+	assert.Equal(t, 0, client.activeURL)
+}
+
+func TestWSClient_SingleURL_ConnectsSuccessfully(t *testing.T) {
+	server := newTestWSServerWithMilestone(t)
+	defer server.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(server.URL), "")
+	require.NoError(t, err)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	events := client.SubscribeMilestoneEvents(ctx)
+
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+		assert.Equal(t, uint64(100), m.StartBlock)
+		assert.Equal(t, uint64(200), m.EndBlock)
+		assert.Equal(t, "137", m.BorChainID)
+		assert.Equal(t, "test-1", m.MilestoneID)
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for milestone event")
+	}
+
+	require.NoError(t, client.Unsubscribe(ctx))
+}
+
+func TestWSClient_DualURL_FailoverToSecondary(t *testing.T) {
+	// Primary always rejects.
+	primary := newTestWSServer(t, true)
+	defer primary.Close()
+
+	// Secondary accepts and sends a milestone.
+	secondary := newTestWSServerWithMilestone(t)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
+
+	// Speed up test by reducing reconnect delay and attempts.
+	client.reconnectDelay = 100 * time.Millisecond
+	client.primaryAttempts = 2
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	events := client.SubscribeMilestoneEvents(ctx)
+
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+		assert.Equal(t, uint64(100), m.StartBlock)
+		assert.Equal(t, uint64(200), m.EndBlock)
+		// Verify we switched to secondary.
+		assert.Equal(t, 1, client.activeURL)
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for milestone event via failover")
+	}
+
+	require.NoError(t, client.Unsubscribe(ctx))
+}
+
+func TestWSClient_ContextCancellation(t *testing.T) {
+	// Both URLs reject — client should respect context cancellation.
+	primary := newTestWSServer(t, true)
+	defer primary.Close()
+
+	secondary := newTestWSServer(t, true)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
+
+	client.reconnectDelay = 100 * time.Millisecond
+
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// Cancel after a short delay.
+	go func() {
+		time.Sleep(300 * time.Millisecond)
+		cancel()
+	}()
+
+	// tryUntilSubscribeMilestoneEvents should return without blocking forever.
+	client.tryUntilSubscribeMilestoneEvents(ctx)
+
+	// Verify context was cancelled.
+	assert.Error(t, ctx.Err())
+}
+
+func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
+	// Test that after cooldown, the reconnection loop probes primary first.
+	primary := newTestWSServer(t, true)
+	defer primary.Close()
+
+	secondary := newTestWSServer(t, true)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
+
+	client.reconnectDelay = 100 * time.Millisecond
+	client.wsCooldown = 50 * time.Millisecond
+
+	// Simulate being on secondary after failover with cooldown elapsed.
+	client.activeURL = 1
+	client.lastFailover = time.Now().Add(-1 * time.Second)
+
+	// Short-lived context — the function will probe primary (reset activeURL=0),
+	// fail to dial, then context expires.
+	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
+	defer cancel()
+
+	client.tryUntilSubscribeMilestoneEvents(ctx)
+
+	// After cooldown elapsed, activeURL should be reset to 0 (probed primary).
+	assert.Equal(t, 0, client.activeURL)
+}
+
+func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
+	// Start with primary down, then bring it up.
+
+	// Primary starts rejecting.
+	primaryReject := newTestWSServer(t, true)
+
+	// Secondary accepts with milestone.
+	secondary := newTestWSServerWithMilestone(t)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primaryReject.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
+
+	client.reconnectDelay = 100 * time.Millisecond
+	client.primaryAttempts = 2
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	events := client.SubscribeMilestoneEvents(ctx)
+
+	// Should failover to secondary.
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+		assert.Equal(t, 1, client.activeURL)
+		assert.Equal(t, uint64(100), m.StartBlock)
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for failover")
+	}
+
+	// The fact that failover worked and lastFailover is set
+	// proves the probe-back mechanism can work later.
+	assert.False(t, client.lastFailover.IsZero(), "lastFailover should be set after switching to secondary")
+
+	// Close the rejecting primary.
+	primaryReject.Close()
+
+	require.NoError(t, client.Unsubscribe(ctx))
+}
diff --git a/eth/ethconfig/config.go b/eth/ethconfig/config.go
index 1b07abe5b9..867e42225a 100644
--- a/eth/ethconfig/config.go
+++ b/eth/ethconfig/config.go
@@ -225,9 +225,15 @@ type Config struct {
 	// Address to connect to Heimdall gRPC server
 	HeimdallgRPCAddress string
 
+	// Address to connect to a secondary Heimdall gRPC server for failover
+	HeimdallgRPCSecondaryAddress string
+
 	// Address to connect to Heimdall WS subscription server
 	HeimdallWSAddress string
 
+	// Address to connect to a secondary Heimdall WS subscription server for failover
+	HeimdallWSSecondaryAddress string
+
 	// Run heimdall service as a child process
 	RunHeimdall bool
 
@@ -348,20 +354,55 @@ func CreateConsensusEngine(chainConfig *params.ChainConfig, ethConfig *Config, d
 				heimdallClient = heimdall.NewHeimdallClient(ethConfig.HeimdallURL, ethConfig.HeimdallTimeout)
 			}
 
-			if ethConfig.HeimdallSecondaryURL != "" {
-				secondaryClient := heimdall.NewHeimdallClient(ethConfig.HeimdallSecondaryURL, ethConfig.HeimdallTimeout)
-				heimdallClient = heimdall.NewFailoverHeimdallClient(heimdallClient, secondaryClient)
+			// Build secondary client for failover.
+			var secondaryHeimdallClient bor.IHeimdallClient
+
+			if ethConfig.HeimdallgRPCSecondaryAddress != "" {
+				// For secondary gRPC's FetchStatus (uses HTTP internally),
+				// prefer secondary HTTP URL if set, otherwise primary.
+				secondaryHTTPURL := ethConfig.HeimdallSecondaryURL
+				if secondaryHTTPURL == "" {
+					secondaryHTTPURL = ethConfig.HeimdallURL
+				}
 
-				log.Info("Heimdall failover enabled", "primary", ethConfig.HeimdallURL, "secondary", ethConfig.HeimdallSecondaryURL)
+				grpcSecondary, grpcErr := heimdallgrpc.NewHeimdallGRPCClient(
+					ethConfig.HeimdallgRPCSecondaryAddress,
+					secondaryHTTPURL,
+					ethConfig.HeimdallTimeout,
+				)
+				if grpcErr != nil {
+					log.Error("Failed to initialize secondary Heimdall gRPC client",
+						"address", ethConfig.HeimdallgRPCSecondaryAddress, "err", grpcErr)
+				} else {
+					secondaryHeimdallClient = grpcSecondary
+				}
+			}
+
+			if secondaryHeimdallClient == nil && ethConfig.HeimdallSecondaryURL != "" {
+				secondaryHeimdallClient = heimdall.NewHeimdallClient(ethConfig.HeimdallSecondaryURL, ethConfig.HeimdallTimeout)
+			}
+
+			if secondaryHeimdallClient != nil {
+				heimdallClient = heimdall.NewFailoverHeimdallClient(heimdallClient, secondaryHeimdallClient)
+				log.Info("Heimdall failover enabled")
 			}
 
 			var heimdallWSClient bor.IHeimdallWSClient
 			var err error
 			if ethConfig.HeimdallWSAddress != "" {
-				heimdallWSClient, err = heimdallws.NewHeimdallWSClient(ethConfig.HeimdallWSAddress)
+				heimdallWSClient, err = heimdallws.NewHeimdallWSClient(
+					ethConfig.HeimdallWSAddress,
+					ethConfig.HeimdallWSSecondaryAddress,
+				)
 				if err != nil {
 					return nil, err
 				}
+
+				if ethConfig.HeimdallWSSecondaryAddress != "" {
+					log.Info("Heimdall WS failover enabled",
+						"primary", ethConfig.HeimdallWSAddress,
+						"secondary", ethConfig.HeimdallWSSecondaryAddress)
+				}
 			}
 
 			return bor.New(chainConfig, db, blockchainAPI, spanner, heimdallClient, heimdallWSClient, genesisContractsClient, false, ethConfig.Miner.BlockTime), nil
diff --git a/eth/ethconfig/gen_config.go b/eth/ethconfig/gen_config.go
index 98ef6e3021..0c9d21e8a3 100644
--- a/eth/ethconfig/gen_config.go
+++ b/eth/ethconfig/gen_config.go
@@ -71,7 +71,9 @@ func (c Config) MarshalTOML() (interface{}, error) {
 		HeimdallTimeout                      time.Duration
 		WithoutHeimdall                      bool
 		HeimdallgRPCAddress                  string
+		HeimdallgRPCSecondaryAddress         string
 		HeimdallWSAddress                    string
+		HeimdallWSSecondaryAddress           string
 		RunHeimdall                          bool
 		RunHeimdallArgs                      string
 		UseHeimdallApp                       bool
@@ -141,7 +143,9 @@ func (c Config) MarshalTOML() (interface{}, error) {
 	enc.HeimdallTimeout = c.HeimdallTimeout
 	enc.WithoutHeimdall = c.WithoutHeimdall
 	enc.HeimdallgRPCAddress = c.HeimdallgRPCAddress
+	enc.HeimdallgRPCSecondaryAddress = c.HeimdallgRPCSecondaryAddress
 	enc.HeimdallWSAddress = c.HeimdallWSAddress
+	enc.HeimdallWSSecondaryAddress = c.HeimdallWSSecondaryAddress
 	enc.RunHeimdall = c.RunHeimdall
 	enc.RunHeimdallArgs = c.RunHeimdallArgs
 	enc.UseHeimdallApp = c.UseHeimdallApp
@@ -219,7 +223,9 @@ func (c *Config) UnmarshalTOML(unmarshal func(interface{}) error) error {
 		HeimdallTimeout                      *time.Duration
 		WithoutHeimdall                      *bool
 		HeimdallgRPCAddress                  *string
+		HeimdallgRPCSecondaryAddress         *string
 		HeimdallWSAddress                    *string
+		HeimdallWSSecondaryAddress           *string
 		RunHeimdall                          *bool
 		RunHeimdallArgs                      *string
 		UseHeimdallApp                       *bool
@@ -388,9 +394,15 @@ func (c *Config) UnmarshalTOML(unmarshal func(interface{}) error) error {
 	if dec.HeimdallgRPCAddress != nil {
 		c.HeimdallgRPCAddress = *dec.HeimdallgRPCAddress
 	}
+	if dec.HeimdallgRPCSecondaryAddress != nil {
+		c.HeimdallgRPCSecondaryAddress = *dec.HeimdallgRPCSecondaryAddress
+	}
 	if dec.HeimdallWSAddress != nil {
 		c.HeimdallWSAddress = *dec.HeimdallWSAddress
 	}
+	if dec.HeimdallWSSecondaryAddress != nil {
+		c.HeimdallWSSecondaryAddress = *dec.HeimdallWSSecondaryAddress
+	}
 	if dec.RunHeimdall != nil {
 		c.RunHeimdall = *dec.RunHeimdall
 	}
diff --git a/internal/cli/server/config.go b/internal/cli/server/config.go
index 3db9c20740..9179754b51 100644
--- a/internal/cli/server/config.go
+++ b/internal/cli/server/config.go
@@ -320,9 +320,15 @@ type HeimdallConfig struct {
 	// GRPCAddress is the address of the heimdall grpc server
 	GRPCAddress string `hcl:"grpc-address,optional" toml:"grpc-address,optional"`
 
+	// GRPCSecondaryAddress is the address of a secondary heimdall grpc server for failover
+	GRPCSecondaryAddress string `hcl:"grpc-secondary-address,optional" toml:"grpc-secondary-address,optional"`
+
 	// WSAddress is the address of the heimdall ws subscription server
 	WSAddress string `hcl:"ws-address,optional" toml:"ws-address,optional"`
 
+	// WSSecondaryAddress is the address of a secondary heimdall ws subscription server for failover
+	WSSecondaryAddress string `hcl:"ws-secondary-address,optional" toml:"ws-secondary-address,optional"`
+
 	// RunHeimdall is used to run heimdall as a child process
 	RunHeimdall bool `hcl:"bor.runheimdall,optional" toml:"bor.runheimdall,optional"`
 
@@ -816,12 +822,14 @@ func DefaultConfig() *Config {
 			},
 		},
 		Heimdall: &HeimdallConfig{
-			URL:          "http://localhost:1317",
-			SecondaryURL: "",
-			Timeout:      5 * time.Second,
-			Without:      false,
-			GRPCAddress:  "",
-			WSAddress:    "",
+			URL:                  "http://localhost:1317",
+			SecondaryURL:         "",
+			Timeout:              5 * time.Second,
+			Without:              false,
+			GRPCAddress:          "",
+			GRPCSecondaryAddress: "",
+			WSAddress:            "",
+			WSSecondaryAddress:   "",
 		},
 		SyncMode:    "full",
 		GcMode:      "full",
@@ -1161,7 +1169,9 @@ func (c *Config) buildEth(stack *node.Node, accountManager *accounts.Manager) (*
 	n.HeimdallTimeout = c.Heimdall.Timeout
 	n.WithoutHeimdall = c.Heimdall.Without
 	n.HeimdallgRPCAddress = c.Heimdall.GRPCAddress
+	n.HeimdallgRPCSecondaryAddress = c.Heimdall.GRPCSecondaryAddress
 	n.HeimdallWSAddress = c.Heimdall.WSAddress
+	n.HeimdallWSSecondaryAddress = c.Heimdall.WSSecondaryAddress
 	n.RunHeimdall = c.Heimdall.RunHeimdall
 	n.RunHeimdallArgs = c.Heimdall.RunHeimdallArgs
 	n.UseHeimdallApp = c.Heimdall.UseHeimdallApp
diff --git a/internal/cli/server/flags.go b/internal/cli/server/flags.go
index 928eda1851..70c15360ce 100644
--- a/internal/cli/server/flags.go
+++ b/internal/cli/server/flags.go
@@ -209,12 +209,24 @@ func (c *Command) Flags(config *Config) *flagset.Flagset {
 		Value:   &c.cliConfig.Heimdall.GRPCAddress,
 		Default: c.cliConfig.Heimdall.GRPCAddress,
 	})
+	f.StringFlag(&flagset.StringFlag{
+		Name:    "bor.heimdallgRPC.secondary",
+		Usage:   "Address of a secondary Heimdall gRPC service for failover",
+		Value:   &c.cliConfig.Heimdall.GRPCSecondaryAddress,
+		Default: c.cliConfig.Heimdall.GRPCSecondaryAddress,
+	})
 	f.StringFlag(&flagset.StringFlag{
 		Name:    "bor.heimdallWS",
 		Usage:   "Address of Heimdall ws subscription service",
 		Value:   &c.cliConfig.Heimdall.WSAddress,
 		Default: c.cliConfig.Heimdall.WSAddress,
 	})
+	f.StringFlag(&flagset.StringFlag{
+		Name:    "bor.heimdallWS.secondary",
+		Usage:   "Address of a secondary Heimdall WS subscription service for failover",
+		Value:   &c.cliConfig.Heimdall.WSSecondaryAddress,
+		Default: c.cliConfig.Heimdall.WSSecondaryAddress,
+	})
 	f.BoolFlag(&flagset.BoolFlag{
 		Name:    "bor.runheimdall",
 		Usage:   "Run Heimdall service as a child process",
diff --git a/internal/cli/server/testdata/default.toml b/internal/cli/server/testdata/default.toml
index 658bca960c..7cbab6628f 100644
--- a/internal/cli/server/testdata/default.toml
+++ b/internal/cli/server/testdata/default.toml
@@ -53,6 +53,9 @@ devfakeauthor = false
   secondary-url = ""
   "bor.without" = false
   grpc-address = ""
+  grpc-secondary-address = ""
+  ws-address = ""
+  ws-secondary-address = ""
   "bor.runheimdall" = false
   "bor.runheimdallargs" = ""
   "bor.useheimdallapp" = false

From d4df759c0f12abfac378752c522f55ff17efcbe5 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Thu, 12 Feb 2026 14:30:44 +0530
Subject: [PATCH 08/29] added tests

---
 eth/ethconfig/config_test.go | 151 +++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/eth/ethconfig/config_test.go b/eth/ethconfig/config_test.go
index 1329830fbf..7752dd11fc 100644
--- a/eth/ethconfig/config_test.go
+++ b/eth/ethconfig/config_test.go
@@ -13,6 +13,7 @@ import (
 	"github.com/ethereum/go-ethereum/consensus/bor/heimdall"
 	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/checkpoint"
 	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/milestone"
+	"github.com/ethereum/go-ethereum/consensus/bor/heimdallws"
 	"github.com/ethereum/go-ethereum/core/rawdb"
 	"github.com/ethereum/go-ethereum/params"
 	"github.com/stretchr/testify/require"
@@ -118,3 +119,153 @@ func TestCreateConsensusEngine_WithoutHeimdall(t *testing.T) {
 	_, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
 }
+
+func TestCreateConsensusEngine_GRPCSecondaryFailover(t *testing.T) {
+	t.Parallel()
+
+	ethConfig := &Config{
+		OverrideHeimdallClient:       &mockHeimdallClient{},
+		HeimdallgRPCSecondaryAddress: "localhost:50051",
+		HeimdallURL:                  "http://localhost:1317",
+	}
+
+	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+	require.NoError(t, err)
+	defer engine.Close()
+
+	borEngine, ok := engine.(*bor.Bor)
+	require.True(t, ok, "Expected Bor consensus engine")
+
+	// Primary mock gets wrapped in FailoverHeimdallClient with gRPC secondary
+	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+	require.True(t, ok, "Expected HeimdallClient to be wrapped in FailoverHeimdallClient")
+}
+
+func TestCreateConsensusEngine_GRPCSecondaryError_FallsBackToHTTP(t *testing.T) {
+	t.Parallel()
+
+	ethConfig := &Config{
+		OverrideHeimdallClient: &mockHeimdallClient{},
+		// Invalid scheme causes NewHeimdallGRPCClient to fail
+		HeimdallgRPCSecondaryAddress: "ftp://localhost:50051",
+		HeimdallSecondaryURL:         "http://secondary:1317",
+	}
+
+	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+	require.NoError(t, err)
+	defer engine.Close()
+
+	borEngine, ok := engine.(*bor.Bor)
+	require.True(t, ok, "Expected Bor consensus engine")
+
+	// gRPC secondary failed, but HTTP secondary kicks in
+	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+	require.True(t, ok, "Expected FailoverHeimdallClient with HTTP fallback after gRPC failure")
+}
+
+func TestCreateConsensusEngine_GRPCSecondaryError_NoHTTPFallback(t *testing.T) {
+	t.Parallel()
+
+	ethConfig := &Config{
+		OverrideHeimdallClient: &mockHeimdallClient{},
+		// Invalid scheme causes NewHeimdallGRPCClient to fail
+		HeimdallgRPCSecondaryAddress: "ftp://localhost:50051",
+		// No HeimdallSecondaryURL — no fallback available
+	}
+
+	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+	require.NoError(t, err)
+	defer engine.Close()
+
+	borEngine, ok := engine.(*bor.Bor)
+	require.True(t, ok, "Expected Bor consensus engine")
+
+	// No secondary available, so no failover wrapper
+	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+	require.False(t, ok, "Expected no FailoverHeimdallClient when both gRPC and HTTP secondary fail/absent")
+}
+
+func TestCreateConsensusEngine_GRPCSecondaryUsesSecondaryHTTPURL(t *testing.T) {
+	t.Parallel()
+
+	ethConfig := &Config{
+		OverrideHeimdallClient:       &mockHeimdallClient{},
+		HeimdallURL:                  "http://primary:1317",
+		HeimdallSecondaryURL:         "http://secondary:1317",
+		HeimdallgRPCSecondaryAddress: "localhost:50051",
+	}
+
+	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+	require.NoError(t, err)
+	defer engine.Close()
+
+	borEngine, ok := engine.(*bor.Bor)
+	require.True(t, ok, "Expected Bor consensus engine")
+
+	// gRPC secondary should be created successfully and wrap in failover.
+	// gRPC takes priority over HTTP secondary when both are available.
+	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+	require.True(t, ok, "Expected FailoverHeimdallClient (gRPC secondary takes priority over HTTP)")
+}
+
+func TestCreateConsensusEngine_WSWithSecondary(t *testing.T) {
+	t.Parallel()
+
+	ethConfig := &Config{
+		OverrideHeimdallClient:     &mockHeimdallClient{},
+		HeimdallWSAddress:          "ws://localhost:26657",
+		HeimdallWSSecondaryAddress: "ws://secondary:26657",
+	}
+
+	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+	require.NoError(t, err)
+	defer engine.Close()
+
+	borEngine, ok := engine.(*bor.Bor)
+	require.True(t, ok, "Expected Bor consensus engine")
+
+	// WS client should be created
+	require.NotNil(t, borEngine.HeimdallWSClient, "Expected non-nil HeimdallWSClient")
+
+	_, ok = borEngine.HeimdallWSClient.(*heimdallws.HeimdallWSClient)
+	require.True(t, ok, "Expected HeimdallWSClient type")
+}
+
+func TestCreateConsensusEngine_WSPrimaryOnly(t *testing.T) {
+	t.Parallel()
+
+	ethConfig := &Config{
+		OverrideHeimdallClient: &mockHeimdallClient{},
+		HeimdallWSAddress:      "ws://localhost:26657",
+	}
+
+	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+	require.NoError(t, err)
+	defer engine.Close()
+
+	borEngine, ok := engine.(*bor.Bor)
+	require.True(t, ok, "Expected Bor consensus engine")
+
+	require.NotNil(t, borEngine.HeimdallWSClient, "Expected non-nil HeimdallWSClient")
+
+	_, ok = borEngine.HeimdallWSClient.(*heimdallws.HeimdallWSClient)
+	require.True(t, ok, "Expected HeimdallWSClient type")
+}
+
+func TestCreateConsensusEngine_NoWSAddress(t *testing.T) {
+	t.Parallel()
+
+	ethConfig := &Config{
+		OverrideHeimdallClient: &mockHeimdallClient{},
+		// No HeimdallWSAddress set
+	}
+
+	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+	require.NoError(t, err)
+	defer engine.Close()
+
+	borEngine, ok := engine.(*bor.Bor)
+	require.True(t, ok, "Expected Bor consensus engine")
+
+	require.Nil(t, borEngine.HeimdallWSClient, "Expected nil HeimdallWSClient when no WS address configured")
+}

From 25b7cad29555147361c787e2ffd5f1e4915aca17 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Thu, 12 Feb 2026 17:28:11 +0530
Subject: [PATCH 09/29] accepting a list of urls (multiple) instead of just
 secondary url

---
 cmd/utils/bor_flags.go                        |  39 +-----
 consensus/bor/heimdall/failover_client.go     | 126 +++++++++++------
 .../bor/heimdall/failover_client_test.go      | 113 ++++++++++++++-
 consensus/bor/heimdallws/client.go            |  48 ++++---
 consensus/bor/heimdallws/client_test.go       |  65 ++++++++-
 eth/ethconfig/config.go                       | 130 +++++++++---------
 eth/ethconfig/config_test.go                  | 116 ++++++----------
 eth/ethconfig/gen_config.go                   |  18 ---
 internal/cli/server/config.go                 |  31 ++---
 internal/cli/server/flags.go                  |  24 +---
 internal/cli/server/testdata/default.toml     |   3 -
 11 files changed, 411 insertions(+), 302 deletions(-)

diff --git a/cmd/utils/bor_flags.go b/cmd/utils/bor_flags.go
index f8b1224e0b..3c87196dcf 100644
--- a/cmd/utils/bor_flags.go
+++ b/cmd/utils/bor_flags.go
@@ -16,20 +16,13 @@ var (
 	// Bor Specific flags
 	//
 
-	// HeimdallURLFlag flag for heimdall url
+	// HeimdallURLFlag flag for heimdall url (comma-separated for failover)
 	HeimdallURLFlag = &cli.StringFlag{
 		Name:  "bor.heimdall",
-		Usage: "URL of Heimdall service",
+		Usage: "URL of Heimdall service (comma-separated for failover: \"url1,url2\")",
 		Value: "http://localhost:1317",
 	}
 
-	// HeimdallSecondaryURLFlag flag for secondary heimdall url (failover)
-	HeimdallSecondaryURLFlag = &cli.StringFlag{
-		Name:  "bor.heimdall.secondary",
-		Usage: "URL of a secondary Heimdall service for failover",
-		Value: "",
-	}
-
 	// HeimdallTimeoutFlag flag for heimdall timeout
 	HeimdallTimeoutFlag = &cli.DurationFlag{
 		Name:  "bor.heimdalltimeout",
@@ -43,31 +36,17 @@ var (
 		Usage: "Run without Heimdall service (for testing purpose)",
 	}
 
-	// HeimdallgRPCAddressFlag flag for heimdall gRPC address
+	// HeimdallgRPCAddressFlag flag for heimdall gRPC address (comma-separated for failover)
 	HeimdallgRPCAddressFlag = &cli.StringFlag{
 		Name:  "bor.heimdallgRPC",
-		Usage: "Address of Heimdall gRPC service",
+		Usage: "Address of Heimdall gRPC service (comma-separated for failover: \"addr1,addr2\")",
 		Value: "",
 	}
 
-	// HeimdallgRPCSecondaryAddressFlag flag for secondary heimdall gRPC address (failover)
-	HeimdallgRPCSecondaryAddressFlag = &cli.StringFlag{
-		Name:  "bor.heimdallgRPC.secondary",
-		Usage: "Address of a secondary Heimdall gRPC service for failover",
-		Value: "",
-	}
-
-	// HeimdallWSAddressFlag flag for heimdall websocket subscription service
+	// HeimdallWSAddressFlag flag for heimdall websocket subscription service (comma-separated for failover)
 	HeimdallWSAddressFlag = &cli.StringFlag{
 		Name:  "bor.heimdallWS",
-		Usage: "Address of Heimdall WS Subscription service",
-		Value: "",
-	}
-
-	// HeimdallWSSecondaryAddressFlag flag for secondary heimdall WS address (failover)
-	HeimdallWSSecondaryAddressFlag = &cli.StringFlag{
-		Name:  "bor.heimdallWS.secondary",
-		Usage: "Address of a secondary Heimdall WS Subscription service for failover",
+		Usage: "Address of Heimdall WS Subscription service (comma-separated for failover: \"addr1,addr2\")",
 		Value: "",
 	}
 
@@ -92,13 +71,10 @@ var (
 	// BorFlags all bor related flags
 	BorFlags = []cli.Flag{
 		HeimdallURLFlag,
-		HeimdallSecondaryURLFlag,
 		HeimdallTimeoutFlag,
 		WithoutHeimdallFlag,
 		HeimdallgRPCAddressFlag,
-		HeimdallgRPCSecondaryAddressFlag,
 		HeimdallWSAddressFlag,
-		HeimdallWSSecondaryAddressFlag,
 		RunHeimdallFlag,
 		RunHeimdallArgsFlag,
 		UseHeimdallAppFlag,
@@ -108,13 +84,10 @@ var (
 // SetBorConfig sets bor config
 func SetBorConfig(ctx *cli.Context, cfg *eth.Config) {
 	cfg.HeimdallURL = ctx.String(HeimdallURLFlag.Name)
-	cfg.HeimdallSecondaryURL = ctx.String(HeimdallSecondaryURLFlag.Name)
 	cfg.HeimdallTimeout = ctx.Duration(HeimdallTimeoutFlag.Name)
 	cfg.WithoutHeimdall = ctx.Bool(WithoutHeimdallFlag.Name)
 	cfg.HeimdallgRPCAddress = ctx.String(HeimdallgRPCAddressFlag.Name)
-	cfg.HeimdallgRPCSecondaryAddress = ctx.String(HeimdallgRPCSecondaryAddressFlag.Name)
 	cfg.HeimdallWSAddress = ctx.String(HeimdallWSAddressFlag.Name)
-	cfg.HeimdallWSSecondaryAddress = ctx.String(HeimdallWSSecondaryAddressFlag.Name)
 	cfg.RunHeimdall = ctx.Bool(RunHeimdallFlag.Name)
 	cfg.RunHeimdallArgs = ctx.String(RunHeimdallArgsFlag.Name)
 	cfg.UseHeimdallApp = ctx.Bool(UseHeimdallAppFlag.Name)
diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index d74b9c3e4a..4e6efa7082 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -21,9 +21,10 @@ const (
 	defaultSecondaryCooldown = 2 * time.Minute
 )
 
-// heimdallClient is a local interface matching bor.IHeimdallClient to avoid
-// an import cycle with the consensus/bor package.
-type heimdallClient interface {
+// Endpoint matches bor.IHeimdallClient. It is exported so that external
+// packages can build []Endpoint slices for NewFailoverHeimdallClient without
+// running into Go's covariant-slice restriction.
+type Endpoint interface {
 	StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error)
 	GetSpan(ctx context.Context, spanID uint64) (*types.Span, error)
 	GetLatestSpan(ctx context.Context) (*types.Span, error)
@@ -35,89 +36,90 @@ type heimdallClient interface {
 	Close()
 }
 
-// FailoverHeimdallClient wraps two heimdall clients (primary + secondary) and
-// transparently fails over from primary to secondary when the primary is
+// FailoverHeimdallClient wraps N heimdall clients (primary at index 0, failovers
+// at 1..N-1) and transparently cascades through them when the active client is
 // unreachable. After a cooldown period it probes the primary again.
 type FailoverHeimdallClient struct {
-	clients        [2]heimdallClient
+	clients        []Endpoint
 	mu             sync.Mutex
-	active         int       // 0 = primary, 1 = secondary
-	lastSwitch     time.Time // when we last switched to secondary
+	active         int       // 0 = primary, >0 = failover
+	lastSwitch     time.Time // when we last switched away from primary
 	attemptTimeout time.Duration
 	cooldown       time.Duration
 }
 
-func NewFailoverHeimdallClient(primary, secondary heimdallClient) *FailoverHeimdallClient {
+func NewFailoverHeimdallClient(clients ...Endpoint) *FailoverHeimdallClient {
 	return &FailoverHeimdallClient{
-		clients:        [2]heimdallClient{primary, secondary},
+		clients:        clients,
 		attemptTimeout: defaultAttemptTimeout,
 		cooldown:       defaultSecondaryCooldown,
 	}
 }
 
 func (f *FailoverHeimdallClient) StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error) {
-	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) ([]*clerk.EventRecordWithTime, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) ([]*clerk.EventRecordWithTime, error) {
 		return c.StateSyncEvents(ctx, fromID, to)
 	})
 }
 
 func (f *FailoverHeimdallClient) GetSpan(ctx context.Context, spanID uint64) (*types.Span, error) {
-	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*types.Span, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*types.Span, error) {
 		return c.GetSpan(ctx, spanID)
 	})
 }
 
 func (f *FailoverHeimdallClient) GetLatestSpan(ctx context.Context) (*types.Span, error) {
-	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*types.Span, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*types.Span, error) {
 		return c.GetLatestSpan(ctx)
 	})
 }
 
 func (f *FailoverHeimdallClient) FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error) {
-	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*checkpoint.Checkpoint, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*checkpoint.Checkpoint, error) {
 		return c.FetchCheckpoint(ctx, number)
 	})
 }
 
 func (f *FailoverHeimdallClient) FetchCheckpointCount(ctx context.Context) (int64, error) {
-	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (int64, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (int64, error) {
 		return c.FetchCheckpointCount(ctx)
 	})
 }
 
 func (f *FailoverHeimdallClient) FetchMilestone(ctx context.Context) (*milestone.Milestone, error) {
-	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*milestone.Milestone, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*milestone.Milestone, error) {
 		return c.FetchMilestone(ctx)
 	})
 }
 
 func (f *FailoverHeimdallClient) FetchMilestoneCount(ctx context.Context) (int64, error) {
-	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (int64, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (int64, error) {
 		return c.FetchMilestoneCount(ctx)
 	})
 }
 
 func (f *FailoverHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error) {
-	return callWithFailover(f, ctx, func(ctx context.Context, c heimdallClient) (*ctypes.SyncInfo, error) {
+	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*ctypes.SyncInfo, error) {
 		return c.FetchStatus(ctx)
 	})
 }
 
 func (f *FailoverHeimdallClient) Close() {
-	f.clients[0].Close()
-	f.clients[1].Close()
+	for _, c := range f.clients {
+		c.Close()
+	}
 }
 
 // callWithFailover executes fn against the active client. If the active client
-// is primary and the call fails with a failover-eligible error, it retries on
-// the secondary. If on secondary past the cooldown, it probes the primary first.
-func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn func(context.Context, heimdallClient) (T, error)) (T, error) {
+// fails with a failover-eligible error, it cascades through remaining clients.
+// If on a non-primary client past the cooldown, it probes the primary first.
+func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error)) (T, error) {
 	f.mu.Lock()
 	active := f.active
-	shouldProbe := active == 1 && time.Since(f.lastSwitch) >= f.cooldown
+	shouldProbe := active != 0 && time.Since(f.lastSwitch) >= f.cooldown
 	f.mu.Unlock()
 
-	// If on secondary and cooldown has elapsed, probe primary
+	// If on a non-primary client and cooldown has elapsed, probe primary
 	if shouldProbe {
 		subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
 		result, err := fn(subCtx, f.clients[0])
@@ -138,23 +140,40 @@ func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn
 			return zero, err
 		}
 
-		// Primary still down, stay on secondary
+		// Primary still down, stay on current client
 		f.mu.Lock()
 		f.lastSwitch = time.Now()
 		f.mu.Unlock()
 
-		log.Debug("Heimdall failover: primary still down after probe, staying on secondary", "err", err)
+		log.Debug("Heimdall failover: primary still down after probe, staying on current", "active", active, "err", err)
+
+		// Try current client, then cascade through remaining on failure
+		result, err = fn(ctx, f.clients[active])
+		if err == nil {
+			return result, nil
+		}
+
+		if !isFailoverError(err, ctx) {
+			var zero T
+			return zero, err
+		}
 
-		// Secondary calls use the caller's ctx directly (no sub-timeout).
-		// The timeout is only needed on primary to bound the failover decision.
-		// Once on secondary there is no further fallback, so the caller's
-		// context (which always has a cancellation path in Bor) governs lifetime.
-		return fn(ctx, f.clients[1])
+		return cascadeClients(f, ctx, fn, active, err)
 	}
 
-	if active == 1 {
-		// On secondary, not yet time to probe: use secondary directly
-		return fn(ctx, f.clients[1])
+	if active != 0 {
+		// On a non-primary client, not yet time to probe: use current directly
+		result, err := fn(ctx, f.clients[active])
+		if err == nil {
+			return result, nil
+		}
+
+		if !isFailoverError(err, ctx) {
+			var zero T
+			return zero, err
+		}
+
+		return cascadeClients(f, ctx, fn, active, err)
 	}
 
 	// Active is primary: try with timeout
@@ -171,15 +190,38 @@ func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn
 		return zero, err
 	}
 
-	// Failover to secondary
-	f.mu.Lock()
-	f.active = 1
-	f.lastSwitch = time.Now()
-	f.mu.Unlock()
+	// Cascade through clients [1, 2, ..., N-1]
+	log.Warn("Heimdall failover: primary failed, cascading to next client", "err", err)
+
+	return cascadeClients(f, ctx, fn, 0, err)
+}
+
+// cascadeClients tries clients after the given index. On first success it
+// switches the active client and returns. If all fail, returns the last error.
+func cascadeClients[T any](f *FailoverHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error), after int, lastErr error) (T, error) {
+	for i := after + 1; i < len(f.clients); i++ {
+		result, err := fn(ctx, f.clients[i])
+		if err == nil {
+			f.mu.Lock()
+			f.active = i
+			f.lastSwitch = time.Now()
+			f.mu.Unlock()
+
+			log.Warn("Heimdall failover: switched to client", "index", i)
 
-	log.Warn("Heimdall failover: primary failed, switching to secondary", "err", err)
+			return result, nil
+		}
+
+		lastErr = err
+
+		if !isFailoverError(err, ctx) {
+			var zero T
+			return zero, err
+		}
+	}
 
-	return fn(ctx, f.clients[1])
+	var zero T
+	return zero, lastErr
 }
 
 // isFailoverError returns true if the error warrants trying the secondary.
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 3a4cf08489..6fad4ff745 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -21,7 +21,7 @@ import (
 	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/milestone"
 )
 
-// mockHeimdallClient is a configurable mock implementing the heimdallClient interface.
+// mockHeimdallClient is a configurable mock implementing the Endpoint interface.
 type mockHeimdallClient struct {
 	getSpanFn            func(ctx context.Context, spanID uint64) (*types.Span, error)
 	getLatestSpanFn      func(ctx context.Context) (*types.Span, error)
@@ -591,3 +591,114 @@ func TestIsFailoverError(t *testing.T) {
 	// nil error should not trigger failover
 	assert.False(t, isFailoverError(nil, ctx), "nil error should not trigger failover")
 }
+
+func TestFailover_ThreeClients_CascadeToTertiary(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	tertiary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	span, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	require.NotNil(t, span)
+
+	assert.GreaterOrEqual(t, primary.hits.Load(), int32(1), "primary should have been tried")
+	assert.GreaterOrEqual(t, secondary.hits.Load(), int32(1), "secondary should have been tried")
+	assert.Equal(t, int32(1), tertiary.hits.Load(), "tertiary should have been called once")
+}
+
+func TestFailover_AllClientsFail(t *testing.T) {
+	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+	tertiary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+
+	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	defer fc.Close()
+
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.Error(t, err)
+}
+
+func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
+	primaryDown := atomic.Bool{}
+	primaryDown.Store(true)
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, spanID uint64) (*types.Span, error) {
+			if primaryDown.Load() {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &types.Span{Id: spanID}, nil
+		},
+	}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	tertiary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 50 * time.Millisecond
+	defer fc.Close()
+
+	// Trigger cascade to tertiary
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+
+	// Wait for cooldown
+	time.Sleep(100 * time.Millisecond)
+
+	// Bring primary back
+	primaryDown.Store(false)
+	primaryBefore := primary.hits.Load()
+
+	// Next call should probe primary and succeed
+	_, err = fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	assert.Greater(t, primary.hits.Load(), primaryBefore, "primary should have been probed")
+
+	// Verify we're back on primary
+	tertiaryBefore := tertiary.hits.Load()
+	_, err = fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	assert.Equal(t, tertiaryBefore, tertiary.hits.Load(), "should be back on primary now")
+}
+
+func TestFailover_ClosesAllClients(t *testing.T) {
+	var closed [3]atomic.Bool
+
+	clients := make([]Endpoint, 3)
+	for i := range clients {
+		idx := i
+		clients[i] = &mockHeimdallClient{closeFn: func() { closed[idx].Store(true) }}
+	}
+
+	fc := NewFailoverHeimdallClient(clients...)
+	fc.Close()
+
+	for i := range closed {
+		assert.True(t, closed[i].Load(), "client %d should be closed", i)
+	}
+}
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index d69ed6cc54..4984eaebcf 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -3,6 +3,7 @@ package heimdallws
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"strconv"
 	"sync"
 	"time"
@@ -45,16 +46,26 @@ type HeimdallWSClient struct {
 }
 
 // NewHeimdallWSClient creates a new WS client for Heimdall with optional failover.
-// If secondaryURL is empty, the client operates with a single URL (existing behavior).
-func NewHeimdallWSClient(primaryURL string, secondaryURL string) (*HeimdallWSClient, error) {
-	urls := []string{primaryURL}
-	if secondaryURL != "" {
-		urls = append(urls, secondaryURL)
+// The first URL is primary; additional URLs are failover candidates in priority order.
+func NewHeimdallWSClient(urls ...string) (*HeimdallWSClient, error) {
+	if len(urls) == 0 {
+		return nil, errors.New("at least one WS URL required")
+	}
+
+	var filtered []string
+	for _, u := range urls {
+		if u != "" {
+			filtered = append(filtered, u)
+		}
+	}
+
+	if len(filtered) == 0 {
+		return nil, errors.New("at least one non-empty WS URL required")
 	}
 
 	return &HeimdallWSClient{
 		conn:            nil,
-		urls:            urls,
+		urls:            filtered,
 		events:          make(chan *milestone.Milestone),
 		done:            make(chan struct{}),
 		primaryAttempts: defaultPrimaryAttempts,
@@ -95,8 +106,8 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		default:
 		}
 
-		// If on secondary and cooldown has elapsed, probe primary first.
-		if c.activeURL == 1 && !c.lastFailover.IsZero() && time.Since(c.lastFailover) >= c.wsCooldown {
+		// If on a non-primary URL and cooldown has elapsed, probe primary first.
+		if c.activeURL != 0 && !c.lastFailover.IsZero() && time.Since(c.lastFailover) >= c.wsCooldown {
 			log.Info("WS cooldown elapsed, probing primary", "url", c.urls[0])
 			c.activeURL = 0
 			primaryAttempts = 0
@@ -108,17 +119,16 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		if err != nil {
 			log.Error("failed to dial websocket on heimdall ws subscription", "url", url, "err", err)
 
-			// Count failures on primary; switch to secondary after threshold.
-			if c.activeURL == 0 {
-				primaryAttempts++
-
-				if len(c.urls) > 1 && primaryAttempts >= c.primaryAttempts {
-					log.Warn("Primary WS failed, switching to secondary",
-						"primary", c.urls[0], "secondary", c.urls[1], "attempts", primaryAttempts)
-					c.activeURL = 1
-					c.lastFailover = time.Now()
-					primaryAttempts = 0
-				}
+			// Count failures on current URL; advance to next after threshold.
+			primaryAttempts++
+
+			if len(c.urls) > 1 && primaryAttempts >= c.primaryAttempts {
+				next := (c.activeURL + 1) % len(c.urls)
+				log.Warn("WS URL failed, switching to next",
+					"from", c.urls[c.activeURL], "to", c.urls[next], "attempts", primaryAttempts)
+				c.activeURL = next
+				c.lastFailover = time.Now()
+				primaryAttempts = 0
 			}
 
 			continue
diff --git a/consensus/bor/heimdallws/client_test.go b/consensus/bor/heimdallws/client_test.go
index 15b3e964fd..12e4f9675a 100644
--- a/consensus/bor/heimdallws/client_test.go
+++ b/consensus/bor/heimdallws/client_test.go
@@ -130,27 +130,46 @@ func wsURL(httpURL string) string {
 }
 
 func TestWSClient_ConstructorSingleURL(t *testing.T) {
-	client, err := NewHeimdallWSClient("ws://localhost:1234", "")
+	client, err := NewHeimdallWSClient("ws://localhost:1234")
 	require.NoError(t, err)
 	assert.Len(t, client.urls, 1)
 	assert.Equal(t, "ws://localhost:1234", client.urls[0])
 	assert.Equal(t, 0, client.activeURL)
 }
 
-func TestWSClient_ConstructorDualURL(t *testing.T) {
-	client, err := NewHeimdallWSClient("ws://primary:1234", "ws://secondary:5678")
+func TestWSClient_ConstructorMultipleURLs(t *testing.T) {
+	client, err := NewHeimdallWSClient("ws://primary:1234", "ws://secondary:5678", "ws://tertiary:9999")
 	require.NoError(t, err)
-	assert.Len(t, client.urls, 2)
+	assert.Len(t, client.urls, 3)
 	assert.Equal(t, "ws://primary:1234", client.urls[0])
 	assert.Equal(t, "ws://secondary:5678", client.urls[1])
+	assert.Equal(t, "ws://tertiary:9999", client.urls[2])
 	assert.Equal(t, 0, client.activeURL)
 }
 
+func TestWSClient_ConstructorFiltersEmpty(t *testing.T) {
+	client, err := NewHeimdallWSClient("ws://primary:1234", "", "ws://tertiary:9999")
+	require.NoError(t, err)
+	assert.Len(t, client.urls, 2)
+	assert.Equal(t, "ws://primary:1234", client.urls[0])
+	assert.Equal(t, "ws://tertiary:9999", client.urls[1])
+}
+
+func TestWSClient_ConstructorNoURLs(t *testing.T) {
+	_, err := NewHeimdallWSClient()
+	require.Error(t, err)
+}
+
+func TestWSClient_ConstructorAllEmpty(t *testing.T) {
+	_, err := NewHeimdallWSClient("", "")
+	require.Error(t, err)
+}
+
 func TestWSClient_SingleURL_ConnectsSuccessfully(t *testing.T) {
 	server := newTestWSServerWithMilestone(t)
 	defer server.Close()
 
-	client, err := NewHeimdallWSClient(wsURL(server.URL), "")
+	client, err := NewHeimdallWSClient(wsURL(server.URL))
 	require.NoError(t, err)
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
@@ -207,6 +226,42 @@ func TestWSClient_DualURL_FailoverToSecondary(t *testing.T) {
 	require.NoError(t, client.Unsubscribe(ctx))
 }
 
+func TestWSClient_ThreeURL_CascadeToTertiary(t *testing.T) {
+	// Primary and secondary always reject.
+	primary := newTestWSServer(t, true)
+	defer primary.Close()
+
+	secondary := newTestWSServer(t, true)
+	defer secondary.Close()
+
+	// Tertiary accepts and sends a milestone.
+	tertiary := newTestWSServerWithMilestone(t)
+	defer tertiary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL), wsURL(tertiary.URL))
+	require.NoError(t, err)
+
+	client.reconnectDelay = 100 * time.Millisecond
+	client.primaryAttempts = 2
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	defer cancel()
+
+	events := client.SubscribeMilestoneEvents(ctx)
+
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+		assert.Equal(t, uint64(100), m.StartBlock)
+		// Verify we ended up on tertiary.
+		assert.Equal(t, 2, client.activeURL)
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for milestone event via cascade")
+	}
+
+	require.NoError(t, client.Unsubscribe(ctx))
+}
+
 func TestWSClient_ContextCancellation(t *testing.T) {
 	// Both URLs reject — client should respect context cancellation.
 	primary := newTestWSServer(t, true)
diff --git a/eth/ethconfig/config.go b/eth/ethconfig/config.go
index 867e42225a..9744b8e94f 100644
--- a/eth/ethconfig/config.go
+++ b/eth/ethconfig/config.go
@@ -19,6 +19,7 @@ package ethconfig
 
 import (
 	"math/big"
+	"strings"
 	"time"
 
 	"github.com/ethereum/go-ethereum/common"
@@ -45,6 +46,25 @@ import (
 	"github.com/ethereum/go-ethereum/params"
 )
 
+// parseURLs splits a comma-separated URL string into a trimmed, non-empty slice.
+func parseURLs(s string) []string {
+	if s == "" {
+		return nil
+	}
+
+	parts := strings.Split(s, ",")
+
+	var out []string
+	for _, p := range parts {
+		p = strings.TrimSpace(p)
+		if p != "" {
+			out = append(out, p)
+		}
+	}
+
+	return out
+}
+
 // FullNodeGPO contains default gasprice oracle settings for full node.
 var FullNodeGPO = gasprice.Config{
 	Blocks:           20,
@@ -210,30 +230,21 @@ type Config struct {
 	// position in eth_getLogs filter criteria (0 = no cap)
 	RPCLogQueryLimit int
 
-	// URL to connect to Heimdall node
+	// URL to connect to Heimdall node (comma-separated for failover: "url1,url2,url3")
 	HeimdallURL string
 
-	// URL to connect to a secondary Heimdall node for failover
-	HeimdallSecondaryURL string
-
 	// timeout in heimdall requests
 	HeimdallTimeout time.Duration
 
 	// No heimdall service
 	WithoutHeimdall bool
 
-	// Address to connect to Heimdall gRPC server
+	// Address to connect to Heimdall gRPC server (comma-separated for failover: "addr1,addr2")
 	HeimdallgRPCAddress string
 
-	// Address to connect to a secondary Heimdall gRPC server for failover
-	HeimdallgRPCSecondaryAddress string
-
-	// Address to connect to Heimdall WS subscription server
+	// Address to connect to Heimdall WS subscription server (comma-separated for failover: "addr1,addr2")
 	HeimdallWSAddress string
 
-	// Address to connect to a secondary Heimdall WS subscription server for failover
-	HeimdallWSSecondaryAddress string
-
 	// Run heimdall service as a child process
 	RunHeimdall bool
 
@@ -334,74 +345,61 @@ func CreateConsensusEngine(chainConfig *params.ChainConfig, ethConfig *Config, d
 				// TODO: Running heimdall from bor is not tested yet.
 				// heimdallClient = heimdallapp.NewHeimdallAppClient()
 				panic("Running heimdall from bor is not implemented yet. Please use heimdall gRPC or HTTP client instead.")
-			} else if ethConfig.HeimdallgRPCAddress != "" {
-				grpcClient, err := heimdallgrpc.NewHeimdallGRPCClient(
-					ethConfig.HeimdallgRPCAddress,
-					ethConfig.HeimdallURL,
-					ethConfig.HeimdallTimeout,
-				)
-				if err != nil {
-					log.Error("Failed to initialize Heimdall gRPC client; falling back to HTTP Heimdall client",
-						"heimdall_grpc", ethConfig.HeimdallgRPCAddress,
-						"heimdall_http", ethConfig.HeimdallURL,
-						"err", err,
-					)
-					heimdallClient = heimdall.NewHeimdallClient(ethConfig.HeimdallURL, ethConfig.HeimdallTimeout)
-				} else {
-					heimdallClient = grpcClient
-				}
 			} else {
-				heimdallClient = heimdall.NewHeimdallClient(ethConfig.HeimdallURL, ethConfig.HeimdallTimeout)
-			}
-
-			// Build secondary client for failover.
-			var secondaryHeimdallClient bor.IHeimdallClient
-
-			if ethConfig.HeimdallgRPCSecondaryAddress != "" {
-				// For secondary gRPC's FetchStatus (uses HTTP internally),
-				// prefer secondary HTTP URL if set, otherwise primary.
-				secondaryHTTPURL := ethConfig.HeimdallSecondaryURL
-				if secondaryHTTPURL == "" {
-					secondaryHTTPURL = ethConfig.HeimdallURL
+				httpURLs := parseURLs(ethConfig.HeimdallURL)
+				grpcAddrs := parseURLs(ethConfig.HeimdallgRPCAddress)
+
+				// Build one client per endpoint.
+				// gRPC takes priority where configured; falls back to HTTP.
+				var heimdallClients []heimdall.Endpoint
+
+				n := max(len(httpURLs), len(grpcAddrs))
+				for i := 0; i < n; i++ {
+					if i < len(grpcAddrs) && grpcAddrs[i] != "" {
+						httpURL := httpURLs[min(i, len(httpURLs)-1)]
+
+						grpcClient, err := heimdallgrpc.NewHeimdallGRPCClient(grpcAddrs[i], httpURL, ethConfig.HeimdallTimeout)
+						if err != nil {
+							log.Error("Failed to initialize Heimdall gRPC client; falling back to HTTP",
+								"index", i, "grpc", grpcAddrs[i], "err", err)
+
+							if i < len(httpURLs) {
+								heimdallClients = append(heimdallClients, heimdall.NewHeimdallClient(httpURLs[i], ethConfig.HeimdallTimeout))
+							}
+
+							continue
+						}
+
+						heimdallClients = append(heimdallClients, grpcClient)
+					} else if i < len(httpURLs) {
+						heimdallClients = append(heimdallClients, heimdall.NewHeimdallClient(httpURLs[i], ethConfig.HeimdallTimeout))
+					}
 				}
 
-				grpcSecondary, grpcErr := heimdallgrpc.NewHeimdallGRPCClient(
-					ethConfig.HeimdallgRPCSecondaryAddress,
-					secondaryHTTPURL,
-					ethConfig.HeimdallTimeout,
-				)
-				if grpcErr != nil {
-					log.Error("Failed to initialize secondary Heimdall gRPC client",
-						"address", ethConfig.HeimdallgRPCSecondaryAddress, "err", grpcErr)
+				if len(heimdallClients) == 0 {
+					heimdallClient = heimdall.NewHeimdallClient(ethConfig.HeimdallURL, ethConfig.HeimdallTimeout)
+				} else if len(heimdallClients) == 1 {
+					heimdallClient = heimdallClients[0]
 				} else {
-					secondaryHeimdallClient = grpcSecondary
+					heimdallClient = heimdall.NewFailoverHeimdallClient(heimdallClients...)
+					log.Info("Heimdall failover enabled", "endpoints", len(heimdallClients))
 				}
 			}
 
-			if secondaryHeimdallClient == nil && ethConfig.HeimdallSecondaryURL != "" {
-				secondaryHeimdallClient = heimdall.NewHeimdallClient(ethConfig.HeimdallSecondaryURL, ethConfig.HeimdallTimeout)
-			}
-
-			if secondaryHeimdallClient != nil {
-				heimdallClient = heimdall.NewFailoverHeimdallClient(heimdallClient, secondaryHeimdallClient)
-				log.Info("Heimdall failover enabled")
-			}
+			// WS client
+			wsAddrs := parseURLs(ethConfig.HeimdallWSAddress)
 
 			var heimdallWSClient bor.IHeimdallWSClient
 			var err error
-			if ethConfig.HeimdallWSAddress != "" {
-				heimdallWSClient, err = heimdallws.NewHeimdallWSClient(
-					ethConfig.HeimdallWSAddress,
-					ethConfig.HeimdallWSSecondaryAddress,
-				)
+
+			if len(wsAddrs) > 0 {
+				heimdallWSClient, err = heimdallws.NewHeimdallWSClient(wsAddrs...)
 				if err != nil {
 					return nil, err
 				}
 
-				if ethConfig.HeimdallWSSecondaryAddress != "" {
-					log.Info("Heimdall WS failover enabled",
-						"primary", ethConfig.HeimdallWSAddress,
-						"secondary", ethConfig.HeimdallWSSecondaryAddress)
+				if len(wsAddrs) > 1 {
+					log.Info("Heimdall WS failover enabled", "endpoints", len(wsAddrs))
 				}
 			}
 
diff --git a/eth/ethconfig/config_test.go b/eth/ethconfig/config_test.go
index 7752dd11fc..9fd4e5785e 100644
--- a/eth/ethconfig/config_test.go
+++ b/eth/ethconfig/config_test.go
@@ -16,6 +16,7 @@ import (
 	"github.com/ethereum/go-ethereum/consensus/bor/heimdallws"
 	"github.com/ethereum/go-ethereum/core/rawdb"
 	"github.com/ethereum/go-ethereum/params"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
@@ -90,43 +91,10 @@ func TestCreateConsensusEngine_OverrideHeimdallClient(t *testing.T) {
 	require.True(t, ok, "Expected Bor consensus engine")
 }
 
-func TestCreateConsensusEngine_HeimdallSecondaryURL(t *testing.T) {
+func TestCreateConsensusEngine_CommaSeparatedHeimdallURL(t *testing.T) {
 	t.Parallel()
 	ethConfig := &Config{
-		OverrideHeimdallClient: &mockHeimdallClient{},
-		HeimdallSecondaryURL:   "http://secondary:1317",
-	}
-
-	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
-	require.NoError(t, err)
-	defer engine.Close()
-
-	borEngine, ok := engine.(*bor.Bor)
-	require.True(t, ok, "Expected Bor consensus engine")
-
-	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-	require.True(t, ok, "Expected HeimdallClient to be wrapped in FailoverHeimdallClient")
-}
-
-func TestCreateConsensusEngine_WithoutHeimdall(t *testing.T) {
-	t.Parallel()
-	ethConfig := &Config{WithoutHeimdall: true}
-
-	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
-	require.NoError(t, err)
-	defer engine.Close()
-
-	_, ok := engine.(*bor.Bor)
-	require.True(t, ok, "Expected Bor consensus engine")
-}
-
-func TestCreateConsensusEngine_GRPCSecondaryFailover(t *testing.T) {
-	t.Parallel()
-
-	ethConfig := &Config{
-		OverrideHeimdallClient:       &mockHeimdallClient{},
-		HeimdallgRPCSecondaryAddress: "localhost:50051",
-		HeimdallURL:                  "http://localhost:1317",
+		HeimdallURL: "http://primary:1317,http://secondary:1317",
 	}
 
 	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
@@ -136,19 +104,14 @@ func TestCreateConsensusEngine_GRPCSecondaryFailover(t *testing.T) {
 	borEngine, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
 
-	// Primary mock gets wrapped in FailoverHeimdallClient with gRPC secondary
 	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
 	require.True(t, ok, "Expected HeimdallClient to be wrapped in FailoverHeimdallClient")
 }
 
-func TestCreateConsensusEngine_GRPCSecondaryError_FallsBackToHTTP(t *testing.T) {
+func TestCreateConsensusEngine_SingleHeimdallURL(t *testing.T) {
 	t.Parallel()
-
 	ethConfig := &Config{
-		OverrideHeimdallClient: &mockHeimdallClient{},
-		// Invalid scheme causes NewHeimdallGRPCClient to fail
-		HeimdallgRPCSecondaryAddress: "ftp://localhost:50051",
-		HeimdallSecondaryURL:         "http://secondary:1317",
+		HeimdallURL: "http://primary:1317",
 	}
 
 	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
@@ -158,41 +121,28 @@ func TestCreateConsensusEngine_GRPCSecondaryError_FallsBackToHTTP(t *testing.T)
 	borEngine, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
 
-	// gRPC secondary failed, but HTTP secondary kicks in
+	// Single URL should NOT produce a FailoverHeimdallClient
 	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-	require.True(t, ok, "Expected FailoverHeimdallClient with HTTP fallback after gRPC failure")
+	require.False(t, ok, "Expected no FailoverHeimdallClient for single URL")
 }
 
-func TestCreateConsensusEngine_GRPCSecondaryError_NoHTTPFallback(t *testing.T) {
+func TestCreateConsensusEngine_WithoutHeimdall(t *testing.T) {
 	t.Parallel()
-
-	ethConfig := &Config{
-		OverrideHeimdallClient: &mockHeimdallClient{},
-		// Invalid scheme causes NewHeimdallGRPCClient to fail
-		HeimdallgRPCSecondaryAddress: "ftp://localhost:50051",
-		// No HeimdallSecondaryURL — no fallback available
-	}
+	ethConfig := &Config{WithoutHeimdall: true}
 
 	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
 	require.NoError(t, err)
 	defer engine.Close()
 
-	borEngine, ok := engine.(*bor.Bor)
+	_, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
-
-	// No secondary available, so no failover wrapper
-	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-	require.False(t, ok, "Expected no FailoverHeimdallClient when both gRPC and HTTP secondary fail/absent")
 }
 
-func TestCreateConsensusEngine_GRPCSecondaryUsesSecondaryHTTPURL(t *testing.T) {
+func TestCreateConsensusEngine_CommaSeparatedGRPC(t *testing.T) {
 	t.Parallel()
-
 	ethConfig := &Config{
-		OverrideHeimdallClient:       &mockHeimdallClient{},
-		HeimdallURL:                  "http://primary:1317",
-		HeimdallSecondaryURL:         "http://secondary:1317",
-		HeimdallgRPCSecondaryAddress: "localhost:50051",
+		HeimdallURL:         "http://primary:1317,http://secondary:1317",
+		HeimdallgRPCAddress: "localhost:50051,localhost:50052",
 	}
 
 	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
@@ -202,19 +152,16 @@ func TestCreateConsensusEngine_GRPCSecondaryUsesSecondaryHTTPURL(t *testing.T) {
 	borEngine, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
 
-	// gRPC secondary should be created successfully and wrap in failover.
-	// gRPC takes priority over HTTP secondary when both are available.
 	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-	require.True(t, ok, "Expected FailoverHeimdallClient (gRPC secondary takes priority over HTTP)")
+	require.True(t, ok, "Expected FailoverHeimdallClient with multiple gRPC endpoints")
 }
 
-func TestCreateConsensusEngine_WSWithSecondary(t *testing.T) {
+func TestCreateConsensusEngine_WSCommaSeparated(t *testing.T) {
 	t.Parallel()
 
 	ethConfig := &Config{
-		OverrideHeimdallClient:     &mockHeimdallClient{},
-		HeimdallWSAddress:          "ws://localhost:26657",
-		HeimdallWSSecondaryAddress: "ws://secondary:26657",
+		OverrideHeimdallClient: &mockHeimdallClient{},
+		HeimdallWSAddress:      "ws://localhost:26657,ws://secondary:26657",
 	}
 
 	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
@@ -224,7 +171,6 @@ func TestCreateConsensusEngine_WSWithSecondary(t *testing.T) {
 	borEngine, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
 
-	// WS client should be created
 	require.NotNil(t, borEngine.HeimdallWSClient, "Expected non-nil HeimdallWSClient")
 
 	_, ok = borEngine.HeimdallWSClient.(*heimdallws.HeimdallWSClient)
@@ -269,3 +215,31 @@ func TestCreateConsensusEngine_NoWSAddress(t *testing.T) {
 
 	require.Nil(t, borEngine.HeimdallWSClient, "Expected nil HeimdallWSClient when no WS address configured")
 }
+
+func TestParseURLs(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		input    string
+		expected []string
+	}{
+		{"empty string", "", nil},
+		{"single URL", "http://localhost:1317", []string{"http://localhost:1317"}},
+		{"two URLs", "http://a:1317,http://b:1317", []string{"http://a:1317", "http://b:1317"}},
+		{"three URLs", "http://a:1317,http://b:1317,http://c:1317", []string{"http://a:1317", "http://b:1317", "http://c:1317"}},
+		{"whitespace trimmed", " http://a:1317 , http://b:1317 ", []string{"http://a:1317", "http://b:1317"}},
+		{"trailing comma", "http://a:1317,", []string{"http://a:1317"}},
+		{"leading comma", ",http://a:1317", []string{"http://a:1317"}},
+		{"empty entries filtered", "http://a:1317,,http://b:1317", []string{"http://a:1317", "http://b:1317"}},
+		{"only commas", ",,,", nil},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			result := parseURLs(tt.input)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
diff --git a/eth/ethconfig/gen_config.go b/eth/ethconfig/gen_config.go
index 0c9d21e8a3..b1ba37d578 100644
--- a/eth/ethconfig/gen_config.go
+++ b/eth/ethconfig/gen_config.go
@@ -67,13 +67,10 @@ func (c Config) MarshalTOML() (interface{}, error) {
 		RPCEVMTimeout                        time.Duration
 		RPCTxFeeCap                          float64
 		HeimdallURL                          string
-		HeimdallSecondaryURL                 string
 		HeimdallTimeout                      time.Duration
 		WithoutHeimdall                      bool
 		HeimdallgRPCAddress                  string
-		HeimdallgRPCSecondaryAddress         string
 		HeimdallWSAddress                    string
-		HeimdallWSSecondaryAddress           string
 		RunHeimdall                          bool
 		RunHeimdallArgs                      string
 		UseHeimdallApp                       bool
@@ -139,13 +136,10 @@ func (c Config) MarshalTOML() (interface{}, error) {
 	enc.RPCEVMTimeout = c.RPCEVMTimeout
 	enc.RPCTxFeeCap = c.RPCTxFeeCap
 	enc.HeimdallURL = c.HeimdallURL
-	enc.HeimdallSecondaryURL = c.HeimdallSecondaryURL
 	enc.HeimdallTimeout = c.HeimdallTimeout
 	enc.WithoutHeimdall = c.WithoutHeimdall
 	enc.HeimdallgRPCAddress = c.HeimdallgRPCAddress
-	enc.HeimdallgRPCSecondaryAddress = c.HeimdallgRPCSecondaryAddress
 	enc.HeimdallWSAddress = c.HeimdallWSAddress
-	enc.HeimdallWSSecondaryAddress = c.HeimdallWSSecondaryAddress
 	enc.RunHeimdall = c.RunHeimdall
 	enc.RunHeimdallArgs = c.RunHeimdallArgs
 	enc.UseHeimdallApp = c.UseHeimdallApp
@@ -219,13 +213,10 @@ func (c *Config) UnmarshalTOML(unmarshal func(interface{}) error) error {
 		RPCEVMTimeout                        *time.Duration
 		RPCTxFeeCap                          *float64
 		HeimdallURL                          *string
-		HeimdallSecondaryURL                 *string
 		HeimdallTimeout                      *time.Duration
 		WithoutHeimdall                      *bool
 		HeimdallgRPCAddress                  *string
-		HeimdallgRPCSecondaryAddress         *string
 		HeimdallWSAddress                    *string
-		HeimdallWSSecondaryAddress           *string
 		RunHeimdall                          *bool
 		RunHeimdallArgs                      *string
 		UseHeimdallApp                       *bool
@@ -382,9 +373,6 @@ func (c *Config) UnmarshalTOML(unmarshal func(interface{}) error) error {
 	if dec.HeimdallURL != nil {
 		c.HeimdallURL = *dec.HeimdallURL
 	}
-	if dec.HeimdallSecondaryURL != nil {
-		c.HeimdallSecondaryURL = *dec.HeimdallSecondaryURL
-	}
 	if dec.HeimdallTimeout != nil {
 		c.HeimdallTimeout = *dec.HeimdallTimeout
 	}
@@ -394,15 +382,9 @@ func (c *Config) UnmarshalTOML(unmarshal func(interface{}) error) error {
 	if dec.HeimdallgRPCAddress != nil {
 		c.HeimdallgRPCAddress = *dec.HeimdallgRPCAddress
 	}
-	if dec.HeimdallgRPCSecondaryAddress != nil {
-		c.HeimdallgRPCSecondaryAddress = *dec.HeimdallgRPCSecondaryAddress
-	}
 	if dec.HeimdallWSAddress != nil {
 		c.HeimdallWSAddress = *dec.HeimdallWSAddress
 	}
-	if dec.HeimdallWSSecondaryAddress != nil {
-		c.HeimdallWSSecondaryAddress = *dec.HeimdallWSSecondaryAddress
-	}
 	if dec.RunHeimdall != nil {
 		c.RunHeimdall = *dec.RunHeimdall
 	}
diff --git a/internal/cli/server/config.go b/internal/cli/server/config.go
index 9179754b51..40a68620a9 100644
--- a/internal/cli/server/config.go
+++ b/internal/cli/server/config.go
@@ -306,29 +306,20 @@ type P2PDiscovery struct {
 }
 
 type HeimdallConfig struct {
-	// URL is the url of the heimdall server
+	// URL is the url of the heimdall server (comma-separated for failover: "url1,url2,url3")
 	URL string `hcl:"url,optional" toml:"url,optional"`
 
-	// SecondaryURL is the url of a secondary heimdall server used for failover
-	SecondaryURL string `hcl:"secondary-url,optional" toml:"secondary-url,optional"`
-
 	Timeout time.Duration `hcl:"timeout,optional" toml:"timeout,optional"`
 
 	// Without is used to disable remote heimdall during testing
 	Without bool `hcl:"bor.without,optional" toml:"bor.without,optional"`
 
-	// GRPCAddress is the address of the heimdall grpc server
+	// GRPCAddress is the address of the heimdall grpc server (comma-separated for failover: "addr1,addr2")
 	GRPCAddress string `hcl:"grpc-address,optional" toml:"grpc-address,optional"`
 
-	// GRPCSecondaryAddress is the address of a secondary heimdall grpc server for failover
-	GRPCSecondaryAddress string `hcl:"grpc-secondary-address,optional" toml:"grpc-secondary-address,optional"`
-
-	// WSAddress is the address of the heimdall ws subscription server
+	// WSAddress is the address of the heimdall ws subscription server (comma-separated for failover: "addr1,addr2")
 	WSAddress string `hcl:"ws-address,optional" toml:"ws-address,optional"`
 
-	// WSSecondaryAddress is the address of a secondary heimdall ws subscription server for failover
-	WSSecondaryAddress string `hcl:"ws-secondary-address,optional" toml:"ws-secondary-address,optional"`
-
 	// RunHeimdall is used to run heimdall as a child process
 	RunHeimdall bool `hcl:"bor.runheimdall,optional" toml:"bor.runheimdall,optional"`
 
@@ -822,14 +813,11 @@ func DefaultConfig() *Config {
 			},
 		},
 		Heimdall: &HeimdallConfig{
-			URL:                  "http://localhost:1317",
-			SecondaryURL:         "",
-			Timeout:              5 * time.Second,
-			Without:              false,
-			GRPCAddress:          "",
-			GRPCSecondaryAddress: "",
-			WSAddress:            "",
-			WSSecondaryAddress:   "",
+			URL:         "http://localhost:1317",
+			Timeout:     5 * time.Second,
+			Without:     false,
+			GRPCAddress: "",
+			WSAddress:   "",
 		},
 		SyncMode:    "full",
 		GcMode:      "full",
@@ -1165,13 +1153,10 @@ func (c *Config) buildEth(stack *node.Node, accountManager *accounts.Manager) (*
 	}
 
 	n.HeimdallURL = c.Heimdall.URL
-	n.HeimdallSecondaryURL = c.Heimdall.SecondaryURL
 	n.HeimdallTimeout = c.Heimdall.Timeout
 	n.WithoutHeimdall = c.Heimdall.Without
 	n.HeimdallgRPCAddress = c.Heimdall.GRPCAddress
-	n.HeimdallgRPCSecondaryAddress = c.Heimdall.GRPCSecondaryAddress
 	n.HeimdallWSAddress = c.Heimdall.WSAddress
-	n.HeimdallWSSecondaryAddress = c.Heimdall.WSSecondaryAddress
 	n.RunHeimdall = c.Heimdall.RunHeimdall
 	n.RunHeimdallArgs = c.Heimdall.RunHeimdallArgs
 	n.UseHeimdallApp = c.Heimdall.UseHeimdallApp
diff --git a/internal/cli/server/flags.go b/internal/cli/server/flags.go
index 70c15360ce..28dada05d8 100644
--- a/internal/cli/server/flags.go
+++ b/internal/cli/server/flags.go
@@ -175,16 +175,10 @@ func (c *Command) Flags(config *Config) *flagset.Flagset {
 	// heimdall
 	f.StringFlag(&flagset.StringFlag{
 		Name:    "bor.heimdall",
-		Usage:   "URL of Heimdall service",
+		Usage:   "URL of Heimdall service (comma-separated for failover: \"url1,url2\")",
 		Value:   &c.cliConfig.Heimdall.URL,
 		Default: c.cliConfig.Heimdall.URL,
 	})
-	f.StringFlag(&flagset.StringFlag{
-		Name:    "bor.heimdall.secondary",
-		Usage:   "URL of a secondary Heimdall service for failover",
-		Value:   &c.cliConfig.Heimdall.SecondaryURL,
-		Default: c.cliConfig.Heimdall.SecondaryURL,
-	})
 	f.DurationFlag(&flagset.DurationFlag{
 		Name:    "bor.heimdalltimeout",
 		Usage:   "Timeout period for bor's outgoing requests to heimdall",
@@ -205,28 +199,16 @@ func (c *Command) Flags(config *Config) *flagset.Flagset {
 	})
 	f.StringFlag(&flagset.StringFlag{
 		Name:    "bor.heimdallgRPC",
-		Usage:   "Address of Heimdall gRPC service",
+		Usage:   "Address of Heimdall gRPC service (comma-separated for failover: \"addr1,addr2\")",
 		Value:   &c.cliConfig.Heimdall.GRPCAddress,
 		Default: c.cliConfig.Heimdall.GRPCAddress,
 	})
-	f.StringFlag(&flagset.StringFlag{
-		Name:    "bor.heimdallgRPC.secondary",
-		Usage:   "Address of a secondary Heimdall gRPC service for failover",
-		Value:   &c.cliConfig.Heimdall.GRPCSecondaryAddress,
-		Default: c.cliConfig.Heimdall.GRPCSecondaryAddress,
-	})
 	f.StringFlag(&flagset.StringFlag{
 		Name:    "bor.heimdallWS",
-		Usage:   "Address of Heimdall ws subscription service",
+		Usage:   "Address of Heimdall WS subscription service (comma-separated for failover: \"addr1,addr2\")",
 		Value:   &c.cliConfig.Heimdall.WSAddress,
 		Default: c.cliConfig.Heimdall.WSAddress,
 	})
-	f.StringFlag(&flagset.StringFlag{
-		Name:    "bor.heimdallWS.secondary",
-		Usage:   "Address of a secondary Heimdall WS subscription service for failover",
-		Value:   &c.cliConfig.Heimdall.WSSecondaryAddress,
-		Default: c.cliConfig.Heimdall.WSSecondaryAddress,
-	})
 	f.BoolFlag(&flagset.BoolFlag{
 		Name:    "bor.runheimdall",
 		Usage:   "Run Heimdall service as a child process",
diff --git a/internal/cli/server/testdata/default.toml b/internal/cli/server/testdata/default.toml
index 7cbab6628f..c3213e2633 100644
--- a/internal/cli/server/testdata/default.toml
+++ b/internal/cli/server/testdata/default.toml
@@ -50,12 +50,9 @@ devfakeauthor = false
 
 [heimdall]
   url = "http://localhost:1317"
-  secondary-url = ""
   "bor.without" = false
   grpc-address = ""
-  grpc-secondary-address = ""
   ws-address = ""
-  ws-secondary-address = ""
   "bor.runheimdall" = false
   "bor.runheimdallargs" = ""
   "bor.useheimdallapp" = false

From 4d44077da49767a9ceded299c881c55651bda2dd Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Thu, 12 Feb 2026 17:35:27 +0530
Subject: [PATCH 10/29] code duplication fix

---
 eth/ethconfig/config_test.go | 53 ++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/eth/ethconfig/config_test.go b/eth/ethconfig/config_test.go
index 9fd4e5785e..0b53563739 100644
--- a/eth/ethconfig/config_test.go
+++ b/eth/ethconfig/config_test.go
@@ -156,46 +156,39 @@ func TestCreateConsensusEngine_CommaSeparatedGRPC(t *testing.T) {
 	require.True(t, ok, "Expected FailoverHeimdallClient with multiple gRPC endpoints")
 }
 
-func TestCreateConsensusEngine_WSCommaSeparated(t *testing.T) {
+func TestCreateConsensusEngine_WSAddress(t *testing.T) {
 	t.Parallel()
 
-	ethConfig := &Config{
-		OverrideHeimdallClient: &mockHeimdallClient{},
-		HeimdallWSAddress:      "ws://localhost:26657,ws://secondary:26657",
+	tests := []struct {
+		name string
+		addr string
+	}{
+		{"comma-separated", "ws://localhost:26657,ws://secondary:26657"},
+		{"primary only", "ws://localhost:26657"},
 	}
 
-	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
-	require.NoError(t, err)
-	defer engine.Close()
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
 
-	borEngine, ok := engine.(*bor.Bor)
-	require.True(t, ok, "Expected Bor consensus engine")
+			ethConfig := &Config{
+				OverrideHeimdallClient: &mockHeimdallClient{},
+				HeimdallWSAddress:      tt.addr,
+			}
 
-	require.NotNil(t, borEngine.HeimdallWSClient, "Expected non-nil HeimdallWSClient")
+			engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+			require.NoError(t, err)
+			defer engine.Close()
 
-	_, ok = borEngine.HeimdallWSClient.(*heimdallws.HeimdallWSClient)
-	require.True(t, ok, "Expected HeimdallWSClient type")
-}
+			borEngine, ok := engine.(*bor.Bor)
+			require.True(t, ok, "Expected Bor consensus engine")
 
-func TestCreateConsensusEngine_WSPrimaryOnly(t *testing.T) {
-	t.Parallel()
+			require.NotNil(t, borEngine.HeimdallWSClient, "Expected non-nil HeimdallWSClient")
 
-	ethConfig := &Config{
-		OverrideHeimdallClient: &mockHeimdallClient{},
-		HeimdallWSAddress:      "ws://localhost:26657",
+			_, ok = borEngine.HeimdallWSClient.(*heimdallws.HeimdallWSClient)
+			require.True(t, ok, "Expected HeimdallWSClient type")
+		})
 	}
-
-	engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
-	require.NoError(t, err)
-	defer engine.Close()
-
-	borEngine, ok := engine.(*bor.Bor)
-	require.True(t, ok, "Expected Bor consensus engine")
-
-	require.NotNil(t, borEngine.HeimdallWSClient, "Expected non-nil HeimdallWSClient")
-
-	_, ok = borEngine.HeimdallWSClient.(*heimdallws.HeimdallWSClient)
-	require.True(t, ok, "Expected HeimdallWSClient type")
 }
 
 func TestCreateConsensusEngine_NoWSAddress(t *testing.T) {

From de26840f68f9da68407186083b89e5785a4e2953 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Thu, 12 Feb 2026 19:48:03 +0530
Subject: [PATCH 11/29] added mode tests

---
 .../bor/heimdall/failover_client_test.go      | 132 ++++++++++++++++++
 eth/ethconfig/config_test.go                  |  47 +++++++
 2 files changed, 179 insertions(+)

diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 6fad4ff745..73b40fd7cd 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -686,6 +686,138 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 	assert.Equal(t, tertiaryBefore, tertiary.hits.Load(), "should be back on primary now")
 }
 
+// Tests for the shouldProbe path (lines 156-161): probe primary fails with
+// failover error, then current (non-primary) client also fails.
+func TestFailover_ProbeCurrentNonFailoverError(t *testing.T) {
+	// Probe primary → failover error, current (secondary) → non-failover error.
+	// Should return the non-failover error without cascading to tertiary.
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, ErrShutdownDetected
+		},
+	}
+	tertiary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 50 * time.Millisecond
+	defer fc.Close()
+
+	// Force onto secondary with cooldown elapsed so probe triggers.
+	fc.mu.Lock()
+	fc.active = 1
+	fc.lastSwitch = time.Now().Add(-time.Hour)
+	fc.mu.Unlock()
+
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.Error(t, err)
+	assert.True(t, errors.Is(err, ErrShutdownDetected))
+	assert.Equal(t, int32(0), tertiary.hits.Load(), "should not cascade to tertiary on non-failover error")
+}
+
+func TestFailover_ProbeCurrentFailoverError_CascadesToNext(t *testing.T) {
+	// Probe primary → failover error, current (secondary) → failover error.
+	// Should cascade to tertiary.
+	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+	tertiary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 50 * time.Millisecond
+	defer fc.Close()
+
+	// Force onto secondary with cooldown elapsed so probe triggers.
+	fc.mu.Lock()
+	fc.active = 1
+	fc.lastSwitch = time.Now().Add(-time.Hour)
+	fc.mu.Unlock()
+
+	span, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	require.NotNil(t, span)
+	assert.Equal(t, int32(1), tertiary.hits.Load(), "should cascade to tertiary")
+
+	fc.mu.Lock()
+	assert.Equal(t, 2, fc.active, "active should switch to tertiary")
+	fc.mu.Unlock()
+}
+
+// Tests for the active != 0 no-probe path (lines 171-176): on a non-primary
+// client with cooldown not elapsed, the current client fails.
+func TestFailover_StickyNonFailoverError(t *testing.T) {
+	// Sticky on secondary (cooldown not elapsed), secondary returns non-failover error.
+	// Should return error without cascading to tertiary.
+	primary := &mockHeimdallClient{}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, ErrShutdownDetected
+		},
+	}
+	tertiary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 1 * time.Hour // very long — no probe
+	defer fc.Close()
+
+	// Force onto secondary with recent switch (cooldown not elapsed).
+	fc.mu.Lock()
+	fc.active = 1
+	fc.lastSwitch = time.Now()
+	fc.mu.Unlock()
+
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.Error(t, err)
+	assert.True(t, errors.Is(err, ErrShutdownDetected))
+	assert.Equal(t, int32(0), primary.hits.Load(), "should not probe primary")
+	assert.Equal(t, int32(0), tertiary.hits.Load(), "should not cascade to tertiary on non-failover error")
+}
+
+func TestFailover_StickyFailoverError_CascadesToNext(t *testing.T) {
+	// Sticky on secondary (cooldown not elapsed), secondary returns failover error.
+	// Should cascade to tertiary.
+	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+
+	primary := &mockHeimdallClient{}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+	tertiary := &mockHeimdallClient{}
+
+	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.cooldown = 1 * time.Hour // very long — no probe
+	defer fc.Close()
+
+	// Force onto secondary with recent switch (cooldown not elapsed).
+	fc.mu.Lock()
+	fc.active = 1
+	fc.lastSwitch = time.Now()
+	fc.mu.Unlock()
+
+	span, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	require.NotNil(t, span)
+	assert.Equal(t, int32(0), primary.hits.Load(), "should not probe primary")
+	assert.Equal(t, int32(1), tertiary.hits.Load(), "should cascade to tertiary")
+
+	fc.mu.Lock()
+	assert.Equal(t, 2, fc.active, "active should switch to tertiary")
+	fc.mu.Unlock()
+}
+
 func TestFailover_ClosesAllClients(t *testing.T) {
 	var closed [3]atomic.Bool
 
diff --git a/eth/ethconfig/config_test.go b/eth/ethconfig/config_test.go
index 0b53563739..0d0f00c9e7 100644
--- a/eth/ethconfig/config_test.go
+++ b/eth/ethconfig/config_test.go
@@ -156,6 +156,53 @@ func TestCreateConsensusEngine_CommaSeparatedGRPC(t *testing.T) {
 	require.True(t, ok, "Expected FailoverHeimdallClient with multiple gRPC endpoints")
 }
 
+func TestCreateConsensusEngine_GRPCInitFailsFallsBackToHTTP(t *testing.T) {
+	t.Parallel()
+
+	t.Run("with HTTP URL available", func(t *testing.T) {
+		t.Parallel()
+
+		// gRPC uses unsupported scheme → NewHeimdallGRPCClient fails.
+		// Fallback appends HTTP client for httpURLs[0]; httpURLs[1] also
+		// gets an HTTP client via the else-if branch → 2 clients → failover.
+		ethConfig := &Config{
+			HeimdallURL:         "http://a:1317,http://b:1317",
+			HeimdallgRPCAddress: "ftp://invalid:50051",
+		}
+
+		engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+		require.NoError(t, err)
+		defer engine.Close()
+
+		borEngine, ok := engine.(*bor.Bor)
+		require.True(t, ok, "Expected Bor consensus engine")
+
+		_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+		require.True(t, ok, "Expected FailoverHeimdallClient after gRPC fallback to HTTP")
+	})
+
+	t.Run("without HTTP URL at that index", func(t *testing.T) {
+		t.Parallel()
+
+		// gRPC[0] succeeds (localhost is allowed), gRPC[1] fails (bad scheme).
+		// i=1 >= len(httpURLs)=1 so no HTTP fallback is added → only 1 client.
+		ethConfig := &Config{
+			HeimdallURL:         "http://a:1317",
+			HeimdallgRPCAddress: "localhost:50051,ftp://invalid:50052",
+		}
+
+		engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+		require.NoError(t, err)
+		defer engine.Close()
+
+		borEngine, ok := engine.(*bor.Bor)
+		require.True(t, ok, "Expected Bor consensus engine")
+
+		_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+		require.False(t, ok, "Expected no FailoverHeimdallClient when second gRPC fails with no HTTP fallback")
+	})
+}
+
 func TestCreateConsensusEngine_WSAddress(t *testing.T) {
 	t.Parallel()
 

From 27f53b92c52b06f9b3853a68c1daba2c646279af Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Thu, 12 Feb 2026 19:51:53 +0530
Subject: [PATCH 12/29] code duplication fix

---
 eth/ethconfig/config_test.go | 87 +++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/eth/ethconfig/config_test.go b/eth/ethconfig/config_test.go
index 0d0f00c9e7..760e7a381b 100644
--- a/eth/ethconfig/config_test.go
+++ b/eth/ethconfig/config_test.go
@@ -159,48 +159,51 @@ func TestCreateConsensusEngine_CommaSeparatedGRPC(t *testing.T) {
 func TestCreateConsensusEngine_GRPCInitFailsFallsBackToHTTP(t *testing.T) {
 	t.Parallel()
 
-	t.Run("with HTTP URL available", func(t *testing.T) {
-		t.Parallel()
-
-		// gRPC uses unsupported scheme → NewHeimdallGRPCClient fails.
-		// Fallback appends HTTP client for httpURLs[0]; httpURLs[1] also
-		// gets an HTTP client via the else-if branch → 2 clients → failover.
-		ethConfig := &Config{
-			HeimdallURL:         "http://a:1317,http://b:1317",
-			HeimdallgRPCAddress: "ftp://invalid:50051",
-		}
-
-		engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
-		require.NoError(t, err)
-		defer engine.Close()
-
-		borEngine, ok := engine.(*bor.Bor)
-		require.True(t, ok, "Expected Bor consensus engine")
-
-		_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-		require.True(t, ok, "Expected FailoverHeimdallClient after gRPC fallback to HTTP")
-	})
-
-	t.Run("without HTTP URL at that index", func(t *testing.T) {
-		t.Parallel()
-
-		// gRPC[0] succeeds (localhost is allowed), gRPC[1] fails (bad scheme).
-		// i=1 >= len(httpURLs)=1 so no HTTP fallback is added → only 1 client.
-		ethConfig := &Config{
-			HeimdallURL:         "http://a:1317",
-			HeimdallgRPCAddress: "localhost:50051,ftp://invalid:50052",
-		}
-
-		engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
-		require.NoError(t, err)
-		defer engine.Close()
-
-		borEngine, ok := engine.(*bor.Bor)
-		require.True(t, ok, "Expected Bor consensus engine")
-
-		_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-		require.False(t, ok, "Expected no FailoverHeimdallClient when second gRPC fails with no HTTP fallback")
-	})
+	tests := []struct {
+		name           string
+		heimdallURL    string
+		grpcAddress    string
+		expectFailover bool
+	}{
+		{
+			// gRPC uses unsupported scheme → NewHeimdallGRPCClient fails.
+			// Fallback appends HTTP client for httpURLs[0]; httpURLs[1] also
+			// gets an HTTP client via the else-if branch → 2 clients → failover.
+			name:           "with HTTP URL available",
+			heimdallURL:    "http://a:1317,http://b:1317",
+			grpcAddress:    "ftp://invalid:50051",
+			expectFailover: true,
+		},
+		{
+			// gRPC[0] succeeds (localhost is allowed), gRPC[1] fails (bad scheme).
+			// i=1 >= len(httpURLs)=1 so no HTTP fallback is added → only 1 client.
+			name:           "without HTTP URL at that index",
+			heimdallURL:    "http://a:1317",
+			grpcAddress:    "localhost:50051,ftp://invalid:50052",
+			expectFailover: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			ethConfig := &Config{
+				HeimdallURL:         tt.heimdallURL,
+				HeimdallgRPCAddress: tt.grpcAddress,
+			}
+
+			engine, err := CreateConsensusEngine(newTestBorChainConfig(), ethConfig, rawdb.NewMemoryDatabase(), nil)
+			require.NoError(t, err)
+			defer engine.Close()
+
+			borEngine, ok := engine.(*bor.Bor)
+			require.True(t, ok, "Expected Bor consensus engine")
+
+			_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+			require.Equal(t, tt.expectFailover, ok)
+		})
+	}
 }
 
 func TestCreateConsensusEngine_WSAddress(t *testing.T) {

From 6cc879a686385396a37e00017c796a38567852c4 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Fri, 13 Feb 2026 09:48:27 +0530
Subject: [PATCH 13/29] addressed comment: rename FailoverHeimdallClient to
 MultiHeimdallClient

---
 consensus/bor/heimdall/failover_client.go     | 32 +++++------
 .../bor/heimdall/failover_client_test.go      | 54 +++++++++----------
 eth/ethconfig/config.go                       |  2 +-
 eth/ethconfig/config_test.go                  | 16 +++---
 4 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index 4e6efa7082..3921df67a1 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -22,7 +22,7 @@ const (
 )
 
 // Endpoint matches bor.IHeimdallClient. It is exported so that external
-// packages can build []Endpoint slices for NewFailoverHeimdallClient without
+// packages can build []Endpoint slices for NewMultiHeimdallClient without
 // running into Go's covariant-slice restriction.
 type Endpoint interface {
 	StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error)
@@ -36,10 +36,10 @@ type Endpoint interface {
 	Close()
 }
 
-// FailoverHeimdallClient wraps N heimdall clients (primary at index 0, failovers
+// MultiHeimdallClient wraps N heimdall clients (primary at index 0, failovers
 // at 1..N-1) and transparently cascades through them when the active client is
 // unreachable. After a cooldown period it probes the primary again.
-type FailoverHeimdallClient struct {
+type MultiHeimdallClient struct {
 	clients        []Endpoint
 	mu             sync.Mutex
 	active         int       // 0 = primary, >0 = failover
@@ -48,63 +48,63 @@ type FailoverHeimdallClient struct {
 	cooldown       time.Duration
 }
 
-func NewFailoverHeimdallClient(clients ...Endpoint) *FailoverHeimdallClient {
-	return &FailoverHeimdallClient{
+func NewMultiHeimdallClient(clients ...Endpoint) *MultiHeimdallClient {
+	return &MultiHeimdallClient{
 		clients:        clients,
 		attemptTimeout: defaultAttemptTimeout,
 		cooldown:       defaultSecondaryCooldown,
 	}
 }
 
-func (f *FailoverHeimdallClient) StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error) {
+func (f *MultiHeimdallClient) StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error) {
 	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) ([]*clerk.EventRecordWithTime, error) {
 		return c.StateSyncEvents(ctx, fromID, to)
 	})
 }
 
-func (f *FailoverHeimdallClient) GetSpan(ctx context.Context, spanID uint64) (*types.Span, error) {
+func (f *MultiHeimdallClient) GetSpan(ctx context.Context, spanID uint64) (*types.Span, error) {
 	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*types.Span, error) {
 		return c.GetSpan(ctx, spanID)
 	})
 }
 
-func (f *FailoverHeimdallClient) GetLatestSpan(ctx context.Context) (*types.Span, error) {
+func (f *MultiHeimdallClient) GetLatestSpan(ctx context.Context) (*types.Span, error) {
 	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*types.Span, error) {
 		return c.GetLatestSpan(ctx)
 	})
 }
 
-func (f *FailoverHeimdallClient) FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error) {
+func (f *MultiHeimdallClient) FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error) {
 	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*checkpoint.Checkpoint, error) {
 		return c.FetchCheckpoint(ctx, number)
 	})
 }
 
-func (f *FailoverHeimdallClient) FetchCheckpointCount(ctx context.Context) (int64, error) {
+func (f *MultiHeimdallClient) FetchCheckpointCount(ctx context.Context) (int64, error) {
 	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (int64, error) {
 		return c.FetchCheckpointCount(ctx)
 	})
 }
 
-func (f *FailoverHeimdallClient) FetchMilestone(ctx context.Context) (*milestone.Milestone, error) {
+func (f *MultiHeimdallClient) FetchMilestone(ctx context.Context) (*milestone.Milestone, error) {
 	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*milestone.Milestone, error) {
 		return c.FetchMilestone(ctx)
 	})
 }
 
-func (f *FailoverHeimdallClient) FetchMilestoneCount(ctx context.Context) (int64, error) {
+func (f *MultiHeimdallClient) FetchMilestoneCount(ctx context.Context) (int64, error) {
 	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (int64, error) {
 		return c.FetchMilestoneCount(ctx)
 	})
 }
 
-func (f *FailoverHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error) {
+func (f *MultiHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error) {
 	return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*ctypes.SyncInfo, error) {
 		return c.FetchStatus(ctx)
 	})
 }
 
-func (f *FailoverHeimdallClient) Close() {
+func (f *MultiHeimdallClient) Close() {
 	for _, c := range f.clients {
 		c.Close()
 	}
@@ -113,7 +113,7 @@ func (f *FailoverHeimdallClient) Close() {
 // callWithFailover executes fn against the active client. If the active client
 // fails with a failover-eligible error, it cascades through remaining clients.
 // If on a non-primary client past the cooldown, it probes the primary first.
-func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error)) (T, error) {
+func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error)) (T, error) {
 	f.mu.Lock()
 	active := f.active
 	shouldProbe := active != 0 && time.Since(f.lastSwitch) >= f.cooldown
@@ -198,7 +198,7 @@ func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn
 
 // cascadeClients tries clients after the given index. On first success it
 // switches the active client and returns. If all fail, returns the last error.
-func cascadeClients[T any](f *FailoverHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error), after int, lastErr error) (T, error) {
+func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error), after int, lastErr error) (T, error) {
 	for i := after + 1; i < len(f.clients); i++ {
 		result, err := fn(ctx, f.clients[i])
 		if err == nil {
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 73b40fd7cd..0a2c0e88c7 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -130,7 +130,7 @@ func TestFailover_SwitchOnPrimaryDown(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -152,7 +152,7 @@ func TestFailover_NoSwitchOnContextCanceled(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 5 * time.Second // longer than caller's ctx
 	defer fc.Close()
 
@@ -172,7 +172,7 @@ func TestFailover_NoSwitchOnServiceUnavailable(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -190,7 +190,7 @@ func TestFailover_NoSwitchOnShutdownDetected(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -208,7 +208,7 @@ func TestFailover_StickyBehavior(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 1 * time.Hour // very long cooldown
 	defer fc.Close()
@@ -244,7 +244,7 @@ func TestFailover_ProbeBackToPrimary(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 50 * time.Millisecond
 	defer fc.Close()
@@ -281,7 +281,7 @@ func TestFailover_ProbeBackFails(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 50 * time.Millisecond
 	defer fc.Close()
@@ -306,7 +306,7 @@ func TestFailover_ClosesBothClients(t *testing.T) {
 	primary := &mockHeimdallClient{closeFn: func() { primaryClosed.Store(true) }}
 	secondary := &mockHeimdallClient{closeFn: func() { secondaryClosed.Store(true) }}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.Close()
 
 	assert.True(t, primaryClosed.Load(), "primary should be closed")
@@ -317,7 +317,7 @@ func TestFailover_PassthroughWhenPrimaryHealthy(t *testing.T) {
 	primary := &mockHeimdallClient{}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 5 * time.Second
 	defer fc.Close()
 
@@ -345,7 +345,7 @@ func TestFailover_Integration_ServiceUnavailable(t *testing.T) {
 	primaryClient := NewHeimdallClient(primary.URL, 5*time.Second)
 	secondaryClient := NewHeimdallClient(secondary.URL, 5*time.Second)
 
-	fc := NewFailoverHeimdallClient(primaryClient, secondaryClient)
+	fc := NewMultiHeimdallClient(primaryClient, secondaryClient)
 	fc.attemptTimeout = 2 * time.Second
 	defer fc.Close()
 
@@ -369,7 +369,7 @@ func TestFailover_StateSyncEvents(t *testing.T) {
 		},
 	}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -392,7 +392,7 @@ func TestFailover_GetLatestSpan(t *testing.T) {
 		},
 	}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -410,7 +410,7 @@ func TestFailover_FetchCheckpoint(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -428,7 +428,7 @@ func TestFailover_FetchCheckpointCount(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -446,7 +446,7 @@ func TestFailover_FetchMilestone(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -464,7 +464,7 @@ func TestFailover_FetchMilestoneCount(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -482,7 +482,7 @@ func TestFailover_FetchStatus(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -500,7 +500,7 @@ func TestFailover_ProbeBackNonFailoverError(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary)
+	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 50 * time.Millisecond
 	defer fc.Close()
@@ -545,7 +545,7 @@ func TestFailover_SwitchOnPrimarySubContextError(t *testing.T) {
 			primary := &mockHeimdallClient{getSpanFn: tt.primaryFn}
 			secondary := &mockHeimdallClient{}
 
-			fc := NewFailoverHeimdallClient(primary, secondary)
+			fc := NewMultiHeimdallClient(primary, secondary)
 			fc.attemptTimeout = 100 * time.Millisecond
 			defer fc.Close()
 
@@ -605,7 +605,7 @@ func TestFailover_ThreeClients_CascadeToTertiary(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -631,7 +631,7 @@ func TestFailover_AllClientsFail(t *testing.T) {
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
 	}
 
-	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	defer fc.Close()
 
@@ -658,7 +658,7 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 50 * time.Millisecond
 	defer fc.Close()
@@ -703,7 +703,7 @@ func TestFailover_ProbeCurrentNonFailoverError(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 50 * time.Millisecond
 	defer fc.Close()
@@ -733,7 +733,7 @@ func TestFailover_ProbeCurrentFailoverError_CascadesToNext(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 50 * time.Millisecond
 	defer fc.Close()
@@ -767,7 +767,7 @@ func TestFailover_StickyNonFailoverError(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 1 * time.Hour // very long — no probe
 	defer fc.Close()
@@ -796,7 +796,7 @@ func TestFailover_StickyFailoverError_CascadesToNext(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewFailoverHeimdallClient(primary, secondary, tertiary)
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.cooldown = 1 * time.Hour // very long — no probe
 	defer fc.Close()
@@ -827,7 +827,7 @@ func TestFailover_ClosesAllClients(t *testing.T) {
 		clients[i] = &mockHeimdallClient{closeFn: func() { closed[idx].Store(true) }}
 	}
 
-	fc := NewFailoverHeimdallClient(clients...)
+	fc := NewMultiHeimdallClient(clients...)
 	fc.Close()
 
 	for i := range closed {
diff --git a/eth/ethconfig/config.go b/eth/ethconfig/config.go
index 9744b8e94f..11bec23195 100644
--- a/eth/ethconfig/config.go
+++ b/eth/ethconfig/config.go
@@ -381,7 +381,7 @@ func CreateConsensusEngine(chainConfig *params.ChainConfig, ethConfig *Config, d
 				} else if len(heimdallClients) == 1 {
 					heimdallClient = heimdallClients[0]
 				} else {
-					heimdallClient = heimdall.NewFailoverHeimdallClient(heimdallClients...)
+					heimdallClient = heimdall.NewMultiHeimdallClient(heimdallClients...)
 					log.Info("Heimdall failover enabled", "endpoints", len(heimdallClients))
 				}
 			}
diff --git a/eth/ethconfig/config_test.go b/eth/ethconfig/config_test.go
index 760e7a381b..302a570834 100644
--- a/eth/ethconfig/config_test.go
+++ b/eth/ethconfig/config_test.go
@@ -104,8 +104,8 @@ func TestCreateConsensusEngine_CommaSeparatedHeimdallURL(t *testing.T) {
 	borEngine, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
 
-	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-	require.True(t, ok, "Expected HeimdallClient to be wrapped in FailoverHeimdallClient")
+	_, ok = borEngine.HeimdallClient.(*heimdall.MultiHeimdallClient)
+	require.True(t, ok, "Expected HeimdallClient to be wrapped in MultiHeimdallClient")
 }
 
 func TestCreateConsensusEngine_SingleHeimdallURL(t *testing.T) {
@@ -121,9 +121,9 @@ func TestCreateConsensusEngine_SingleHeimdallURL(t *testing.T) {
 	borEngine, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
 
-	// Single URL should NOT produce a FailoverHeimdallClient
-	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-	require.False(t, ok, "Expected no FailoverHeimdallClient for single URL")
+	// Single URL should NOT produce a MultiHeimdallClient
+	_, ok = borEngine.HeimdallClient.(*heimdall.MultiHeimdallClient)
+	require.False(t, ok, "Expected no MultiHeimdallClient for single URL")
 }
 
 func TestCreateConsensusEngine_WithoutHeimdall(t *testing.T) {
@@ -152,8 +152,8 @@ func TestCreateConsensusEngine_CommaSeparatedGRPC(t *testing.T) {
 	borEngine, ok := engine.(*bor.Bor)
 	require.True(t, ok, "Expected Bor consensus engine")
 
-	_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
-	require.True(t, ok, "Expected FailoverHeimdallClient with multiple gRPC endpoints")
+	_, ok = borEngine.HeimdallClient.(*heimdall.MultiHeimdallClient)
+	require.True(t, ok, "Expected MultiHeimdallClient with multiple gRPC endpoints")
 }
 
 func TestCreateConsensusEngine_GRPCInitFailsFallsBackToHTTP(t *testing.T) {
@@ -200,7 +200,7 @@ func TestCreateConsensusEngine_GRPCInitFailsFallsBackToHTTP(t *testing.T) {
 			borEngine, ok := engine.(*bor.Bor)
 			require.True(t, ok, "Expected Bor consensus engine")
 
-			_, ok = borEngine.HeimdallClient.(*heimdall.FailoverHeimdallClient)
+			_, ok = borEngine.HeimdallClient.(*heimdall.MultiHeimdallClient)
 			require.Equal(t, tt.expectFailover, ok)
 		})
 	}

From 39eda1567a495ed0318e70440124c08bbc755078 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Mon, 23 Feb 2026 12:27:03 +0530
Subject: [PATCH 14/29] added timeout on cascade/secondary calls

---
 consensus/bor/heimdall/failover_client.go | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index 3921df67a1..159d6d4e97 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -148,7 +148,10 @@ func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn fun
 		log.Debug("Heimdall failover: primary still down after probe, staying on current", "active", active, "err", err)
 
 		// Try current client, then cascade through remaining on failure
-		result, err = fn(ctx, f.clients[active])
+		subCtx2, cancel2 := context.WithTimeout(ctx, f.attemptTimeout)
+		result, err = fn(subCtx2, f.clients[active])
+		cancel2()
+
 		if err == nil {
 			return result, nil
 		}
@@ -163,7 +166,10 @@ func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn fun
 
 	if active != 0 {
 		// On a non-primary client, not yet time to probe: use current directly
-		result, err := fn(ctx, f.clients[active])
+		subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
+		result, err := fn(subCtx, f.clients[active])
+		cancel()
+
 		if err == nil {
 			return result, nil
 		}
@@ -200,7 +206,10 @@ func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn fun
 // switches the active client and returns. If all fail, returns the last error.
 func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error), after int, lastErr error) (T, error) {
 	for i := after + 1; i < len(f.clients); i++ {
-		result, err := fn(ctx, f.clients[i])
+		subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
+		result, err := fn(subCtx, f.clients[i])
+		cancel()
+
 		if err == nil {
 			f.mu.Lock()
 			f.active = i

From 4709ad6bf6afaf02d91a2f7cece7fab02a0ec4e5 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Mon, 23 Feb 2026 12:32:50 +0530
Subject: [PATCH 15/29] added a few checks to prevent panic

---
 consensus/bor/heimdall/failover_client.go | 4 ++++
 eth/ethconfig/config.go                   | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index 159d6d4e97..d1fc68a0e3 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -49,6 +49,10 @@ type MultiHeimdallClient struct {
 }
 
 func NewMultiHeimdallClient(clients ...Endpoint) *MultiHeimdallClient {
+	if len(clients) == 0 {
+		panic("NewMultiHeimdallClient requires at least one client")
+	}
+
 	return &MultiHeimdallClient{
 		clients:        clients,
 		attemptTimeout: defaultAttemptTimeout,
diff --git a/eth/ethconfig/config.go b/eth/ethconfig/config.go
index 11bec23195..322731bcee 100644
--- a/eth/ethconfig/config.go
+++ b/eth/ethconfig/config.go
@@ -356,7 +356,10 @@ func CreateConsensusEngine(chainConfig *params.ChainConfig, ethConfig *Config, d
 				n := max(len(httpURLs), len(grpcAddrs))
 				for i := 0; i < n; i++ {
 					if i < len(grpcAddrs) && grpcAddrs[i] != "" {
-						httpURL := httpURLs[min(i, len(httpURLs)-1)]
+						var httpURL string
+						if len(httpURLs) > 0 {
+							httpURL = httpURLs[min(i, len(httpURLs)-1)]
+						}
 
 						grpcClient, err := heimdallgrpc.NewHeimdallGRPCClient(grpcAddrs[i], httpURL, ethConfig.HeimdallTimeout)
 						if err != nil {

From 1d8befe85c01634e21d772d5ba5f4f4c464eb6fb Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Mon, 23 Feb 2026 12:50:00 +0530
Subject: [PATCH 16/29] donot failover on 4xx codes

---
 consensus/bor/heimdall/client.go               | 16 +++++++++++++++-
 consensus/bor/heimdall/failover_client.go      |  8 +++++---
 consensus/bor/heimdall/failover_client_test.go |  9 +++++++--
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/consensus/bor/heimdall/client.go b/consensus/bor/heimdall/client.go
index a27aa4f6aa..d8a4878d83 100644
--- a/consensus/bor/heimdall/client.go
+++ b/consensus/bor/heimdall/client.go
@@ -39,6 +39,20 @@ var (
 	ErrServiceUnavailable    = errors.New("service unavailable")
 )
 
+// HTTPStatusError is returned when Heimdall responds with a non-2xx, non-503 status code.
+// It wraps ErrNotSuccessfulResponse for backwards-compatibility with errors.Is checks.
+type HTTPStatusError struct {
+	StatusCode int
+}
+
+func (e *HTTPStatusError) Error() string {
+	return fmt.Sprintf("%s: response code %d", ErrNotSuccessfulResponse.Error(), e.StatusCode)
+}
+
+func (e *HTTPStatusError) Unwrap() error {
+	return ErrNotSuccessfulResponse
+}
+
 const (
 	heimdallAPIBodyLimit = 128 * 1024 * 1024 // 128 MB
 	stateFetchLimit      = 50
@@ -455,7 +469,7 @@ func internalFetch(ctx context.Context, client http.Client, u *url.URL) ([]byte,
 
 	// check status code
 	if res.StatusCode != 200 && res.StatusCode != 204 {
-		return nil, fmt.Errorf("%w: response code %d", ErrNotSuccessfulResponse, res.StatusCode)
+		return nil, &HTTPStatusError{StatusCode: res.StatusCode}
 	}
 
 	// unmarshall data from buffer
diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index d1fc68a0e3..52ebfe48aa 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -271,9 +271,11 @@ func isFailoverError(err error, callerCtx context.Context) bool {
 		return true
 	}
 
-	// Non-successful HTTP response (4xx, 5xx excluding 503)
-	if errors.Is(err, ErrNotSuccessfulResponse) {
-		return true
+	// Server-side HTTP error (5xx, excluding 503 which is already handled above).
+	// Client errors (4xx) are logical errors; the secondary would return the same response.
+	var httpErr *HTTPStatusError
+	if errors.As(err, &httpErr) {
+		return httpErr.StatusCode >= 500
 	}
 
 	// Sub-context deadline exceeded (the caller's context is still alive at this point)
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 0a2c0e88c7..b06ce97361 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -568,8 +568,13 @@ func TestIsFailoverError(t *testing.T) {
 	// ErrNoResponse should trigger failover
 	assert.True(t, isFailoverError(ErrNoResponse, ctx), "ErrNoResponse should trigger failover")
 
-	// ErrNotSuccessfulResponse should trigger failover
-	assert.True(t, isFailoverError(fmt.Errorf("wrapped: %w", ErrNotSuccessfulResponse), ctx), "ErrNotSuccessfulResponse should trigger failover")
+	// 5xx HTTP errors should trigger failover; the server is unhealthy
+	assert.True(t, isFailoverError(&HTTPStatusError{StatusCode: 500}, ctx), "5xx should trigger failover")
+	assert.True(t, isFailoverError(fmt.Errorf("wrapped: %w", &HTTPStatusError{StatusCode: 502}), ctx), "wrapped 5xx should trigger failover")
+
+	// 4xx HTTP errors should NOT trigger failover; a logical error will be the same on every node
+	assert.False(t, isFailoverError(&HTTPStatusError{StatusCode: 400}, ctx), "4xx should not trigger failover")
+	assert.False(t, isFailoverError(&HTTPStatusError{StatusCode: 404}, ctx), "4xx should not trigger failover")
 
 	// DeadlineExceeded with live caller ctx should trigger failover
 	assert.True(t, isFailoverError(context.DeadlineExceeded, ctx), "DeadlineExceeded should trigger failover when caller ctx is alive")

From c75f3c0c535c5330351db85fa6ca7a8dfe4ab518 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Mon, 23 Feb 2026 12:57:47 +0530
Subject: [PATCH 17/29] ws now has linear cap rather than circular wrap

---
 consensus/bor/heimdallws/client.go      | 12 +++++----
 consensus/bor/heimdallws/client_test.go | 36 +++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index 4984eaebcf..273a853a2a 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -123,11 +123,13 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 			primaryAttempts++
 
 			if len(c.urls) > 1 && primaryAttempts >= c.primaryAttempts {
-				next := (c.activeURL + 1) % len(c.urls)
-				log.Warn("WS URL failed, switching to next",
-					"from", c.urls[c.activeURL], "to", c.urls[next], "attempts", primaryAttempts)
-				c.activeURL = next
-				c.lastFailover = time.Now()
+				next := min(c.activeURL+1, len(c.urls)-1)
+				if next != c.activeURL {
+					log.Warn("WS URL failed, switching to next",
+						"from", c.urls[c.activeURL], "to", c.urls[next], "attempts", primaryAttempts)
+					c.activeURL = next
+					c.lastFailover = time.Now()
+				}
 				primaryAttempts = 0
 			}
 
diff --git a/consensus/bor/heimdallws/client_test.go b/consensus/bor/heimdallws/client_test.go
index 12e4f9675a..2585477b32 100644
--- a/consensus/bor/heimdallws/client_test.go
+++ b/consensus/bor/heimdallws/client_test.go
@@ -319,6 +319,42 @@ func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
 	assert.Equal(t, 0, client.activeURL)
 }
 
+func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
+	// Both URLs reject. The client should stay on the last URL once it gets
+	// there rather than wrapping back to primary with the modulo operator.
+	// Wrapping would also incorrectly reset lastFailover, preventing the
+	// cooldown-based probe-back-to-primary from ever firing.
+	primary := newTestWSServer(t, true)
+	defer primary.Close()
+
+	secondary := newTestWSServer(t, true)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
+
+	client.reconnectDelay = 10 * time.Millisecond
+	client.primaryAttempts = 2
+	client.wsCooldown = 1 * time.Hour // prevent probe-back from interfering
+
+	// Pre-set to secondary as if a prior failover already happened.
+	client.activeURL = 1
+	client.lastFailover = time.Now()
+	lastFailoverBefore := client.lastFailover
+
+	ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond)
+	defer cancel()
+
+	client.tryUntilSubscribeMilestoneEvents(ctx)
+
+	// Must stay on secondary (index 1), not wrap back to primary (index 0).
+	assert.Equal(t, 1, client.activeURL, "should stay on last URL, not wrap back to primary")
+
+	// lastFailover must not be updated — the cooldown timer must remain intact
+	// so that the probe-back-to-primary mechanism can eventually fire.
+	assert.Equal(t, lastFailoverBefore, client.lastFailover, "lastFailover must not be reset when already at last URL")
+}
+
 func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
 	// Start with primary down, then bring it up.
 

From 3825a5d774cbb784ad4c90b3c5ba3e391673dcec Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Mon, 23 Feb 2026 14:12:59 +0530
Subject: [PATCH 18/29] added background health-check for heimdall failover

---
 consensus/bor/heimdall/failover_client.go     | 141 ++++----
 .../bor/heimdall/failover_client_test.go      | 317 +++++++++++-------
 consensus/bor/heimdallws/client.go            | 100 ++++--
 consensus/bor/heimdallws/client_test.go       | 121 +++++--
 4 files changed, 431 insertions(+), 248 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index 52ebfe48aa..2f4655a394 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"net"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/0xPolygon/heimdall-v2/x/bor/types"
@@ -17,8 +18,8 @@ import (
 )
 
 const (
-	defaultAttemptTimeout    = 30 * time.Second
-	defaultSecondaryCooldown = 2 * time.Minute
+	defaultAttemptTimeout      = 30 * time.Second
+	defaultHealthCheckInterval = 30 * time.Second
 )
 
 // Endpoint matches bor.IHeimdallClient. It is exported so that external
@@ -38,14 +39,17 @@ type Endpoint interface {
 
 // MultiHeimdallClient wraps N heimdall clients (primary at index 0, failovers
 // at 1..N-1) and transparently cascades through them when the active client is
-// unreachable. After a cooldown period it probes the primary again.
+// unreachable. A background goroutine periodically health-checks higher-priority
+// endpoints and promotes back when one recovers.
 type MultiHeimdallClient struct {
-	clients        []Endpoint
-	mu             sync.Mutex
-	active         int       // 0 = primary, >0 = failover
-	lastSwitch     time.Time // when we last switched away from primary
-	attemptTimeout time.Duration
-	cooldown       time.Duration
+	clients             []Endpoint
+	mu                  sync.Mutex
+	active              int // 0 = primary, >0 = failover
+	attemptTimeout      time.Duration
+	healthCheckInterval time.Duration
+	quit                chan struct{}
+	closeOnce           sync.Once
+	probing             atomic.Bool
 }
 
 func NewMultiHeimdallClient(clients ...Endpoint) *MultiHeimdallClient {
@@ -54,9 +58,10 @@ func NewMultiHeimdallClient(clients ...Endpoint) *MultiHeimdallClient {
 	}
 
 	return &MultiHeimdallClient{
-		clients:        clients,
-		attemptTimeout: defaultAttemptTimeout,
-		cooldown:       defaultSecondaryCooldown,
+		clients:             clients,
+		attemptTimeout:      defaultAttemptTimeout,
+		healthCheckInterval: defaultHealthCheckInterval,
+		quit:                make(chan struct{}),
 	}
 }
 
@@ -109,86 +114,70 @@ func (f *MultiHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo
 }
 
 func (f *MultiHeimdallClient) Close() {
+	f.closeOnce.Do(func() { close(f.quit) })
+
 	for _, c := range f.clients {
 		c.Close()
 	}
 }
 
-// callWithFailover executes fn against the active client. If the active client
-// fails with a failover-eligible error, it cascades through remaining clients.
-// If on a non-primary client past the cooldown, it probes the primary first.
-func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error)) (T, error) {
-	f.mu.Lock()
-	active := f.active
-	shouldProbe := active != 0 && time.Since(f.lastSwitch) >= f.cooldown
-	f.mu.Unlock()
-
-	// If on a non-primary client and cooldown has elapsed, probe primary
-	if shouldProbe {
-		subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
-		result, err := fn(subCtx, f.clients[0])
-		cancel()
-
-		if err == nil {
-			f.mu.Lock()
-			f.active = 0
-			f.mu.Unlock()
-
-			log.Info("Heimdall failover: primary recovered, switching back")
+// startHealthCheck runs in a background goroutine, periodically probing
+// higher-priority endpoints. When one recovers, it promotes active and
+// self-terminates. This keeps real requests off the probe path.
+func (f *MultiHeimdallClient) startHealthCheck() {
+	defer f.probing.Store(false)
 
-			return result, nil
-		}
+	ticker := time.NewTicker(f.healthCheckInterval)
+	defer ticker.Stop()
 
-		if !isFailoverError(err, ctx) {
-			var zero T
-			return zero, err
+	for {
+		select {
+		case <-f.quit:
+			return
+		case <-ticker.C:
 		}
 
-		// Primary still down, stay on current client
 		f.mu.Lock()
-		f.lastSwitch = time.Now()
+		active := f.active
 		f.mu.Unlock()
 
-		log.Debug("Heimdall failover: primary still down after probe, staying on current", "active", active, "err", err)
-
-		// Try current client, then cascade through remaining on failure
-		subCtx2, cancel2 := context.WithTimeout(ctx, f.attemptTimeout)
-		result, err = fn(subCtx2, f.clients[active])
-		cancel2()
-
-		if err == nil {
-			return result, nil
+		if active == 0 {
+			// Already on primary, nothing to probe.
+			return
 		}
 
-		if !isFailoverError(err, ctx) {
-			var zero T
-			return zero, err
-		}
+		// Probe clients 0..active-1 (highest priority first).
+		for i := 0; i < active; i++ {
+			ctx, cancel := context.WithTimeout(context.Background(), f.attemptTimeout)
+			_, err := f.clients[i].FetchStatus(ctx)
+			cancel()
 
-		return cascadeClients(f, ctx, fn, active, err)
-	}
+			if err == nil {
+				f.mu.Lock()
+				f.active = i
+				f.mu.Unlock()
 
-	if active != 0 {
-		// On a non-primary client, not yet time to probe: use current directly
-		subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
-		result, err := fn(subCtx, f.clients[active])
-		cancel()
+				log.Info("Heimdall health-check: promoted to higher-priority client", "index", i)
 
-		if err == nil {
-			return result, nil
-		}
+				if i == 0 {
+					return
+				}
 
-		if !isFailoverError(err, ctx) {
-			var zero T
-			return zero, err
+				break // keep ticking to probe even higher-priority clients
+			}
 		}
-
-		return cascadeClients(f, ctx, fn, active, err)
 	}
+}
+
+// callWithFailover executes fn against the active client. If the active client
+// fails with a failover-eligible error, it cascades through remaining clients.
+func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error)) (T, error) {
+	f.mu.Lock()
+	active := f.active
+	f.mu.Unlock()
 
-	// Active is primary: try with timeout
 	subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
-	result, err := fn(subCtx, f.clients[0])
+	result, err := fn(subCtx, f.clients[active])
 	cancel()
 
 	if err == nil {
@@ -200,10 +189,11 @@ func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn fun
 		return zero, err
 	}
 
-	// Cascade through clients [1, 2, ..., N-1]
-	log.Warn("Heimdall failover: primary failed, cascading to next client", "err", err)
+	if active == 0 {
+		log.Warn("Heimdall failover: primary failed, cascading to next client", "err", err)
+	}
 
-	return cascadeClients(f, ctx, fn, 0, err)
+	return cascadeClients(f, ctx, fn, active, err)
 }
 
 // cascadeClients tries clients after the given index. On first success it
@@ -217,11 +207,14 @@ func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(
 		if err == nil {
 			f.mu.Lock()
 			f.active = i
-			f.lastSwitch = time.Now()
 			f.mu.Unlock()
 
 			log.Warn("Heimdall failover: switched to client", "index", i)
 
+			if i > 0 && f.probing.CompareAndSwap(false, true) {
+				go f.startHealthCheck()
+			}
+
 			return result, nil
 		}
 
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index b06ce97361..2847486f94 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -210,7 +210,7 @@ func TestFailover_StickyBehavior(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 1 * time.Hour // very long cooldown
+	fc.healthCheckInterval = 1 * time.Hour // very long — no background probe
 	defer fc.Close()
 
 	// First call triggers failover
@@ -241,32 +241,35 @@ func TestFailover_ProbeBackToPrimary(t *testing.T) {
 			}
 			return &types.Span{Id: spanID}, nil
 		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			if primaryDown.Load() {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &ctypes.SyncInfo{}, nil
+		},
 	}
 	secondary := &mockHeimdallClient{}
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 50 * time.Millisecond
+	fc.healthCheckInterval = 50 * time.Millisecond
 	defer fc.Close()
 
 	// Trigger failover
 	_, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
-	// Wait for cooldown to elapse
-	time.Sleep(100 * time.Millisecond)
-
 	// Bring primary back
 	primaryDown.Store(false)
 
-	primaryBefore := primary.hits.Load()
-
-	// Next call should probe primary and succeed
-	_, err = fc.GetSpan(context.Background(), 1)
-	require.NoError(t, err)
-	assert.Greater(t, primary.hits.Load(), primaryBefore, "primary should have been probed")
+	// Wait for background health-check to promote primary
+	require.Eventually(t, func() bool {
+		fc.mu.Lock()
+		defer fc.mu.Unlock()
+		return fc.active == 0
+	}, 2*time.Second, 20*time.Millisecond, "health-check should promote back to primary")
 
-	// Verify we're back on primary
+	// Verify subsequent calls go to primary
 	secondaryBefore := secondary.hits.Load()
 	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
@@ -278,26 +281,34 @@ func TestFailover_ProbeBackFails(t *testing.T) {
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
 			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
 	}
 	secondary := &mockHeimdallClient{}
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 50 * time.Millisecond
+	fc.healthCheckInterval = 50 * time.Millisecond
 	defer fc.Close()
 
 	// Trigger failover
 	_, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
-	// Wait for cooldown
-	time.Sleep(100 * time.Millisecond)
+	// Wait for a few health-check ticks
+	time.Sleep(200 * time.Millisecond)
+
+	// Active should still be on secondary since primary FetchStatus fails
+	fc.mu.Lock()
+	assert.Equal(t, 1, fc.active, "should stay on secondary when primary still down")
+	fc.mu.Unlock()
 
-	// Probe should fail, then fallback to secondary
+	// Calls should still succeed via secondary
 	secondaryBefore := secondary.hits.Load()
 	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
-	assert.Greater(t, secondary.hits.Load(), secondaryBefore, "should fall back to secondary after failed probe")
+	assert.Greater(t, secondary.hits.Load(), secondaryBefore, "should still use secondary")
 }
 
 func TestFailover_ClosesBothClients(t *testing.T) {
@@ -492,34 +503,6 @@ func TestFailover_FetchStatus(t *testing.T) {
 	assert.Equal(t, int32(1), secondary.hits.Load())
 }
 
-func TestFailover_ProbeBackNonFailoverError(t *testing.T) {
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, ErrShutdownDetected
-		},
-	}
-	secondary := &mockHeimdallClient{}
-
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 50 * time.Millisecond
-	defer fc.Close()
-
-	// Force onto secondary
-	fc.mu.Lock()
-	fc.active = 1
-	fc.lastSwitch = time.Now().Add(-time.Hour) // cooldown already elapsed
-	fc.mu.Unlock()
-
-	// Probe primary → gets ErrShutdownDetected (non-failover error)
-	// Should return the error directly, NOT fall back to secondary
-	secondaryBefore := secondary.hits.Load()
-	_, err := fc.GetSpan(context.Background(), 1)
-	require.Error(t, err)
-	assert.True(t, errors.Is(err, ErrShutdownDetected))
-	assert.Equal(t, secondaryBefore, secondary.hits.Load(), "should not fall back to secondary on non-failover error during probe")
-}
-
 func TestFailover_SwitchOnPrimarySubContextError(t *testing.T) {
 	tests := []struct {
 		name      string
@@ -655,6 +638,12 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 			}
 			return &types.Span{Id: spanID}, nil
 		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			if primaryDown.Load() {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &ctypes.SyncInfo{}, nil
+		},
 	}
 	secondary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
@@ -665,24 +654,22 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 50 * time.Millisecond
+	fc.healthCheckInterval = 50 * time.Millisecond
 	defer fc.Close()
 
 	// Trigger cascade to tertiary
 	_, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
-	// Wait for cooldown
-	time.Sleep(100 * time.Millisecond)
-
 	// Bring primary back
 	primaryDown.Store(false)
-	primaryBefore := primary.hits.Load()
 
-	// Next call should probe primary and succeed
-	_, err = fc.GetSpan(context.Background(), 1)
-	require.NoError(t, err)
-	assert.Greater(t, primary.hits.Load(), primaryBefore, "primary should have been probed")
+	// Wait for health-check goroutine to promote back to primary
+	require.Eventually(t, func() bool {
+		fc.mu.Lock()
+		defer fc.mu.Unlock()
+		return fc.active == 0
+	}, 2*time.Second, 20*time.Millisecond, "health-check should promote back to primary")
 
 	// Verify we're back on primary
 	tertiaryBefore := tertiary.hits.Load()
@@ -691,16 +678,9 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 	assert.Equal(t, tertiaryBefore, tertiary.hits.Load(), "should be back on primary now")
 }
 
-// Tests for the shouldProbe path (lines 156-161): probe primary fails with
-// failover error, then current (non-primary) client also fails.
-func TestFailover_ProbeCurrentNonFailoverError(t *testing.T) {
-	// Probe primary → failover error, current (secondary) → non-failover error.
-	// Should return the non-failover error without cascading to tertiary.
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-		},
-	}
+// Active client returns non-failover error: should return directly, no cascade.
+func TestFailover_ActiveNonFailoverError(t *testing.T) {
+	primary := &mockHeimdallClient{}
 	secondary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
 			return nil, ErrShutdownDetected
@@ -710,29 +690,25 @@ func TestFailover_ProbeCurrentNonFailoverError(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 50 * time.Millisecond
 	defer fc.Close()
 
-	// Force onto secondary with cooldown elapsed so probe triggers.
+	// Force onto secondary
 	fc.mu.Lock()
 	fc.active = 1
-	fc.lastSwitch = time.Now().Add(-time.Hour)
 	fc.mu.Unlock()
 
 	_, err := fc.GetSpan(context.Background(), 1)
 	require.Error(t, err)
 	assert.True(t, errors.Is(err, ErrShutdownDetected))
+	assert.Equal(t, int32(0), primary.hits.Load(), "should not probe primary")
 	assert.Equal(t, int32(0), tertiary.hits.Load(), "should not cascade to tertiary on non-failover error")
 }
 
-func TestFailover_ProbeCurrentFailoverError_CascadesToNext(t *testing.T) {
-	// Probe primary → failover error, current (secondary) → failover error.
-	// Should cascade to tertiary.
+// Active client returns failover error: should cascade to next.
+func TestFailover_ActiveFailoverError_CascadesToNext(t *testing.T) {
 	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
-	}
+	primary := &mockHeimdallClient{}
 	secondary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
 	}
@@ -740,18 +716,17 @@ func TestFailover_ProbeCurrentFailoverError_CascadesToNext(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 50 * time.Millisecond
 	defer fc.Close()
 
-	// Force onto secondary with cooldown elapsed so probe triggers.
+	// Force onto secondary
 	fc.mu.Lock()
 	fc.active = 1
-	fc.lastSwitch = time.Now().Add(-time.Hour)
 	fc.mu.Unlock()
 
 	span, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 	require.NotNil(t, span)
+	assert.Equal(t, int32(0), primary.hits.Load(), "should not probe primary")
 	assert.Equal(t, int32(1), tertiary.hits.Load(), "should cascade to tertiary")
 
 	fc.mu.Lock()
@@ -759,43 +734,153 @@ func TestFailover_ProbeCurrentFailoverError_CascadesToNext(t *testing.T) {
 	fc.mu.Unlock()
 }
 
-// Tests for the active != 0 no-probe path (lines 171-176): on a non-primary
-// client with cooldown not elapsed, the current client fails.
-func TestFailover_StickyNonFailoverError(t *testing.T) {
-	// Sticky on secondary (cooldown not elapsed), secondary returns non-failover error.
-	// Should return error without cascading to tertiary.
-	primary := &mockHeimdallClient{}
+func TestFailover_ClosesAllClients(t *testing.T) {
+	var closed [3]atomic.Bool
+
+	clients := make([]Endpoint, 3)
+	for i := range clients {
+		idx := i
+		clients[i] = &mockHeimdallClient{closeFn: func() { closed[idx].Store(true) }}
+	}
+
+	fc := NewMultiHeimdallClient(clients...)
+	fc.Close()
+
+	for i := range closed {
+		assert.True(t, closed[i].Load(), "client %d should be closed", i)
+	}
+}
+
+func TestFailover_HealthCheckStartsOnFailover(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			return &ctypes.SyncInfo{}, nil // primary recovers for health-check
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewMultiHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 50 * time.Millisecond
+	defer fc.Close()
+
+	// Trigger failover
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+
+	// probing should be true after cascade
+	assert.True(t, fc.probing.Load(), "probing should be true after failover")
+
+	// Wait for health-check to promote and self-terminate
+	require.Eventually(t, func() bool {
+		return !fc.probing.Load()
+	}, 2*time.Second, 20*time.Millisecond, "probing should be false after recovery")
+
+	fc.mu.Lock()
+	assert.Equal(t, 0, fc.active, "should be back on primary")
+	fc.mu.Unlock()
+}
+
+func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
+	// 3 clients: primary down, secondary recovers, tertiary active.
+	// Health-check should promote to secondary first, then primary.
+	primaryDown := atomic.Bool{}
+	primaryDown.Store(true)
+
+	secondaryDown := atomic.Bool{}
+	secondaryDown.Store(true)
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			if primaryDown.Load() {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &ctypes.SyncInfo{}, nil
+		},
+	}
 	secondary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, ErrShutdownDetected
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			if secondaryDown.Load() {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &ctypes.SyncInfo{}, nil
 		},
 	}
 	tertiary := &mockHeimdallClient{}
 
 	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 1 * time.Hour // very long — no probe
+	fc.healthCheckInterval = 50 * time.Millisecond
 	defer fc.Close()
 
-	// Force onto secondary with recent switch (cooldown not elapsed).
-	fc.mu.Lock()
-	fc.active = 1
-	fc.lastSwitch = time.Now()
-	fc.mu.Unlock()
+	// Trigger cascade to tertiary
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+
+	// Bring secondary back first
+	secondaryDown.Store(false)
+
+	require.Eventually(t, func() bool {
+		fc.mu.Lock()
+		defer fc.mu.Unlock()
+		return fc.active == 1
+	}, 2*time.Second, 20*time.Millisecond, "should promote to secondary")
 
+	// Now bring primary back
+	primaryDown.Store(false)
+
+	require.Eventually(t, func() bool {
+		fc.mu.Lock()
+		defer fc.mu.Unlock()
+		return fc.active == 0
+	}, 2*time.Second, 20*time.Millisecond, "should promote to primary")
+}
+
+func TestFailover_HealthCheckRespectsClose(t *testing.T) {
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewMultiHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 50 * time.Millisecond
+
+	// Trigger failover
 	_, err := fc.GetSpan(context.Background(), 1)
-	require.Error(t, err)
-	assert.True(t, errors.Is(err, ErrShutdownDetected))
-	assert.Equal(t, int32(0), primary.hits.Load(), "should not probe primary")
-	assert.Equal(t, int32(0), tertiary.hits.Load(), "should not cascade to tertiary on non-failover error")
+	require.NoError(t, err)
+
+	assert.True(t, fc.probing.Load(), "probing should be true after failover")
+
+	// Close should stop the goroutine
+	fc.Close()
+
+	require.Eventually(t, func() bool {
+		return !fc.probing.Load()
+	}, 2*time.Second, 20*time.Millisecond, "probing should stop after Close")
 }
 
-func TestFailover_StickyFailoverError_CascadesToNext(t *testing.T) {
-	// Sticky on secondary (cooldown not elapsed), secondary returns failover error.
-	// Should cascade to tertiary.
+func TestFailover_NoDuplicateGoroutines(t *testing.T) {
 	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 
-	primary := &mockHeimdallClient{}
+	primary := &mockHeimdallClient{
+		getSpanFn:     func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
+	}
 	secondary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
 	}
@@ -803,39 +888,23 @@ func TestFailover_StickyFailoverError_CascadesToNext(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.cooldown = 1 * time.Hour // very long — no probe
+	fc.healthCheckInterval = 1 * time.Hour // long interval so goroutine stays alive
 	defer fc.Close()
 
-	// Force onto secondary with recent switch (cooldown not elapsed).
-	fc.mu.Lock()
-	fc.active = 1
-	fc.lastSwitch = time.Now()
-	fc.mu.Unlock()
-
-	span, err := fc.GetSpan(context.Background(), 1)
+	// First cascade: primary→secondary fails, lands on tertiary
+	_, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
-	require.NotNil(t, span)
-	assert.Equal(t, int32(0), primary.hits.Load(), "should not probe primary")
-	assert.Equal(t, int32(1), tertiary.hits.Load(), "should cascade to tertiary")
 
+	assert.True(t, fc.probing.Load(), "probing should be true")
+
+	// Force back to secondary and cascade again — should NOT spawn a second goroutine
 	fc.mu.Lock()
-	assert.Equal(t, 2, fc.active, "active should switch to tertiary")
+	fc.active = 1
 	fc.mu.Unlock()
-}
-
-func TestFailover_ClosesAllClients(t *testing.T) {
-	var closed [3]atomic.Bool
 
-	clients := make([]Endpoint, 3)
-	for i := range clients {
-		idx := i
-		clients[i] = &mockHeimdallClient{closeFn: func() { closed[idx].Store(true) }}
-	}
-
-	fc := NewMultiHeimdallClient(clients...)
-	fc.Close()
+	_, err = fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
 
-	for i := range closed {
-		assert.True(t, closed[i].Load(), "client %d should be closed", i)
-	}
+	// probing is still true from the first goroutine; CompareAndSwap prevents a second
+	assert.True(t, fc.probing.Load(), "probing should still be true (no duplicate)")
 }
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index 273a853a2a..5fd2952d25 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -6,6 +6,7 @@ import (
 	"errors"
 	"strconv"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/gorilla/websocket"
@@ -31,13 +32,11 @@ const (
 type HeimdallWSClient struct {
 	conn      *websocket.Conn
 	urls      []string // primary at [0], secondary at [1] (if configured)
-	activeURL int      // index into urls
+	activeURL int      // index into urls; protected by mu
 	events    chan *milestone.Milestone
 	done      chan struct{}
 	mu        sync.Mutex
-
-	// lastFailover tracks when the client last switched to secondary
-	lastFailover time.Time
+	probing   atomic.Bool // guards against spawning multiple health-check goroutines
 
 	// Configurable parameters (defaults set in constructor, overridable for testing)
 	primaryAttempts int
@@ -84,15 +83,70 @@ func (c *HeimdallWSClient) SubscribeMilestoneEvents(ctx context.Context) <-chan
 	return c.events
 }
 
+// startWSHealthCheck runs in a background goroutine, periodically probing
+// higher-priority WS endpoints. When one responds, it updates activeURL and
+// closes the current connection to trigger reconnection in readMessages.
+func (c *HeimdallWSClient) startWSHealthCheck() {
+	defer c.probing.Store(false)
+
+	ticker := time.NewTicker(c.wsCooldown)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-c.done:
+			return
+		case <-ticker.C:
+		}
+
+		c.mu.Lock()
+		active := c.activeURL
+		c.mu.Unlock()
+
+		if active == 0 {
+			return
+		}
+
+		// Probe URLs 0..active-1 (highest priority first).
+		for i := 0; i < active; i++ {
+			testConn, _, err := websocket.DefaultDialer.Dial(c.urls[i], nil)
+			if err != nil {
+				continue
+			}
+			testConn.Close()
+
+			c.mu.Lock()
+			c.activeURL = i
+			conn := c.conn
+			c.mu.Unlock()
+
+			log.Info("WS health-check: promoted to higher-priority URL", "index", i, "url", c.urls[i])
+
+			// Close current connection to trigger reconnection in readMessages.
+			if conn != nil {
+				conn.Close()
+			}
+
+			if i == 0 {
+				return
+			}
+
+			break // keep ticking to probe even higher-priority URLs
+		}
+	}
+}
+
 // tryUntilSubscribeMilestoneEvents retries connecting and subscribing until success,
 // with failover to secondary URL after defaultPrimaryAttempts failures on primary.
 func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context) {
-	primaryAttempts := 0
+	attempts := 0
 	firstTime := true
+
 	for {
 		if !firstTime {
 			time.Sleep(c.reconnectDelay)
 		}
+
 		firstTime = false
 
 		// Check for context cancellation or unsubscribe.
@@ -106,35 +160,39 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		default:
 		}
 
-		// If on a non-primary URL and cooldown has elapsed, probe primary first.
-		if c.activeURL != 0 && !c.lastFailover.IsZero() && time.Since(c.lastFailover) >= c.wsCooldown {
-			log.Info("WS cooldown elapsed, probing primary", "url", c.urls[0])
-			c.activeURL = 0
-			primaryAttempts = 0
-		}
+		c.mu.Lock()
+		active := c.activeURL
+		c.mu.Unlock()
 
-		url := c.urls[c.activeURL]
+		url := c.urls[active]
 
 		conn, _, err := websocket.DefaultDialer.Dial(url, nil)
 		if err != nil {
 			log.Error("failed to dial websocket on heimdall ws subscription", "url", url, "err", err)
 
-			// Count failures on current URL; advance to next after threshold.
-			primaryAttempts++
+			attempts++
 
-			if len(c.urls) > 1 && primaryAttempts >= c.primaryAttempts {
-				next := min(c.activeURL+1, len(c.urls)-1)
-				if next != c.activeURL {
+			if len(c.urls) > 1 && attempts >= c.primaryAttempts {
+				next := min(active+1, len(c.urls)-1)
+				if next != active {
 					log.Warn("WS URL failed, switching to next",
-						"from", c.urls[c.activeURL], "to", c.urls[next], "attempts", primaryAttempts)
+						"from", c.urls[active], "to", c.urls[next], "attempts", attempts)
+
+					c.mu.Lock()
 					c.activeURL = next
-					c.lastFailover = time.Now()
+					c.mu.Unlock()
+
+					if c.probing.CompareAndSwap(false, true) {
+						go c.startWSHealthCheck()
+					}
 				}
-				primaryAttempts = 0
+
+				attempts = 0
 			}
 
 			continue
 		}
+
 		c.mu.Lock()
 		c.conn = conn
 		c.mu.Unlock()
@@ -151,7 +209,9 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 			log.Error("failed to send subscription request on heimdall ws subscription", "url", url, "err", err)
 			continue
 		}
+
 		log.Info("successfully connected on heimdall ws subscription", "url", url)
+
 		return
 	}
 }
diff --git a/consensus/bor/heimdallws/client_test.go b/consensus/bor/heimdallws/client_test.go
index 2585477b32..c10c29fa13 100644
--- a/consensus/bor/heimdallws/client_test.go
+++ b/consensus/bor/heimdallws/client_test.go
@@ -218,7 +218,9 @@ func TestWSClient_DualURL_FailoverToSecondary(t *testing.T) {
 		assert.Equal(t, uint64(100), m.StartBlock)
 		assert.Equal(t, uint64(200), m.EndBlock)
 		// Verify we switched to secondary.
+		client.mu.Lock()
 		assert.Equal(t, 1, client.activeURL)
+		client.mu.Unlock()
 	case <-ctx.Done():
 		t.Fatal("timed out waiting for milestone event via failover")
 	}
@@ -254,7 +256,9 @@ func TestWSClient_ThreeURL_CascadeToTertiary(t *testing.T) {
 		require.NotNil(t, m)
 		assert.Equal(t, uint64(100), m.StartBlock)
 		// Verify we ended up on tertiary.
+		client.mu.Lock()
 		assert.Equal(t, 2, client.activeURL)
+		client.mu.Unlock()
 	case <-ctx.Done():
 		t.Fatal("timed out waiting for milestone event via cascade")
 	}
@@ -291,39 +295,61 @@ func TestWSClient_ContextCancellation(t *testing.T) {
 }
 
 func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
-	// Test that after cooldown, the reconnection loop probes primary first.
-	primary := newTestWSServer(t, true)
-	defer primary.Close()
+	// Primary starts rejecting, secondary accepts.
+	// After failover to secondary, primary comes back, health-check should promote.
+	primaryReject := newTestWSServer(t, true)
+	defer primaryReject.Close()
 
-	secondary := newTestWSServer(t, true)
+	secondary := newTestWSServerWithMilestone(t)
 	defer secondary.Close()
 
-	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL))
+	client, err := NewHeimdallWSClient(wsURL(primaryReject.URL), wsURL(secondary.URL))
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.wsCooldown = 50 * time.Millisecond
-
-	// Simulate being on secondary after failover with cooldown elapsed.
-	client.activeURL = 1
-	client.lastFailover = time.Now().Add(-1 * time.Second)
+	client.primaryAttempts = 2
+	client.wsCooldown = 100 * time.Millisecond
 
-	// Short-lived context — the function will probe primary (reset activeURL=0),
-	// fail to dial, then context expires.
-	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
 
-	client.tryUntilSubscribeMilestoneEvents(ctx)
+	events := client.SubscribeMilestoneEvents(ctx)
 
-	// After cooldown elapsed, activeURL should be reset to 0 (probed primary).
-	assert.Equal(t, 0, client.activeURL)
+	// Should failover to secondary.
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+		client.mu.Lock()
+		assert.Equal(t, 1, client.activeURL)
+		client.mu.Unlock()
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for failover")
+	}
+
+	// Close the rejecting primary and replace with an accepting one.
+	primaryReject.Close()
+
+	primaryGood := newTestWSServer(t, false)
+	defer primaryGood.Close()
+
+	// Update URL to the new primary that accepts connections.
+	client.mu.Lock()
+	client.urls[0] = wsURL(primaryGood.URL)
+	client.mu.Unlock()
+
+	// Wait for background health-check to promote back to primary.
+	require.Eventually(t, func() bool {
+		client.mu.Lock()
+		defer client.mu.Unlock()
+		return client.activeURL == 0
+	}, 5*time.Second, 50*time.Millisecond, "health-check should promote back to primary")
+
+	require.NoError(t, client.Unsubscribe(ctx))
 }
 
 func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
 	// Both URLs reject. The client should stay on the last URL once it gets
-	// there rather than wrapping back to primary with the modulo operator.
-	// Wrapping would also incorrectly reset lastFailover, preventing the
-	// cooldown-based probe-back-to-primary from ever firing.
+	// there rather than wrapping back to primary.
 	primary := newTestWSServer(t, true)
 	defer primary.Close()
 
@@ -335,12 +361,12 @@ func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
 
 	client.reconnectDelay = 10 * time.Millisecond
 	client.primaryAttempts = 2
-	client.wsCooldown = 1 * time.Hour // prevent probe-back from interfering
+	client.wsCooldown = 1 * time.Hour // prevent health-check from interfering
 
 	// Pre-set to secondary as if a prior failover already happened.
+	client.mu.Lock()
 	client.activeURL = 1
-	client.lastFailover = time.Now()
-	lastFailoverBefore := client.lastFailover
+	client.mu.Unlock()
 
 	ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond)
 	defer cancel()
@@ -348,11 +374,9 @@ func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
 	client.tryUntilSubscribeMilestoneEvents(ctx)
 
 	// Must stay on secondary (index 1), not wrap back to primary (index 0).
+	client.mu.Lock()
 	assert.Equal(t, 1, client.activeURL, "should stay on last URL, not wrap back to primary")
-
-	// lastFailover must not be updated — the cooldown timer must remain intact
-	// so that the probe-back-to-primary mechanism can eventually fire.
-	assert.Equal(t, lastFailoverBefore, client.lastFailover, "lastFailover must not be reset when already at last URL")
+	client.mu.Unlock()
 }
 
 func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
@@ -380,18 +404,55 @@ func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
 	select {
 	case m := <-events:
 		require.NotNil(t, m)
+		client.mu.Lock()
 		assert.Equal(t, 1, client.activeURL)
+		client.mu.Unlock()
 		assert.Equal(t, uint64(100), m.StartBlock)
 	case <-ctx.Done():
 		t.Fatal("timed out waiting for failover")
 	}
 
-	// The fact that failover worked and lastFailover is set
-	// proves the probe-back mechanism can work later.
-	assert.False(t, client.lastFailover.IsZero(), "lastFailover should be set after switching to secondary")
-
 	// Close the rejecting primary.
 	primaryReject.Close()
 
 	require.NoError(t, client.Unsubscribe(ctx))
 }
+
+func TestWSClient_HealthCheckRespectsUnsubscribe(t *testing.T) {
+	// Verify that the health-check goroutine stops when done channel is closed.
+	primary := newTestWSServer(t, true)
+	defer primary.Close()
+
+	secondary := newTestWSServerWithMilestone(t)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
+
+	client.reconnectDelay = 100 * time.Millisecond
+	client.primaryAttempts = 2
+	client.wsCooldown = 50 * time.Millisecond
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	events := client.SubscribeMilestoneEvents(ctx)
+
+	// Wait for failover to secondary.
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for failover")
+	}
+
+	// Probing goroutine should be running.
+	assert.True(t, client.probing.Load(), "probing should be active after failover")
+
+	// Unsubscribe should stop the health-check goroutine.
+	require.NoError(t, client.Unsubscribe(ctx))
+
+	require.Eventually(t, func() bool {
+		return !client.probing.Load()
+	}, 2*time.Second, 50*time.Millisecond, "probing should stop after unsubscribe")
+}

From b0cc4f5e2495d1ca85e7613222e72b8e06e86b3e Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Mon, 23 Feb 2026 14:16:40 +0530
Subject: [PATCH 19/29] updated log

---
 eth/ethconfig/config.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/eth/ethconfig/config.go b/eth/ethconfig/config.go
index 322731bcee..224b83c981 100644
--- a/eth/ethconfig/config.go
+++ b/eth/ethconfig/config.go
@@ -385,7 +385,7 @@ func CreateConsensusEngine(chainConfig *params.ChainConfig, ethConfig *Config, d
 					heimdallClient = heimdallClients[0]
 				} else {
 					heimdallClient = heimdall.NewMultiHeimdallClient(heimdallClients...)
-					log.Info("Heimdall failover enabled", "endpoints", len(heimdallClients))
+					log.Info("Heimdall failover enabled with multiple endpoints", "endpoints", len(heimdallClients))
 				}
 			}
 
@@ -402,7 +402,7 @@ func CreateConsensusEngine(chainConfig *params.ChainConfig, ethConfig *Config, d
 				}
 
 				if len(wsAddrs) > 1 {
-					log.Info("Heimdall WS failover enabled", "endpoints", len(wsAddrs))
+					log.Info("Heimdall WS failover enabled with multiple endpoints", "endpoints", len(wsAddrs))
 				}
 			}
 

From 01b24b40212324c43baa5d018979181d934af239 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Mon, 23 Feb 2026 14:35:14 +0530
Subject: [PATCH 20/29] added metrics to track failover

---
 consensus/bor/heimdall/failover_client.go      |  8 ++++++++
 consensus/bor/heimdall/failover_client_test.go | 17 +++++++++++++++++
 consensus/bor/heimdall/failover_metrics.go     | 17 +++++++++++++++++
 consensus/bor/heimdallws/client.go             |  9 +++++++++
 4 files changed, 51 insertions(+)
 create mode 100644 consensus/bor/heimdall/failover_metrics.go

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index 2f4655a394..ee9275a151 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -148,6 +148,8 @@ func (f *MultiHeimdallClient) startHealthCheck() {
 
 		// Probe clients 0..active-1 (highest priority first).
 		for i := 0; i < active; i++ {
+			failoverProbeAttempts.Inc(1)
+
 			ctx, cancel := context.WithTimeout(context.Background(), f.attemptTimeout)
 			_, err := f.clients[i].FetchStatus(ctx)
 			cancel()
@@ -157,6 +159,9 @@ func (f *MultiHeimdallClient) startHealthCheck() {
 				f.active = i
 				f.mu.Unlock()
 
+				failoverProbeSuccesses.Inc(1)
+				failoverActiveGauge.Update(int64(i))
+
 				log.Info("Heimdall health-check: promoted to higher-priority client", "index", i)
 
 				if i == 0 {
@@ -209,6 +214,9 @@ func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(
 			f.active = i
 			f.mu.Unlock()
 
+			failoverSwitchCounter.Inc(1)
+			failoverActiveGauge.Update(int64(i))
+
 			log.Warn("Heimdall failover: switched to client", "index", i)
 
 			if i > 0 && f.probing.CompareAndSwap(false, true) {
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 2847486f94..829679deb3 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -122,6 +122,9 @@ func (m *mockHeimdallClient) Close() {
 }
 
 func TestFailover_SwitchOnPrimaryDown(t *testing.T) {
+	switchesBefore := failoverSwitchCounter.Snapshot().Count()
+	activeBefore := failoverActiveGauge.Snapshot().Value()
+
 	primary := &mockHeimdallClient{
 		getSpanFn: func(ctx context.Context, _ uint64) (*types.Span, error) {
 			// Simulate transport error
@@ -140,6 +143,10 @@ func TestFailover_SwitchOnPrimaryDown(t *testing.T) {
 
 	assert.GreaterOrEqual(t, primary.hits.Load(), int32(1), "primary should have been tried")
 	assert.Equal(t, int32(1), secondary.hits.Load(), "secondary should have been called once")
+
+	assert.Greater(t, failoverSwitchCounter.Snapshot().Count(), switchesBefore, "failover switch counter should increment")
+	_ = activeBefore // gauge is set, not incremented
+	assert.Equal(t, int64(1), failoverActiveGauge.Snapshot().Value(), "active gauge should reflect secondary index")
 }
 
 func TestFailover_NoSwitchOnContextCanceled(t *testing.T) {
@@ -752,6 +759,9 @@ func TestFailover_ClosesAllClients(t *testing.T) {
 }
 
 func TestFailover_HealthCheckStartsOnFailover(t *testing.T) {
+	probeAttemptsBefore := failoverProbeAttempts.Snapshot().Count()
+	probeSuccessesBefore := failoverProbeSuccesses.Snapshot().Count()
+
 	primary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
 			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
@@ -782,6 +792,9 @@ func TestFailover_HealthCheckStartsOnFailover(t *testing.T) {
 	fc.mu.Lock()
 	assert.Equal(t, 0, fc.active, "should be back on primary")
 	fc.mu.Unlock()
+
+	assert.Greater(t, failoverProbeAttempts.Snapshot().Count(), probeAttemptsBefore, "probe attempts should increment")
+	assert.Greater(t, failoverProbeSuccesses.Snapshot().Count(), probeSuccessesBefore, "probe successes should increment")
 }
 
 func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
@@ -835,6 +848,8 @@ func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
 		return fc.active == 1
 	}, 2*time.Second, 20*time.Millisecond, "should promote to secondary")
 
+	assert.Equal(t, int64(1), failoverActiveGauge.Snapshot().Value(), "active gauge should reflect secondary after first promotion")
+
 	// Now bring primary back
 	primaryDown.Store(false)
 
@@ -843,6 +858,8 @@ func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
 		defer fc.mu.Unlock()
 		return fc.active == 0
 	}, 2*time.Second, 20*time.Millisecond, "should promote to primary")
+
+	assert.Equal(t, int64(0), failoverActiveGauge.Snapshot().Value(), "active gauge should reflect primary after full recovery")
 }
 
 func TestFailover_HealthCheckRespectsClose(t *testing.T) {
diff --git a/consensus/bor/heimdall/failover_metrics.go b/consensus/bor/heimdall/failover_metrics.go
new file mode 100644
index 0000000000..b2079de945
--- /dev/null
+++ b/consensus/bor/heimdall/failover_metrics.go
@@ -0,0 +1,17 @@
+package heimdall
+
+import "github.com/ethereum/go-ethereum/metrics"
+
+var (
+	// HTTP/gRPC failover metrics (used within this package)
+	failoverSwitchCounter = metrics.NewRegisteredCounter("client/failover/switches", nil)
+	failoverActiveGauge   = metrics.NewRegisteredGauge("client/failover/active", nil)
+	failoverProbeAttempts = metrics.NewRegisteredCounter("client/failover/probe/attempts", nil)
+	failoverProbeSuccesses = metrics.NewRegisteredCounter("client/failover/probe/successes", nil)
+
+	// WS failover metrics (exported for use by heimdallws package)
+	FailoverWSSwitchCounter  = metrics.NewRegisteredCounter("client/failover/ws/switches", nil)
+	FailoverWSActiveGauge    = metrics.NewRegisteredGauge("client/failover/ws/active", nil)
+	FailoverWSProbeAttempts  = metrics.NewRegisteredCounter("client/failover/ws/probe/attempts", nil)
+	FailoverWSProbeSuccesses = metrics.NewRegisteredCounter("client/failover/ws/probe/successes", nil)
+)
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index 5fd2952d25..f5c2b025f9 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -12,6 +12,7 @@ import (
 	"github.com/gorilla/websocket"
 
 	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/consensus/bor/heimdall"
 	"github.com/ethereum/go-ethereum/consensus/bor/heimdall/milestone"
 	"github.com/ethereum/go-ethereum/log"
 )
@@ -109,6 +110,8 @@ func (c *HeimdallWSClient) startWSHealthCheck() {
 
 		// Probe URLs 0..active-1 (highest priority first).
 		for i := 0; i < active; i++ {
+			heimdall.FailoverWSProbeAttempts.Inc(1)
+
 			testConn, _, err := websocket.DefaultDialer.Dial(c.urls[i], nil)
 			if err != nil {
 				continue
@@ -120,6 +123,9 @@ func (c *HeimdallWSClient) startWSHealthCheck() {
 			conn := c.conn
 			c.mu.Unlock()
 
+			heimdall.FailoverWSProbeSuccesses.Inc(1)
+			heimdall.FailoverWSActiveGauge.Update(int64(i))
+
 			log.Info("WS health-check: promoted to higher-priority URL", "index", i, "url", c.urls[i])
 
 			// Close current connection to trigger reconnection in readMessages.
@@ -182,6 +188,9 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 					c.activeURL = next
 					c.mu.Unlock()
 
+					heimdall.FailoverWSSwitchCounter.Inc(1)
+					heimdall.FailoverWSActiveGauge.Update(int64(next))
+
 					if c.probing.CompareAndSwap(false, true) {
 						go c.startWSHealthCheck()
 					}

From 200d899988bb40563470a3b87f2cdae7ddf5373f Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Mon, 23 Feb 2026 15:30:16 +0530
Subject: [PATCH 21/29] fix lint

---
 consensus/bor/heimdall/failover_metrics.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/consensus/bor/heimdall/failover_metrics.go b/consensus/bor/heimdall/failover_metrics.go
index b2079de945..f9d6aedeae 100644
--- a/consensus/bor/heimdall/failover_metrics.go
+++ b/consensus/bor/heimdall/failover_metrics.go
@@ -4,9 +4,9 @@ import "github.com/ethereum/go-ethereum/metrics"
 
 var (
 	// HTTP/gRPC failover metrics (used within this package)
-	failoverSwitchCounter = metrics.NewRegisteredCounter("client/failover/switches", nil)
-	failoverActiveGauge   = metrics.NewRegisteredGauge("client/failover/active", nil)
-	failoverProbeAttempts = metrics.NewRegisteredCounter("client/failover/probe/attempts", nil)
+	failoverSwitchCounter  = metrics.NewRegisteredCounter("client/failover/switches", nil)
+	failoverActiveGauge    = metrics.NewRegisteredGauge("client/failover/active", nil)
+	failoverProbeAttempts  = metrics.NewRegisteredCounter("client/failover/probe/attempts", nil)
 	failoverProbeSuccesses = metrics.NewRegisteredCounter("client/failover/probe/successes", nil)
 
 	// WS failover metrics (exported for use by heimdallws package)

From be4fe9dc2fa77418cea9d528288ff56027780d15 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Tue, 24 Feb 2026 09:56:11 +0530
Subject: [PATCH 22/29] updated the health check logic and some minor
 improvements

---
 consensus/bor/heimdall/failover_client.go     | 324 ++++++++---
 .../bor/heimdall/failover_client_test.go      | 525 ++++++++++++++----
 consensus/bor/heimdall/failover_metrics.go    |  20 +-
 consensus/bor/heimdallws/client.go            | 299 +++++++---
 consensus/bor/heimdallws/client_test.go       | 161 +++++-
 5 files changed, 1049 insertions(+), 280 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index ee9275a151..2f2e0029ba 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -5,7 +5,6 @@ import (
 	"errors"
 	"net"
 	"sync"
-	"sync/atomic"
 	"time"
 
 	"github.com/0xPolygon/heimdall-v2/x/bor/types"
@@ -18,8 +17,10 @@ import (
 )
 
 const (
-	defaultAttemptTimeout      = 30 * time.Second
-	defaultHealthCheckInterval = 30 * time.Second
+	defaultAttemptTimeout       = 30 * time.Second
+	defaultHealthCheckInterval  = 10 * time.Second
+	defaultConsecutiveThreshold = 3
+	defaultPromotionCooldown    = 60 * time.Second
 )
 
 // Endpoint matches bor.IHeimdallClient. It is exported so that external
@@ -37,19 +38,33 @@ type Endpoint interface {
 	Close()
 }
 
+// endpointHealth tracks the health state of a single endpoint.
+type endpointHealth struct {
+	healthy            bool
+	consecutiveSuccess int
+	healthySince       time.Time // when consecutive threshold was reached
+	lastErr            error
+}
+
 // MultiHeimdallClient wraps N heimdall clients (primary at index 0, failovers
 // at 1..N-1) and transparently cascades through them when the active client is
-// unreachable. A background goroutine periodically health-checks higher-priority
-// endpoints and promotes back when one recovers.
+// unreachable. A background health registry continuously probes ALL endpoints,
+// requires consecutive successes + cooldown before promotion, and gives cascade
+// full visibility into endpoint health.
 type MultiHeimdallClient struct {
-	clients             []Endpoint
-	mu                  sync.Mutex
-	active              int // 0 = primary, >0 = failover
-	attemptTimeout      time.Duration
-	healthCheckInterval time.Duration
-	quit                chan struct{}
-	closeOnce           sync.Once
-	probing             atomic.Bool
+	clients              []Endpoint
+	mu                   sync.Mutex
+	active               int // 0 = primary, >0 = failover
+	health               []endpointHealth
+	attemptTimeout       time.Duration
+	healthCheckInterval  time.Duration
+	consecutiveThreshold int
+	promotionCooldown    time.Duration
+	quit                 chan struct{}
+	closeOnce            sync.Once
+	startOnce            sync.Once
+	probeCtx             context.Context    // cancelled on Close to abort in-flight probes
+	probeCancel          context.CancelFunc
 }
 
 func NewMultiHeimdallClient(clients ...Endpoint) *MultiHeimdallClient {
@@ -57,11 +72,33 @@ func NewMultiHeimdallClient(clients ...Endpoint) *MultiHeimdallClient {
 		panic("NewMultiHeimdallClient requires at least one client")
 	}
 
+	health := make([]endpointHealth, len(clients))
+	// Primary starts as healthy; others start unhealthy.
+	health[0] = endpointHealth{healthy: true}
+
+	probeCtx, probeCancel := context.WithCancel(context.Background())
+
 	return &MultiHeimdallClient{
-		clients:             clients,
-		attemptTimeout:      defaultAttemptTimeout,
-		healthCheckInterval: defaultHealthCheckInterval,
-		quit:                make(chan struct{}),
+		clients:              clients,
+		health:               health,
+		attemptTimeout:       defaultAttemptTimeout,
+		healthCheckInterval:  defaultHealthCheckInterval,
+		consecutiveThreshold: defaultConsecutiveThreshold,
+		promotionCooldown:    defaultPromotionCooldown,
+		quit:                 make(chan struct{}),
+		probeCtx:             probeCtx,
+		probeCancel:          probeCancel,
+	}
+}
+
+// ensureHealthRegistry lazily starts the health registry goroutine on the first
+// API call. This allows tests to configure fields (thresholds, intervals) after
+// construction but before the goroutine reads them.
+func (f *MultiHeimdallClient) ensureHealthRegistry() {
+	if len(f.clients) > 1 {
+		f.startOnce.Do(func() {
+			go f.runHealthRegistry()
+		})
 	}
 }
 
@@ -114,19 +151,20 @@ func (f *MultiHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo
 }
 
 func (f *MultiHeimdallClient) Close() {
-	f.closeOnce.Do(func() { close(f.quit) })
+	f.closeOnce.Do(func() {
+		f.probeCancel() // cancel in-flight probes first
+		close(f.quit)
+	})
 
 	for _, c := range f.clients {
 		c.Close()
 	}
 }
 
-// startHealthCheck runs in a background goroutine, periodically probing
-// higher-priority endpoints. When one recovers, it promotes active and
-// self-terminates. This keeps real requests off the probe path.
-func (f *MultiHeimdallClient) startHealthCheck() {
-	defer f.probing.Store(false)
-
+// runHealthRegistry is an always-on goroutine (started in constructor, stopped
+// on Close) that continuously probes ALL endpoints, requires consecutive
+// successes before marking healthy, and enforces cooldown before promotion.
+func (f *MultiHeimdallClient) runHealthRegistry() {
 	ticker := time.NewTicker(f.healthCheckInterval)
 	defer ticker.Stop()
 
@@ -137,46 +175,146 @@ func (f *MultiHeimdallClient) startHealthCheck() {
 		case <-ticker.C:
 		}
 
+		f.probeAllEndpoints()
+		f.maybePromote()
+		f.maybeProactiveSwitch()
+	}
+}
+
+// probeAllEndpoints probes every endpoint via FetchStatus and updates health state.
+func (f *MultiHeimdallClient) probeAllEndpoints() {
+	for i := 0; i < len(f.clients); i++ {
+		// Check for shutdown between individual probes so we don't
+		// burn N*timeout before noticing Close() was called.
+		select {
+		case <-f.quit:
+			return
+		default:
+		}
+
+		failoverProbeAttempts.Inc(1)
+
+		ctx, cancel := context.WithTimeout(f.probeCtx, f.attemptTimeout)
+		_, err := f.clients[i].FetchStatus(ctx)
+		cancel()
+
 		f.mu.Lock()
-		active := f.active
+
+		if err == nil {
+			f.health[i].consecutiveSuccess++
+			f.health[i].lastErr = nil
+
+			if f.health[i].consecutiveSuccess >= f.consecutiveThreshold && !f.health[i].healthy {
+				f.health[i].healthy = true
+				f.health[i].healthySince = time.Now()
+			}
+
+			failoverProbeSuccesses.Inc(1)
+		} else {
+			// Fast failure detection: one failure resets to unhealthy.
+			f.health[i].consecutiveSuccess = 0
+			f.health[i].healthy = false
+			f.health[i].lastErr = err
+		}
+
 		f.mu.Unlock()
+	}
+
+	// Update healthy endpoints gauge.
+	f.mu.Lock()
+	count := int64(0)
+	for i := range f.health {
+		if f.health[i].healthy {
+			count++
+		}
+	}
+	f.mu.Unlock()
+
+	failoverHealthyEndpoints.Update(count)
+}
+
+// maybePromote checks if a higher-priority endpoint (index < active) is healthy
+// and has passed cooldown. If yes, promotes to the highest-priority qualified endpoint.
+func (f *MultiHeimdallClient) maybePromote() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	if f.active == 0 {
+		return
+	}
+
+	for i := 0; i < f.active; i++ {
+		if f.health[i].healthy && time.Since(f.health[i].healthySince) >= f.promotionCooldown {
+			f.active = i
+			failoverActiveGauge.Update(int64(i))
+			failoverProactiveSwitches.Inc(1)
+
+			log.Info("Heimdall health registry: promoted to higher-priority client",
+				"index", i, "previous", f.active)
 
-		if active == 0 {
-			// Already on primary, nothing to probe.
 			return
 		}
+	}
+}
 
-		// Probe clients 0..active-1 (highest priority first).
-		for i := 0; i < active; i++ {
-			failoverProbeAttempts.Inc(1)
+// maybeProactiveSwitch detects if the active endpoint is unhealthy and switches
+// to the highest-priority healthy endpoint.
+func (f *MultiHeimdallClient) maybeProactiveSwitch() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
 
-			ctx, cancel := context.WithTimeout(context.Background(), f.attemptTimeout)
-			_, err := f.clients[i].FetchStatus(ctx)
-			cancel()
+	if f.health[f.active].healthy {
+		return
+	}
 
-			if err == nil {
-				f.mu.Lock()
-				f.active = i
-				f.mu.Unlock()
+	// Active is unhealthy. Find the best alternative.
+	// Pass 1: healthy + cooled.
+	for i := 0; i < len(f.clients); i++ {
+		if i == f.active {
+			continue
+		}
 
-				failoverProbeSuccesses.Inc(1)
-				failoverActiveGauge.Update(int64(i))
+		if f.health[i].healthy && time.Since(f.health[i].healthySince) >= f.promotionCooldown {
+			prev := f.active
+			f.active = i
 
-				log.Info("Heimdall health-check: promoted to higher-priority client", "index", i)
+			failoverActiveGauge.Update(int64(i))
+			failoverProactiveSwitches.Inc(1)
 
-				if i == 0 {
-					return
-				}
+			log.Warn("Heimdall health registry: proactive switch (active unhealthy, cooled target)",
+				"from", prev, "to", i)
 
-				break // keep ticking to probe even higher-priority clients
-			}
+			return
+		}
+	}
+
+	// Pass 2: healthy but NOT cooled (emergency).
+	for i := 0; i < len(f.clients); i++ {
+		if i == f.active {
+			continue
+		}
+
+		if f.health[i].healthy {
+			prev := f.active
+			f.active = i
+
+			failoverActiveGauge.Update(int64(i))
+			failoverProactiveSwitches.Inc(1)
+
+			log.Warn("Heimdall health registry: proactive switch (active unhealthy, uncooled target)",
+				"from", prev, "to", i)
+
+			return
 		}
 	}
 }
 
 // callWithFailover executes fn against the active client. If the active client
-// fails with a failover-eligible error, it cascades through remaining clients.
+// fails with a failover-eligible error, it marks it unhealthy and cascades
+// through remaining clients using health registry information.
 func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error)) (T, error) {
+	f.ensureHealthRegistry()
+
 	f.mu.Lock()
 	active := f.active
 	f.mu.Unlock()
@@ -194,43 +332,91 @@ func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn fun
 		return zero, err
 	}
 
+	// Mark the active endpoint unhealthy in the registry.
+	f.mu.Lock()
+	f.health[active].consecutiveSuccess = 0
+	f.health[active].healthy = false
+	f.health[active].lastErr = err
+	f.mu.Unlock()
+
 	if active == 0 {
-		log.Warn("Heimdall failover: primary failed, cascading to next client", "err", err)
+		log.Warn("Heimdall failover: primary failed, cascading", "err", err)
 	}
 
 	return cascadeClients(f, ctx, fn, active, err)
 }
 
-// cascadeClients tries clients after the given index. On first success it
-// switches the active client and returns. If all fail, returns the last error.
-func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error), after int, lastErr error) (T, error) {
-	for i := after + 1; i < len(f.clients); i++ {
-		subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
-		result, err := fn(subCtx, f.clients[i])
-		cancel()
+// cascadeClients tries all endpoints in priority order using health registry
+// information. It uses a three-pass approach:
+//  1. Healthy + cooled endpoints in priority order (skipping failed active)
+//  2. Healthy but NOT cooled endpoints in priority order
+//  3. Unhealthy endpoints in priority order (last resort)
+func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error), failed int, lastErr error) (T, error) {
+	n := len(f.clients)
 
-		if err == nil {
-			f.mu.Lock()
-			f.active = i
-			f.mu.Unlock()
+	// Build candidate lists based on health state.
+	f.mu.Lock()
 
-			failoverSwitchCounter.Inc(1)
-			failoverActiveGauge.Update(int64(i))
+	var cooled, uncooled, unhealthy []int
 
-			log.Warn("Heimdall failover: switched to client", "index", i)
+	for i := 0; i < n; i++ {
+		if i == failed {
+			continue
+		}
 
-			if i > 0 && f.probing.CompareAndSwap(false, true) {
-				go f.startHealthCheck()
+		if f.health[i].healthy {
+			if time.Since(f.health[i].healthySince) >= f.promotionCooldown {
+				cooled = append(cooled, i)
+			} else {
+				uncooled = append(uncooled, i)
 			}
-
-			return result, nil
+		} else {
+			unhealthy = append(unhealthy, i)
 		}
+	}
+
+	f.mu.Unlock()
+
+	// Try each pass in order.
+	passes := [][]int{cooled, uncooled, unhealthy}
+
+	for _, candidates := range passes {
+		for _, i := range candidates {
+			subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
+			result, err := fn(subCtx, f.clients[i])
+			cancel()
+
+			if err == nil {
+				f.mu.Lock()
+				f.active = i
+				f.health[i].consecutiveSuccess++
+				if !f.health[i].healthy && f.health[i].consecutiveSuccess >= f.consecutiveThreshold {
+					f.health[i].healthy = true
+					f.health[i].healthySince = time.Now()
+				}
+				f.mu.Unlock()
 
-		lastErr = err
+				failoverSwitchCounter.Inc(1)
+				failoverActiveGauge.Update(int64(i))
+
+				log.Warn("Heimdall failover: switched to client", "index", i)
 
-		if !isFailoverError(err, ctx) {
-			var zero T
-			return zero, err
+				return result, nil
+			}
+
+			lastErr = err
+
+			if !isFailoverError(err, ctx) {
+				var zero T
+				return zero, err
+			}
+
+			// Mark this endpoint unhealthy too.
+			f.mu.Lock()
+			f.health[i].consecutiveSuccess = 0
+			f.health[i].healthy = false
+			f.health[i].lastErr = err
+			f.mu.Unlock()
 		}
 	}
 
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 829679deb3..02fc7ff186 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -7,6 +7,7 @@ import (
 	"net"
 	"net/http"
 	"net/http/httptest"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -121,20 +122,30 @@ func (m *mockHeimdallClient) Close() {
 	}
 }
 
+// newInstantMulti creates a MultiHeimdallClient with instant health registry
+// behavior: consecutiveThreshold=1, promotionCooldown=0, fast health-check interval.
+func newInstantMulti(clients ...Endpoint) *MultiHeimdallClient {
+	fc := NewMultiHeimdallClient(clients...)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
+	fc.healthCheckInterval = 50 * time.Millisecond
+
+	return fc
+}
+
 func TestFailover_SwitchOnPrimaryDown(t *testing.T) {
 	switchesBefore := failoverSwitchCounter.Snapshot().Count()
 	activeBefore := failoverActiveGauge.Snapshot().Value()
 
 	primary := &mockHeimdallClient{
 		getSpanFn: func(ctx context.Context, _ uint64) (*types.Span, error) {
-			// Simulate transport error
 			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 		},
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	span, err := fc.GetSpan(context.Background(), 1)
@@ -142,7 +153,7 @@ func TestFailover_SwitchOnPrimaryDown(t *testing.T) {
 	require.NotNil(t, span)
 
 	assert.GreaterOrEqual(t, primary.hits.Load(), int32(1), "primary should have been tried")
-	assert.Equal(t, int32(1), secondary.hits.Load(), "secondary should have been called once")
+	assert.GreaterOrEqual(t, secondary.hits.Load(), int32(1), "secondary should have been called")
 
 	assert.Greater(t, failoverSwitchCounter.Snapshot().Count(), switchesBefore, "failover switch counter should increment")
 	_ = activeBefore // gauge is set, not incremented
@@ -152,7 +163,6 @@ func TestFailover_SwitchOnPrimaryDown(t *testing.T) {
 func TestFailover_NoSwitchOnContextCanceled(t *testing.T) {
 	primary := &mockHeimdallClient{
 		getSpanFn: func(ctx context.Context, _ uint64) (*types.Span, error) {
-			// Block until context is cancelled
 			<-ctx.Done()
 			return nil, ctx.Err()
 		},
@@ -161,6 +171,9 @@ func TestFailover_NoSwitchOnContextCanceled(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 5 * time.Second // longer than caller's ctx
+	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
 	defer fc.Close()
 
 	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
@@ -181,6 +194,9 @@ func TestFailover_NoSwitchOnServiceUnavailable(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
 	defer fc.Close()
 
 	_, err := fc.GetSpan(context.Background(), 1)
@@ -199,6 +215,9 @@ func TestFailover_NoSwitchOnShutdownDetected(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
 	defer fc.Close()
 
 	_, err := fc.GetSpan(context.Background(), 1)
@@ -212,12 +231,17 @@ func TestFailover_StickyBehavior(t *testing.T) {
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
 			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
 	}
 	secondary := &mockHeimdallClient{}
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour // very long — no background probe
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
+	fc.healthCheckInterval = 1 * time.Hour // very long — no background promotion
 	defer fc.Close()
 
 	// First call triggers failover
@@ -257,9 +281,7 @@ func TestFailover_ProbeBackToPrimary(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	// Trigger failover
@@ -269,12 +291,12 @@ func TestFailover_ProbeBackToPrimary(t *testing.T) {
 	// Bring primary back
 	primaryDown.Store(false)
 
-	// Wait for background health-check to promote primary
+	// Wait for background health registry to promote primary
 	require.Eventually(t, func() bool {
 		fc.mu.Lock()
 		defer fc.mu.Unlock()
 		return fc.active == 0
-	}, 2*time.Second, 20*time.Millisecond, "health-check should promote back to primary")
+	}, 2*time.Second, 20*time.Millisecond, "health registry should promote back to primary")
 
 	// Verify subsequent calls go to primary
 	secondaryBefore := secondary.hits.Load()
@@ -294,9 +316,7 @@ func TestFailover_ProbeBackFails(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	// Trigger failover
@@ -337,6 +357,9 @@ func TestFailover_PassthroughWhenPrimaryHealthy(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 5 * time.Second
+	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
 	defer fc.Close()
 
 	for i := 0; i < 5; i++ {
@@ -387,15 +410,13 @@ func TestFailover_StateSyncEvents(t *testing.T) {
 		},
 	}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	events, err := fc.StateSyncEvents(context.Background(), 42, 100)
 	require.NoError(t, err)
 	require.Len(t, events, 1)
 	assert.Equal(t, uint64(42), events[0].ID)
-	assert.Equal(t, int32(1), secondary.hits.Load())
 }
 
 func TestFailover_GetLatestSpan(t *testing.T) {
@@ -410,14 +431,12 @@ func TestFailover_GetLatestSpan(t *testing.T) {
 		},
 	}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	span, err := fc.GetLatestSpan(context.Background())
 	require.NoError(t, err)
 	assert.Equal(t, uint64(77), span.Id)
-	assert.Equal(t, int32(1), secondary.hits.Load())
 }
 
 func TestFailover_FetchCheckpoint(t *testing.T) {
@@ -428,14 +447,12 @@ func TestFailover_FetchCheckpoint(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	cp, err := fc.FetchCheckpoint(context.Background(), 5)
 	require.NoError(t, err)
 	require.NotNil(t, cp)
-	assert.Equal(t, int32(1), secondary.hits.Load())
 }
 
 func TestFailover_FetchCheckpointCount(t *testing.T) {
@@ -446,14 +463,12 @@ func TestFailover_FetchCheckpointCount(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	count, err := fc.FetchCheckpointCount(context.Background())
 	require.NoError(t, err)
 	assert.Equal(t, int64(10), count)
-	assert.Equal(t, int32(1), secondary.hits.Load())
 }
 
 func TestFailover_FetchMilestone(t *testing.T) {
@@ -464,14 +479,12 @@ func TestFailover_FetchMilestone(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	ms, err := fc.FetchMilestone(context.Background())
 	require.NoError(t, err)
 	require.NotNil(t, ms)
-	assert.Equal(t, int32(1), secondary.hits.Load())
 }
 
 func TestFailover_FetchMilestoneCount(t *testing.T) {
@@ -482,14 +495,12 @@ func TestFailover_FetchMilestoneCount(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	count, err := fc.FetchMilestoneCount(context.Background())
 	require.NoError(t, err)
 	assert.Equal(t, int64(5), count)
-	assert.Equal(t, int32(1), secondary.hits.Load())
 }
 
 func TestFailover_FetchStatus(t *testing.T) {
@@ -500,14 +511,12 @@ func TestFailover_FetchStatus(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary)
 	defer fc.Close()
 
 	status, err := fc.FetchStatus(context.Background())
 	require.NoError(t, err)
 	require.NotNil(t, status)
-	assert.Equal(t, int32(1), secondary.hits.Load())
 }
 
 func TestFailover_SwitchOnPrimarySubContextError(t *testing.T) {
@@ -535,15 +544,14 @@ func TestFailover_SwitchOnPrimarySubContextError(t *testing.T) {
 			primary := &mockHeimdallClient{getSpanFn: tt.primaryFn}
 			secondary := &mockHeimdallClient{}
 
-			fc := NewMultiHeimdallClient(primary, secondary)
-			fc.attemptTimeout = 100 * time.Millisecond
+			fc := newInstantMulti(primary, secondary)
 			defer fc.Close()
 
 			span, err := fc.GetSpan(context.Background(), 1)
 			require.NoError(t, err)
 			require.NotNil(t, span)
-			assert.Equal(t, int32(1), primary.hits.Load(), "primary should have been tried")
-			assert.Equal(t, int32(1), secondary.hits.Load(), "should failover on sub-context error")
+			assert.GreaterOrEqual(t, primary.hits.Load(), int32(1), "primary should have been tried")
+			assert.GreaterOrEqual(t, secondary.hits.Load(), int32(1), "should failover on sub-context error")
 		})
 	}
 }
@@ -600,8 +608,7 @@ func TestFailover_ThreeClients_CascadeToTertiary(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary, tertiary)
 	defer fc.Close()
 
 	span, err := fc.GetSpan(context.Background(), 1)
@@ -610,7 +617,7 @@ func TestFailover_ThreeClients_CascadeToTertiary(t *testing.T) {
 
 	assert.GreaterOrEqual(t, primary.hits.Load(), int32(1), "primary should have been tried")
 	assert.GreaterOrEqual(t, secondary.hits.Load(), int32(1), "secondary should have been tried")
-	assert.Equal(t, int32(1), tertiary.hits.Load(), "tertiary should have been called once")
+	assert.GreaterOrEqual(t, tertiary.hits.Load(), int32(1), "tertiary should have been called")
 }
 
 func TestFailover_AllClientsFail(t *testing.T) {
@@ -626,8 +633,7 @@ func TestFailover_AllClientsFail(t *testing.T) {
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
 	}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary, tertiary)
 	defer fc.Close()
 
 	_, err := fc.GetSpan(context.Background(), 1)
@@ -659,9 +665,7 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
-	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
+	fc := newInstantMulti(primary, secondary, tertiary)
 	defer fc.Close()
 
 	// Trigger cascade to tertiary
@@ -671,12 +675,12 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 	// Bring primary back
 	primaryDown.Store(false)
 
-	// Wait for health-check goroutine to promote back to primary
+	// Wait for health registry to promote back to primary
 	require.Eventually(t, func() bool {
 		fc.mu.Lock()
 		defer fc.mu.Unlock()
 		return fc.active == 0
-	}, 2*time.Second, 20*time.Millisecond, "health-check should promote back to primary")
+	}, 2*time.Second, 20*time.Millisecond, "health registry should promote back to primary")
 
 	// Verify we're back on primary
 	tertiaryBefore := tertiary.hits.Load()
@@ -697,6 +701,9 @@ func TestFailover_ActiveNonFailoverError(t *testing.T) {
 
 	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
 	defer fc.Close()
 
 	// Force onto secondary
@@ -707,22 +714,23 @@ func TestFailover_ActiveNonFailoverError(t *testing.T) {
 	_, err := fc.GetSpan(context.Background(), 1)
 	require.Error(t, err)
 	assert.True(t, errors.Is(err, ErrShutdownDetected))
-	assert.Equal(t, int32(0), primary.hits.Load(), "should not probe primary")
 	assert.Equal(t, int32(0), tertiary.hits.Load(), "should not cascade to tertiary on non-failover error")
 }
 
-// Active client returns failover error: should cascade to next.
+// Active client returns failover error: cascade should try by priority.
 func TestFailover_ActiveFailoverError_CascadesToNext(t *testing.T) {
 	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 
-	primary := &mockHeimdallClient{}
+	// Primary also fails so cascade doesn't land there.
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
 	secondary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
-	fc.attemptTimeout = 100 * time.Millisecond
+	fc := newInstantMulti(primary, secondary, tertiary)
 	defer fc.Close()
 
 	// Force onto secondary
@@ -733,8 +741,7 @@ func TestFailover_ActiveFailoverError_CascadesToNext(t *testing.T) {
 	span, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 	require.NotNil(t, span)
-	assert.Equal(t, int32(0), primary.hits.Load(), "should not probe primary")
-	assert.Equal(t, int32(1), tertiary.hits.Load(), "should cascade to tertiary")
+	assert.GreaterOrEqual(t, tertiary.hits.Load(), int32(1), "should cascade to tertiary")
 
 	fc.mu.Lock()
 	assert.Equal(t, 2, fc.active, "active should switch to tertiary")
@@ -758,48 +765,7 @@ func TestFailover_ClosesAllClients(t *testing.T) {
 	}
 }
 
-func TestFailover_HealthCheckStartsOnFailover(t *testing.T) {
-	probeAttemptsBefore := failoverProbeAttempts.Snapshot().Count()
-	probeSuccessesBefore := failoverProbeSuccesses.Snapshot().Count()
-
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-		},
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
-			return &ctypes.SyncInfo{}, nil // primary recovers for health-check
-		},
-	}
-	secondary := &mockHeimdallClient{}
-
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
-	defer fc.Close()
-
-	// Trigger failover
-	_, err := fc.GetSpan(context.Background(), 1)
-	require.NoError(t, err)
-
-	// probing should be true after cascade
-	assert.True(t, fc.probing.Load(), "probing should be true after failover")
-
-	// Wait for health-check to promote and self-terminate
-	require.Eventually(t, func() bool {
-		return !fc.probing.Load()
-	}, 2*time.Second, 20*time.Millisecond, "probing should be false after recovery")
-
-	fc.mu.Lock()
-	assert.Equal(t, 0, fc.active, "should be back on primary")
-	fc.mu.Unlock()
-
-	assert.Greater(t, failoverProbeAttempts.Snapshot().Count(), probeAttemptsBefore, "probe attempts should increment")
-	assert.Greater(t, failoverProbeSuccesses.Snapshot().Count(), probeSuccessesBefore, "probe successes should increment")
-}
-
 func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
-	// 3 clients: primary down, secondary recovers, tertiary active.
-	// Health-check should promote to secondary first, then primary.
 	primaryDown := atomic.Bool{}
 	primaryDown.Store(true)
 
@@ -830,9 +796,7 @@ func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
-	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
+	fc := newInstantMulti(primary, secondary, tertiary)
 	defer fc.Close()
 
 	// Trigger cascade to tertiary
@@ -848,8 +812,6 @@ func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
 		return fc.active == 1
 	}, 2*time.Second, 20*time.Millisecond, "should promote to secondary")
 
-	assert.Equal(t, int64(1), failoverActiveGauge.Snapshot().Value(), "active gauge should reflect secondary after first promotion")
-
 	// Now bring primary back
 	primaryDown.Store(false)
 
@@ -858,40 +820,163 @@ func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
 		defer fc.mu.Unlock()
 		return fc.active == 0
 	}, 2*time.Second, 20*time.Millisecond, "should promote to primary")
+}
+
+func TestFailover_HealthRegistryRespectsClose(t *testing.T) {
+	primary := &mockHeimdallClient{
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+	}
+	secondary := &mockHeimdallClient{}
 
-	assert.Equal(t, int64(0), failoverActiveGauge.Snapshot().Value(), "active gauge should reflect primary after full recovery")
+	fc := NewMultiHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 50 * time.Millisecond
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
+
+	// Close should stop the health registry goroutine
+	fc.Close()
+
+	// No goroutine should be running after close — verify by checking
+	// that probe counts don't increase after close.
+	probesBefore := failoverProbeAttempts.Snapshot().Count()
+	time.Sleep(200 * time.Millisecond)
+	probesAfter := failoverProbeAttempts.Snapshot().Count()
+
+	assert.Equal(t, probesBefore, probesAfter, "no probes should run after Close")
 }
 
-func TestFailover_HealthCheckRespectsClose(t *testing.T) {
+// --- New health registry tests ---
+
+func TestRegistry_ConsecutiveThreshold(t *testing.T) {
+	probeCount := atomic.Int32{}
+
 	primary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
 			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 		},
 		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			probeCount.Add(1)
+			return &ctypes.SyncInfo{}, nil
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewMultiHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 50 * time.Millisecond
+	fc.consecutiveThreshold = 3 // need 3 consecutive successes
+	fc.promotionCooldown = 0
+	defer fc.Close()
+
+	// Trigger failover
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+
+	fc.mu.Lock()
+	assert.Equal(t, 1, fc.active, "should be on secondary")
+	fc.mu.Unlock()
+
+	// Wait for enough probes to pass the threshold
+	require.Eventually(t, func() bool {
+		return probeCount.Load() >= 3
+	}, 2*time.Second, 20*time.Millisecond, "should probe primary at least 3 times")
+
+	// Should eventually promote after threshold met
+	require.Eventually(t, func() bool {
+		fc.mu.Lock()
+		defer fc.mu.Unlock()
+		return fc.active == 0
+	}, 2*time.Second, 20*time.Millisecond, "should promote after consecutive threshold met")
+}
+
+func TestRegistry_PromotionCooldown(t *testing.T) {
+	primaryDown := atomic.Bool{}
+	primaryDown.Store(true)
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
 			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			if primaryDown.Load() {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &ctypes.SyncInfo{}, nil
+		},
 	}
 	secondary := &mockHeimdallClient{}
 
 	fc := NewMultiHeimdallClient(primary, secondary)
 	fc.attemptTimeout = 100 * time.Millisecond
 	fc.healthCheckInterval = 50 * time.Millisecond
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 500 * time.Millisecond // 500ms cooldown
+	defer fc.Close()
 
 	// Trigger failover
 	_, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
-	assert.True(t, fc.probing.Load(), "probing should be true after failover")
+	// Bring primary back
+	primaryDown.Store(false)
 
-	// Close should stop the goroutine
-	fc.Close()
+	// Wait for at least one probe to succeed — primary should be healthy but not promoted yet
+	time.Sleep(150 * time.Millisecond)
+	fc.mu.Lock()
+	assert.Equal(t, 1, fc.active, "should not promote before cooldown")
+	fc.mu.Unlock()
 
+	// Wait for cooldown to pass and promotion to happen
 	require.Eventually(t, func() bool {
-		return !fc.probing.Load()
-	}, 2*time.Second, 20*time.Millisecond, "probing should stop after Close")
+		fc.mu.Lock()
+		defer fc.mu.Unlock()
+		return fc.active == 0
+	}, 3*time.Second, 20*time.Millisecond, "should promote after cooldown passes")
+}
+
+func TestRegistry_FlappingPrevention(t *testing.T) {
+	callCount := atomic.Int32{}
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			n := callCount.Add(1)
+			// Alternate: success, fail, success, fail...
+			if n%2 == 0 {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &ctypes.SyncInfo{}, nil
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewMultiHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 50 * time.Millisecond
+	fc.consecutiveThreshold = 3
+	fc.promotionCooldown = 0
+	defer fc.Close()
+
+	// Trigger failover
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+
+	// Wait for several probe cycles
+	time.Sleep(500 * time.Millisecond)
+
+	// Primary should never reach healthy because alternating success/fail
+	// never reaches 3 consecutive successes.
+	fc.mu.Lock()
+	assert.Equal(t, 1, fc.active, "should stay on secondary — flapping primary never reaches threshold")
+	fc.mu.Unlock()
 }
 
-func TestFailover_NoDuplicateGoroutines(t *testing.T) {
+func TestRegistry_InformedCascade_SkipsUnhealthy(t *testing.T) {
 	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 
 	primary := &mockHeimdallClient{
@@ -899,29 +984,233 @@ func TestFailover_NoDuplicateGoroutines(t *testing.T) {
 		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
 	}
 	secondary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+		getSpanFn:     func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
 	}
 	tertiary := &mockHeimdallClient{}
 
 	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour // long interval so goroutine stays alive
+	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
 	defer fc.Close()
 
-	// First cascade: primary→secondary fails, lands on tertiary
+	// Mark secondary as unhealthy in the registry
+	fc.mu.Lock()
+	fc.health[1] = endpointHealth{healthy: false}
+	fc.mu.Unlock()
+
+	// Trigger failover from primary
+	secondaryHitsBefore := secondary.hits.Load()
 	_, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
-	assert.True(t, fc.probing.Load(), "probing should be true")
+	// Secondary should not have been tried for the GetSpan call since it's unhealthy,
+	// but it may be tried in the last-resort pass. The key thing is that tertiary succeeds.
+	fc.mu.Lock()
+	assert.Equal(t, 2, fc.active, "should end up on tertiary")
+	fc.mu.Unlock()
+
+	_ = secondaryHitsBefore
+}
+
+func TestRegistry_InformedCascade_TriesByPriority(t *testing.T) {
+	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 
-	// Force back to secondary and cascade again — should NOT spawn a second goroutine
+	// Track call order
+	var callOrder []int
+	var orderMu sync.Mutex
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			orderMu.Lock()
+			callOrder = append(callOrder, 0)
+			orderMu.Unlock()
+			return &types.Span{Id: 1}, nil
+		},
+	}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			orderMu.Lock()
+			callOrder = append(callOrder, 1)
+			orderMu.Unlock()
+			return nil, connErr
+		},
+	}
+	tertiary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			orderMu.Lock()
+			callOrder = append(callOrder, 2)
+			orderMu.Unlock()
+			return nil, connErr
+		},
+	}
+
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 1 * time.Hour
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
+	defer fc.Close()
+
+	// Force active to index 1 (secondary); primary (index 0) is healthy
 	fc.mu.Lock()
 	fc.active = 1
+	fc.health[0] = endpointHealth{healthy: true, healthySince: time.Now().Add(-1 * time.Hour)}
+	fc.health[1] = endpointHealth{healthy: true}
+	fc.health[2] = endpointHealth{healthy: true}
 	fc.mu.Unlock()
 
-	_, err = fc.GetSpan(context.Background(), 1)
+	span, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	require.NotNil(t, span)
+
+	// Cascade should try primary (index 0) before tertiary (index 2)
+	fc.mu.Lock()
+	assert.Equal(t, 0, fc.active, "should cascade to primary (highest priority)")
+	fc.mu.Unlock()
+}
+
+func TestRegistry_ProactiveSwitchOnActiveUnhealthy(t *testing.T) {
+	primaryDown := atomic.Bool{}
+	primaryDown.Store(false)
+
+	primary := &mockHeimdallClient{
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			if primaryDown.Load() {
+				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			}
+			return &ctypes.SyncInfo{}, nil
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := newInstantMulti(primary, secondary)
+	defer fc.Close()
+
+	// Start the health registry (normally started on first API call).
+	fc.ensureHealthRegistry()
+
+	// Verify we start on primary
+	fc.mu.Lock()
+	assert.Equal(t, 0, fc.active, "should start on primary")
+	fc.mu.Unlock()
+
+	// Now make primary go down — the health registry should detect and switch
+	primaryDown.Store(true)
+
+	require.Eventually(t, func() bool {
+		fc.mu.Lock()
+		defer fc.mu.Unlock()
+		return fc.active == 1
+	}, 2*time.Second, 20*time.Millisecond, "health registry should proactively switch to secondary")
+}
+
+func TestRegistry_CascadeFallsBackToUnhealthy(t *testing.T) {
+	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+	// Secondary is marked unhealthy but actually works
+	secondary := &mockHeimdallClient{}
+
+	fc := NewMultiHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 1 * time.Hour
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
+	defer fc.Close()
+
+	// Mark secondary as unhealthy
+	fc.mu.Lock()
+	fc.health[1] = endpointHealth{healthy: false}
+	fc.mu.Unlock()
+
+	// Primary fails, cascade should fall back to unhealthy secondary as last resort
+	span, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
+	require.NotNil(t, span)
+
+	fc.mu.Lock()
+	assert.Equal(t, 1, fc.active, "should fall back to unhealthy secondary as last resort")
+	fc.mu.Unlock()
+}
 
-	// probing is still true from the first goroutine; CompareAndSwap prevents a second
-	assert.True(t, fc.probing.Load(), "probing should still be true (no duplicate)")
+func TestRegistry_MarkUnhealthyOnRealFailure(t *testing.T) {
+	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc := NewMultiHeimdallClient(primary, secondary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 1 * time.Hour
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 0
+	defer fc.Close()
+
+	// Primary starts as healthy
+	fc.mu.Lock()
+	assert.True(t, fc.health[0].healthy, "primary should start healthy")
+	fc.mu.Unlock()
+
+	// Trigger a real request that fails on primary
+	_, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err) // succeeds via secondary
+
+	// Primary should now be marked unhealthy
+	fc.mu.Lock()
+	assert.False(t, fc.health[0].healthy, "primary should be marked unhealthy after real failure")
+	assert.Equal(t, 0, fc.health[0].consecutiveSuccess, "consecutive success should be reset")
+	fc.mu.Unlock()
+}
+
+func TestRegistry_InformedCascade_RespectsCooldown(t *testing.T) {
+	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+
+	// Primary (index 0): healthy but NOT cooled (recently became healthy)
+	// Secondary (index 1): fails (active)
+	// Tertiary (index 2): healthy AND cooled
+
+	primary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return &types.Span{Id: 1}, nil
+		},
+	}
+	secondary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+	}
+	tertiary := &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return &types.Span{Id: 3}, nil
+		},
+	}
+
+	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.healthCheckInterval = 1 * time.Hour
+	fc.consecutiveThreshold = 1
+	fc.promotionCooldown = 1 * time.Hour // long cooldown
+	defer fc.Close()
+
+	// Set up health states
+	fc.mu.Lock()
+	fc.active = 1
+	fc.health[0] = endpointHealth{healthy: true, healthySince: time.Now()} // NOT cooled
+	fc.health[1] = endpointHealth{healthy: true}
+	fc.health[2] = endpointHealth{healthy: true, healthySince: time.Now().Add(-2 * time.Hour)} // cooled
+	fc.mu.Unlock()
+
+	span, err := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, err)
+	require.NotNil(t, span)
+
+	// Should prefer tertiary (cooled) over primary (uncooled)
+	fc.mu.Lock()
+	assert.Equal(t, 2, fc.active, "should prefer cooled tertiary over uncooled primary")
+	fc.mu.Unlock()
 }
diff --git a/consensus/bor/heimdall/failover_metrics.go b/consensus/bor/heimdall/failover_metrics.go
index f9d6aedeae..482fb6fa29 100644
--- a/consensus/bor/heimdall/failover_metrics.go
+++ b/consensus/bor/heimdall/failover_metrics.go
@@ -4,14 +4,18 @@ import "github.com/ethereum/go-ethereum/metrics"
 
 var (
 	// HTTP/gRPC failover metrics (used within this package)
-	failoverSwitchCounter  = metrics.NewRegisteredCounter("client/failover/switches", nil)
-	failoverActiveGauge    = metrics.NewRegisteredGauge("client/failover/active", nil)
-	failoverProbeAttempts  = metrics.NewRegisteredCounter("client/failover/probe/attempts", nil)
-	failoverProbeSuccesses = metrics.NewRegisteredCounter("client/failover/probe/successes", nil)
+	failoverSwitchCounter     = metrics.NewRegisteredCounter("client/failover/switches", nil)
+	failoverActiveGauge       = metrics.NewRegisteredGauge("client/failover/active", nil)
+	failoverProbeAttempts     = metrics.NewRegisteredCounter("client/failover/probe/attempts", nil)
+	failoverProbeSuccesses    = metrics.NewRegisteredCounter("client/failover/probe/successes", nil)
+	failoverHealthyEndpoints  = metrics.NewRegisteredGauge("client/failover/healthy_endpoints", nil)
+	failoverProactiveSwitches = metrics.NewRegisteredCounter("client/failover/proactive_switches", nil)
 
 	// WS failover metrics (exported for use by heimdallws package)
-	FailoverWSSwitchCounter  = metrics.NewRegisteredCounter("client/failover/ws/switches", nil)
-	FailoverWSActiveGauge    = metrics.NewRegisteredGauge("client/failover/ws/active", nil)
-	FailoverWSProbeAttempts  = metrics.NewRegisteredCounter("client/failover/ws/probe/attempts", nil)
-	FailoverWSProbeSuccesses = metrics.NewRegisteredCounter("client/failover/ws/probe/successes", nil)
+	FailoverWSSwitchCounter     = metrics.NewRegisteredCounter("client/failover/ws/switches", nil)
+	FailoverWSActiveGauge       = metrics.NewRegisteredGauge("client/failover/ws/active", nil)
+	FailoverWSProbeAttempts     = metrics.NewRegisteredCounter("client/failover/ws/probe/attempts", nil)
+	FailoverWSProbeSuccesses    = metrics.NewRegisteredCounter("client/failover/ws/probe/successes", nil)
+	FailoverWSHealthyEndpoints  = metrics.NewRegisteredGauge("client/failover/ws/healthy_endpoints", nil)
+	FailoverWSProactiveSwitches = metrics.NewRegisteredCounter("client/failover/ws/proactive_switches", nil)
 )
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index f5c2b025f9..a013b2a58e 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -6,7 +6,6 @@ import (
 	"errors"
 	"strconv"
 	"sync"
-	"sync/atomic"
 	"time"
 
 	"github.com/gorilla/websocket"
@@ -18,31 +17,49 @@ import (
 )
 
 const (
-	// defaultPrimaryAttempts is the number of consecutive failures on the primary URL
-	// before switching to the secondary (~30s at 10s/attempt).
-	defaultPrimaryAttempts = 3
-
 	// defaultReconnectDelay is the backoff between reconnection attempts.
 	defaultReconnectDelay = 10 * time.Second
 
-	// defaultWSCooldown is how long to stay on secondary before probing primary again.
-	defaultWSCooldown = 2 * time.Minute
+	// defaultWSHealthCheckInterval is how often the health registry probes all endpoints.
+	defaultWSHealthCheckInterval = 10 * time.Second
+
+	// defaultWSConsecutiveThreshold is the number of consecutive successful probes
+	// needed before an endpoint is considered healthy.
+	defaultWSConsecutiveThreshold = 3
+
+	// defaultWSPromotionCooldown is how long after becoming healthy before an
+	// endpoint is eligible for promotion.
+	defaultWSPromotionCooldown = 60 * time.Second
+
+	// defaultWSProbeTimeout bounds each individual WS probe dial so a
+	// firewalled host can't block the health-check goroutine forever.
+	defaultWSProbeTimeout = 10 * time.Second
 )
 
+// wsEndpointHealth tracks the health state of a single WS endpoint.
+type wsEndpointHealth struct {
+	healthy            bool
+	consecutiveSuccess int
+	healthySince       time.Time
+	lastErr            error
+}
+
 // HeimdallWSClient represents a websocket client with auto-reconnection and failover support.
 type HeimdallWSClient struct {
 	conn      *websocket.Conn
 	urls      []string // primary at [0], secondary at [1] (if configured)
 	activeURL int      // index into urls; protected by mu
+	health    []wsEndpointHealth
 	events    chan *milestone.Milestone
 	done      chan struct{}
 	mu        sync.Mutex
-	probing   atomic.Bool // guards against spawning multiple health-check goroutines
 
 	// Configurable parameters (defaults set in constructor, overridable for testing)
-	primaryAttempts int
-	reconnectDelay  time.Duration
-	wsCooldown      time.Duration
+	reconnectDelay       time.Duration
+	healthCheckInterval  time.Duration
+	consecutiveThreshold int
+	promotionCooldown    time.Duration
+	probeTimeout         time.Duration
 }
 
 // NewHeimdallWSClient creates a new WS client for Heimdall with optional failover.
@@ -63,14 +80,21 @@ func NewHeimdallWSClient(urls ...string) (*HeimdallWSClient, error) {
 		return nil, errors.New("at least one non-empty WS URL required")
 	}
 
+	health := make([]wsEndpointHealth, len(filtered))
+	// Primary starts as healthy; others start unhealthy.
+	health[0] = wsEndpointHealth{healthy: true}
+
 	return &HeimdallWSClient{
-		conn:            nil,
-		urls:            filtered,
-		events:          make(chan *milestone.Milestone),
-		done:            make(chan struct{}),
-		primaryAttempts: defaultPrimaryAttempts,
-		reconnectDelay:  defaultReconnectDelay,
-		wsCooldown:      defaultWSCooldown,
+		conn:                 nil,
+		urls:                 filtered,
+		health:               health,
+		events:               make(chan *milestone.Milestone),
+		done:                 make(chan struct{}),
+		reconnectDelay:       defaultReconnectDelay,
+		healthCheckInterval:  defaultWSHealthCheckInterval,
+		consecutiveThreshold: defaultWSConsecutiveThreshold,
+		promotionCooldown:    defaultWSPromotionCooldown,
+		probeTimeout:         defaultWSProbeTimeout,
 	}, nil
 }
 
@@ -81,16 +105,19 @@ func (c *HeimdallWSClient) SubscribeMilestoneEvents(ctx context.Context) <-chan
 	// Start the goroutine to read messages.
 	go c.readMessages(ctx)
 
+	// Start the health registry if there are multiple URLs.
+	if len(c.urls) > 1 {
+		go c.runWSHealthRegistry()
+	}
+
 	return c.events
 }
 
-// startWSHealthCheck runs in a background goroutine, periodically probing
-// higher-priority WS endpoints. When one responds, it updates activeURL and
-// closes the current connection to trigger reconnection in readMessages.
-func (c *HeimdallWSClient) startWSHealthCheck() {
-	defer c.probing.Store(false)
-
-	ticker := time.NewTicker(c.wsCooldown)
+// runWSHealthRegistry is an always-on goroutine that continuously probes ALL WS
+// endpoints, requires consecutive successes before marking healthy, and enforces
+// cooldown before promotion. Stopped when done channel is closed (Unsubscribe).
+func (c *HeimdallWSClient) runWSHealthRegistry() {
+	ticker := time.NewTicker(c.healthCheckInterval)
 	defer ticker.Stop()
 
 	for {
@@ -100,57 +127,179 @@ func (c *HeimdallWSClient) startWSHealthCheck() {
 		case <-ticker.C:
 		}
 
-		c.mu.Lock()
-		active := c.activeURL
-		c.mu.Unlock()
+		c.probeAllWSEndpoints()
+		c.maybeWSPromote()
+		c.maybeWSProactiveSwitch()
+	}
+}
+
+// probeAllWSEndpoints probes every WS endpoint via dial (connect + immediately close).
+func (c *HeimdallWSClient) probeAllWSEndpoints() {
+	dialer := websocket.Dialer{
+		HandshakeTimeout: c.probeTimeout,
+	}
 
-		if active == 0 {
+	for i := 0; i < len(c.urls); i++ {
+		// Check for shutdown between individual probes.
+		select {
+		case <-c.done:
 			return
+		default:
 		}
 
-		// Probe URLs 0..active-1 (highest priority first).
-		for i := 0; i < active; i++ {
-			heimdall.FailoverWSProbeAttempts.Inc(1)
+		heimdall.FailoverWSProbeAttempts.Inc(1)
 
-			testConn, _, err := websocket.DefaultDialer.Dial(c.urls[i], nil)
-			if err != nil {
-				continue
-			}
+		c.mu.Lock()
+		url := c.urls[i]
+		c.mu.Unlock()
+
+		ctx, cancel := context.WithTimeout(context.Background(), c.probeTimeout)
+		testConn, _, err := dialer.DialContext(ctx, url, nil)
+		cancel()
+
+		c.mu.Lock()
+
+		if err == nil {
 			testConn.Close()
 
-			c.mu.Lock()
-			c.activeURL = i
-			conn := c.conn
-			c.mu.Unlock()
+			c.health[i].consecutiveSuccess++
+			c.health[i].lastErr = nil
+
+			if c.health[i].consecutiveSuccess >= c.consecutiveThreshold && !c.health[i].healthy {
+				c.health[i].healthy = true
+				c.health[i].healthySince = time.Now()
+			}
 
 			heimdall.FailoverWSProbeSuccesses.Inc(1)
+		} else {
+			c.health[i].consecutiveSuccess = 0
+			c.health[i].healthy = false
+			c.health[i].lastErr = err
+		}
+
+		c.mu.Unlock()
+	}
+
+	// Update healthy endpoints gauge.
+	c.mu.Lock()
+	count := int64(0)
+	for i := range c.health {
+		if c.health[i].healthy {
+			count++
+		}
+	}
+	c.mu.Unlock()
+
+	heimdall.FailoverWSHealthyEndpoints.Update(count)
+}
+
+// maybeWSPromote checks if a higher-priority URL (index < activeURL) is healthy
+// and has passed cooldown. If yes, promotes to the highest-priority qualified URL.
+func (c *HeimdallWSClient) maybeWSPromote() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.activeURL == 0 {
+		return
+	}
+
+	for i := 0; i < c.activeURL; i++ {
+		if c.health[i].healthy && time.Since(c.health[i].healthySince) >= c.promotionCooldown {
+			prev := c.activeURL
+			c.activeURL = i
+
 			heimdall.FailoverWSActiveGauge.Update(int64(i))
+			heimdall.FailoverWSProactiveSwitches.Inc(1)
 
-			log.Info("WS health-check: promoted to higher-priority URL", "index", i, "url", c.urls[i])
+			log.Info("WS health registry: promoted to higher-priority URL",
+				"index", i, "previous", prev, "url", c.urls[i])
 
 			// Close current connection to trigger reconnection in readMessages.
-			if conn != nil {
-				conn.Close()
+			if c.conn != nil {
+				c.conn.Close()
 			}
 
-			if i == 0 {
-				return
+			return
+		}
+	}
+}
+
+// maybeWSProactiveSwitch detects if the active URL is unhealthy and switches
+// to the highest-priority healthy URL.
+func (c *HeimdallWSClient) maybeWSProactiveSwitch() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.health[c.activeURL].healthy {
+		return
+	}
+
+	// Active is unhealthy. Find the best alternative.
+	// Pass 1: healthy + cooled.
+	for i := 0; i < len(c.urls); i++ {
+		if i == c.activeURL {
+			continue
+		}
+
+		if c.health[i].healthy && time.Since(c.health[i].healthySince) >= c.promotionCooldown {
+			prev := c.activeURL
+			c.activeURL = i
+
+			heimdall.FailoverWSActiveGauge.Update(int64(i))
+			heimdall.FailoverWSProactiveSwitches.Inc(1)
+
+			log.Warn("WS health registry: proactive switch (active unhealthy, cooled target)",
+				"from", prev, "to", i, "url", c.urls[i])
+
+			if c.conn != nil {
+				c.conn.Close()
 			}
 
-			break // keep ticking to probe even higher-priority URLs
+			return
+		}
+	}
+
+	// Pass 2: healthy but NOT cooled (emergency).
+	for i := 0; i < len(c.urls); i++ {
+		if i == c.activeURL {
+			continue
+		}
+
+		if c.health[i].healthy {
+			prev := c.activeURL
+			c.activeURL = i
+
+			heimdall.FailoverWSActiveGauge.Update(int64(i))
+			heimdall.FailoverWSProactiveSwitches.Inc(1)
+
+			log.Warn("WS health registry: proactive switch (active unhealthy, uncooled target)",
+				"from", prev, "to", i, "url", c.urls[i])
+
+			if c.conn != nil {
+				c.conn.Close()
+			}
+
+			return
 		}
 	}
 }
 
 // tryUntilSubscribeMilestoneEvents retries connecting and subscribing until success,
-// with failover to secondary URL after defaultPrimaryAttempts failures on primary.
+// consulting the health registry to pick the best URL.
 func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context) {
-	attempts := 0
 	firstTime := true
 
 	for {
 		if !firstTime {
-			time.Sleep(c.reconnectDelay)
+			select {
+			case <-ctx.Done():
+				log.Info("Context cancelled during reconnection")
+				return
+			case <-c.done:
+				log.Info("Client unsubscribed during reconnection")
+				return
+			case <-time.After(c.reconnectDelay):
+			}
 		}
 
 		firstTime = false
@@ -176,34 +325,60 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		if err != nil {
 			log.Error("failed to dial websocket on heimdall ws subscription", "url", url, "err", err)
 
-			attempts++
+			// Mark endpoint unhealthy in the registry.
+			c.mu.Lock()
+			c.health[active].consecutiveSuccess = 0
+			c.health[active].healthy = false
+			c.health[active].lastErr = err
+
+			// Find the best healthy alternative.
+			switched := false
+			for i := 0; i < len(c.urls); i++ {
+				if i == active && c.health[i].healthy {
+					continue
+				}
 
-			if len(c.urls) > 1 && attempts >= c.primaryAttempts {
-				next := min(active+1, len(c.urls)-1)
-				if next != active {
-					log.Warn("WS URL failed, switching to next",
-						"from", c.urls[active], "to", c.urls[next], "attempts", attempts)
+				if i != active && c.health[i].healthy {
+					c.activeURL = i
+					switched = true
+
+					heimdall.FailoverWSSwitchCounter.Inc(1)
+					heimdall.FailoverWSActiveGauge.Update(int64(i))
+
+					log.Warn("WS URL failed, switching to healthy endpoint",
+						"from", c.urls[active], "to", c.urls[i])
+
+					break
+				}
+			}
 
-					c.mu.Lock()
+			// If no healthy alternative, try next in round-robin fashion.
+			if !switched && len(c.urls) > 1 {
+				next := (active + 1) % len(c.urls)
+				if next != active {
 					c.activeURL = next
-					c.mu.Unlock()
 
 					heimdall.FailoverWSSwitchCounter.Inc(1)
 					heimdall.FailoverWSActiveGauge.Update(int64(next))
 
-					if c.probing.CompareAndSwap(false, true) {
-						go c.startWSHealthCheck()
-					}
+					log.Warn("WS URL failed, switching to next endpoint",
+						"from", c.urls[active], "to", c.urls[next])
 				}
-
-				attempts = 0
 			}
 
+			c.mu.Unlock()
+
 			continue
 		}
 
 		c.mu.Lock()
 		c.conn = conn
+		// Mark this endpoint as successful.
+		c.health[active].consecutiveSuccess++
+		if c.health[active].consecutiveSuccess >= c.consecutiveThreshold && !c.health[active].healthy {
+			c.health[active].healthy = true
+			c.health[active].healthySince = time.Now()
+		}
 		c.mu.Unlock()
 
 		// Build the subscription request.
diff --git a/consensus/bor/heimdallws/client_test.go b/consensus/bor/heimdallws/client_test.go
index c10c29fa13..70c25f458e 100644
--- a/consensus/bor/heimdallws/client_test.go
+++ b/consensus/bor/heimdallws/client_test.go
@@ -135,6 +135,8 @@ func TestWSClient_ConstructorSingleURL(t *testing.T) {
 	assert.Len(t, client.urls, 1)
 	assert.Equal(t, "ws://localhost:1234", client.urls[0])
 	assert.Equal(t, 0, client.activeURL)
+	assert.Len(t, client.health, 1)
+	assert.True(t, client.health[0].healthy, "primary should start healthy")
 }
 
 func TestWSClient_ConstructorMultipleURLs(t *testing.T) {
@@ -145,6 +147,10 @@ func TestWSClient_ConstructorMultipleURLs(t *testing.T) {
 	assert.Equal(t, "ws://secondary:5678", client.urls[1])
 	assert.Equal(t, "ws://tertiary:9999", client.urls[2])
 	assert.Equal(t, 0, client.activeURL)
+	assert.Len(t, client.health, 3)
+	assert.True(t, client.health[0].healthy, "primary should start healthy")
+	assert.False(t, client.health[1].healthy, "secondary should start unhealthy")
+	assert.False(t, client.health[2].healthy, "tertiary should start unhealthy")
 }
 
 func TestWSClient_ConstructorFiltersEmpty(t *testing.T) {
@@ -203,9 +209,10 @@ func TestWSClient_DualURL_FailoverToSecondary(t *testing.T) {
 	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL))
 	require.NoError(t, err)
 
-	// Speed up test by reducing reconnect delay and attempts.
+	// Speed up test.
 	client.reconnectDelay = 100 * time.Millisecond
-	client.primaryAttempts = 2
+	client.consecutiveThreshold = 1
+	client.promotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -244,7 +251,8 @@ func TestWSClient_ThreeURL_CascadeToTertiary(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.primaryAttempts = 2
+	client.consecutiveThreshold = 1
+	client.promotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
 	defer cancel()
@@ -278,6 +286,8 @@ func TestWSClient_ContextCancellation(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
+	client.consecutiveThreshold = 1
+	client.promotionCooldown = 0
 
 	ctx, cancel := context.WithCancel(context.Background())
 
@@ -307,8 +317,9 @@ func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.primaryAttempts = 2
-	client.wsCooldown = 100 * time.Millisecond
+	client.healthCheckInterval = 100 * time.Millisecond
+	client.consecutiveThreshold = 1
+	client.promotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -337,19 +348,18 @@ func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
 	client.urls[0] = wsURL(primaryGood.URL)
 	client.mu.Unlock()
 
-	// Wait for background health-check to promote back to primary.
+	// Wait for background health registry to promote back to primary.
 	require.Eventually(t, func() bool {
 		client.mu.Lock()
 		defer client.mu.Unlock()
 		return client.activeURL == 0
-	}, 5*time.Second, 50*time.Millisecond, "health-check should promote back to primary")
+	}, 5*time.Second, 50*time.Millisecond, "health registry should promote back to primary")
 
 	require.NoError(t, client.Unsubscribe(ctx))
 }
 
 func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
-	// Both URLs reject. The client should stay on the last URL once it gets
-	// there rather than wrapping back to primary.
+	// Both URLs reject. The client should handle correctly when on last URL.
 	primary := newTestWSServer(t, true)
 	defer primary.Close()
 
@@ -360,8 +370,9 @@ func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 10 * time.Millisecond
-	client.primaryAttempts = 2
-	client.wsCooldown = 1 * time.Hour // prevent health-check from interfering
+	client.healthCheckInterval = 1 * time.Hour // prevent health-check from interfering
+	client.consecutiveThreshold = 1
+	client.promotionCooldown = 0
 
 	// Pre-set to secondary as if a prior failover already happened.
 	client.mu.Lock()
@@ -373,10 +384,13 @@ func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
 
 	client.tryUntilSubscribeMilestoneEvents(ctx)
 
-	// Must stay on secondary (index 1), not wrap back to primary (index 0).
+	// Should have moved off secondary since it fails.
 	client.mu.Lock()
-	assert.Equal(t, 1, client.activeURL, "should stay on last URL, not wrap back to primary")
+	active := client.activeURL
 	client.mu.Unlock()
+
+	// May have wrapped to primary (index 0) since secondary fails.
+	_ = active // either index is acceptable; the important thing is it didn't hang.
 }
 
 func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
@@ -393,7 +407,8 @@ func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.primaryAttempts = 2
+	client.consecutiveThreshold = 1
+	client.promotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -418,8 +433,8 @@ func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
 	require.NoError(t, client.Unsubscribe(ctx))
 }
 
-func TestWSClient_HealthCheckRespectsUnsubscribe(t *testing.T) {
-	// Verify that the health-check goroutine stops when done channel is closed.
+func TestWSClient_HealthRegistryRespectsUnsubscribe(t *testing.T) {
+	// Verify that the health registry goroutine stops when done channel is closed.
 	primary := newTestWSServer(t, true)
 	defer primary.Close()
 
@@ -430,8 +445,9 @@ func TestWSClient_HealthCheckRespectsUnsubscribe(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.primaryAttempts = 2
-	client.wsCooldown = 50 * time.Millisecond
+	client.healthCheckInterval = 50 * time.Millisecond
+	client.consecutiveThreshold = 1
+	client.promotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -446,13 +462,112 @@ func TestWSClient_HealthCheckRespectsUnsubscribe(t *testing.T) {
 		t.Fatal("timed out waiting for failover")
 	}
 
-	// Probing goroutine should be running.
-	assert.True(t, client.probing.Load(), "probing should be active after failover")
+	// Unsubscribe should stop the health registry goroutine.
+	require.NoError(t, client.Unsubscribe(ctx))
+
+	// Give a moment for the goroutine to stop and verify no panics.
+	time.Sleep(200 * time.Millisecond)
+}
+
+// --- New health registry tests ---
+
+func TestWSClient_Registry_ConsecutiveThreshold(t *testing.T) {
+	// Primary starts rejecting, secondary accepts.
+	primaryReject := newTestWSServer(t, true)
+	defer primaryReject.Close()
+
+	secondary := newTestWSServerWithMilestone(t)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primaryReject.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
+
+	client.reconnectDelay = 100 * time.Millisecond
+	client.healthCheckInterval = 50 * time.Millisecond
+	client.consecutiveThreshold = 3 // need 3 consecutive successes
+	client.promotionCooldown = 0
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	events := client.SubscribeMilestoneEvents(ctx)
+
+	// Failover to secondary.
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for failover")
+	}
+
+	// Replace rejecting primary with accepting one.
+	primaryReject.Close()
+	primaryGood := newTestWSServer(t, false)
+	defer primaryGood.Close()
+
+	client.mu.Lock()
+	client.urls[0] = wsURL(primaryGood.URL)
+	client.mu.Unlock()
+
+	// Should eventually promote after 3 consecutive successes.
+	require.Eventually(t, func() bool {
+		client.mu.Lock()
+		defer client.mu.Unlock()
+		return client.activeURL == 0
+	}, 5*time.Second, 50*time.Millisecond, "should promote after consecutive threshold met")
 
-	// Unsubscribe should stop the health-check goroutine.
 	require.NoError(t, client.Unsubscribe(ctx))
+}
+
+func TestWSClient_Registry_PromotionCooldown(t *testing.T) {
+	primaryReject := newTestWSServer(t, true)
+	defer primaryReject.Close()
+
+	secondary := newTestWSServerWithMilestone(t)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primaryReject.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
 
+	client.reconnectDelay = 100 * time.Millisecond
+	client.healthCheckInterval = 50 * time.Millisecond
+	client.consecutiveThreshold = 1
+	client.promotionCooldown = 500 * time.Millisecond
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	events := client.SubscribeMilestoneEvents(ctx)
+
+	// Failover to secondary.
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for failover")
+	}
+
+	// Replace primary with good one.
+	primaryReject.Close()
+	primaryGood := newTestWSServer(t, false)
+	defer primaryGood.Close()
+
+	client.mu.Lock()
+	client.urls[0] = wsURL(primaryGood.URL)
+	client.mu.Unlock()
+
+	// Should not promote immediately (cooldown not met).
+	time.Sleep(150 * time.Millisecond)
+	client.mu.Lock()
+	assert.Equal(t, 1, client.activeURL, "should not promote before cooldown")
+	client.mu.Unlock()
+
+	// Wait for cooldown to pass.
 	require.Eventually(t, func() bool {
-		return !client.probing.Load()
-	}, 2*time.Second, 50*time.Millisecond, "probing should stop after unsubscribe")
+		client.mu.Lock()
+		defer client.mu.Unlock()
+		return client.activeURL == 0
+	}, 3*time.Second, 50*time.Millisecond, "should promote after cooldown passes")
+
+	require.NoError(t, client.Unsubscribe(ctx))
 }

From b170f033cfae5d078515e170910c21013b142351 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Tue, 24 Feb 2026 10:07:36 +0530
Subject: [PATCH 23/29] fix lint and improvements

---
 consensus/bor/heimdall/failover_client.go      | 5 +++--
 consensus/bor/heimdall/failover_client_test.go | 2 +-
 consensus/bor/heimdallws/client.go             | 9 +++++++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index 2f2e0029ba..a77e1b3ee6 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -63,7 +63,7 @@ type MultiHeimdallClient struct {
 	quit                 chan struct{}
 	closeOnce            sync.Once
 	startOnce            sync.Once
-	probeCtx             context.Context    // cancelled on Close to abort in-flight probes
+	probeCtx             context.Context // cancelled on Close to abort in-flight probes
 	probeCancel          context.CancelFunc
 }
 
@@ -245,12 +245,13 @@ func (f *MultiHeimdallClient) maybePromote() {
 
 	for i := 0; i < f.active; i++ {
 		if f.health[i].healthy && time.Since(f.health[i].healthySince) >= f.promotionCooldown {
+			prev := f.active
 			f.active = i
 			failoverActiveGauge.Update(int64(i))
 			failoverProactiveSwitches.Inc(1)
 
 			log.Info("Heimdall health registry: promoted to higher-priority client",
-				"index", i, "previous", f.active)
+				"index", i, "previous", prev)
 
 			return
 		}
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 02fc7ff186..56d6702c80 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -170,7 +170,7 @@ func TestFailover_NoSwitchOnContextCanceled(t *testing.T) {
 	secondary := &mockHeimdallClient{}
 
 	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 5 * time.Second // longer than caller's ctx
+	fc.attemptTimeout = 5 * time.Second    // longer than caller's ctx
 	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
 	fc.consecutiveThreshold = 1
 	fc.promotionCooldown = 0
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index a013b2a58e..7b8de20dcb 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -334,11 +334,11 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 			// Find the best healthy alternative.
 			switched := false
 			for i := 0; i < len(c.urls); i++ {
-				if i == active && c.health[i].healthy {
+				if i == active {
 					continue
 				}
 
-				if i != active && c.health[i].healthy {
+				if c.health[i].healthy {
 					c.activeURL = i
 					switched = true
 
@@ -502,5 +502,10 @@ func (c *HeimdallWSClient) Unsubscribe(ctx context.Context) error {
 func (c *HeimdallWSClient) Close() error {
 	c.mu.Lock()
 	defer c.mu.Unlock()
+
+	if c.conn == nil {
+		return nil
+	}
+
 	return c.conn.Close()
 }

From c3a946b03837ec2e597b0ac50751182791ad1633 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Tue, 24 Feb 2026 10:38:20 +0530
Subject: [PATCH 24/29] reduced duplication in health registry, and fixed a bug
 in ws

---
 consensus/bor/heimdall/failover_client.go     | 271 +++-----------
 .../bor/heimdall/failover_client_test.go      | 319 ++++++++---------
 consensus/bor/heimdall/health_registry.go     | 338 ++++++++++++++++++
 .../bor/heimdall/health_registry_test.go      | 272 ++++++++++++++
 consensus/bor/heimdallws/client.go            | 332 +++++------------
 consensus/bor/heimdallws/client_test.go       | 104 +++---
 eth/ethconfig/config.go                       |   8 +-
 7 files changed, 965 insertions(+), 679 deletions(-)
 create mode 100644 consensus/bor/heimdall/health_registry.go
 create mode 100644 consensus/bor/heimdall/health_registry_test.go

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index a77e1b3ee6..b74eec5d1f 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -3,8 +3,8 @@ package heimdall
 import (
 	"context"
 	"errors"
+	"fmt"
 	"net"
-	"sync"
 	"time"
 
 	"github.com/0xPolygon/heimdall-v2/x/bor/types"
@@ -38,57 +38,57 @@ type Endpoint interface {
 	Close()
 }
 
-// endpointHealth tracks the health state of a single endpoint.
-type endpointHealth struct {
-	healthy            bool
-	consecutiveSuccess int
-	healthySince       time.Time // when consecutive threshold was reached
-	lastErr            error
-}
-
 // MultiHeimdallClient wraps N heimdall clients (primary at index 0, failovers
 // at 1..N-1) and transparently cascades through them when the active client is
 // unreachable. A background health registry continuously probes ALL endpoints,
 // requires consecutive successes + cooldown before promotion, and gives cascade
 // full visibility into endpoint health.
 type MultiHeimdallClient struct {
-	clients              []Endpoint
-	mu                   sync.Mutex
-	active               int // 0 = primary, >0 = failover
-	health               []endpointHealth
-	attemptTimeout       time.Duration
-	healthCheckInterval  time.Duration
-	consecutiveThreshold int
-	promotionCooldown    time.Duration
-	quit                 chan struct{}
-	closeOnce            sync.Once
-	startOnce            sync.Once
-	probeCtx             context.Context // cancelled on Close to abort in-flight probes
-	probeCancel          context.CancelFunc
+	clients        []Endpoint
+	registry       *HealthRegistry
+	attemptTimeout time.Duration
+	probeCtx       context.Context // cancelled on Close to abort in-flight probes
+	probeCancel    context.CancelFunc
 }
 
-func NewMultiHeimdallClient(clients ...Endpoint) *MultiHeimdallClient {
+func NewMultiHeimdallClient(clients ...Endpoint) (*MultiHeimdallClient, error) {
 	if len(clients) == 0 {
-		panic("NewMultiHeimdallClient requires at least one client")
+		return nil, fmt.Errorf("NewMultiHeimdallClient requires at least one client")
 	}
 
-	health := make([]endpointHealth, len(clients))
-	// Primary starts as healthy; others start unhealthy.
-	health[0] = endpointHealth{healthy: true}
-
 	probeCtx, probeCancel := context.WithCancel(context.Background())
 
-	return &MultiHeimdallClient{
-		clients:              clients,
-		health:               health,
-		attemptTimeout:       defaultAttemptTimeout,
-		healthCheckInterval:  defaultHealthCheckInterval,
-		consecutiveThreshold: defaultConsecutiveThreshold,
-		promotionCooldown:    defaultPromotionCooldown,
-		quit:                 make(chan struct{}),
-		probeCtx:             probeCtx,
-		probeCancel:          probeCancel,
+	f := &MultiHeimdallClient{
+		clients:        clients,
+		attemptTimeout: defaultAttemptTimeout,
+		probeCtx:       probeCtx,
+		probeCancel:    probeCancel,
 	}
+
+	f.registry = NewHealthRegistry(
+		len(clients),
+		f.probeEndpoint,
+		nil, // HTTP client doesn't need onSwitch callback
+		RegistryMetrics{
+			ProbeAttempts:     failoverProbeAttempts,
+			ProbeSuccesses:    failoverProbeSuccesses,
+			ProactiveSwitches: failoverProactiveSwitches,
+			ActiveGauge:       failoverActiveGauge,
+			HealthyEndpoints:  failoverHealthyEndpoints,
+		},
+	)
+
+	return f, nil
+}
+
+// probeEndpoint probes a single endpoint via FetchStatus.
+func (f *MultiHeimdallClient) probeEndpoint(i int) error {
+	ctx, cancel := context.WithTimeout(f.probeCtx, f.attemptTimeout)
+	defer cancel()
+
+	_, err := f.clients[i].FetchStatus(ctx)
+
+	return err
 }
 
 // ensureHealthRegistry lazily starts the health registry goroutine on the first
@@ -96,9 +96,7 @@ func NewMultiHeimdallClient(clients ...Endpoint) *MultiHeimdallClient {
 // construction but before the goroutine reads them.
 func (f *MultiHeimdallClient) ensureHealthRegistry() {
 	if len(f.clients) > 1 {
-		f.startOnce.Do(func() {
-			go f.runHealthRegistry()
-		})
+		f.registry.Start()
 	}
 }
 
@@ -151,174 +149,21 @@ func (f *MultiHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo
 }
 
 func (f *MultiHeimdallClient) Close() {
-	f.closeOnce.Do(func() {
-		f.probeCancel() // cancel in-flight probes first
-		close(f.quit)
-	})
+	f.probeCancel() // cancel in-flight probes first
+	f.registry.Stop()
 
 	for _, c := range f.clients {
 		c.Close()
 	}
 }
 
-// runHealthRegistry is an always-on goroutine (started in constructor, stopped
-// on Close) that continuously probes ALL endpoints, requires consecutive
-// successes before marking healthy, and enforces cooldown before promotion.
-func (f *MultiHeimdallClient) runHealthRegistry() {
-	ticker := time.NewTicker(f.healthCheckInterval)
-	defer ticker.Stop()
-
-	for {
-		select {
-		case <-f.quit:
-			return
-		case <-ticker.C:
-		}
-
-		f.probeAllEndpoints()
-		f.maybePromote()
-		f.maybeProactiveSwitch()
-	}
-}
-
-// probeAllEndpoints probes every endpoint via FetchStatus and updates health state.
-func (f *MultiHeimdallClient) probeAllEndpoints() {
-	for i := 0; i < len(f.clients); i++ {
-		// Check for shutdown between individual probes so we don't
-		// burn N*timeout before noticing Close() was called.
-		select {
-		case <-f.quit:
-			return
-		default:
-		}
-
-		failoverProbeAttempts.Inc(1)
-
-		ctx, cancel := context.WithTimeout(f.probeCtx, f.attemptTimeout)
-		_, err := f.clients[i].FetchStatus(ctx)
-		cancel()
-
-		f.mu.Lock()
-
-		if err == nil {
-			f.health[i].consecutiveSuccess++
-			f.health[i].lastErr = nil
-
-			if f.health[i].consecutiveSuccess >= f.consecutiveThreshold && !f.health[i].healthy {
-				f.health[i].healthy = true
-				f.health[i].healthySince = time.Now()
-			}
-
-			failoverProbeSuccesses.Inc(1)
-		} else {
-			// Fast failure detection: one failure resets to unhealthy.
-			f.health[i].consecutiveSuccess = 0
-			f.health[i].healthy = false
-			f.health[i].lastErr = err
-		}
-
-		f.mu.Unlock()
-	}
-
-	// Update healthy endpoints gauge.
-	f.mu.Lock()
-	count := int64(0)
-	for i := range f.health {
-		if f.health[i].healthy {
-			count++
-		}
-	}
-	f.mu.Unlock()
-
-	failoverHealthyEndpoints.Update(count)
-}
-
-// maybePromote checks if a higher-priority endpoint (index < active) is healthy
-// and has passed cooldown. If yes, promotes to the highest-priority qualified endpoint.
-func (f *MultiHeimdallClient) maybePromote() {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	if f.active == 0 {
-		return
-	}
-
-	for i := 0; i < f.active; i++ {
-		if f.health[i].healthy && time.Since(f.health[i].healthySince) >= f.promotionCooldown {
-			prev := f.active
-			f.active = i
-			failoverActiveGauge.Update(int64(i))
-			failoverProactiveSwitches.Inc(1)
-
-			log.Info("Heimdall health registry: promoted to higher-priority client",
-				"index", i, "previous", prev)
-
-			return
-		}
-	}
-}
-
-// maybeProactiveSwitch detects if the active endpoint is unhealthy and switches
-// to the highest-priority healthy endpoint.
-func (f *MultiHeimdallClient) maybeProactiveSwitch() {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	if f.health[f.active].healthy {
-		return
-	}
-
-	// Active is unhealthy. Find the best alternative.
-	// Pass 1: healthy + cooled.
-	for i := 0; i < len(f.clients); i++ {
-		if i == f.active {
-			continue
-		}
-
-		if f.health[i].healthy && time.Since(f.health[i].healthySince) >= f.promotionCooldown {
-			prev := f.active
-			f.active = i
-
-			failoverActiveGauge.Update(int64(i))
-			failoverProactiveSwitches.Inc(1)
-
-			log.Warn("Heimdall health registry: proactive switch (active unhealthy, cooled target)",
-				"from", prev, "to", i)
-
-			return
-		}
-	}
-
-	// Pass 2: healthy but NOT cooled (emergency).
-	for i := 0; i < len(f.clients); i++ {
-		if i == f.active {
-			continue
-		}
-
-		if f.health[i].healthy {
-			prev := f.active
-			f.active = i
-
-			failoverActiveGauge.Update(int64(i))
-			failoverProactiveSwitches.Inc(1)
-
-			log.Warn("Heimdall health registry: proactive switch (active unhealthy, uncooled target)",
-				"from", prev, "to", i)
-
-			return
-		}
-	}
-}
-
 // callWithFailover executes fn against the active client. If the active client
 // fails with a failover-eligible error, it marks it unhealthy and cascades
 // through remaining clients using health registry information.
 func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error)) (T, error) {
 	f.ensureHealthRegistry()
 
-	f.mu.Lock()
-	active := f.active
-	f.mu.Unlock()
+	active := f.registry.Active()
 
 	subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
 	result, err := fn(subCtx, f.clients[active])
@@ -334,11 +179,7 @@ func callWithFailover[T any](f *MultiHeimdallClient, ctx context.Context, fn fun
 	}
 
 	// Mark the active endpoint unhealthy in the registry.
-	f.mu.Lock()
-	f.health[active].consecutiveSuccess = 0
-	f.health[active].healthy = false
-	f.health[active].lastErr = err
-	f.mu.Unlock()
+	f.registry.MarkUnhealthy(active, err)
 
 	if active == 0 {
 		log.Warn("Heimdall failover: primary failed, cascading", "err", err)
@@ -356,7 +197,8 @@ func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(
 	n := len(f.clients)
 
 	// Build candidate lists based on health state.
-	f.mu.Lock()
+	snap := f.registry.HealthSnapshot()
+	cooldown := f.registry.PromotionCooldown
 
 	var cooled, uncooled, unhealthy []int
 
@@ -365,8 +207,8 @@ func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(
 			continue
 		}
 
-		if f.health[i].healthy {
-			if time.Since(f.health[i].healthySince) >= f.promotionCooldown {
+		if snap[i].Healthy {
+			if time.Since(snap[i].HealthySince) >= cooldown {
 				cooled = append(cooled, i)
 			} else {
 				uncooled = append(uncooled, i)
@@ -376,8 +218,6 @@ func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(
 		}
 	}
 
-	f.mu.Unlock()
-
 	// Try each pass in order.
 	passes := [][]int{cooled, uncooled, unhealthy}
 
@@ -388,17 +228,10 @@ func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(
 			cancel()
 
 			if err == nil {
-				f.mu.Lock()
-				f.active = i
-				f.health[i].consecutiveSuccess++
-				if !f.health[i].healthy && f.health[i].consecutiveSuccess >= f.consecutiveThreshold {
-					f.health[i].healthy = true
-					f.health[i].healthySince = time.Now()
-				}
-				f.mu.Unlock()
+				f.registry.SetActive(i)
+				f.registry.MarkSuccess(i)
 
 				failoverSwitchCounter.Inc(1)
-				failoverActiveGauge.Update(int64(i))
 
 				log.Warn("Heimdall failover: switched to client", "index", i)
 
@@ -413,11 +246,7 @@ func cascadeClients[T any](f *MultiHeimdallClient, ctx context.Context, fn func(
 			}
 
 			// Mark this endpoint unhealthy too.
-			f.mu.Lock()
-			f.health[i].consecutiveSuccess = 0
-			f.health[i].healthy = false
-			f.health[i].lastErr = err
-			f.mu.Unlock()
+			f.registry.MarkUnhealthy(i, err)
 		}
 	}
 
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 56d6702c80..641730330a 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -125,15 +125,24 @@ func (m *mockHeimdallClient) Close() {
 // newInstantMulti creates a MultiHeimdallClient with instant health registry
 // behavior: consecutiveThreshold=1, promotionCooldown=0, fast health-check interval.
 func newInstantMulti(clients ...Endpoint) *MultiHeimdallClient {
-	fc := NewMultiHeimdallClient(clients...)
+	fc, err := NewMultiHeimdallClient(clients...)
+	if err != nil {
+		panic(err)
+	}
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
-	fc.healthCheckInterval = 50 * time.Millisecond
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
+	fc.registry.HealthCheckInterval = 50 * time.Millisecond
 
 	return fc
 }
 
+func TestNewMultiHeimdallClient_NoClients_ReturnsError(t *testing.T) {
+	_, err := NewMultiHeimdallClient()
+	require.Error(t, err)
+}
+
 func TestFailover_SwitchOnPrimaryDown(t *testing.T) {
 	switchesBefore := failoverSwitchCounter.Snapshot().Count()
 	activeBefore := failoverActiveGauge.Snapshot().Value()
@@ -169,17 +178,19 @@ func TestFailover_NoSwitchOnContextCanceled(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
-	fc.attemptTimeout = 5 * time.Second    // longer than caller's ctx
-	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
+	fc.attemptTimeout = 5 * time.Second // longer than caller's ctx
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer cancel()
 
-	_, err := fc.GetSpan(ctx, 1)
+	_, err = fc.GetSpan(ctx, 1)
 	require.Error(t, err)
 	assert.Equal(t, int32(0), secondary.hits.Load(), "should not failover on caller context cancellation")
 }
@@ -192,14 +203,16 @@ func TestFailover_NoSwitchOnServiceUnavailable(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.Error(t, err)
 	assert.True(t, errors.Is(err, ErrServiceUnavailable))
 	assert.Equal(t, int32(0), secondary.hits.Load(), "should not failover on 503")
@@ -213,14 +226,16 @@ func TestFailover_NoSwitchOnShutdownDetected(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.Error(t, err)
 	assert.True(t, errors.Is(err, ErrShutdownDetected))
 	assert.Equal(t, int32(0), secondary.hits.Load(), "should not failover on shutdown")
@@ -237,15 +252,17 @@ func TestFailover_StickyBehavior(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
-	fc.healthCheckInterval = 1 * time.Hour // very long — no background promotion
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour // very long — no background promotion
 	defer fc.Close()
 
 	// First call triggers failover
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
 	primaryBefore := primary.hits.Load()
@@ -293,9 +310,7 @@ func TestFailover_ProbeBackToPrimary(t *testing.T) {
 
 	// Wait for background health registry to promote primary
 	require.Eventually(t, func() bool {
-		fc.mu.Lock()
-		defer fc.mu.Unlock()
-		return fc.active == 0
+		return fc.registry.Active() == 0
 	}, 2*time.Second, 20*time.Millisecond, "health registry should promote back to primary")
 
 	// Verify subsequent calls go to primary
@@ -327,9 +342,7 @@ func TestFailover_ProbeBackFails(t *testing.T) {
 	time.Sleep(200 * time.Millisecond)
 
 	// Active should still be on secondary since primary FetchStatus fails
-	fc.mu.Lock()
-	assert.Equal(t, 1, fc.active, "should stay on secondary when primary still down")
-	fc.mu.Unlock()
+	assert.Equal(t, 1, fc.registry.Active(), "should stay on secondary when primary still down")
 
 	// Calls should still succeed via secondary
 	secondaryBefore := secondary.hits.Load()
@@ -344,7 +357,9 @@ func TestFailover_ClosesBothClients(t *testing.T) {
 	primary := &mockHeimdallClient{closeFn: func() { primaryClosed.Store(true) }}
 	secondary := &mockHeimdallClient{closeFn: func() { secondaryClosed.Store(true) }}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.Close()
 
 	assert.True(t, primaryClosed.Load(), "primary should be closed")
@@ -355,11 +370,13 @@ func TestFailover_PassthroughWhenPrimaryHealthy(t *testing.T) {
 	primary := &mockHeimdallClient{}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 5 * time.Second
-	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	for i := 0; i < 5; i++ {
@@ -386,14 +403,16 @@ func TestFailover_Integration_ServiceUnavailable(t *testing.T) {
 	primaryClient := NewHeimdallClient(primary.URL, 5*time.Second)
 	secondaryClient := NewHeimdallClient(secondary.URL, 5*time.Second)
 
-	fc := NewMultiHeimdallClient(primaryClient, secondaryClient)
+	fc, err := NewMultiHeimdallClient(primaryClient, secondaryClient)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 2 * time.Second
 	defer fc.Close()
 
 	ctx := WithRequestType(context.Background(), SpanRequest)
 
 	// 503 should NOT trigger failover
-	_, err := fc.GetSpan(ctx, 1)
+	_, err = fc.GetSpan(ctx, 1)
 	require.Error(t, err)
 	assert.True(t, errors.Is(err, ErrServiceUnavailable))
 }
@@ -677,9 +696,7 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 
 	// Wait for health registry to promote back to primary
 	require.Eventually(t, func() bool {
-		fc.mu.Lock()
-		defer fc.mu.Unlock()
-		return fc.active == 0
+		return fc.registry.Active() == 0
 	}, 2*time.Second, 20*time.Millisecond, "health registry should promote back to primary")
 
 	// Verify we're back on primary
@@ -699,19 +716,19 @@ func TestFailover_ActiveNonFailoverError(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
+	fc, err := NewMultiHeimdallClient(primary, secondary, tertiary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	// Force onto secondary
-	fc.mu.Lock()
-	fc.active = 1
-	fc.mu.Unlock()
+	fc.registry.SetActive(1)
 
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.Error(t, err)
 	assert.True(t, errors.Is(err, ErrShutdownDetected))
 	assert.Equal(t, int32(0), tertiary.hits.Load(), "should not cascade to tertiary on non-failover error")
@@ -734,18 +751,14 @@ func TestFailover_ActiveFailoverError_CascadesToNext(t *testing.T) {
 	defer fc.Close()
 
 	// Force onto secondary
-	fc.mu.Lock()
-	fc.active = 1
-	fc.mu.Unlock()
+	fc.registry.SetActive(1)
 
 	span, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 	require.NotNil(t, span)
 	assert.GreaterOrEqual(t, tertiary.hits.Load(), int32(1), "should cascade to tertiary")
 
-	fc.mu.Lock()
-	assert.Equal(t, 2, fc.active, "active should switch to tertiary")
-	fc.mu.Unlock()
+	assert.Equal(t, 2, fc.registry.Active(), "active should switch to tertiary")
 }
 
 func TestFailover_ClosesAllClients(t *testing.T) {
@@ -757,7 +770,9 @@ func TestFailover_ClosesAllClients(t *testing.T) {
 		clients[i] = &mockHeimdallClient{closeFn: func() { closed[idx].Store(true) }}
 	}
 
-	fc := NewMultiHeimdallClient(clients...)
+	fc, err := NewMultiHeimdallClient(clients...)
+	require.NoError(t, err)
+
 	fc.Close()
 
 	for i := range closed {
@@ -807,18 +822,14 @@ func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
 	secondaryDown.Store(false)
 
 	require.Eventually(t, func() bool {
-		fc.mu.Lock()
-		defer fc.mu.Unlock()
-		return fc.active == 1
+		return fc.registry.Active() == 1
 	}, 2*time.Second, 20*time.Millisecond, "should promote to secondary")
 
 	// Now bring primary back
 	primaryDown.Store(false)
 
 	require.Eventually(t, func() bool {
-		fc.mu.Lock()
-		defer fc.mu.Unlock()
-		return fc.active == 0
+		return fc.registry.Active() == 0
 	}, 2*time.Second, 20*time.Millisecond, "should promote to primary")
 }
 
@@ -830,11 +841,13 @@ func TestFailover_HealthRegistryRespectsClose(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 50 * time.Millisecond
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 
 	// Close should stop the health registry goroutine
 	fc.Close()
@@ -864,20 +877,20 @@ func TestRegistry_ConsecutiveThreshold(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
-	fc.consecutiveThreshold = 3 // need 3 consecutive successes
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 50 * time.Millisecond
+	fc.registry.ConsecutiveThreshold = 3 // need 3 consecutive successes
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	// Trigger failover
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
-	fc.mu.Lock()
-	assert.Equal(t, 1, fc.active, "should be on secondary")
-	fc.mu.Unlock()
+	assert.Equal(t, 1, fc.registry.Active(), "should be on secondary")
 
 	// Wait for enough probes to pass the threshold
 	require.Eventually(t, func() bool {
@@ -886,9 +899,7 @@ func TestRegistry_ConsecutiveThreshold(t *testing.T) {
 
 	// Should eventually promote after threshold met
 	require.Eventually(t, func() bool {
-		fc.mu.Lock()
-		defer fc.mu.Unlock()
-		return fc.active == 0
+		return fc.registry.Active() == 0
 	}, 2*time.Second, 20*time.Millisecond, "should promote after consecutive threshold met")
 }
 
@@ -909,15 +920,17 @@ func TestRegistry_PromotionCooldown(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 500 * time.Millisecond // 500ms cooldown
+	fc.registry.HealthCheckInterval = 50 * time.Millisecond
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 500 * time.Millisecond // 500ms cooldown
 	defer fc.Close()
 
 	// Trigger failover
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
 	// Bring primary back
@@ -925,15 +938,11 @@ func TestRegistry_PromotionCooldown(t *testing.T) {
 
 	// Wait for at least one probe to succeed — primary should be healthy but not promoted yet
 	time.Sleep(150 * time.Millisecond)
-	fc.mu.Lock()
-	assert.Equal(t, 1, fc.active, "should not promote before cooldown")
-	fc.mu.Unlock()
+	assert.Equal(t, 1, fc.registry.Active(), "should not promote before cooldown")
 
 	// Wait for cooldown to pass and promotion to happen
 	require.Eventually(t, func() bool {
-		fc.mu.Lock()
-		defer fc.mu.Unlock()
-		return fc.active == 0
+		return fc.registry.Active() == 0
 	}, 3*time.Second, 20*time.Millisecond, "should promote after cooldown passes")
 }
 
@@ -955,15 +964,17 @@ func TestRegistry_FlappingPrevention(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 50 * time.Millisecond
-	fc.consecutiveThreshold = 3
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 50 * time.Millisecond
+	fc.registry.ConsecutiveThreshold = 3
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	// Trigger failover
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
 	// Wait for several probe cycles
@@ -971,9 +982,7 @@ func TestRegistry_FlappingPrevention(t *testing.T) {
 
 	// Primary should never reach healthy because alternating success/fail
 	// never reaches 3 consecutive successes.
-	fc.mu.Lock()
-	assert.Equal(t, 1, fc.active, "should stay on secondary — flapping primary never reaches threshold")
-	fc.mu.Unlock()
+	assert.Equal(t, 1, fc.registry.Active(), "should stay on secondary — flapping primary never reaches threshold")
 }
 
 func TestRegistry_InformedCascade_SkipsUnhealthy(t *testing.T) {
@@ -989,28 +998,26 @@ func TestRegistry_InformedCascade_SkipsUnhealthy(t *testing.T) {
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
+	fc, err := NewMultiHeimdallClient(primary, secondary, tertiary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour // prevent background probes
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	// Mark secondary as unhealthy in the registry
-	fc.mu.Lock()
-	fc.health[1] = endpointHealth{healthy: false}
-	fc.mu.Unlock()
+	fc.registry.SetHealth(1, EndpointHealth{Healthy: false})
 
 	// Trigger failover from primary
 	secondaryHitsBefore := secondary.hits.Load()
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
 	// Secondary should not have been tried for the GetSpan call since it's unhealthy,
 	// but it may be tried in the last-resort pass. The key thing is that tertiary succeeds.
-	fc.mu.Lock()
-	assert.Equal(t, 2, fc.active, "should end up on tertiary")
-	fc.mu.Unlock()
+	assert.Equal(t, 2, fc.registry.Active(), "should end up on tertiary")
 
 	_ = secondaryHitsBefore
 }
@@ -1047,29 +1054,27 @@ func TestRegistry_InformedCascade_TriesByPriority(t *testing.T) {
 		},
 	}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
+	fc, err := NewMultiHeimdallClient(primary, secondary, tertiary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	// Force active to index 1 (secondary); primary (index 0) is healthy
-	fc.mu.Lock()
-	fc.active = 1
-	fc.health[0] = endpointHealth{healthy: true, healthySince: time.Now().Add(-1 * time.Hour)}
-	fc.health[1] = endpointHealth{healthy: true}
-	fc.health[2] = endpointHealth{healthy: true}
-	fc.mu.Unlock()
+	fc.registry.SetActive(1)
+	fc.registry.SetHealth(0, EndpointHealth{Healthy: true, HealthySince: time.Now().Add(-1 * time.Hour)})
+	fc.registry.SetHealth(1, EndpointHealth{Healthy: true})
+	fc.registry.SetHealth(2, EndpointHealth{Healthy: true})
 
 	span, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 	require.NotNil(t, span)
 
 	// Cascade should try primary (index 0) before tertiary (index 2)
-	fc.mu.Lock()
-	assert.Equal(t, 0, fc.active, "should cascade to primary (highest priority)")
-	fc.mu.Unlock()
+	assert.Equal(t, 0, fc.registry.Active(), "should cascade to primary (highest priority)")
 }
 
 func TestRegistry_ProactiveSwitchOnActiveUnhealthy(t *testing.T) {
@@ -1093,17 +1098,13 @@ func TestRegistry_ProactiveSwitchOnActiveUnhealthy(t *testing.T) {
 	fc.ensureHealthRegistry()
 
 	// Verify we start on primary
-	fc.mu.Lock()
-	assert.Equal(t, 0, fc.active, "should start on primary")
-	fc.mu.Unlock()
+	assert.Equal(t, 0, fc.registry.Active(), "should start on primary")
 
 	// Now make primary go down — the health registry should detect and switch
 	primaryDown.Store(true)
 
 	require.Eventually(t, func() bool {
-		fc.mu.Lock()
-		defer fc.mu.Unlock()
-		return fc.active == 1
+		return fc.registry.Active() == 1
 	}, 2*time.Second, 20*time.Millisecond, "health registry should proactively switch to secondary")
 }
 
@@ -1116,26 +1117,24 @@ func TestRegistry_CascadeFallsBackToUnhealthy(t *testing.T) {
 	// Secondary is marked unhealthy but actually works
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	// Mark secondary as unhealthy
-	fc.mu.Lock()
-	fc.health[1] = endpointHealth{healthy: false}
-	fc.mu.Unlock()
+	fc.registry.SetHealth(1, EndpointHealth{Healthy: false})
 
 	// Primary fails, cascade should fall back to unhealthy secondary as last resort
 	span, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 	require.NotNil(t, span)
 
-	fc.mu.Lock()
-	assert.Equal(t, 1, fc.active, "should fall back to unhealthy secondary as last resort")
-	fc.mu.Unlock()
+	assert.Equal(t, 1, fc.registry.Active(), "should fall back to unhealthy secondary as last resort")
 }
 
 func TestRegistry_MarkUnhealthyOnRealFailure(t *testing.T) {
@@ -1146,27 +1145,27 @@ func TestRegistry_MarkUnhealthyOnRealFailure(t *testing.T) {
 	}
 	secondary := &mockHeimdallClient{}
 
-	fc := NewMultiHeimdallClient(primary, secondary)
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 0
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	// Primary starts as healthy
-	fc.mu.Lock()
-	assert.True(t, fc.health[0].healthy, "primary should start healthy")
-	fc.mu.Unlock()
+	snap := fc.registry.HealthSnapshot()
+	assert.True(t, snap[0].Healthy, "primary should start healthy")
 
 	// Trigger a real request that fails on primary
-	_, err := fc.GetSpan(context.Background(), 1)
+	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err) // succeeds via secondary
 
 	// Primary should now be marked unhealthy
-	fc.mu.Lock()
-	assert.False(t, fc.health[0].healthy, "primary should be marked unhealthy after real failure")
-	assert.Equal(t, 0, fc.health[0].consecutiveSuccess, "consecutive success should be reset")
-	fc.mu.Unlock()
+	snap = fc.registry.HealthSnapshot()
+	assert.False(t, snap[0].Healthy, "primary should be marked unhealthy after real failure")
+	assert.Equal(t, 0, snap[0].ConsecutiveSuccess, "consecutive success should be reset")
 }
 
 func TestRegistry_InformedCascade_RespectsCooldown(t *testing.T) {
@@ -1190,27 +1189,25 @@ func TestRegistry_InformedCascade_RespectsCooldown(t *testing.T) {
 		},
 	}
 
-	fc := NewMultiHeimdallClient(primary, secondary, tertiary)
+	fc, err := NewMultiHeimdallClient(primary, secondary, tertiary)
+	require.NoError(t, err)
+
 	fc.attemptTimeout = 100 * time.Millisecond
-	fc.healthCheckInterval = 1 * time.Hour
-	fc.consecutiveThreshold = 1
-	fc.promotionCooldown = 1 * time.Hour // long cooldown
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 1 * time.Hour // long cooldown
 	defer fc.Close()
 
 	// Set up health states
-	fc.mu.Lock()
-	fc.active = 1
-	fc.health[0] = endpointHealth{healthy: true, healthySince: time.Now()} // NOT cooled
-	fc.health[1] = endpointHealth{healthy: true}
-	fc.health[2] = endpointHealth{healthy: true, healthySince: time.Now().Add(-2 * time.Hour)} // cooled
-	fc.mu.Unlock()
+	fc.registry.SetActive(1)
+	fc.registry.SetHealth(0, EndpointHealth{Healthy: true, HealthySince: time.Now()})                     // NOT cooled
+	fc.registry.SetHealth(1, EndpointHealth{Healthy: true})                                               // active, will fail
+	fc.registry.SetHealth(2, EndpointHealth{Healthy: true, HealthySince: time.Now().Add(-2 * time.Hour)}) // cooled
 
 	span, err := fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 	require.NotNil(t, span)
 
 	// Should prefer tertiary (cooled) over primary (uncooled)
-	fc.mu.Lock()
-	assert.Equal(t, 2, fc.active, "should prefer cooled tertiary over uncooled primary")
-	fc.mu.Unlock()
+	assert.Equal(t, 2, fc.registry.Active(), "should prefer cooled tertiary over uncooled primary")
 }
diff --git a/consensus/bor/heimdall/health_registry.go b/consensus/bor/heimdall/health_registry.go
new file mode 100644
index 0000000000..8dcd890930
--- /dev/null
+++ b/consensus/bor/heimdall/health_registry.go
@@ -0,0 +1,338 @@
+package heimdall
+
+import (
+	"sync"
+	"time"
+
+	"github.com/ethereum/go-ethereum/log"
+	"github.com/ethereum/go-ethereum/metrics"
+)
+
+// EndpointHealth tracks the health state of a single endpoint.
+type EndpointHealth struct {
+	Healthy            bool
+	ConsecutiveSuccess int
+	HealthySince       time.Time // when consecutive threshold was reached
+	LastErr            error
+}
+
+// RegistryMetrics holds the metrics counters/gauges that a HealthRegistry reports to.
+// Nil pointers are safe — the registry checks before calling.
+type RegistryMetrics struct {
+	ProbeAttempts     *metrics.Counter
+	ProbeSuccesses    *metrics.Counter
+	ProactiveSwitches *metrics.Counter
+	ActiveGauge       *metrics.Gauge
+	HealthyEndpoints  *metrics.Gauge
+}
+
+// HealthRegistry is a shared health state machine for N endpoints.
+// It runs a background goroutine that probes all endpoints, promotes
+// higher-priority endpoints when healthy+cooled, and proactively switches
+// away from unhealthy active endpoints.
+type HealthRegistry struct {
+	mu     sync.Mutex
+	health []EndpointHealth
+	active int
+	n      int
+
+	// Exported config fields — set after construction, before Start().
+	HealthCheckInterval  time.Duration
+	ConsecutiveThreshold int
+	PromotionCooldown    time.Duration
+
+	probeFunc func(i int) error
+	onSwitch  func(from, to int) // called under mu; may acquire other locks
+
+	metrics RegistryMetrics
+
+	quit      chan struct{}
+	closeOnce sync.Once
+	startOnce sync.Once
+}
+
+// NewHealthRegistry creates a registry for n endpoints.
+// probeFunc is called for each endpoint index to test reachability.
+// onSwitch (optional) is called under the registry lock when the active
+// endpoint changes due to promotion or proactive switch.
+func NewHealthRegistry(n int, probeFunc func(int) error, onSwitch func(from, to int), m RegistryMetrics) *HealthRegistry {
+	health := make([]EndpointHealth, n)
+	// Primary starts as healthy; others start unhealthy.
+	health[0] = EndpointHealth{Healthy: true}
+
+	return &HealthRegistry{
+		health:               health,
+		n:                    n,
+		HealthCheckInterval:  defaultHealthCheckInterval,
+		ConsecutiveThreshold: defaultConsecutiveThreshold,
+		PromotionCooldown:    defaultPromotionCooldown,
+		probeFunc:            probeFunc,
+		onSwitch:             onSwitch,
+		metrics:              m,
+		quit:                 make(chan struct{}),
+	}
+}
+
+// Active returns the index of the currently active endpoint.
+func (r *HealthRegistry) Active() int {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	return r.active
+}
+
+// SetActive sets the active endpoint index, updates the gauge, and calls onSwitch
+// if the active endpoint changed. The caller must NOT hold r.mu.
+func (r *HealthRegistry) SetActive(i int) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	prev := r.active
+	r.active = i
+
+	if r.metrics.ActiveGauge != nil {
+		r.metrics.ActiveGauge.Update(int64(i))
+	}
+
+	if prev != i && r.onSwitch != nil {
+		r.onSwitch(prev, i)
+	}
+}
+
+// MarkUnhealthy resets the health state of endpoint i to unhealthy.
+func (r *HealthRegistry) MarkUnhealthy(i int, err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	r.health[i].ConsecutiveSuccess = 0
+	r.health[i].Healthy = false
+	r.health[i].LastErr = err
+}
+
+// MarkSuccess increments the consecutive success count for endpoint i and
+// transitions it to healthy if the threshold is met.
+func (r *HealthRegistry) MarkSuccess(i int) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	r.health[i].ConsecutiveSuccess++
+	r.health[i].LastErr = nil
+
+	if r.health[i].ConsecutiveSuccess >= r.ConsecutiveThreshold && !r.health[i].Healthy {
+		r.health[i].Healthy = true
+		r.health[i].HealthySince = time.Now()
+	}
+}
+
+// HealthSnapshot returns a copy of all endpoint health states.
+func (r *HealthRegistry) HealthSnapshot() []EndpointHealth {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	snap := make([]EndpointHealth, r.n)
+	copy(snap, r.health)
+
+	return snap
+}
+
+// SetHealth directly overrides the health state of endpoint i.
+// Intended for tests that need to manipulate state.
+func (r *HealthRegistry) SetHealth(i int, h EndpointHealth) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	r.health[i] = h
+}
+
+// Start lazily starts the background health-check goroutine via startOnce.
+func (r *HealthRegistry) Start() {
+	r.startOnce.Do(func() {
+		go r.run()
+	})
+}
+
+// Stop closes the quit channel, stopping the background goroutine.
+func (r *HealthRegistry) Stop() {
+	r.closeOnce.Do(func() {
+		close(r.quit)
+	})
+}
+
+// run is the background goroutine: probe → promote → proactive switch.
+func (r *HealthRegistry) run() {
+	ticker := time.NewTicker(r.HealthCheckInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-r.quit:
+			return
+		case <-ticker.C:
+		}
+
+		r.probeAll()
+		r.maybePromote()
+		r.maybeProactiveSwitch()
+	}
+}
+
+// probeAll probes every endpoint and updates health state.
+func (r *HealthRegistry) probeAll() {
+	for i := 0; i < r.n; i++ {
+		// Check for shutdown between individual probes.
+		select {
+		case <-r.quit:
+			return
+		default:
+		}
+
+		if r.metrics.ProbeAttempts != nil {
+			r.metrics.ProbeAttempts.Inc(1)
+		}
+
+		err := r.probeFunc(i)
+
+		r.mu.Lock()
+
+		if err == nil {
+			r.health[i].ConsecutiveSuccess++
+			r.health[i].LastErr = nil
+
+			if r.health[i].ConsecutiveSuccess >= r.ConsecutiveThreshold && !r.health[i].Healthy {
+				r.health[i].Healthy = true
+				r.health[i].HealthySince = time.Now()
+			}
+
+			if r.metrics.ProbeSuccesses != nil {
+				r.metrics.ProbeSuccesses.Inc(1)
+			}
+		} else {
+			r.health[i].ConsecutiveSuccess = 0
+			r.health[i].Healthy = false
+			r.health[i].LastErr = err
+		}
+
+		r.mu.Unlock()
+	}
+
+	// Update healthy endpoints gauge.
+	r.mu.Lock()
+	count := int64(0)
+
+	for i := range r.health {
+		if r.health[i].Healthy {
+			count++
+		}
+	}
+
+	r.mu.Unlock()
+
+	if r.metrics.HealthyEndpoints != nil {
+		r.metrics.HealthyEndpoints.Update(count)
+	}
+}
+
+// maybePromote checks if a higher-priority endpoint (index < active) is healthy
+// and has passed cooldown. If yes, promotes to the highest-priority qualified endpoint.
+func (r *HealthRegistry) maybePromote() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.active == 0 {
+		return
+	}
+
+	for i := 0; i < r.active; i++ {
+		if r.health[i].Healthy && time.Since(r.health[i].HealthySince) >= r.PromotionCooldown {
+			prev := r.active
+			r.active = i
+
+			if r.metrics.ActiveGauge != nil {
+				r.metrics.ActiveGauge.Update(int64(i))
+			}
+
+			if r.metrics.ProactiveSwitches != nil {
+				r.metrics.ProactiveSwitches.Inc(1)
+			}
+
+			log.Info("Health registry: promoted to higher-priority endpoint",
+				"index", i, "previous", prev)
+
+			if r.onSwitch != nil {
+				r.onSwitch(prev, i)
+			}
+
+			return
+		}
+	}
+}
+
+// maybeProactiveSwitch detects if the active endpoint is unhealthy and switches
+// to the highest-priority healthy endpoint.
+func (r *HealthRegistry) maybeProactiveSwitch() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.health[r.active].Healthy {
+		return
+	}
+
+	// Active is unhealthy. Find the best alternative.
+	// Pass 1: healthy + cooled.
+	for i := 0; i < r.n; i++ {
+		if i == r.active {
+			continue
+		}
+
+		if r.health[i].Healthy && time.Since(r.health[i].HealthySince) >= r.PromotionCooldown {
+			prev := r.active
+			r.active = i
+
+			if r.metrics.ActiveGauge != nil {
+				r.metrics.ActiveGauge.Update(int64(i))
+			}
+
+			if r.metrics.ProactiveSwitches != nil {
+				r.metrics.ProactiveSwitches.Inc(1)
+			}
+
+			log.Warn("Health registry: proactive switch (active unhealthy, cooled target)",
+				"from", prev, "to", i)
+
+			if r.onSwitch != nil {
+				r.onSwitch(prev, i)
+			}
+
+			return
+		}
+	}
+
+	// Pass 2: healthy but NOT cooled (emergency).
+	for i := 0; i < r.n; i++ {
+		if i == r.active {
+			continue
+		}
+
+		if r.health[i].Healthy {
+			prev := r.active
+			r.active = i
+
+			if r.metrics.ActiveGauge != nil {
+				r.metrics.ActiveGauge.Update(int64(i))
+			}
+
+			if r.metrics.ProactiveSwitches != nil {
+				r.metrics.ProactiveSwitches.Inc(1)
+			}
+
+			log.Warn("Health registry: proactive switch (active unhealthy, uncooled target)",
+				"from", prev, "to", i)
+
+			if r.onSwitch != nil {
+				r.onSwitch(prev, i)
+			}
+
+			return
+		}
+	}
+}
diff --git a/consensus/bor/heimdall/health_registry_test.go b/consensus/bor/heimdall/health_registry_test.go
new file mode 100644
index 0000000000..8a98b53ee4
--- /dev/null
+++ b/consensus/bor/heimdall/health_registry_test.go
@@ -0,0 +1,272 @@
+package heimdall
+
+import (
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestHealthRegistry_Constructor_PrimaryHealthy(t *testing.T) {
+	r := NewHealthRegistry(3, func(i int) error { return nil }, nil, RegistryMetrics{})
+
+	snap := r.HealthSnapshot()
+	assert.Len(t, snap, 3)
+	assert.True(t, snap[0].Healthy, "primary should start healthy")
+	assert.False(t, snap[1].Healthy, "secondary should start unhealthy")
+	assert.False(t, snap[2].Healthy, "tertiary should start unhealthy")
+	assert.Equal(t, 0, r.Active())
+}
+
+func TestHealthRegistry_MarkUnhealthy(t *testing.T) {
+	r := NewHealthRegistry(2, func(i int) error { return nil }, nil, RegistryMetrics{})
+
+	r.MarkUnhealthy(0, errors.New("down"))
+
+	snap := r.HealthSnapshot()
+	assert.False(t, snap[0].Healthy)
+	assert.Equal(t, 0, snap[0].ConsecutiveSuccess)
+	assert.EqualError(t, snap[0].LastErr, "down")
+}
+
+func TestHealthRegistry_MarkSuccess_Transitions(t *testing.T) {
+	r := NewHealthRegistry(2, func(i int) error { return nil }, nil, RegistryMetrics{})
+	r.ConsecutiveThreshold = 3
+
+	// Endpoint 1 starts unhealthy.
+	snap := r.HealthSnapshot()
+	assert.False(t, snap[1].Healthy)
+
+	// Two successes: still unhealthy.
+	r.MarkSuccess(1)
+	r.MarkSuccess(1)
+	snap = r.HealthSnapshot()
+	assert.False(t, snap[1].Healthy)
+	assert.Equal(t, 2, snap[1].ConsecutiveSuccess)
+
+	// Third success: transitions to healthy.
+	r.MarkSuccess(1)
+	snap = r.HealthSnapshot()
+	assert.True(t, snap[1].Healthy)
+	assert.Equal(t, 3, snap[1].ConsecutiveSuccess)
+	assert.False(t, snap[1].HealthySince.IsZero())
+}
+
+func TestHealthRegistry_MarkSuccess_ResetByFailure(t *testing.T) {
+	r := NewHealthRegistry(2, func(i int) error { return nil }, nil, RegistryMetrics{})
+	r.ConsecutiveThreshold = 3
+
+	r.MarkSuccess(1)
+	r.MarkSuccess(1)
+	r.MarkUnhealthy(1, errors.New("fail"))
+
+	snap := r.HealthSnapshot()
+	assert.False(t, snap[1].Healthy)
+	assert.Equal(t, 0, snap[1].ConsecutiveSuccess)
+
+	// Need 3 more successes after reset.
+	r.MarkSuccess(1)
+	snap = r.HealthSnapshot()
+	assert.False(t, snap[1].Healthy)
+}
+
+func TestHealthRegistry_SetActive_CallsOnSwitch(t *testing.T) {
+	var switchFrom, switchTo int
+	called := false
+
+	r := NewHealthRegistry(2, func(i int) error { return nil }, func(from, to int) {
+		called = true
+		switchFrom = from
+		switchTo = to
+	}, RegistryMetrics{})
+
+	r.SetActive(1)
+	assert.True(t, called)
+	assert.Equal(t, 0, switchFrom)
+	assert.Equal(t, 1, switchTo)
+	assert.Equal(t, 1, r.Active())
+}
+
+func TestHealthRegistry_SetActive_NoCallOnSameIndex(t *testing.T) {
+	called := false
+	r := NewHealthRegistry(2, func(i int) error { return nil }, func(from, to int) {
+		called = true
+	}, RegistryMetrics{})
+
+	r.SetActive(0) // same as current
+	assert.False(t, called, "onSwitch should not be called when active doesn't change")
+}
+
+func TestHealthRegistry_SetHealth(t *testing.T) {
+	r := NewHealthRegistry(2, func(i int) error { return nil }, nil, RegistryMetrics{})
+
+	h := EndpointHealth{
+		Healthy:            true,
+		ConsecutiveSuccess: 5,
+		HealthySince:       time.Now().Add(-1 * time.Hour),
+	}
+	r.SetHealth(1, h)
+
+	snap := r.HealthSnapshot()
+	assert.True(t, snap[1].Healthy)
+	assert.Equal(t, 5, snap[1].ConsecutiveSuccess)
+}
+
+func TestHealthRegistry_ProbeAll(t *testing.T) {
+	probeResults := []error{nil, errors.New("fail"), nil}
+	probeCount := atomic.Int32{}
+
+	r := NewHealthRegistry(3, func(i int) error {
+		probeCount.Add(1)
+		return probeResults[i]
+	}, nil, RegistryMetrics{})
+	r.ConsecutiveThreshold = 1
+
+	r.probeAll()
+
+	assert.Equal(t, int32(3), probeCount.Load())
+
+	snap := r.HealthSnapshot()
+	// Index 0 was already healthy, stays healthy.
+	assert.True(t, snap[0].Healthy)
+	// Index 1 failed: unhealthy.
+	assert.False(t, snap[1].Healthy)
+	assert.EqualError(t, snap[1].LastErr, "fail")
+	// Index 2 succeeded once with threshold=1: becomes healthy.
+	assert.True(t, snap[2].Healthy)
+}
+
+func TestHealthRegistry_MaybePromote(t *testing.T) {
+	r := NewHealthRegistry(3, func(i int) error { return nil }, nil, RegistryMetrics{})
+	r.PromotionCooldown = 0
+	r.ConsecutiveThreshold = 1
+
+	// Set active to 2, mark index 0 as unhealthy, make index 1 healthy+cooled.
+	r.SetActive(2)
+	r.SetHealth(0, EndpointHealth{Healthy: false})
+	r.SetHealth(1, EndpointHealth{
+		Healthy:      true,
+		HealthySince: time.Now().Add(-1 * time.Hour),
+	})
+
+	r.maybePromote()
+
+	assert.Equal(t, 1, r.Active(), "should promote to index 1")
+}
+
+func TestHealthRegistry_MaybePromote_RespectsOrder(t *testing.T) {
+	r := NewHealthRegistry(3, func(i int) error { return nil }, nil, RegistryMetrics{})
+	r.PromotionCooldown = 0
+
+	// Active at 2, both 0 and 1 healthy — should promote to 0 (highest priority).
+	r.SetActive(2)
+	r.SetHealth(0, EndpointHealth{Healthy: true, HealthySince: time.Now().Add(-1 * time.Hour)})
+	r.SetHealth(1, EndpointHealth{Healthy: true, HealthySince: time.Now().Add(-1 * time.Hour)})
+
+	r.maybePromote()
+
+	assert.Equal(t, 0, r.Active(), "should promote to index 0 (highest priority)")
+}
+
+func TestHealthRegistry_MaybePromote_RespectsCooldown(t *testing.T) {
+	r := NewHealthRegistry(2, func(i int) error { return nil }, nil, RegistryMetrics{})
+	r.PromotionCooldown = 1 * time.Hour
+
+	// Active at 1, index 0 healthy but recently (not cooled).
+	r.SetActive(1)
+	r.SetHealth(0, EndpointHealth{Healthy: true, HealthySince: time.Now()})
+
+	r.maybePromote()
+
+	assert.Equal(t, 1, r.Active(), "should not promote — cooldown not met")
+}
+
+func TestHealthRegistry_MaybeProactiveSwitch_CooledFirst(t *testing.T) {
+	r := NewHealthRegistry(3, func(i int) error { return nil }, nil, RegistryMetrics{})
+	r.PromotionCooldown = 0
+
+	// Active at 0, mark it unhealthy. Index 2 is healthy+cooled.
+	r.SetHealth(0, EndpointHealth{Healthy: false})
+	r.SetHealth(2, EndpointHealth{Healthy: true, HealthySince: time.Now().Add(-1 * time.Hour)})
+
+	r.maybeProactiveSwitch()
+
+	assert.Equal(t, 2, r.Active(), "should switch to cooled healthy endpoint")
+}
+
+func TestHealthRegistry_MaybeProactiveSwitch_UncooledFallback(t *testing.T) {
+	r := NewHealthRegistry(3, func(i int) error { return nil }, nil, RegistryMetrics{})
+	r.PromotionCooldown = 1 * time.Hour
+
+	// Active at 0, mark it unhealthy. Index 1 is healthy but NOT cooled.
+	r.SetHealth(0, EndpointHealth{Healthy: false})
+	r.SetHealth(1, EndpointHealth{Healthy: true, HealthySince: time.Now()}) // not cooled
+
+	r.maybeProactiveSwitch()
+
+	assert.Equal(t, 1, r.Active(), "should fall back to uncooled healthy endpoint")
+}
+
+func TestHealthRegistry_MaybeProactiveSwitch_NoHealthy(t *testing.T) {
+	r := NewHealthRegistry(3, func(i int) error { return nil }, nil, RegistryMetrics{})
+
+	// All unhealthy.
+	r.SetHealth(0, EndpointHealth{Healthy: false})
+	r.SetHealth(1, EndpointHealth{Healthy: false})
+	r.SetHealth(2, EndpointHealth{Healthy: false})
+
+	r.maybeProactiveSwitch()
+
+	assert.Equal(t, 0, r.Active(), "should stay on 0 when no alternatives are healthy")
+}
+
+func TestHealthRegistry_Stop_HaltsGoroutine(t *testing.T) {
+	probeCount := atomic.Int32{}
+
+	r := NewHealthRegistry(2, func(i int) error {
+		probeCount.Add(1)
+		return nil
+	}, nil, RegistryMetrics{})
+	r.HealthCheckInterval = 50 * time.Millisecond
+
+	r.Start()
+	time.Sleep(150 * time.Millisecond)
+	r.Stop()
+
+	countAfterStop := probeCount.Load()
+	time.Sleep(200 * time.Millisecond)
+
+	assert.Equal(t, countAfterStop, probeCount.Load(), "no probes should run after Stop")
+}
+
+func TestHealthRegistry_Run_Integration(t *testing.T) {
+	probeResults := []error{errors.New("down"), nil}
+	var results atomic.Value
+	results.Store(probeResults)
+
+	r := NewHealthRegistry(2, func(i int) error {
+		return results.Load().([]error)[i]
+	}, nil, RegistryMetrics{})
+	r.HealthCheckInterval = 50 * time.Millisecond
+	r.ConsecutiveThreshold = 1
+	r.PromotionCooldown = 0
+
+	r.Start()
+	defer r.Stop()
+
+	// Primary is down, secondary is healthy. Should proactively switch.
+	require.Eventually(t, func() bool {
+		return r.Active() == 1
+	}, 2*time.Second, 20*time.Millisecond, "should switch to healthy secondary")
+
+	// Bring primary back.
+	results.Store([]error{nil, nil})
+
+	// Should promote back to primary.
+	require.Eventually(t, func() bool {
+		return r.Active() == 0
+	}, 2*time.Second, 20*time.Millisecond, "should promote back to primary")
+}
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index 7b8de20dcb..1a7c68c4bb 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -16,57 +16,39 @@ import (
 	"github.com/ethereum/go-ethereum/log"
 )
 
+var (
+	ErrNoURLs         = errors.New("at least one WS URL required")
+	ErrNoNonEmptyURLs = errors.New("at least one non-empty WS URL required")
+)
+
 const (
 	// defaultReconnectDelay is the backoff between reconnection attempts.
 	defaultReconnectDelay = 10 * time.Second
 
-	// defaultWSHealthCheckInterval is how often the health registry probes all endpoints.
-	defaultWSHealthCheckInterval = 10 * time.Second
-
-	// defaultWSConsecutiveThreshold is the number of consecutive successful probes
-	// needed before an endpoint is considered healthy.
-	defaultWSConsecutiveThreshold = 3
-
-	// defaultWSPromotionCooldown is how long after becoming healthy before an
-	// endpoint is eligible for promotion.
-	defaultWSPromotionCooldown = 60 * time.Second
-
 	// defaultWSProbeTimeout bounds each individual WS probe dial so a
 	// firewalled host can't block the health-check goroutine forever.
 	defaultWSProbeTimeout = 10 * time.Second
 )
 
-// wsEndpointHealth tracks the health state of a single WS endpoint.
-type wsEndpointHealth struct {
-	healthy            bool
-	consecutiveSuccess int
-	healthySince       time.Time
-	lastErr            error
-}
-
 // HeimdallWSClient represents a websocket client with auto-reconnection and failover support.
 type HeimdallWSClient struct {
-	conn      *websocket.Conn
-	urls      []string // primary at [0], secondary at [1] (if configured)
-	activeURL int      // index into urls; protected by mu
-	health    []wsEndpointHealth
-	events    chan *milestone.Milestone
-	done      chan struct{}
-	mu        sync.Mutex
+	conn     *websocket.Conn
+	urls     []string // primary at [0], secondary at [1] (if configured)
+	registry *heimdall.HealthRegistry
+	events   chan *milestone.Milestone
+	done     chan struct{}
+	mu       sync.Mutex
 
 	// Configurable parameters (defaults set in constructor, overridable for testing)
-	reconnectDelay       time.Duration
-	healthCheckInterval  time.Duration
-	consecutiveThreshold int
-	promotionCooldown    time.Duration
-	probeTimeout         time.Duration
+	reconnectDelay time.Duration
+	probeTimeout   time.Duration
 }
 
 // NewHeimdallWSClient creates a new WS client for Heimdall with optional failover.
 // The first URL is primary; additional URLs are failover candidates in priority order.
 func NewHeimdallWSClient(urls ...string) (*HeimdallWSClient, error) {
 	if len(urls) == 0 {
-		return nil, errors.New("at least one WS URL required")
+		return nil, ErrNoURLs
 	}
 
 	var filtered []string
@@ -77,211 +59,81 @@ func NewHeimdallWSClient(urls ...string) (*HeimdallWSClient, error) {
 	}
 
 	if len(filtered) == 0 {
-		return nil, errors.New("at least one non-empty WS URL required")
+		return nil, ErrNoNonEmptyURLs
 	}
 
-	health := make([]wsEndpointHealth, len(filtered))
-	// Primary starts as healthy; others start unhealthy.
-	health[0] = wsEndpointHealth{healthy: true}
-
-	return &HeimdallWSClient{
-		conn:                 nil,
-		urls:                 filtered,
-		health:               health,
-		events:               make(chan *milestone.Milestone),
-		done:                 make(chan struct{}),
-		reconnectDelay:       defaultReconnectDelay,
-		healthCheckInterval:  defaultWSHealthCheckInterval,
-		consecutiveThreshold: defaultWSConsecutiveThreshold,
-		promotionCooldown:    defaultWSPromotionCooldown,
-		probeTimeout:         defaultWSProbeTimeout,
-	}, nil
-}
-
-// SubscribeMilestoneEvents sends the subscription request and starts processing incoming messages.
-func (c *HeimdallWSClient) SubscribeMilestoneEvents(ctx context.Context) <-chan *milestone.Milestone {
-	c.tryUntilSubscribeMilestoneEvents(ctx)
-
-	// Start the goroutine to read messages.
-	go c.readMessages(ctx)
-
-	// Start the health registry if there are multiple URLs.
-	if len(c.urls) > 1 {
-		go c.runWSHealthRegistry()
+	c := &HeimdallWSClient{
+		conn:           nil,
+		urls:           filtered,
+		events:         make(chan *milestone.Milestone),
+		done:           make(chan struct{}),
+		reconnectDelay: defaultReconnectDelay,
+		probeTimeout:   defaultWSProbeTimeout,
 	}
 
-	return c.events
+	c.registry = heimdall.NewHealthRegistry(
+		len(filtered),
+		c.probeWSEndpoint,
+		c.onWSSwitch,
+		heimdall.RegistryMetrics{
+			ProbeAttempts:     heimdall.FailoverWSProbeAttempts,
+			ProbeSuccesses:    heimdall.FailoverWSProbeSuccesses,
+			ProactiveSwitches: heimdall.FailoverWSProactiveSwitches,
+			ActiveGauge:       heimdall.FailoverWSActiveGauge,
+			HealthyEndpoints:  heimdall.FailoverWSHealthyEndpoints,
+		},
+	)
+
+	return c, nil
 }
 
-// runWSHealthRegistry is an always-on goroutine that continuously probes ALL WS
-// endpoints, requires consecutive successes before marking healthy, and enforces
-// cooldown before promotion. Stopped when done channel is closed (Unsubscribe).
-func (c *HeimdallWSClient) runWSHealthRegistry() {
-	ticker := time.NewTicker(c.healthCheckInterval)
-	defer ticker.Stop()
-
-	for {
-		select {
-		case <-c.done:
-			return
-		case <-ticker.C:
-		}
-
-		c.probeAllWSEndpoints()
-		c.maybeWSPromote()
-		c.maybeWSProactiveSwitch()
-	}
-}
+// probeWSEndpoint dials a WS endpoint and immediately closes the connection.
+func (c *HeimdallWSClient) probeWSEndpoint(i int) error {
+	c.mu.Lock()
+	url := c.urls[i]
+	c.mu.Unlock()
 
-// probeAllWSEndpoints probes every WS endpoint via dial (connect + immediately close).
-func (c *HeimdallWSClient) probeAllWSEndpoints() {
 	dialer := websocket.Dialer{
 		HandshakeTimeout: c.probeTimeout,
 	}
 
-	for i := 0; i < len(c.urls); i++ {
-		// Check for shutdown between individual probes.
-		select {
-		case <-c.done:
-			return
-		default:
-		}
-
-		heimdall.FailoverWSProbeAttempts.Inc(1)
-
-		c.mu.Lock()
-		url := c.urls[i]
-		c.mu.Unlock()
-
-		ctx, cancel := context.WithTimeout(context.Background(), c.probeTimeout)
-		testConn, _, err := dialer.DialContext(ctx, url, nil)
-		cancel()
-
-		c.mu.Lock()
-
-		if err == nil {
-			testConn.Close()
-
-			c.health[i].consecutiveSuccess++
-			c.health[i].lastErr = nil
-
-			if c.health[i].consecutiveSuccess >= c.consecutiveThreshold && !c.health[i].healthy {
-				c.health[i].healthy = true
-				c.health[i].healthySince = time.Now()
-			}
-
-			heimdall.FailoverWSProbeSuccesses.Inc(1)
-		} else {
-			c.health[i].consecutiveSuccess = 0
-			c.health[i].healthy = false
-			c.health[i].lastErr = err
-		}
+	ctx, cancel := context.WithTimeout(context.Background(), c.probeTimeout)
+	defer cancel()
 
-		c.mu.Unlock()
+	testConn, _, err := dialer.DialContext(ctx, url, nil)
+	if err != nil {
+		return err
 	}
 
-	// Update healthy endpoints gauge.
-	c.mu.Lock()
-	count := int64(0)
-	for i := range c.health {
-		if c.health[i].healthy {
-			count++
-		}
-	}
-	c.mu.Unlock()
+	testConn.Close()
 
-	heimdall.FailoverWSHealthyEndpoints.Update(count)
+	return nil
 }
 
-// maybeWSPromote checks if a higher-priority URL (index < activeURL) is healthy
-// and has passed cooldown. If yes, promotes to the highest-priority qualified URL.
-func (c *HeimdallWSClient) maybeWSPromote() {
+// onWSSwitch is called by the registry (under registry lock) when the active
+// endpoint changes. It closes the current connection to trigger reconnection.
+func (c *HeimdallWSClient) onWSSwitch(from, to int) {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	if c.activeURL == 0 {
-		return
-	}
-
-	for i := 0; i < c.activeURL; i++ {
-		if c.health[i].healthy && time.Since(c.health[i].healthySince) >= c.promotionCooldown {
-			prev := c.activeURL
-			c.activeURL = i
-
-			heimdall.FailoverWSActiveGauge.Update(int64(i))
-			heimdall.FailoverWSProactiveSwitches.Inc(1)
-
-			log.Info("WS health registry: promoted to higher-priority URL",
-				"index", i, "previous", prev, "url", c.urls[i])
-
-			// Close current connection to trigger reconnection in readMessages.
-			if c.conn != nil {
-				c.conn.Close()
-			}
-
-			return
-		}
+	if c.conn != nil {
+		c.conn.Close()
 	}
 }
 
-// maybeWSProactiveSwitch detects if the active URL is unhealthy and switches
-// to the highest-priority healthy URL.
-func (c *HeimdallWSClient) maybeWSProactiveSwitch() {
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	if c.health[c.activeURL].healthy {
-		return
-	}
-
-	// Active is unhealthy. Find the best alternative.
-	// Pass 1: healthy + cooled.
-	for i := 0; i < len(c.urls); i++ {
-		if i == c.activeURL {
-			continue
-		}
-
-		if c.health[i].healthy && time.Since(c.health[i].healthySince) >= c.promotionCooldown {
-			prev := c.activeURL
-			c.activeURL = i
-
-			heimdall.FailoverWSActiveGauge.Update(int64(i))
-			heimdall.FailoverWSProactiveSwitches.Inc(1)
-
-			log.Warn("WS health registry: proactive switch (active unhealthy, cooled target)",
-				"from", prev, "to", i, "url", c.urls[i])
+// SubscribeMilestoneEvents sends the subscription request and starts processing incoming messages.
+func (c *HeimdallWSClient) SubscribeMilestoneEvents(ctx context.Context) <-chan *milestone.Milestone {
+	c.tryUntilSubscribeMilestoneEvents(ctx)
 
-			if c.conn != nil {
-				c.conn.Close()
-			}
+	// Start the goroutine to read messages.
+	go c.readMessages(ctx)
 
-			return
-		}
+	// Start the health registry if there are multiple URLs.
+	if len(c.urls) > 1 {
+		c.registry.Start()
 	}
 
-	// Pass 2: healthy but NOT cooled (emergency).
-	for i := 0; i < len(c.urls); i++ {
-		if i == c.activeURL {
-			continue
-		}
-
-		if c.health[i].healthy {
-			prev := c.activeURL
-			c.activeURL = i
-
-			heimdall.FailoverWSActiveGauge.Update(int64(i))
-			heimdall.FailoverWSProactiveSwitches.Inc(1)
-
-			log.Warn("WS health registry: proactive switch (active unhealthy, uncooled target)",
-				"from", prev, "to", i, "url", c.urls[i])
-
-			if c.conn != nil {
-				c.conn.Close()
-			}
-
-			return
-		}
-	}
+	return c.events
 }
 
 // tryUntilSubscribeMilestoneEvents retries connecting and subscribing until success,
@@ -315,10 +167,7 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		default:
 		}
 
-		c.mu.Lock()
-		active := c.activeURL
-		c.mu.Unlock()
-
+		active := c.registry.Active()
 		url := c.urls[active]
 
 		conn, _, err := websocket.DefaultDialer.Dial(url, nil)
@@ -326,24 +175,22 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 			log.Error("failed to dial websocket on heimdall ws subscription", "url", url, "err", err)
 
 			// Mark endpoint unhealthy in the registry.
-			c.mu.Lock()
-			c.health[active].consecutiveSuccess = 0
-			c.health[active].healthy = false
-			c.health[active].lastErr = err
+			c.registry.MarkUnhealthy(active, err)
 
 			// Find the best healthy alternative.
+			snap := c.registry.HealthSnapshot()
 			switched := false
+
 			for i := 0; i < len(c.urls); i++ {
 				if i == active {
 					continue
 				}
 
-				if c.health[i].healthy {
-					c.activeURL = i
+				if snap[i].Healthy {
+					c.registry.SetActive(i)
 					switched = true
 
 					heimdall.FailoverWSSwitchCounter.Inc(1)
-					heimdall.FailoverWSActiveGauge.Update(int64(i))
 
 					log.Warn("WS URL failed, switching to healthy endpoint",
 						"from", c.urls[active], "to", c.urls[i])
@@ -356,32 +203,30 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 			if !switched && len(c.urls) > 1 {
 				next := (active + 1) % len(c.urls)
 				if next != active {
-					c.activeURL = next
+					c.registry.SetActive(next)
 
 					heimdall.FailoverWSSwitchCounter.Inc(1)
-					heimdall.FailoverWSActiveGauge.Update(int64(next))
 
 					log.Warn("WS URL failed, switching to next endpoint",
 						"from", c.urls[active], "to", c.urls[next])
 				}
 			}
 
-			c.mu.Unlock()
-
 			continue
 		}
 
+		// Close previous connection if any, then set the new one.
 		c.mu.Lock()
+		if c.conn != nil {
+			c.conn.Close()
+		}
 		c.conn = conn
+
 		// Mark this endpoint as successful.
-		c.health[active].consecutiveSuccess++
-		if c.health[active].consecutiveSuccess >= c.consecutiveThreshold && !c.health[active].healthy {
-			c.health[active].healthy = true
-			c.health[active].healthySince = time.Now()
-		}
-		c.mu.Unlock()
+		c.registry.MarkSuccess(active)
 
-		// Build the subscription request.
+		// Build the subscription request and send it under lock to avoid
+		// racing with readMessages on c.conn.
 		req := subscriptionRequest{
 			JSONRPC: "2.0",
 			Method:  "subscribe",
@@ -389,7 +234,10 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		}
 		req.Params.Query = "tm.event='NewBlock' AND milestone.number>0"
 
-		if err := c.conn.WriteJSON(req); err != nil {
+		err = c.conn.WriteJSON(req)
+		c.mu.Unlock()
+
+		if err != nil {
 			log.Error("failed to send subscription request on heimdall ws subscription", "url", url, "err", err)
 			continue
 		}
@@ -403,6 +251,7 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 // readMessages continuously reads messages from the websocket, handling reconnections if necessary.
 func (c *HeimdallWSClient) readMessages(ctx context.Context) {
 	defer close(c.events)
+
 	for {
 		// Check if the context or unsubscribe signal is set.
 		select {
@@ -414,14 +263,24 @@ func (c *HeimdallWSClient) readMessages(ctx context.Context) {
 			// continue to process messages
 		}
 
-		if err := c.conn.SetReadDeadline(time.Now().Add(30 * time.Second)); err != nil {
+		// Grab local ref under lock to avoid racing with reconnection.
+		c.mu.Lock()
+		conn := c.conn
+		c.mu.Unlock()
+
+		if conn == nil {
+			c.tryUntilSubscribeMilestoneEvents(ctx)
+			continue
+		}
+
+		if err := conn.SetReadDeadline(time.Now().Add(30 * time.Second)); err != nil {
 			log.Error("failed to set read deadline on heimdall ws subscription", "err", err)
 
 			c.tryUntilSubscribeMilestoneEvents(ctx)
 			continue
 		}
 
-		_, message, err := c.conn.ReadMessage()
+		_, message, err := conn.ReadMessage()
 		if err != nil {
 			log.Error("connection lost; will attempt to reconnect on heimdall ws subscription", "error", err)
 
@@ -495,6 +354,9 @@ func (c *HeimdallWSClient) Unsubscribe(ctx context.Context) error {
 	default:
 		close(c.done)
 	}
+
+	c.registry.Stop()
+
 	return nil
 }
 
diff --git a/consensus/bor/heimdallws/client_test.go b/consensus/bor/heimdallws/client_test.go
index 70c25f458e..df9f4344d8 100644
--- a/consensus/bor/heimdallws/client_test.go
+++ b/consensus/bor/heimdallws/client_test.go
@@ -134,9 +134,10 @@ func TestWSClient_ConstructorSingleURL(t *testing.T) {
 	require.NoError(t, err)
 	assert.Len(t, client.urls, 1)
 	assert.Equal(t, "ws://localhost:1234", client.urls[0])
-	assert.Equal(t, 0, client.activeURL)
-	assert.Len(t, client.health, 1)
-	assert.True(t, client.health[0].healthy, "primary should start healthy")
+	assert.Equal(t, 0, client.registry.Active())
+	snap := client.registry.HealthSnapshot()
+	assert.Len(t, snap, 1)
+	assert.True(t, snap[0].Healthy, "primary should start healthy")
 }
 
 func TestWSClient_ConstructorMultipleURLs(t *testing.T) {
@@ -146,11 +147,12 @@ func TestWSClient_ConstructorMultipleURLs(t *testing.T) {
 	assert.Equal(t, "ws://primary:1234", client.urls[0])
 	assert.Equal(t, "ws://secondary:5678", client.urls[1])
 	assert.Equal(t, "ws://tertiary:9999", client.urls[2])
-	assert.Equal(t, 0, client.activeURL)
-	assert.Len(t, client.health, 3)
-	assert.True(t, client.health[0].healthy, "primary should start healthy")
-	assert.False(t, client.health[1].healthy, "secondary should start unhealthy")
-	assert.False(t, client.health[2].healthy, "tertiary should start unhealthy")
+	assert.Equal(t, 0, client.registry.Active())
+	snap := client.registry.HealthSnapshot()
+	assert.Len(t, snap, 3)
+	assert.True(t, snap[0].Healthy, "primary should start healthy")
+	assert.False(t, snap[1].Healthy, "secondary should start unhealthy")
+	assert.False(t, snap[2].Healthy, "tertiary should start unhealthy")
 }
 
 func TestWSClient_ConstructorFiltersEmpty(t *testing.T) {
@@ -211,8 +213,8 @@ func TestWSClient_DualURL_FailoverToSecondary(t *testing.T) {
 
 	// Speed up test.
 	client.reconnectDelay = 100 * time.Millisecond
-	client.consecutiveThreshold = 1
-	client.promotionCooldown = 0
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -225,9 +227,7 @@ func TestWSClient_DualURL_FailoverToSecondary(t *testing.T) {
 		assert.Equal(t, uint64(100), m.StartBlock)
 		assert.Equal(t, uint64(200), m.EndBlock)
 		// Verify we switched to secondary.
-		client.mu.Lock()
-		assert.Equal(t, 1, client.activeURL)
-		client.mu.Unlock()
+		assert.Equal(t, 1, client.registry.Active())
 	case <-ctx.Done():
 		t.Fatal("timed out waiting for milestone event via failover")
 	}
@@ -251,8 +251,8 @@ func TestWSClient_ThreeURL_CascadeToTertiary(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.consecutiveThreshold = 1
-	client.promotionCooldown = 0
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
 	defer cancel()
@@ -264,9 +264,7 @@ func TestWSClient_ThreeURL_CascadeToTertiary(t *testing.T) {
 		require.NotNil(t, m)
 		assert.Equal(t, uint64(100), m.StartBlock)
 		// Verify we ended up on tertiary.
-		client.mu.Lock()
-		assert.Equal(t, 2, client.activeURL)
-		client.mu.Unlock()
+		assert.Equal(t, 2, client.registry.Active())
 	case <-ctx.Done():
 		t.Fatal("timed out waiting for milestone event via cascade")
 	}
@@ -286,8 +284,8 @@ func TestWSClient_ContextCancellation(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.consecutiveThreshold = 1
-	client.promotionCooldown = 0
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 0
 
 	ctx, cancel := context.WithCancel(context.Background())
 
@@ -317,9 +315,9 @@ func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.healthCheckInterval = 100 * time.Millisecond
-	client.consecutiveThreshold = 1
-	client.promotionCooldown = 0
+	client.registry.HealthCheckInterval = 100 * time.Millisecond
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -330,9 +328,7 @@ func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
 	select {
 	case m := <-events:
 		require.NotNil(t, m)
-		client.mu.Lock()
-		assert.Equal(t, 1, client.activeURL)
-		client.mu.Unlock()
+		assert.Equal(t, 1, client.registry.Active())
 	case <-ctx.Done():
 		t.Fatal("timed out waiting for failover")
 	}
@@ -350,9 +346,7 @@ func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
 
 	// Wait for background health registry to promote back to primary.
 	require.Eventually(t, func() bool {
-		client.mu.Lock()
-		defer client.mu.Unlock()
-		return client.activeURL == 0
+		return client.registry.Active() == 0
 	}, 5*time.Second, 50*time.Millisecond, "health registry should promote back to primary")
 
 	require.NoError(t, client.Unsubscribe(ctx))
@@ -370,14 +364,12 @@ func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 10 * time.Millisecond
-	client.healthCheckInterval = 1 * time.Hour // prevent health-check from interfering
-	client.consecutiveThreshold = 1
-	client.promotionCooldown = 0
+	client.registry.HealthCheckInterval = 1 * time.Hour // prevent health-check from interfering
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 0
 
 	// Pre-set to secondary as if a prior failover already happened.
-	client.mu.Lock()
-	client.activeURL = 1
-	client.mu.Unlock()
+	client.registry.SetActive(1)
 
 	ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond)
 	defer cancel()
@@ -385,9 +377,7 @@ func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
 	client.tryUntilSubscribeMilestoneEvents(ctx)
 
 	// Should have moved off secondary since it fails.
-	client.mu.Lock()
-	active := client.activeURL
-	client.mu.Unlock()
+	active := client.registry.Active()
 
 	// May have wrapped to primary (index 0) since secondary fails.
 	_ = active // either index is acceptable; the important thing is it didn't hang.
@@ -407,8 +397,8 @@ func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.consecutiveThreshold = 1
-	client.promotionCooldown = 0
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -419,9 +409,7 @@ func TestWSClient_DualURL_PrimaryRecovery(t *testing.T) {
 	select {
 	case m := <-events:
 		require.NotNil(t, m)
-		client.mu.Lock()
-		assert.Equal(t, 1, client.activeURL)
-		client.mu.Unlock()
+		assert.Equal(t, 1, client.registry.Active())
 		assert.Equal(t, uint64(100), m.StartBlock)
 	case <-ctx.Done():
 		t.Fatal("timed out waiting for failover")
@@ -445,9 +433,9 @@ func TestWSClient_HealthRegistryRespectsUnsubscribe(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.healthCheckInterval = 50 * time.Millisecond
-	client.consecutiveThreshold = 1
-	client.promotionCooldown = 0
+	client.registry.HealthCheckInterval = 50 * time.Millisecond
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -483,9 +471,9 @@ func TestWSClient_Registry_ConsecutiveThreshold(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.healthCheckInterval = 50 * time.Millisecond
-	client.consecutiveThreshold = 3 // need 3 consecutive successes
-	client.promotionCooldown = 0
+	client.registry.HealthCheckInterval = 50 * time.Millisecond
+	client.registry.ConsecutiveThreshold = 3 // need 3 consecutive successes
+	client.registry.PromotionCooldown = 0
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -511,9 +499,7 @@ func TestWSClient_Registry_ConsecutiveThreshold(t *testing.T) {
 
 	// Should eventually promote after 3 consecutive successes.
 	require.Eventually(t, func() bool {
-		client.mu.Lock()
-		defer client.mu.Unlock()
-		return client.activeURL == 0
+		return client.registry.Active() == 0
 	}, 5*time.Second, 50*time.Millisecond, "should promote after consecutive threshold met")
 
 	require.NoError(t, client.Unsubscribe(ctx))
@@ -530,9 +516,9 @@ func TestWSClient_Registry_PromotionCooldown(t *testing.T) {
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.healthCheckInterval = 50 * time.Millisecond
-	client.consecutiveThreshold = 1
-	client.promotionCooldown = 500 * time.Millisecond
+	client.registry.HealthCheckInterval = 50 * time.Millisecond
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 500 * time.Millisecond
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -558,15 +544,11 @@ func TestWSClient_Registry_PromotionCooldown(t *testing.T) {
 
 	// Should not promote immediately (cooldown not met).
 	time.Sleep(150 * time.Millisecond)
-	client.mu.Lock()
-	assert.Equal(t, 1, client.activeURL, "should not promote before cooldown")
-	client.mu.Unlock()
+	assert.Equal(t, 1, client.registry.Active(), "should not promote before cooldown")
 
 	// Wait for cooldown to pass.
 	require.Eventually(t, func() bool {
-		client.mu.Lock()
-		defer client.mu.Unlock()
-		return client.activeURL == 0
+		return client.registry.Active() == 0
 	}, 3*time.Second, 50*time.Millisecond, "should promote after cooldown passes")
 
 	require.NoError(t, client.Unsubscribe(ctx))
diff --git a/eth/ethconfig/config.go b/eth/ethconfig/config.go
index ebee43ba42..64d7361f5e 100644
--- a/eth/ethconfig/config.go
+++ b/eth/ethconfig/config.go
@@ -18,6 +18,7 @@
 package ethconfig
 
 import (
+	"fmt"
 	"math/big"
 	"strings"
 	"time"
@@ -393,7 +394,12 @@ func CreateConsensusEngine(chainConfig *params.ChainConfig, ethConfig *Config, d
 				} else if len(heimdallClients) == 1 {
 					heimdallClient = heimdallClients[0]
 				} else {
-					heimdallClient = heimdall.NewMultiHeimdallClient(heimdallClients...)
+					multiClient, err := heimdall.NewMultiHeimdallClient(heimdallClients...)
+					if err != nil {
+						return nil, fmt.Errorf("failed to create heimdall failover client: %w", err)
+					}
+
+					heimdallClient = multiClient
 					log.Info("Heimdall failover enabled with multiple endpoints", "endpoints", len(heimdallClients))
 				}
 			}

From 60457e01bef1562f8d7c8a6549d2260950f9f990 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Tue, 24 Feb 2026 10:52:06 +0530
Subject: [PATCH 25/29] fixed a race condition in ws

---
 consensus/bor/heimdallws/client.go      | 45 +++++++++++++++++----
 consensus/bor/heimdallws/client_test.go | 53 +++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index 1a7c68c4bb..0e75d2600e 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -32,12 +32,13 @@ const (
 
 // HeimdallWSClient represents a websocket client with auto-reconnection and failover support.
 type HeimdallWSClient struct {
-	conn     *websocket.Conn
-	urls     []string // primary at [0], secondary at [1] (if configured)
-	registry *heimdall.HealthRegistry
-	events   chan *milestone.Milestone
-	done     chan struct{}
-	mu       sync.Mutex
+	conn      *websocket.Conn
+	connEpoch uint64   // incremented on each connection change; detects proactive switches
+	urls      []string // primary at [0], secondary at [1] (if configured)
+	registry  *heimdall.HealthRegistry
+	events    chan *milestone.Milestone
+	done      chan struct{}
+	mu        sync.Mutex
 
 	// Configurable parameters (defaults set in constructor, overridable for testing)
 	reconnectDelay time.Duration
@@ -111,16 +112,30 @@ func (c *HeimdallWSClient) probeWSEndpoint(i int) error {
 }
 
 // onWSSwitch is called by the registry (under registry lock) when the active
-// endpoint changes. It closes the current connection to trigger reconnection.
+// endpoint changes. It bumps the connection epoch, closes the current connection,
+// and nils it out. The epoch change lets readMessages distinguish a proactive
+// switch from a real network error, avoiding misleading logs and double-closes.
 func (c *HeimdallWSClient) onWSSwitch(from, to int) {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
+	c.connEpoch++
+
 	if c.conn != nil {
 		c.conn.Close()
+		c.conn = nil
 	}
 }
 
+// connEpochChanged reports whether the connection epoch has advanced past the
+// given snapshot, indicating that a proactive switch (or reconnection) occurred.
+func (c *HeimdallWSClient) connEpochChanged(epoch uint64) bool {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	return c.connEpoch != epoch
+}
+
 // SubscribeMilestoneEvents sends the subscription request and starts processing incoming messages.
 func (c *HeimdallWSClient) SubscribeMilestoneEvents(ctx context.Context) <-chan *milestone.Milestone {
 	c.tryUntilSubscribeMilestoneEvents(ctx)
@@ -221,6 +236,7 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 			c.conn.Close()
 		}
 		c.conn = conn
+		c.connEpoch++
 
 		// Mark this endpoint as successful.
 		c.registry.MarkSuccess(active)
@@ -263,9 +279,10 @@ func (c *HeimdallWSClient) readMessages(ctx context.Context) {
 			// continue to process messages
 		}
 
-		// Grab local ref under lock to avoid racing with reconnection.
+		// Grab local ref and epoch under lock to detect proactive switches.
 		c.mu.Lock()
 		conn := c.conn
+		epoch := c.connEpoch
 		c.mu.Unlock()
 
 		if conn == nil {
@@ -274,6 +291,12 @@ func (c *HeimdallWSClient) readMessages(ctx context.Context) {
 		}
 
 		if err := conn.SetReadDeadline(time.Now().Add(30 * time.Second)); err != nil {
+			if c.connEpochChanged(epoch) {
+				// Proactive switch closed the connection; loop back to pick up the new endpoint.
+				log.Info("reconnecting due to endpoint switch on heimdall ws subscription")
+				continue
+			}
+
 			log.Error("failed to set read deadline on heimdall ws subscription", "err", err)
 
 			c.tryUntilSubscribeMilestoneEvents(ctx)
@@ -282,6 +305,12 @@ func (c *HeimdallWSClient) readMessages(ctx context.Context) {
 
 		_, message, err := conn.ReadMessage()
 		if err != nil {
+			if c.connEpochChanged(epoch) {
+				// Proactive switch closed the connection; loop back to pick up the new endpoint.
+				log.Info("reconnecting due to endpoint switch on heimdall ws subscription")
+				continue
+			}
+
 			log.Error("connection lost; will attempt to reconnect on heimdall ws subscription", "error", err)
 
 			c.tryUntilSubscribeMilestoneEvents(ctx)
diff --git a/consensus/bor/heimdallws/client_test.go b/consensus/bor/heimdallws/client_test.go
index df9f4344d8..e13725e229 100644
--- a/consensus/bor/heimdallws/client_test.go
+++ b/consensus/bor/heimdallws/client_test.go
@@ -553,3 +553,56 @@ func TestWSClient_Registry_PromotionCooldown(t *testing.T) {
 
 	require.NoError(t, client.Unsubscribe(ctx))
 }
+
+func TestWSClient_ProactiveSwitchSetsConnNil(t *testing.T) {
+	// Verify that onWSSwitch nils out the connection and bumps the epoch,
+	// so readMessages detects the switch via epoch change rather than
+	// seeing a stale non-nil closed conn.
+	primary := newTestWSServerWithMilestone(t)
+	defer primary.Close()
+
+	secondary := newTestWSServerWithMilestone(t)
+	defer secondary.Close()
+
+	client, err := NewHeimdallWSClient(wsURL(primary.URL), wsURL(secondary.URL))
+	require.NoError(t, err)
+
+	client.reconnectDelay = 100 * time.Millisecond
+	client.registry.HealthCheckInterval = 1 * time.Hour // manual control
+	client.registry.ConsecutiveThreshold = 1
+	client.registry.PromotionCooldown = 0
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	events := client.SubscribeMilestoneEvents(ctx)
+
+	// Receive milestone from primary.
+	select {
+	case m := <-events:
+		require.NotNil(t, m)
+		assert.Equal(t, 0, client.registry.Active())
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for milestone from primary")
+	}
+
+	// Capture epoch before switch.
+	client.mu.Lock()
+	epochBefore := client.connEpoch
+	client.mu.Unlock()
+
+	// Simulate a proactive switch by calling onWSSwitch directly.
+	client.onWSSwitch(0, 1)
+
+	// Verify conn is nil and epoch advanced.
+	client.mu.Lock()
+	assert.Nil(t, client.conn, "onWSSwitch should nil out the connection")
+	assert.Greater(t, client.connEpoch, epochBefore, "onWSSwitch should bump epoch")
+	client.mu.Unlock()
+
+	// readMessages should detect the nil conn and reconnect.
+	// Set active to secondary so reconnection goes there.
+	client.registry.SetActive(1)
+
+	require.NoError(t, client.Unsubscribe(ctx))
+}

From 30f07b777faddc5c5fcd2620936d581930bb09c9 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Tue, 24 Feb 2026 16:00:22 +0530
Subject: [PATCH 26/29] fixed a potential deadlock

---
 consensus/bor/heimdall/health_registry.go | 121 ++++++++++++----------
 consensus/bor/heimdallws/client.go        |   7 +-
 2 files changed, 73 insertions(+), 55 deletions(-)

diff --git a/consensus/bor/heimdall/health_registry.go b/consensus/bor/heimdall/health_registry.go
index 8dcd890930..c43d2dbf0a 100644
--- a/consensus/bor/heimdall/health_registry.go
+++ b/consensus/bor/heimdall/health_registry.go
@@ -42,7 +42,7 @@ type HealthRegistry struct {
 	PromotionCooldown    time.Duration
 
 	probeFunc func(i int) error
-	onSwitch  func(from, to int) // called under mu; may acquire other locks
+	onSwitch  func(from, to int) // called outside mu to avoid lock-ordering issues
 
 	metrics RegistryMetrics
 
@@ -53,8 +53,8 @@ type HealthRegistry struct {
 
 // NewHealthRegistry creates a registry for n endpoints.
 // probeFunc is called for each endpoint index to test reachability.
-// onSwitch (optional) is called under the registry lock when the active
-// endpoint changes due to promotion or proactive switch.
+// onSwitch (optional) is called outside the registry lock when the active
+// endpoint changes due to promotion, proactive switch, or SetActive.
 func NewHealthRegistry(n int, probeFunc func(int) error, onSwitch func(from, to int), m RegistryMetrics) *HealthRegistry {
 	health := make([]EndpointHealth, n)
 	// Primary starts as healthy; others start unhealthy.
@@ -85,15 +85,18 @@ func (r *HealthRegistry) Active() int {
 // if the active endpoint changed. The caller must NOT hold r.mu.
 func (r *HealthRegistry) SetActive(i int) {
 	r.mu.Lock()
-	defer r.mu.Unlock()
-
 	prev := r.active
 	r.active = i
 
 	if r.metrics.ActiveGauge != nil {
 		r.metrics.ActiveGauge.Update(int64(i))
 	}
+	r.mu.Unlock()
 
+	// Call onSwitch outside r.mu to avoid lock-ordering deadlock.
+	// The WS client's onWSSwitch callback acquires c.mu, so calling it
+	// under r.mu would create a registry.mu → c.mu path that conflicts
+	// with the c.mu → registry.mu path in tryUntilSubscribeMilestoneEvents.
 	if prev != i && r.onSwitch != nil {
 		r.onSwitch(prev, i)
 	}
@@ -235,34 +238,40 @@ func (r *HealthRegistry) probeAll() {
 // maybePromote checks if a higher-priority endpoint (index < active) is healthy
 // and has passed cooldown. If yes, promotes to the highest-priority qualified endpoint.
 func (r *HealthRegistry) maybePromote() {
+	var prev, next int
+	doSwitch := false
+
 	r.mu.Lock()
-	defer r.mu.Unlock()
 
-	if r.active == 0 {
-		return
-	}
+	if r.active != 0 {
+		for i := 0; i < r.active; i++ {
+			if r.health[i].Healthy && time.Since(r.health[i].HealthySince) >= r.PromotionCooldown {
+				prev = r.active
+				next = i
+				r.active = i
+				doSwitch = true
 
-	for i := 0; i < r.active; i++ {
-		if r.health[i].Healthy && time.Since(r.health[i].HealthySince) >= r.PromotionCooldown {
-			prev := r.active
-			r.active = i
+				if r.metrics.ActiveGauge != nil {
+					r.metrics.ActiveGauge.Update(int64(i))
+				}
 
-			if r.metrics.ActiveGauge != nil {
-				r.metrics.ActiveGauge.Update(int64(i))
-			}
+				if r.metrics.ProactiveSwitches != nil {
+					r.metrics.ProactiveSwitches.Inc(1)
+				}
 
-			if r.metrics.ProactiveSwitches != nil {
-				r.metrics.ProactiveSwitches.Inc(1)
+				break
 			}
+		}
+	}
 
-			log.Info("Health registry: promoted to higher-priority endpoint",
-				"index", i, "previous", prev)
+	r.mu.Unlock()
 
-			if r.onSwitch != nil {
-				r.onSwitch(prev, i)
-			}
+	if doSwitch {
+		log.Info("Health registry: promoted to higher-priority endpoint",
+			"index", next, "previous", prev)
 
-			return
+		if r.onSwitch != nil {
+			r.onSwitch(prev, next)
 		}
 	}
 }
@@ -270,10 +279,14 @@ func (r *HealthRegistry) maybePromote() {
 // maybeProactiveSwitch detects if the active endpoint is unhealthy and switches
 // to the highest-priority healthy endpoint.
 func (r *HealthRegistry) maybeProactiveSwitch() {
+	var prev, next int
+	doSwitch := false
+	var logMsg string
+
 	r.mu.Lock()
-	defer r.mu.Unlock()
 
 	if r.health[r.active].Healthy {
+		r.mu.Unlock()
 		return
 	}
 
@@ -285,8 +298,11 @@ func (r *HealthRegistry) maybeProactiveSwitch() {
 		}
 
 		if r.health[i].Healthy && time.Since(r.health[i].HealthySince) >= r.PromotionCooldown {
-			prev := r.active
+			prev = r.active
+			next = i
 			r.active = i
+			doSwitch = true
+			logMsg = "Health registry: proactive switch (active unhealthy, cooled target)"
 
 			if r.metrics.ActiveGauge != nil {
 				r.metrics.ActiveGauge.Update(int64(i))
@@ -296,43 +312,44 @@ func (r *HealthRegistry) maybeProactiveSwitch() {
 				r.metrics.ProactiveSwitches.Inc(1)
 			}
 
-			log.Warn("Health registry: proactive switch (active unhealthy, cooled target)",
-				"from", prev, "to", i)
-
-			if r.onSwitch != nil {
-				r.onSwitch(prev, i)
-			}
-
-			return
+			break
 		}
 	}
 
 	// Pass 2: healthy but NOT cooled (emergency).
-	for i := 0; i < r.n; i++ {
-		if i == r.active {
-			continue
-		}
+	if !doSwitch {
+		for i := 0; i < r.n; i++ {
+			if i == r.active {
+				continue
+			}
 
-		if r.health[i].Healthy {
-			prev := r.active
-			r.active = i
+			if r.health[i].Healthy {
+				prev = r.active
+				next = i
+				r.active = i
+				doSwitch = true
+				logMsg = "Health registry: proactive switch (active unhealthy, uncooled target)"
 
-			if r.metrics.ActiveGauge != nil {
-				r.metrics.ActiveGauge.Update(int64(i))
-			}
+				if r.metrics.ActiveGauge != nil {
+					r.metrics.ActiveGauge.Update(int64(i))
+				}
 
-			if r.metrics.ProactiveSwitches != nil {
-				r.metrics.ProactiveSwitches.Inc(1)
+				if r.metrics.ProactiveSwitches != nil {
+					r.metrics.ProactiveSwitches.Inc(1)
+				}
+
+				break
 			}
+		}
+	}
 
-			log.Warn("Health registry: proactive switch (active unhealthy, uncooled target)",
-				"from", prev, "to", i)
+	r.mu.Unlock()
 
-			if r.onSwitch != nil {
-				r.onSwitch(prev, i)
-			}
+	if doSwitch {
+		log.Warn(logMsg, "from", prev, "to", next)
 
-			return
+		if r.onSwitch != nil {
+			r.onSwitch(prev, next)
 		}
 	}
 }
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index 0e75d2600e..9bc92cc30f 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -238,9 +238,6 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		c.conn = conn
 		c.connEpoch++
 
-		// Mark this endpoint as successful.
-		c.registry.MarkSuccess(active)
-
 		// Build the subscription request and send it under lock to avoid
 		// racing with readMessages on c.conn.
 		req := subscriptionRequest{
@@ -253,6 +250,10 @@ func (c *HeimdallWSClient) tryUntilSubscribeMilestoneEvents(ctx context.Context)
 		err = c.conn.WriteJSON(req)
 		c.mu.Unlock()
 
+		// Mark outside c.mu to prevent lock-ordering deadlock with
+		// registry.mu → c.mu (onWSSwitch called from health-check goroutine).
+		c.registry.MarkSuccess(active)
+
 		if err != nil {
 			log.Error("failed to send subscription request on heimdall ws subscription", "url", url, "err", err)
 			continue

From cbf6924f180130d57206efe1ca25010d115fcbb9 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Tue, 24 Feb 2026 16:38:54 +0530
Subject: [PATCH 27/29] replaces the sequential prob loop with concurent
 goroutines

---
 consensus/bor/heimdall/health_registry.go | 73 ++++++++++++++++-------
 1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/consensus/bor/heimdall/health_registry.go b/consensus/bor/heimdall/health_registry.go
index c43d2dbf0a..26c9337de8 100644
--- a/consensus/bor/heimdall/health_registry.go
+++ b/consensus/bor/heimdall/health_registry.go
@@ -47,6 +47,7 @@ type HealthRegistry struct {
 	metrics RegistryMetrics
 
 	quit      chan struct{}
+	done      chan struct{} // closed when run() exits
 	closeOnce sync.Once
 	startOnce sync.Once
 }
@@ -70,6 +71,7 @@ func NewHealthRegistry(n int, probeFunc func(int) error, onSwitch func(from, to
 		onSwitch:             onSwitch,
 		metrics:              m,
 		quit:                 make(chan struct{}),
+		done:                 make(chan struct{}),
 	}
 }
 
@@ -154,15 +156,24 @@ func (r *HealthRegistry) Start() {
 	})
 }
 
-// Stop closes the quit channel, stopping the background goroutine.
+// Stop closes the quit channel and waits for the background goroutine to exit.
 func (r *HealthRegistry) Stop() {
+	// If Start() was never called, close done so the wait below doesn't block.
+	r.startOnce.Do(func() {
+		close(r.done)
+	})
+
 	r.closeOnce.Do(func() {
 		close(r.quit)
 	})
+
+	<-r.done
 }
 
 // run is the background goroutine: probe → promote → proactive switch.
 func (r *HealthRegistry) run() {
+	defer close(r.done)
+
 	ticker := time.NewTicker(r.HealthCheckInterval)
 	defer ticker.Stop()
 
@@ -179,25 +190,49 @@ func (r *HealthRegistry) run() {
 	}
 }
 
-// probeAll probes every endpoint and updates health state.
+// probeAll probes every endpoint concurrently and updates health state.
 func (r *HealthRegistry) probeAll() {
-	for i := 0; i < r.n; i++ {
-		// Check for shutdown between individual probes.
-		select {
-		case <-r.quit:
-			return
-		default:
-		}
+	// Check for shutdown before launching probes.
+	select {
+	case <-r.quit:
+		return
+	default:
+	}
+
+	// Launch all probes concurrently. Each goroutine writes to its own
+	// index in errs — no data race, no mutex needed for the slice.
+	errs := make([]error, r.n)
+
+	var wg sync.WaitGroup
+	wg.Add(r.n)
 
+	for i := 0; i < r.n; i++ {
 		if r.metrics.ProbeAttempts != nil {
 			r.metrics.ProbeAttempts.Inc(1)
 		}
 
-		err := r.probeFunc(i)
+		go func(idx int) {
+			defer wg.Done()
+			errs[idx] = r.probeFunc(idx)
+		}(i)
+	}
+
+	wg.Wait()
+
+	// Discard results if shutdown occurred while probes were in flight.
+	select {
+	case <-r.quit:
+		return
+	default:
+	}
+
+	// Apply all results under a single lock acquisition.
+	r.mu.Lock()
 
-		r.mu.Lock()
+	healthyCount := int64(0)
 
-		if err == nil {
+	for i := 0; i < r.n; i++ {
+		if errs[i] == nil {
 			r.health[i].ConsecutiveSuccess++
 			r.health[i].LastErr = nil
 
@@ -212,26 +247,18 @@ func (r *HealthRegistry) probeAll() {
 		} else {
 			r.health[i].ConsecutiveSuccess = 0
 			r.health[i].Healthy = false
-			r.health[i].LastErr = err
+			r.health[i].LastErr = errs[i]
 		}
 
-		r.mu.Unlock()
-	}
-
-	// Update healthy endpoints gauge.
-	r.mu.Lock()
-	count := int64(0)
-
-	for i := range r.health {
 		if r.health[i].Healthy {
-			count++
+			healthyCount++
 		}
 	}
 
 	r.mu.Unlock()
 
 	if r.metrics.HealthyEndpoints != nil {
-		r.metrics.HealthyEndpoints.Update(count)
+		r.metrics.HealthyEndpoints.Update(healthyCount)
 	}
 }
 

From 716a9e8f1fdabaa00a318067769bea768ad2eb03 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Wed, 25 Feb 2026 09:33:55 +0530
Subject: [PATCH 28/29] reduced the failover switchover time

---
 consensus/bor/heimdall/failover_client.go     |  5 +-
 .../bor/heimdall/failover_client_test.go      | 90 +++++++++++++++++--
 consensus/bor/heimdall/health_registry.go     | 75 ++++++++--------
 .../bor/heimdall/health_registry_test.go      | 66 ++++++++++++++
 consensus/bor/heimdallws/client.go            |  5 +-
 5 files changed, 196 insertions(+), 45 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client.go b/consensus/bor/heimdall/failover_client.go
index b74eec5d1f..9b20269ff2 100644
--- a/consensus/bor/heimdall/failover_client.go
+++ b/consensus/bor/heimdall/failover_client.go
@@ -18,6 +18,7 @@ import (
 
 const (
 	defaultAttemptTimeout       = 30 * time.Second
+	defaultProbeTimeout         = 5 * time.Second
 	defaultHealthCheckInterval  = 10 * time.Second
 	defaultConsecutiveThreshold = 3
 	defaultPromotionCooldown    = 60 * time.Second
@@ -47,6 +48,7 @@ type MultiHeimdallClient struct {
 	clients        []Endpoint
 	registry       *HealthRegistry
 	attemptTimeout time.Duration
+	probeTimeout   time.Duration
 	probeCtx       context.Context // cancelled on Close to abort in-flight probes
 	probeCancel    context.CancelFunc
 }
@@ -61,6 +63,7 @@ func NewMultiHeimdallClient(clients ...Endpoint) (*MultiHeimdallClient, error) {
 	f := &MultiHeimdallClient{
 		clients:        clients,
 		attemptTimeout: defaultAttemptTimeout,
+		probeTimeout:   defaultProbeTimeout,
 		probeCtx:       probeCtx,
 		probeCancel:    probeCancel,
 	}
@@ -83,7 +86,7 @@ func NewMultiHeimdallClient(clients ...Endpoint) (*MultiHeimdallClient, error) {
 
 // probeEndpoint probes a single endpoint via FetchStatus.
 func (f *MultiHeimdallClient) probeEndpoint(i int) error {
-	ctx, cancel := context.WithTimeout(f.probeCtx, f.attemptTimeout)
+	ctx, cancel := context.WithTimeout(f.probeCtx, f.probeTimeout)
 	defer cancel()
 
 	_, err := f.clients[i].FetchStatus(ctx)
diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 641730330a..26cd633164 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -131,6 +131,7 @@ func newInstantMulti(clients ...Endpoint) *MultiHeimdallClient {
 	}
 
 	fc.attemptTimeout = 100 * time.Millisecond
+	fc.probeTimeout = 100 * time.Millisecond
 	fc.registry.ConsecutiveThreshold = 1
 	fc.registry.PromotionCooldown = 0
 	fc.registry.HealthCheckInterval = 50 * time.Millisecond
@@ -182,17 +183,25 @@ func TestFailover_NoSwitchOnContextCanceled(t *testing.T) {
 	require.NoError(t, err)
 
 	fc.attemptTimeout = 5 * time.Second // longer than caller's ctx
+	fc.probeTimeout = 100 * time.Millisecond
 	fc.registry.HealthCheckInterval = 1 * time.Hour
 	fc.registry.ConsecutiveThreshold = 1
 	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
+	// Start registry and let the immediate probe cycle complete so its
+	// FetchStatus hits don't race with the assertion below.
+	fc.ensureHealthRegistry()
+	time.Sleep(50 * time.Millisecond)
+
+	secondaryBefore := secondary.hits.Load()
+
 	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer cancel()
 
 	_, err = fc.GetSpan(ctx, 1)
 	require.Error(t, err)
-	assert.Equal(t, int32(0), secondary.hits.Load(), "should not failover on caller context cancellation")
+	assert.Equal(t, secondaryBefore, secondary.hits.Load(), "should not failover on caller context cancellation")
 }
 
 func TestFailover_NoSwitchOnServiceUnavailable(t *testing.T) {
@@ -256,6 +265,7 @@ func TestFailover_StickyBehavior(t *testing.T) {
 	require.NoError(t, err)
 
 	fc.attemptTimeout = 100 * time.Millisecond
+	fc.probeTimeout = 100 * time.Millisecond
 	fc.registry.ConsecutiveThreshold = 1
 	fc.registry.PromotionCooldown = 0
 	fc.registry.HealthCheckInterval = 1 * time.Hour // very long — no background promotion
@@ -265,6 +275,10 @@ func TestFailover_StickyBehavior(t *testing.T) {
 	_, err = fc.GetSpan(context.Background(), 1)
 	require.NoError(t, err)
 
+	// Wait for the immediate probe cycle (launched by ensureHealthRegistry
+	// inside the first GetSpan call) to complete before snapshotting hits.
+	time.Sleep(50 * time.Millisecond)
+
 	primaryBefore := primary.hits.Load()
 	secondaryBefore := secondary.hits.Load()
 
@@ -374,18 +388,27 @@ func TestFailover_PassthroughWhenPrimaryHealthy(t *testing.T) {
 	require.NoError(t, err)
 
 	fc.attemptTimeout = 5 * time.Second
+	fc.probeTimeout = 100 * time.Millisecond
 	fc.registry.HealthCheckInterval = 1 * time.Hour
 	fc.registry.ConsecutiveThreshold = 1
 	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
+	// Start registry and let the immediate probe cycle complete so its
+	// FetchStatus hits don't interfere with assertions below.
+	fc.ensureHealthRegistry()
+	time.Sleep(50 * time.Millisecond)
+
+	primaryBefore := primary.hits.Load()
+	secondaryBefore := secondary.hits.Load()
+
 	for i := 0; i < 5; i++ {
 		_, err := fc.GetSpan(context.Background(), 1)
 		require.NoError(t, err)
 	}
 
-	assert.Equal(t, int32(5), primary.hits.Load(), "all calls should go to primary")
-	assert.Equal(t, int32(0), secondary.hits.Load(), "secondary should not be contacted")
+	assert.Equal(t, primaryBefore+5, primary.hits.Load(), "all calls should go to primary")
+	assert.Equal(t, secondaryBefore, secondary.hits.Load(), "secondary should not be contacted for API calls")
 }
 
 // Integration test using real HTTP servers to verify end-to-end behavior
@@ -740,21 +763,30 @@ func TestFailover_ActiveFailoverError_CascadesToNext(t *testing.T) {
 
 	// Primary also fails so cascade doesn't land there.
 	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+		getSpanFn:     func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
 	}
 	secondary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+		getSpanFn:     func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
 	}
 	tertiary := &mockHeimdallClient{}
 
-	fc := newInstantMulti(primary, secondary, tertiary)
+	fc, err := NewMultiHeimdallClient(primary, secondary, tertiary)
+	require.NoError(t, err)
+
+	fc.attemptTimeout = 100 * time.Millisecond
+	fc.probeTimeout = 100 * time.Millisecond
+	fc.registry.HealthCheckInterval = 1 * time.Hour // prevent background probes from promoting
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
 	// Force onto secondary
 	fc.registry.SetActive(1)
 
-	span, err := fc.GetSpan(context.Background(), 1)
-	require.NoError(t, err)
+	span, getErr := fc.GetSpan(context.Background(), 1)
+	require.NoError(t, getErr)
 	require.NotNil(t, span)
 	assert.GreaterOrEqual(t, tertiary.hits.Load(), int32(1), "should cascade to tertiary")
 
@@ -1121,11 +1153,17 @@ func TestRegistry_CascadeFallsBackToUnhealthy(t *testing.T) {
 	require.NoError(t, err)
 
 	fc.attemptTimeout = 100 * time.Millisecond
+	fc.probeTimeout = 100 * time.Millisecond
 	fc.registry.HealthCheckInterval = 1 * time.Hour
 	fc.registry.ConsecutiveThreshold = 1
 	fc.registry.PromotionCooldown = 0
 	defer fc.Close()
 
+	// Start registry and let the immediate probe complete before setting up
+	// the test state, otherwise the probe can mark secondary healthy.
+	fc.ensureHealthRegistry()
+	time.Sleep(50 * time.Millisecond)
+
 	// Mark secondary as unhealthy
 	fc.registry.SetHealth(1, EndpointHealth{Healthy: false})
 
@@ -1168,6 +1206,42 @@ func TestRegistry_MarkUnhealthyOnRealFailure(t *testing.T) {
 	assert.Equal(t, 0, snap[0].ConsecutiveSuccess, "consecutive success should be reset")
 }
 
+func TestFailover_ProbeUsesProbeTimeout(t *testing.T) {
+	// Verify that probes use the short probeTimeout, not the long attemptTimeout.
+	// A probe against a hanging endpoint should fail within probeTimeout, not
+	// wait for attemptTimeout.
+	primary := &mockHeimdallClient{
+		fetchStatusFn: func(ctx context.Context) (*ctypes.SyncInfo, error) {
+			// Hang until context expires.
+			<-ctx.Done()
+			return nil, ctx.Err()
+		},
+	}
+	secondary := &mockHeimdallClient{}
+
+	fc, err := NewMultiHeimdallClient(primary, secondary)
+	require.NoError(t, err)
+
+	fc.attemptTimeout = 10 * time.Second // long — should NOT be used for probes
+	fc.probeTimeout = 200 * time.Millisecond
+	fc.registry.HealthCheckInterval = 1 * time.Hour
+	fc.registry.ConsecutiveThreshold = 1
+	fc.registry.PromotionCooldown = 0
+	defer fc.Close()
+
+	start := time.Now()
+	fc.registry.Start()
+
+	// Wait for the immediate probe cycle to complete.
+	require.Eventually(t, func() bool {
+		snap := fc.registry.HealthSnapshot()
+		return !snap[0].Healthy || snap[0].LastErr != nil
+	}, 2*time.Second, 20*time.Millisecond, "probe should complete")
+
+	elapsed := time.Since(start)
+	assert.Less(t, elapsed, 2*time.Second, "probe should complete within probeTimeout, not attemptTimeout")
+}
+
 func TestRegistry_InformedCascade_RespectsCooldown(t *testing.T) {
 	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
 
diff --git a/consensus/bor/heimdall/health_registry.go b/consensus/bor/heimdall/health_registry.go
index 26c9337de8..de61698584 100644
--- a/consensus/bor/heimdall/health_registry.go
+++ b/consensus/bor/heimdall/health_registry.go
@@ -174,6 +174,12 @@ func (r *HealthRegistry) Stop() {
 func (r *HealthRegistry) run() {
 	defer close(r.done)
 
+	// Run an immediate probe cycle so a down primary is detected within
+	// seconds of boot rather than waiting for the first ticker fire.
+	r.probeAll()
+	r.maybePromote()
+	r.maybeProactiveSwitch()
+
 	ticker := time.NewTicker(r.HealthCheckInterval)
 	defer ticker.Stop()
 
@@ -191,6 +197,9 @@ func (r *HealthRegistry) run() {
 }
 
 // probeAll probes every endpoint concurrently and updates health state.
+// Each goroutine applies its own result immediately so that a request
+// arriving mid-cycle (via callWithFailover → HealthSnapshot) sees fresh
+// data for already-completed probes rather than stale data for all of them.
 func (r *HealthRegistry) probeAll() {
 	// Check for shutdown before launching probes.
 	select {
@@ -199,10 +208,6 @@ func (r *HealthRegistry) probeAll() {
 	default:
 	}
 
-	// Launch all probes concurrently. Each goroutine writes to its own
-	// index in errs — no data race, no mutex needed for the slice.
-	errs := make([]error, r.n)
-
 	var wg sync.WaitGroup
 	wg.Add(r.n)
 
@@ -213,51 +218,51 @@ func (r *HealthRegistry) probeAll() {
 
 		go func(idx int) {
 			defer wg.Done()
-			errs[idx] = r.probeFunc(idx)
+
+			err := r.probeFunc(idx)
+
+			// Apply this probe's result immediately.
+			r.mu.Lock()
+			if err == nil {
+				r.health[idx].ConsecutiveSuccess++
+				r.health[idx].LastErr = nil
+
+				if r.health[idx].ConsecutiveSuccess >= r.ConsecutiveThreshold && !r.health[idx].Healthy {
+					r.health[idx].Healthy = true
+					r.health[idx].HealthySince = time.Now()
+				}
+
+				if r.metrics.ProbeSuccesses != nil {
+					r.metrics.ProbeSuccesses.Inc(1)
+				}
+			} else {
+				r.health[idx].ConsecutiveSuccess = 0
+				r.health[idx].Healthy = false
+				r.health[idx].LastErr = err
+			}
+			r.mu.Unlock()
 		}(i)
 	}
 
 	wg.Wait()
 
-	// Discard results if shutdown occurred while probes were in flight.
+	// Update gauge after all probes complete — needs to scan all results.
 	select {
 	case <-r.quit:
 		return
 	default:
 	}
 
-	// Apply all results under a single lock acquisition.
-	r.mu.Lock()
-
-	healthyCount := int64(0)
-
-	for i := 0; i < r.n; i++ {
-		if errs[i] == nil {
-			r.health[i].ConsecutiveSuccess++
-			r.health[i].LastErr = nil
-
-			if r.health[i].ConsecutiveSuccess >= r.ConsecutiveThreshold && !r.health[i].Healthy {
-				r.health[i].Healthy = true
-				r.health[i].HealthySince = time.Now()
-			}
-
-			if r.metrics.ProbeSuccesses != nil {
-				r.metrics.ProbeSuccesses.Inc(1)
+	if r.metrics.HealthyEndpoints != nil {
+		r.mu.Lock()
+		healthyCount := int64(0)
+		for i := 0; i < r.n; i++ {
+			if r.health[i].Healthy {
+				healthyCount++
 			}
-		} else {
-			r.health[i].ConsecutiveSuccess = 0
-			r.health[i].Healthy = false
-			r.health[i].LastErr = errs[i]
 		}
+		r.mu.Unlock()
 
-		if r.health[i].Healthy {
-			healthyCount++
-		}
-	}
-
-	r.mu.Unlock()
-
-	if r.metrics.HealthyEndpoints != nil {
 		r.metrics.HealthyEndpoints.Update(healthyCount)
 	}
 }
diff --git a/consensus/bor/heimdall/health_registry_test.go b/consensus/bor/heimdall/health_registry_test.go
index 8a98b53ee4..9761dd05a1 100644
--- a/consensus/bor/heimdall/health_registry_test.go
+++ b/consensus/bor/heimdall/health_registry_test.go
@@ -223,6 +223,72 @@ func TestHealthRegistry_MaybeProactiveSwitch_NoHealthy(t *testing.T) {
 	assert.Equal(t, 0, r.Active(), "should stay on 0 when no alternatives are healthy")
 }
 
+func TestHealthRegistry_ImmediateProbeOnStart(t *testing.T) {
+	probeCount := atomic.Int32{}
+
+	r := NewHealthRegistry(2, func(i int) error {
+		probeCount.Add(1)
+		return nil
+	}, nil, RegistryMetrics{})
+	r.HealthCheckInterval = 10 * time.Second // long interval — should NOT gate first probe
+
+	r.Start()
+	defer r.Stop()
+
+	// The first probe cycle should fire immediately, not after HealthCheckInterval.
+	require.Eventually(t, func() bool {
+		return probeCount.Load() >= 2 // 2 endpoints probed
+	}, 2*time.Second, 10*time.Millisecond, "first probe cycle should run immediately on Start")
+}
+
+func TestHealthRegistry_ProbeAll_IncrementalUpdate(t *testing.T) {
+	// Verify that a fast probe's result is visible before a slow probe completes.
+	slowStarted := make(chan struct{})
+	slowRelease := make(chan struct{})
+
+	r := NewHealthRegistry(2, func(i int) error {
+		if i == 0 {
+			// Fast probe: returns immediately.
+			return nil
+		}
+		// Slow probe: blocks until released.
+		close(slowStarted)
+		<-slowRelease
+		return nil
+	}, nil, RegistryMetrics{})
+	r.ConsecutiveThreshold = 1
+
+	// Run probeAll in a goroutine since the slow probe blocks.
+	done := make(chan struct{})
+	go func() {
+		r.probeAll()
+		close(done)
+	}()
+
+	// Wait for the slow probe to start (meaning the fast probe has already completed).
+	select {
+	case <-slowStarted:
+	case <-time.After(2 * time.Second):
+		t.Fatal("timed out waiting for slow probe to start")
+	}
+
+	// The fast probe (index 0) should already be applied even though the slow
+	// probe (index 1) is still in flight.
+	snap := r.HealthSnapshot()
+	assert.True(t, snap[0].Healthy, "fast probe result should be visible before slow probe completes")
+
+	// Release the slow probe and wait for probeAll to finish.
+	close(slowRelease)
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("timed out waiting for probeAll to finish")
+	}
+
+	snap = r.HealthSnapshot()
+	assert.True(t, snap[1].Healthy, "slow probe result should be applied after release")
+}
+
 func TestHealthRegistry_Stop_HaltsGoroutine(t *testing.T) {
 	probeCount := atomic.Int32{}
 
diff --git a/consensus/bor/heimdallws/client.go b/consensus/bor/heimdallws/client.go
index 9bc92cc30f..f1d0cb2ec5 100644
--- a/consensus/bor/heimdallws/client.go
+++ b/consensus/bor/heimdallws/client.go
@@ -377,14 +377,17 @@ func (c *HeimdallWSClient) readMessages(ctx context.Context) {
 // Unsubscribe signals the reader goroutine to stop.
 func (c *HeimdallWSClient) Unsubscribe(ctx context.Context) error {
 	c.mu.Lock()
-	defer c.mu.Unlock()
 	select {
 	case <-c.done:
 		// Already unsubscribed.
 	default:
 		close(c.done)
 	}
+	c.mu.Unlock()
 
+	// Stop the registry outside c.mu to avoid deadlock with probeWSEndpoint,
+	// which acquires c.mu to read the URL while running under the registry's
+	// run() goroutine.
 	c.registry.Stop()
 
 	return nil

From 478759a7eae2d5a02cc52bbc7caacc33bf093270 Mon Sep 17 00:00:00 2001
From: Pratik Patil <pratikspatil024@gmail.com>
Date: Wed, 25 Feb 2026 10:35:23 +0530
Subject: [PATCH 29/29] reduced code duplication in tests

---
 .../bor/heimdall/failover_client_test.go      | 156 +++++++-----------
 consensus/bor/heimdallws/client_test.go       | 140 +++++-----------
 2 files changed, 108 insertions(+), 188 deletions(-)

diff --git a/consensus/bor/heimdall/failover_client_test.go b/consensus/bor/heimdall/failover_client_test.go
index 26cd633164..1ed5740ddd 100644
--- a/consensus/bor/heimdall/failover_client_test.go
+++ b/consensus/bor/heimdall/failover_client_test.go
@@ -122,6 +122,55 @@ func (m *mockHeimdallClient) Close() {
 	}
 }
 
+// testConnErr is a reusable connection-refused error for tests.
+var testConnErr = &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+
+// newConnRefusedMock creates a mock where both API calls and health probes always fail.
+func newConnRefusedMock() *mockHeimdallClient {
+	return &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, testConnErr
+		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			return nil, testConnErr
+		},
+	}
+}
+
+// newToggleMock creates a mock whose API calls and health probes fail when down.Load() is true.
+func newToggleMock(down *atomic.Bool) *mockHeimdallClient {
+	return &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, spanID uint64) (*types.Span, error) {
+			if down.Load() {
+				return nil, testConnErr
+			}
+			return &types.Span{Id: spanID}, nil
+		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			if down.Load() {
+				return nil, testConnErr
+			}
+			return &ctypes.SyncInfo{}, nil
+		},
+	}
+}
+
+// newProbeToggleMock creates a mock where API calls always fail but health probes
+// succeed when down.Load() is false.
+func newProbeToggleMock(down *atomic.Bool) *mockHeimdallClient {
+	return &mockHeimdallClient{
+		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
+			return nil, testConnErr
+		},
+		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
+			if down.Load() {
+				return nil, testConnErr
+			}
+			return &ctypes.SyncInfo{}, nil
+		},
+	}
+}
+
 // newInstantMulti creates a MultiHeimdallClient with instant health registry
 // behavior: consecutiveThreshold=1, promotionCooldown=0, fast health-check interval.
 func newInstantMulti(clients ...Endpoint) *MultiHeimdallClient {
@@ -296,20 +345,7 @@ func TestFailover_ProbeBackToPrimary(t *testing.T) {
 	primaryDown := atomic.Bool{}
 	primaryDown.Store(true)
 
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, spanID uint64) (*types.Span, error) {
-			if primaryDown.Load() {
-				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-			}
-			return &types.Span{Id: spanID}, nil
-		},
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
-			if primaryDown.Load() {
-				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-			}
-			return &ctypes.SyncInfo{}, nil
-		},
-	}
+	primary := newToggleMock(&primaryDown)
 	secondary := &mockHeimdallClient{}
 
 	fc := newInstantMulti(primary, secondary)
@@ -335,14 +371,7 @@ func TestFailover_ProbeBackToPrimary(t *testing.T) {
 }
 
 func TestFailover_ProbeBackFails(t *testing.T) {
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-		},
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-		},
-	}
+	primary := newConnRefusedMock()
 	secondary := &mockHeimdallClient{}
 
 	fc := newInstantMulti(primary, secondary)
@@ -686,23 +715,10 @@ func TestFailover_ThreeClients_ProbeBackToPrimary(t *testing.T) {
 	primaryDown := atomic.Bool{}
 	primaryDown.Store(true)
 
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, spanID uint64) (*types.Span, error) {
-			if primaryDown.Load() {
-				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-			}
-			return &types.Span{Id: spanID}, nil
-		},
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
-			if primaryDown.Load() {
-				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-			}
-			return &ctypes.SyncInfo{}, nil
-		},
-	}
+	primary := newToggleMock(&primaryDown)
 	secondary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			return nil, testConnErr
 		},
 	}
 	tertiary := &mockHeimdallClient{}
@@ -759,17 +775,9 @@ func TestFailover_ActiveNonFailoverError(t *testing.T) {
 
 // Active client returns failover error: cascade should try by priority.
 func TestFailover_ActiveFailoverError_CascadesToNext(t *testing.T) {
-	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-
 	// Primary also fails so cascade doesn't land there.
-	primary := &mockHeimdallClient{
-		getSpanFn:     func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
-	}
-	secondary := &mockHeimdallClient{
-		getSpanFn:     func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
-	}
+	primary := newConnRefusedMock()
+	secondary := newConnRefusedMock()
 	tertiary := &mockHeimdallClient{}
 
 	fc, err := NewMultiHeimdallClient(primary, secondary, tertiary)
@@ -819,28 +827,8 @@ func TestFailover_HealthCheckPromotesHighestPriority(t *testing.T) {
 	secondaryDown := atomic.Bool{}
 	secondaryDown.Store(true)
 
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-		},
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
-			if primaryDown.Load() {
-				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-			}
-			return &ctypes.SyncInfo{}, nil
-		},
-	}
-	secondary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-		},
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
-			if secondaryDown.Load() {
-				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-			}
-			return &ctypes.SyncInfo{}, nil
-		},
-	}
+	primary := newProbeToggleMock(&primaryDown)
+	secondary := newProbeToggleMock(&secondaryDown)
 	tertiary := &mockHeimdallClient{}
 
 	fc := newInstantMulti(primary, secondary, tertiary)
@@ -900,7 +888,7 @@ func TestRegistry_ConsecutiveThreshold(t *testing.T) {
 
 	primary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			return nil, testConnErr
 		},
 		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
 			probeCount.Add(1)
@@ -939,17 +927,7 @@ func TestRegistry_PromotionCooldown(t *testing.T) {
 	primaryDown := atomic.Bool{}
 	primaryDown.Store(true)
 
-	primary := &mockHeimdallClient{
-		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-		},
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
-			if primaryDown.Load() {
-				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-			}
-			return &ctypes.SyncInfo{}, nil
-		},
-	}
+	primary := newProbeToggleMock(&primaryDown)
 	secondary := &mockHeimdallClient{}
 
 	fc, err := NewMultiHeimdallClient(primary, secondary)
@@ -983,13 +961,13 @@ func TestRegistry_FlappingPrevention(t *testing.T) {
 
 	primary := &mockHeimdallClient{
 		getSpanFn: func(_ context.Context, _ uint64) (*types.Span, error) {
-			return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+			return nil, testConnErr
 		},
 		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) {
 			n := callCount.Add(1)
 			// Alternate: success, fail, success, fail...
 			if n%2 == 0 {
-				return nil, &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
+				return nil, testConnErr
 			}
 			return &ctypes.SyncInfo{}, nil
 		},
@@ -1018,16 +996,8 @@ func TestRegistry_FlappingPrevention(t *testing.T) {
 }
 
 func TestRegistry_InformedCascade_SkipsUnhealthy(t *testing.T) {
-	connErr := &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}
-
-	primary := &mockHeimdallClient{
-		getSpanFn:     func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
-	}
-	secondary := &mockHeimdallClient{
-		getSpanFn:     func(_ context.Context, _ uint64) (*types.Span, error) { return nil, connErr },
-		fetchStatusFn: func(_ context.Context) (*ctypes.SyncInfo, error) { return nil, connErr },
-	}
+	primary := newConnRefusedMock()
+	secondary := newConnRefusedMock()
 	tertiary := &mockHeimdallClient{}
 
 	fc, err := NewMultiHeimdallClient(primary, secondary, tertiary)
diff --git a/consensus/bor/heimdallws/client_test.go b/consensus/bor/heimdallws/client_test.go
index e13725e229..a5b2f4330f 100644
--- a/consensus/bor/heimdallws/client_test.go
+++ b/consensus/bor/heimdallws/client_test.go
@@ -303,53 +303,13 @@ func TestWSClient_ContextCancellation(t *testing.T) {
 }
 
 func TestWSClient_DualURL_ProbeBackToPrimary(t *testing.T) {
-	// Primary starts rejecting, secondary accepts.
-	// After failover to secondary, primary comes back, health-check should promote.
-	primaryReject := newTestWSServer(t, true)
-	defer primaryReject.Close()
-
-	secondary := newTestWSServerWithMilestone(t)
-	defer secondary.Close()
-
-	client, err := NewHeimdallWSClient(wsURL(primaryReject.URL), wsURL(secondary.URL))
-	require.NoError(t, err)
-
-	client.reconnectDelay = 100 * time.Millisecond
-	client.registry.HealthCheckInterval = 100 * time.Millisecond
-	client.registry.ConsecutiveThreshold = 1
-	client.registry.PromotionCooldown = 0
-
-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-	defer cancel()
-
-	events := client.SubscribeMilestoneEvents(ctx)
-
-	// Should failover to secondary.
-	select {
-	case m := <-events:
-		require.NotNil(t, m)
-		assert.Equal(t, 1, client.registry.Active())
-	case <-ctx.Done():
-		t.Fatal("timed out waiting for failover")
-	}
-
-	// Close the rejecting primary and replace with an accepting one.
-	primaryReject.Close()
-
-	primaryGood := newTestWSServer(t, false)
-	defer primaryGood.Close()
-
-	// Update URL to the new primary that accepts connections.
-	client.mu.Lock()
-	client.urls[0] = wsURL(primaryGood.URL)
-	client.mu.Unlock()
+	fix := setupWSFailover(t, 100*time.Millisecond, 1, 0)
+	defer fix.cleanup(t)
 
 	// Wait for background health registry to promote back to primary.
 	require.Eventually(t, func() bool {
-		return client.registry.Active() == 0
+		return fix.client.registry.Active() == 0
 	}, 5*time.Second, 50*time.Millisecond, "health registry should promote back to primary")
-
-	require.NoError(t, client.Unsubscribe(ctx))
 }
 
 func TestWSClient_DualURL_NoWrapOnLastURLFails(t *testing.T) {
@@ -457,30 +417,40 @@ func TestWSClient_HealthRegistryRespectsUnsubscribe(t *testing.T) {
 	time.Sleep(200 * time.Millisecond)
 }
 
-// --- New health registry tests ---
+// wsFailoverFixture holds the shared state for WS failover tests that start with
+// a rejecting primary, failover to a milestone-serving secondary, then swap in a
+// good primary to test promotion behavior.
+type wsFailoverFixture struct {
+	client *HeimdallWSClient
+	ctx    context.Context
+	cancel context.CancelFunc
+}
+
+// setupWSFailover creates a rejecting primary and accepting secondary, subscribes
+// to milestone events, waits for failover to secondary, then replaces the primary
+// with an accepting server. The caller can then assert promotion behavior.
+func setupWSFailover(t *testing.T, healthInterval time.Duration, threshold int, cooldown time.Duration) *wsFailoverFixture {
+	t.Helper()
 
-func TestWSClient_Registry_ConsecutiveThreshold(t *testing.T) {
-	// Primary starts rejecting, secondary accepts.
 	primaryReject := newTestWSServer(t, true)
-	defer primaryReject.Close()
+	t.Cleanup(primaryReject.Close)
 
 	secondary := newTestWSServerWithMilestone(t)
-	defer secondary.Close()
+	t.Cleanup(secondary.Close)
 
 	client, err := NewHeimdallWSClient(wsURL(primaryReject.URL), wsURL(secondary.URL))
 	require.NoError(t, err)
 
 	client.reconnectDelay = 100 * time.Millisecond
-	client.registry.HealthCheckInterval = 50 * time.Millisecond
-	client.registry.ConsecutiveThreshold = 3 // need 3 consecutive successes
-	client.registry.PromotionCooldown = 0
+	client.registry.HealthCheckInterval = healthInterval
+	client.registry.ConsecutiveThreshold = threshold
+	client.registry.PromotionCooldown = cooldown
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-	defer cancel()
 
 	events := client.SubscribeMilestoneEvents(ctx)
 
-	// Failover to secondary.
+	// Wait for failover to secondary.
 	select {
 	case m := <-events:
 		require.NotNil(t, m)
@@ -490,68 +460,48 @@ func TestWSClient_Registry_ConsecutiveThreshold(t *testing.T) {
 
 	// Replace rejecting primary with accepting one.
 	primaryReject.Close()
+
 	primaryGood := newTestWSServer(t, false)
-	defer primaryGood.Close()
+	t.Cleanup(primaryGood.Close)
 
 	client.mu.Lock()
 	client.urls[0] = wsURL(primaryGood.URL)
 	client.mu.Unlock()
 
-	// Should eventually promote after 3 consecutive successes.
-	require.Eventually(t, func() bool {
-		return client.registry.Active() == 0
-	}, 5*time.Second, 50*time.Millisecond, "should promote after consecutive threshold met")
-
-	require.NoError(t, client.Unsubscribe(ctx))
+	return &wsFailoverFixture{client: client, ctx: ctx, cancel: cancel}
 }
 
-func TestWSClient_Registry_PromotionCooldown(t *testing.T) {
-	primaryReject := newTestWSServer(t, true)
-	defer primaryReject.Close()
-
-	secondary := newTestWSServerWithMilestone(t)
-	defer secondary.Close()
-
-	client, err := NewHeimdallWSClient(wsURL(primaryReject.URL), wsURL(secondary.URL))
-	require.NoError(t, err)
-
-	client.reconnectDelay = 100 * time.Millisecond
-	client.registry.HealthCheckInterval = 50 * time.Millisecond
-	client.registry.ConsecutiveThreshold = 1
-	client.registry.PromotionCooldown = 500 * time.Millisecond
+func (f *wsFailoverFixture) cleanup(t *testing.T) {
+	t.Helper()
 
-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-	defer cancel()
+	defer f.cancel()
+	require.NoError(t, f.client.Unsubscribe(f.ctx))
+}
 
-	events := client.SubscribeMilestoneEvents(ctx)
+// --- New health registry tests ---
 
-	// Failover to secondary.
-	select {
-	case m := <-events:
-		require.NotNil(t, m)
-	case <-ctx.Done():
-		t.Fatal("timed out waiting for failover")
-	}
+func TestWSClient_Registry_ConsecutiveThreshold(t *testing.T) {
+	fix := setupWSFailover(t, 50*time.Millisecond, 3, 0)
+	defer fix.cleanup(t)
 
-	// Replace primary with good one.
-	primaryReject.Close()
-	primaryGood := newTestWSServer(t, false)
-	defer primaryGood.Close()
+	// Should eventually promote after 3 consecutive successes.
+	require.Eventually(t, func() bool {
+		return fix.client.registry.Active() == 0
+	}, 5*time.Second, 50*time.Millisecond, "should promote after consecutive threshold met")
+}
 
-	client.mu.Lock()
-	client.urls[0] = wsURL(primaryGood.URL)
-	client.mu.Unlock()
+func TestWSClient_Registry_PromotionCooldown(t *testing.T) {
+	fix := setupWSFailover(t, 50*time.Millisecond, 1, 500*time.Millisecond)
+	defer fix.cleanup(t)
 
 	// Should not promote immediately (cooldown not met).
 	time.Sleep(150 * time.Millisecond)
-	assert.Equal(t, 1, client.registry.Active(), "should not promote before cooldown")
+	assert.Equal(t, 1, fix.client.registry.Active(), "should not promote before cooldown")
 
 	// Wait for cooldown to pass.
 	require.Eventually(t, func() bool {
-		return client.registry.Active() == 0
+		return fix.client.registry.Active() == 0
 	}, 3*time.Second, 50*time.Millisecond, "should promote after cooldown passes")
-
-	require.NoError(t, client.Unsubscribe(ctx))
 }
 
 func TestWSClient_ProactiveSwitchSetsConnNil(t *testing.T) {