Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
1629daf
heimdall: added initial implementation of heimdall RPC fallback
pratikspatil024 Feb 10, 2026
fe0c07b
added comment for clarification
pratikspatil024 Feb 11, 2026
fe49be3
reduced the colldown time to 2 minutes
pratikspatil024 Feb 11, 2026
0c8c0a6
added more unit tests
pratikspatil024 Feb 11, 2026
93bd0e6
lint and duplication fix
pratikspatil024 Feb 11, 2026
a02d07f
1 more unit test
pratikspatil024 Feb 11, 2026
b66fe79
Merge branch 'develop' of github.com:0xPolygon/bor into psp-pos-3181
pratikspatil024 Feb 11, 2026
8a9d2f7
added failover for heimdall grpc and ws clients
pratikspatil024 Feb 12, 2026
d4df759
added tests
pratikspatil024 Feb 12, 2026
25b7cad
accepting a list of urls (multiple) instead of just secondary url
pratikspatil024 Feb 12, 2026
4d44077
code duplication fix
pratikspatil024 Feb 12, 2026
de26840
added mode tests
pratikspatil024 Feb 12, 2026
27f53b9
code duplication fix
pratikspatil024 Feb 12, 2026
6cc879a
addressed comment: rename FailoverHeimdallClient to MultiHeimdallClient
pratikspatil024 Feb 13, 2026
39eda15
added timeout on cascade/secondary calls
pratikspatil024 Feb 23, 2026
4709ad6
added a few checks to prevent panic
pratikspatil024 Feb 23, 2026
1d8befe
donot failover on 4xx codes
pratikspatil024 Feb 23, 2026
c75f3c0
ws now has linear cap rather than circular wrap
pratikspatil024 Feb 23, 2026
3825a5d
added background health-check for heimdall failover
pratikspatil024 Feb 23, 2026
b0cc4f5
updated log
pratikspatil024 Feb 23, 2026
01b24b4
added metrics to track failover
pratikspatil024 Feb 23, 2026
673cc38
Merge branch 'develop' of github.com:0xPolygon/bor into psp-pos-3181
pratikspatil024 Feb 23, 2026
200d899
fix lint
pratikspatil024 Feb 23, 2026
be4fe9d
updated the health check logic and some minor improvements
pratikspatil024 Feb 24, 2026
b170f03
fix lint and improvements
pratikspatil024 Feb 24, 2026
c3a946b
reduced duplication in health registry, and fixed a bug in ws
pratikspatil024 Feb 24, 2026
60457e0
fixed a race condition in ws
pratikspatil024 Feb 24, 2026
30f07b7
fixed a potential deadlock
pratikspatil024 Feb 24, 2026
cbf6924
replaces the sequential prob loop with concurent goroutines
pratikspatil024 Feb 24, 2026
716a9e8
reduced the failover switchover time
pratikspatil024 Feb 25, 2026
c2c65ae
Merge branch 'develop' of github.com:0xPolygon/bor into psp-pos-3181
pratikspatil024 Feb 25, 2026
478759a
reduced code duplication in tests
pratikspatil024 Feb 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cmd/utils/bor_flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ var (
// Bor Specific flags
//

// HeimdallURLFlag flag for heimdall url
// HeimdallURLFlag flag for heimdall url (comma-separated for failover)
HeimdallURLFlag = &cli.StringFlag{
Name: "bor.heimdall",
Usage: "URL of Heimdall service",
Usage: "URL of Heimdall service (comma-separated for failover: \"url1,url2\")",
Value: "http://localhost:1317",
}

Expand All @@ -36,17 +36,17 @@ var (
Usage: "Run without Heimdall service (for testing purpose)",
}

// HeimdallgRPCAddressFlag flag for heimdall gRPC address
// HeimdallgRPCAddressFlag flag for heimdall gRPC address (comma-separated for failover)
HeimdallgRPCAddressFlag = &cli.StringFlag{
Name: "bor.heimdallgRPC",
Usage: "Address of Heimdall gRPC service",
Usage: "Address of Heimdall gRPC service (comma-separated for failover: \"addr1,addr2\")",
Value: "",
}

// HeimdallWSAddressFlag flag for heimdall websocket subscription service
// HeimdallWSAddressFlag flag for heimdall websocket subscription service (comma-separated for failover)
HeimdallWSAddressFlag = &cli.StringFlag{
Name: "bor.heimdallWS",
Usage: "Address of Heimdall WS Subscription service",
Usage: "Address of Heimdall WS Subscription service (comma-separated for failover: \"addr1,addr2\")",
Value: "",
}

Expand Down
277 changes: 277 additions & 0 deletions consensus/bor/heimdall/failover_client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
package heimdall

import (
"context"
"errors"
"net"
"sync"
"time"

"github.com/0xPolygon/heimdall-v2/x/bor/types"
ctypes "github.com/cometbft/cometbft/rpc/core/types"

"github.com/ethereum/go-ethereum/consensus/bor/clerk"
"github.com/ethereum/go-ethereum/consensus/bor/heimdall/checkpoint"
"github.com/ethereum/go-ethereum/consensus/bor/heimdall/milestone"
"github.com/ethereum/go-ethereum/log"
)

const (
defaultAttemptTimeout = 30 * time.Second
defaultSecondaryCooldown = 2 * time.Minute
)

// Endpoint matches bor.IHeimdallClient. It is exported so that external
// packages can build []Endpoint slices for NewFailoverHeimdallClient without
// running into Go's covariant-slice restriction.
type Endpoint interface {
StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error)
GetSpan(ctx context.Context, spanID uint64) (*types.Span, error)
GetLatestSpan(ctx context.Context) (*types.Span, error)
FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error)
FetchCheckpointCount(ctx context.Context) (int64, error)
FetchMilestone(ctx context.Context) (*milestone.Milestone, error)
FetchMilestoneCount(ctx context.Context) (int64, error)
FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error)
Close()
}

// FailoverHeimdallClient wraps N heimdall clients (primary at index 0, failovers
// at 1..N-1) and transparently cascades through them when the active client is
// unreachable. After a cooldown period it probes the primary again.
type FailoverHeimdallClient struct {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does MultiHeimdallClient sound better? Since this structure does not only contain the failover client, but also the primary client.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated here, thanks!

clients []Endpoint
mu sync.Mutex
active int // 0 = primary, >0 = failover
lastSwitch time.Time // when we last switched away from primary
attemptTimeout time.Duration
cooldown time.Duration
}

func NewFailoverHeimdallClient(clients ...Endpoint) *FailoverHeimdallClient {
return &FailoverHeimdallClient{
clients: clients,
attemptTimeout: defaultAttemptTimeout,
cooldown: defaultSecondaryCooldown,
}
}

func (f *FailoverHeimdallClient) StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error) {
return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) ([]*clerk.EventRecordWithTime, error) {
return c.StateSyncEvents(ctx, fromID, to)
})
}

func (f *FailoverHeimdallClient) GetSpan(ctx context.Context, spanID uint64) (*types.Span, error) {
return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*types.Span, error) {
return c.GetSpan(ctx, spanID)
})
}

func (f *FailoverHeimdallClient) GetLatestSpan(ctx context.Context) (*types.Span, error) {
return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*types.Span, error) {
return c.GetLatestSpan(ctx)
})
}

func (f *FailoverHeimdallClient) FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error) {
return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*checkpoint.Checkpoint, error) {
return c.FetchCheckpoint(ctx, number)
})
}

func (f *FailoverHeimdallClient) FetchCheckpointCount(ctx context.Context) (int64, error) {
return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (int64, error) {
return c.FetchCheckpointCount(ctx)
})
}

func (f *FailoverHeimdallClient) FetchMilestone(ctx context.Context) (*milestone.Milestone, error) {
return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*milestone.Milestone, error) {
return c.FetchMilestone(ctx)
})
}

func (f *FailoverHeimdallClient) FetchMilestoneCount(ctx context.Context) (int64, error) {
return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (int64, error) {
return c.FetchMilestoneCount(ctx)
})
}

func (f *FailoverHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error) {
return callWithFailover(f, ctx, func(ctx context.Context, c Endpoint) (*ctypes.SyncInfo, error) {
return c.FetchStatus(ctx)
})
}

func (f *FailoverHeimdallClient) Close() {
for _, c := range f.clients {
c.Close()
}
}

// callWithFailover executes fn against the active client. If the active client
// fails with a failover-eligible error, it cascades through remaining clients.
// If on a non-primary client past the cooldown, it probes the primary first.
func callWithFailover[T any](f *FailoverHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error)) (T, error) {
f.mu.Lock()
active := f.active
shouldProbe := active != 0 && time.Since(f.lastSwitch) >= f.cooldown
f.mu.Unlock()

// If on a non-primary client and cooldown has elapsed, probe primary
if shouldProbe {
subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be risky if primary is still not up after the cooldown, resulting a lag. Instead of querying on receiving a new request, we can probe primary in a background goroutine, and when it is back up, update the active accordingly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added a background health check here. Thanks!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did some updates here

result, err := fn(subCtx, f.clients[0])
cancel()

if err == nil {
f.mu.Lock()
f.active = 0
f.mu.Unlock()

log.Info("Heimdall failover: primary recovered, switching back")

return result, nil
}

if !isFailoverError(err, ctx) {
var zero T
return zero, err
}

// Primary still down, stay on current client
f.mu.Lock()
f.lastSwitch = time.Now()
f.mu.Unlock()

log.Debug("Heimdall failover: primary still down after probe, staying on current", "active", active, "err", err)

// Try current client, then cascade through remaining on failure
result, err = fn(ctx, f.clients[active])
if err == nil {
return result, nil
}

if !isFailoverError(err, ctx) {
var zero T
return zero, err
}

return cascadeClients(f, ctx, fn, active, err)
}

if active != 0 {
// On a non-primary client, not yet time to probe: use current directly
result, err := fn(ctx, f.clients[active])
if err == nil {
return result, nil
}

if !isFailoverError(err, ctx) {
var zero T
return zero, err
}

return cascadeClients(f, ctx, fn, active, err)
}

// Active is primary: try with timeout
subCtx, cancel := context.WithTimeout(ctx, f.attemptTimeout)
result, err := fn(subCtx, f.clients[0])
cancel()

if err == nil {
return result, nil
}

if !isFailoverError(err, ctx) {
var zero T
return zero, err
}

// Cascade through clients [1, 2, ..., N-1]
log.Warn("Heimdall failover: primary failed, cascading to next client", "err", err)

return cascadeClients(f, ctx, fn, 0, err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above - by the time when we discover a failure in backup client, it might be too late to switch over to the next one if the request times out, causing a span rotation. We can always probe the active client in a background goroutine, which will automatically update the active if something goes wrong with the current one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added a background health check here. Thanks!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did some updates here

}

// cascadeClients tries clients after the given index. On first success it
// switches the active client and returns. If all fail, returns the last error.
func cascadeClients[T any](f *FailoverHeimdallClient, ctx context.Context, fn func(context.Context, Endpoint) (T, error), after int, lastErr error) (T, error) {
for i := after + 1; i < len(f.clients); i++ {
result, err := fn(ctx, f.clients[i])
if err == nil {
f.mu.Lock()
f.active = i
f.lastSwitch = time.Now()
f.mu.Unlock()

log.Warn("Heimdall failover: switched to client", "index", i)

return result, nil
}

lastErr = err

if !isFailoverError(err, ctx) {
var zero T
return zero, err
}
}

var zero T
return zero, lastErr
}

// isFailoverError returns true if the error warrants trying the secondary.
// It distinguishes between sub-context timeouts (failover-eligible) and
// caller context cancellation (not eligible).
func isFailoverError(err error, callerCtx context.Context) bool {
if err == nil {
return false
}

// If the caller's context is done, this is not a failover scenario
if callerCtx.Err() != nil {
return false
}

// Shutdown detected - not a transport error
if errors.Is(err, ErrShutdownDetected) {
return false
}

// 503 is a Heimdall feature-gate, not a transport issue
if errors.Is(err, ErrServiceUnavailable) {
return false
}

// Transport errors
var netErr net.Error
if errors.As(err, &netErr) {
return true
}

// No response from Heimdall
if errors.Is(err, ErrNoResponse) {
return true
}

// Non-successful HTTP response (4xx, 5xx excluding 503)
if errors.Is(err, ErrNotSuccessfulResponse) {
return true
}

// Sub-context deadline exceeded (the caller's context is still alive at this point)
if errors.Is(err, context.DeadlineExceeded) {
return true
}

// Context canceled from sub-context (caller ctx is still alive)
if errors.Is(err, context.Canceled) {
return true
}

return false
}
Loading
Loading