ethereum-optimism
diff --git a/‎proxyd/backend.go‎
Lines changed: 51 additions & 0 deletions b/‎proxyd/backend.go‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎proxyd/backend_probe.go‎
Lines changed: 210 additions & 0 deletions b/‎proxyd/backend_probe.go‎
Lines changed: 210 additions & 0 deletions
@@ -16,6 +16,7 @@ import (
 	"strconv"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	sw "github.com/ethereum-optimism/infra/proxyd/pkg/avg-sliding-window"
@@ -322,6 +323,11 @@ type Backend struct {
 	maxLatencyThreshold          time.Duration
 	maxErrorRateThreshold        float64
 
+	probeSpec    *ProbeSpec
+	probeURL     string
+	ProbeWorker  *ProbeWorker
+	healthyProbe atomic.Bool
+
 	latencySlidingWindow            *sw.AvgSlidingWindow
 	networkRequestsSlidingWindow    *sw.AvgSlidingWindow
 	intermittentErrorsSlidingWindow *sw.AvgSlidingWindow
@@ -474,6 +480,32 @@ func WithIntermittentNetworkErrorSlidingWindow(sw *sw.AvgSlidingWindow) BackendO
 	}
 }
 
+func WithProbe(probeURL string, probeFailureThreshold int, probeSuccessThreshold int, probePeriodSeconds int, probeTimeoutSeconds int) BackendOpt {
+	return func(b *Backend) {
+		b.probeURL = probeURL
+		probeSpec := ProbeSpec{
+			// default values
+			FailureThreshold: 1,
+			SuccessThreshold: 2,
+			Period:           4 * time.Second,
+			Timeout:          1 * time.Second,
+		}
+		if probeFailureThreshold > 0 {
+			probeSpec.FailureThreshold = probeFailureThreshold
+		}
+		if probeSuccessThreshold > 0 {
+			probeSpec.SuccessThreshold = probeSuccessThreshold
+		}
+		if probePeriodSeconds > 0 {
+			probeSpec.Period = time.Duration(probePeriodSeconds) * time.Second
+		}
+		if probeTimeoutSeconds > 0 {
+			probeSpec.Timeout = time.Duration(probeTimeoutSeconds) * time.Second
+		}
+		b.probeSpec = &probeSpec
+	}
+}
+
 type indexedReqRes struct {
 	index int
 	req   *RPCReq
@@ -527,7 +559,9 @@ func NewBackend(
 		networkRequestsSlidingWindow:    sw.NewSlidingWindow(),
 		intermittentErrorsSlidingWindow: sw.NewSlidingWindow(),
 		allowedStatusCodes:              []int{400, 413}, // Alchemy returns a 400 on bad JSONs, and Quicknode returns a 413 on too large requests
+
 	}
+	backend.healthyProbe.Store(true)
 
 	backend.Override(opts...)
 
@@ -890,6 +924,9 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
 
 // IsHealthy checks if the backend is able to serve traffic, based on dynamic parameters
 func (b *Backend) IsHealthy() bool {
+	if !b.IsProbeHealthy() {
+		return false
+	}
 	errorRate := b.ErrorRate()
 	avgLatency := time.Duration(b.latencySlidingWindow.Avg())
 	if errorRate >= b.maxErrorRateThreshold {
@@ -901,6 +938,20 @@ func (b *Backend) IsHealthy() bool {
 	return true
 }
 
+func (b *Backend) IsProbeHealthy() bool {
+	if b.probeSpec == nil {
+		return true
+	}
+	return b.healthyProbe.Load()
+}
+
+func (b *Backend) SetProbeHealth(healthy bool) {
+	if b.probeSpec == nil {
+		return
+	}
+	b.healthyProbe.Store(healthy)
+}
+
 // ErrorRate returns the instant error rate of the backend
 func (b *Backend) ErrorRate() (errorRate float64) {
 	// we only really start counting the error rate after a minimum of 10 requests
 
@@ -0,0 +1,210 @@
+// backend_probe.go implements HTTP health check probing for backend servers.
+//
+// The probe system is inspired by Kubernetes liveness/readiness probes and provides:
+//   - Periodic HTTP health checks against a configurable endpoint
+//   - Configurable success/failure thresholds to prevent flapping
+//   - Async operation via a background goroutine per backend
+//
+// # Usage
+//
+// When a backend is configured with a probe_url, a ProbeWorker runs in the background,
+// periodically checking the endpoint. The backend is only marked unhealthy after
+// FailureThreshold consecutive failures, and only marked healthy after SuccessThreshold
+// consecutive successes. This threshold behavior prevents health status from flapping
+// due to transient network issues.
+//
+// # Configuration
+//
+// ProbeSpec controls the probe behavior:
+//   - FailureThreshold: consecutive failures before marking unhealthy (default: 1)
+//   - SuccessThreshold: consecutive successes before marking healthy (default: 2)
+//   - Period: interval between probes (default: 4s)
+//   - Timeout: HTTP request timeout per probe (default: 4s)
+//
+// # HTTP Probe Behavior
+//
+// The probe sends a GET request to the configured URL. Success is determined by:
+//   - 2xx status codes: success
+//   - 3xx (redirects), 4xx, 5xx, or connection errors: failure
+//
+// The probe uses a custom dialer with SO_LINGER set (borrowed from Kubernetes) to ensure
+// clean connection teardown, and disables keep-alives to get fresh connection state each probe.
+package proxyd
+
+import (
+	"crypto/tls"
+	"fmt"
+	"io"
+	"math/rand"
+	"net"
+	"net/http"
+	"net/url"
+	"syscall"
+	"time"
+)
+
+type ProbeSpec struct {
+	FailureThreshold int
+	SuccessThreshold int
+	Period           time.Duration
+	Timeout          time.Duration
+}
+
+// borrowed from https://github.com/kubernetes/kubernetes/blob/b53b9fb5573323484af9a19cf3f5bfe80760abba/pkg/probe/dialer_others.go#L37
+// probeDialer is a dialer that sets the SO_LINGER option to 1 second.
+func probeDialer() *net.Dialer {
+	dialer := &net.Dialer{
+		Control: func(network, address string, c syscall.RawConn) error {
+			return c.Control(func(fd uintptr) {
+				_ = syscall.SetsockoptLinger(int(fd), syscall.SOL_SOCKET, syscall.SO_LINGER, &syscall.Linger{Onoff: 1, Linger: 1})
+			})
+		},
+	}
+	return dialer
+}
+
+var defaultTransport = http.DefaultTransport.(*http.Transport)
+
+func doHTTPProbe(req *http.Request, client *http.Client) (bool, string) {
+	res, err := client.Do(req)
+	if err != nil {
+		// Convert errors into failures to catch timeouts.
+		return false, err.Error()
+	}
+	defer res.Body.Close()
+	if _, err = io.ReadAll(res.Body); err != nil {
+		return false, err.Error()
+	}
+	if res.StatusCode >= http.StatusOK && res.StatusCode < http.StatusBadRequest {
+		if res.StatusCode >= http.StatusMultipleChoices { // Redirect
+			return false, fmt.Sprintf("HTTP Probe result is a redirect: %s", res.Status)
+		}
+		return true, ""
+	}
+	return false, fmt.Sprintf("HTTP probe failed with statuscode: %d", res.StatusCode)
+}
+
+type ProbeWorker struct {
+	stopCh        chan struct{}
+	spec          ProbeSpec
+	transport     *http.Transport
+	req           *http.Request
+	resultHandler func(bool, string)
+	lastResult    bool
+	resultRun     int
+	backendName   string
+}
+
+func NewProbeWorker(
+	backendName string,
+	probeUrl string,
+	probeSpec ProbeSpec,
+	resultHandler func(bool, string),
+	tlsConfig *tls.Config,
+) (*ProbeWorker, error) {
+
+	u, err := url.Parse(probeUrl)
+	if err != nil {
+		return nil, err
+	}
+	req, err := http.NewRequest("GET", u.String(), nil)
+	if err != nil {
+		return nil, err
+	}
+
+	req.Header = http.Header{
+		"User-Agent": {"proxyd-probe"},
+		"Accept":     {"*/*"},
+	}
+
+	// Use provided TLS config or fall back to default (secure) configuration
+	if tlsConfig == nil {
+		tlsConfig = &tls.Config{}
+	}
+
+	transport := &http.Transport{
+		TLSClientConfig:     tlsConfig,
+		TLSHandshakeTimeout: defaultTransport.TLSHandshakeTimeout,
+		DisableKeepAlives:   true,
+		DisableCompression:  true,
+		DialContext:         probeDialer().DialContext,
+		IdleConnTimeout:     defaultTransport.IdleConnTimeout,
+	}
+
+	return &ProbeWorker{
+		stopCh:        make(chan struct{}, 1), // Buffer so stop() can be non-blocking.
+		spec:          probeSpec,
+		resultHandler: resultHandler,
+		transport:     transport,
+		req:           req,
+		backendName:   backendName,
+	}, nil
+}
+
+func (w *ProbeWorker) run() {
+	probeTickerPeriod := w.spec.Period
+
+	// first wait period is random to avoid simultaneous probes
+	time.Sleep(time.Duration(rand.Float64() * float64(probeTickerPeriod)))
+
+	probeTicker := time.NewTicker(probeTickerPeriod)
+
+	defer func() {
+		// Clean up.
+		probeTicker.Stop()
+
+	}()
+
+probeLoop:
+	for {
+		w.doProbe()
+		// Wait for next probe tick.
+		select {
+		case <-w.stopCh:
+			break probeLoop
+		case <-probeTicker.C:
+			// continue
+		}
+	}
+}
+
+func (w *ProbeWorker) Stop() {
+	select {
+	case w.stopCh <- struct{}{}:
+	default: // Non-blocking.
+	}
+}
+
+func (w *ProbeWorker) Start() {
+	go w.run()
+}
+
+func (w *ProbeWorker) doProbe() {
+	client := &http.Client{
+		Timeout:   w.spec.Timeout,
+		Transport: w.transport,
+	}
+
+	start := time.Now()
+	result, message := doHTTPProbe(w.req, client)
+	duration := time.Since(start)
+
+	RecordBackendProbeDuration(w.backendName, duration)
+	RecordBackendProbeCheck(w.backendName, result)
+
+	if w.lastResult == result {
+		w.resultRun++
+	} else {
+		w.lastResult = result
+		w.resultRun = 1
+	}
+
+	if (!result && w.resultRun < int(w.spec.FailureThreshold)) ||
+		(result && w.resultRun < int(w.spec.SuccessThreshold)) {
+		// Success or failure is below threshold - leave the probe state unchanged.
+		return
+	}
+
+	RecordBackendProbeHealthy(w.backendName, result)
+	w.resultHandler(result, message)
+}