feat: improve breaker by combining error rate and latency

kevwan · kevwan · commit 63693d68690a · 2026-01-09T23:54:59.000+08:00
Signed-off-by: kevin &lt;wanjunfeng@gmail.com&gt;
diff --git a/core/breaker/breaker.go b/core/breaker/breaker.go
@@ -92,7 +92,8 @@ type (
 	}
 
 	circuitBreaker struct {
-		name string
+		name    string
+		timeout time.Duration
 		throttle
 	}
 
@@ -117,7 +118,7 @@ func NewBreaker(opts ...Option) Breaker {
 	if len(b.name) == 0 {
 		b.name = stringx.Rand()
 	}
-	b.throttle = newLoggedThrottle(b.name, newGoogleBreaker())
+	b.throttle = newLoggedThrottle(b.name, newGoogleBreaker(b.timeout))
 
 	return &b
 }
@@ -202,6 +203,12 @@ func WithName(name string) Option {
 	}
 }
 
+func WithTimeout(timeout time.Duration) Option {
+	return func(b *circuitBreaker) {
+		b.timeout = timeout
+	}
+}
+
 func defaultAcceptable(err error) bool {
 	return err == nil
 }
diff --git a/core/breaker/breaker_test.go b/core/breaker/breaker_test.go
@@ -388,6 +388,21 @@ func BenchmarkGoogleBreaker(b *testing.B) {
 	}
 }
 
+func TestWithTimeout(t *testing.T) {
+	timeout := 100 * time.Millisecond
+	b := NewBreaker(WithTimeout(timeout))
+	cb, ok := b.(*circuitBreaker)
+	assert.True(t, ok)
+	assert.Equal(t, timeout, cb.timeout)
+}
+
+func TestNopPromise(t *testing.T) {
+	var p nopPromise
+	// These methods should not panic and do nothing
+	p.Accept()
+	p.Reject("test reason")
+}
+
 type mockedPromise struct{}
 
 func (m *mockedPromise) Accept() {
diff --git a/core/breaker/googlebreaker.go b/core/breaker/googlebreaker.go
@@ -1,6 +1,8 @@
 package breaker
 
 import (
+	"math"
+	"sync/atomic"
 	"time"
 
 	"github.com/zeromicro/go-zero/core/collection"
@@ -11,22 +13,31 @@ import (
 
 const (
 	// 250ms for bucket duration
-	window            = time.Second * 10
-	buckets           = 40
-	forcePassDuration = time.Second
-	k                 = 1.5
-	minK              = 1.1
-	protection        = 5
+	window                      = time.Second * 10
+	buckets                     = 40
+	forcePassDuration           = time.Second
+	k                           = 1.5
+	minK                        = 1.1
+	protection                  = 5
+	latencyActivationMultiplier = 3
+	latencyCeilingRatio         = 0.95
+	latencyBaselineDecayBeta    = 0.25
+	latencyBaselineRiseBeta     = 0.01
+	latencyCurrentBeta          = 0.25
+	latencyMaxDropRatio         = 0.3
 )
 
 // googleBreaker is a netflixBreaker pattern from google.
 // see Client-Side Throttling section in https://landing.google.com/sre/sre-book/chapters/handling-overload/
 type (
 	googleBreaker struct {
-		k        float64
-		stat     *collection.RollingWindow[int64, *bucket]
-		proba    *mathx.Proba
-		lastPass *syncx.AtomicDuration
+		k                float64
+		stat             *collection.RollingWindow[int64, *bucket]
+		proba            *mathx.Proba
+		lastPass         *syncx.AtomicDuration
+		timeoutUs        int64
+		noLoadLatencyUs  int64
+		currentLatencyUs int64
 	}
 
 	windowResult struct {
@@ -37,27 +48,25 @@ type (
 	}
 )
 
-func newGoogleBreaker() *googleBreaker {
+func newGoogleBreaker(timeout time.Duration) *googleBreaker {
 	bucketDuration := time.Duration(int64(window) / int64(buckets))
-	st := collection.NewRollingWindow[int64, *bucket](func() *bucket {
+	st := collection.NewRollingWindow(func() *bucket {
 		return new(bucket)
 	}, buckets, bucketDuration)
 	return &googleBreaker{
-		stat:     st,
-		k:        k,
-		proba:    mathx.NewProba(),
-		lastPass: syncx.NewAtomicDuration(),
+		stat:      st,
+		k:         k,
+		proba:     mathx.NewProba(),
+		lastPass:  syncx.NewAtomicDuration(),
+		timeoutUs: timeout.Microseconds(),
 	}
 }
 
 func (b *googleBreaker) accept() error {
-	var w float64
 	history := b.history()
-	w = b.k - (b.k-minK)*float64(history.failingBuckets)/buckets
-	weightedAccepts := mathx.AtLeast(w, minK) * float64(history.accepts)
-	// https://landing.google.com/sre/sre-book/chapters/handling-overload/#eq2101
-	// for better performance, no need to care about the negative ratio
-	dropRatio := (float64(history.total-protection) - weightedAccepts) / float64(history.total+1)
+	errorRatio := b.calcK(history)
+	latencyRatio := b.calcLatencyRatio()
+	dropRatio := math.Max(errorRatio, latencyRatio)
 	if dropRatio <= 0 {
 		return nil
 	}
@@ -86,10 +95,40 @@ func (b *googleBreaker) allow() (internalPromise, error) {
 	}
 
 	return googlePromise{
-		b: b,
+		b:     b,
+		start: timex.Now(),
 	}, nil
 }
 
+func (b *googleBreaker) calcK(history windowResult) float64 {
+	w := b.k - (b.k-minK)*float64(history.failingBuckets)/buckets
+	weightedAccepts := mathx.AtLeast(w, minK) * float64(history.accepts)
+	// https://landing.google.com/sre/sre-book/chapters/handling-overload/#eq2101
+	// for better performance, no need to care about the negative ratio
+	return (float64(history.total-protection) - weightedAccepts) / float64(history.total+1)
+}
+
+func (b *googleBreaker) calcLatencyRatio() float64 {
+	if b.timeoutUs <= 0 {
+		return 0
+	}
+
+	noLoadLatency := atomic.LoadInt64(&b.noLoadLatencyUs)
+	currentLatencyUs := atomic.LoadInt64(&b.currentLatencyUs)
+	if noLoadLatency <= 0 || currentLatencyUs <= 0 {
+		return 0
+	}
+
+	threshold := noLoadLatency * latencyActivationMultiplier
+	ceiling := int64(float64(b.timeoutUs) * latencyCeilingRatio)
+	if currentLatencyUs < threshold || ceiling <= threshold {
+		return 0
+	}
+
+	ratio := float64(currentLatencyUs-threshold) / float64(ceiling-threshold)
+	return math.Min(1.0, math.Max(0.0, ratio)) * latencyMaxDropRatio
+}
+
 func (b *googleBreaker) doReq(req func() error, fallback Fallback, acceptable Acceptable) error {
 	if err := b.accept(); err != nil {
 		b.markDrop()
@@ -101,10 +140,11 @@ func (b *googleBreaker) doReq(req func() error, fallback Fallback, acceptable Ac
 	}
 
 	var succ bool
+	start := timex.Now()
 	defer func() {
 		// if req() panic, success is false, mark as failure
 		if succ {
-			b.markSuccess()
+			b.markSuccess(timex.Since(start).Microseconds())
 		} else {
 			b.markFailure()
 		}
@@ -118,18 +158,6 @@ func (b *googleBreaker) doReq(req func() error, fallback Fallback, acceptable Ac
 	return err
 }
 
-func (b *googleBreaker) markDrop() {
-	b.stat.Add(drop)
-}
-
-func (b *googleBreaker) markFailure() {
-	b.stat.Add(fail)
-}
-
-func (b *googleBreaker) markSuccess() {
-	b.stat.Add(success)
-}
-
 func (b *googleBreaker) history() windowResult {
 	var result windowResult
 
@@ -151,12 +179,70 @@ func (b *googleBreaker) history() windowResult {
 	return result
 }
 
+func (b *googleBreaker) markDrop() {
+	b.stat.Add(drop)
+}
+
+func (b *googleBreaker) markFailure() {
+	b.stat.Add(fail)
+}
+
+func (b *googleBreaker) markSuccess(latencyUs int64) {
+	b.stat.Add(success)
+	if b.timeoutUs > 0 {
+		b.updateLatency(latencyUs)
+	}
+}
+
+func (b *googleBreaker) updateBaselineLatency(latencyUs int64) {
+	noLoadLatency := atomic.LoadInt64(&b.noLoadLatencyUs)
+	if noLoadLatency <= 0 {
+		atomic.StoreInt64(&b.noLoadLatencyUs, latencyUs)
+		return
+	}
+
+	var beta float64
+	if latencyUs < noLoadLatency {
+		// Fast decay when latency decreases
+		beta = latencyBaselineDecayBeta
+	} else {
+		// Slow rise when latency increases
+		beta = latencyBaselineRiseBeta
+	}
+
+	newBaseline := int64(beta*float64(latencyUs) + (1-beta)*float64(noLoadLatency))
+	atomic.StoreInt64(&b.noLoadLatencyUs, newBaseline)
+}
+
+func (b *googleBreaker) updateCurrentLatency(latencyUs int64) {
+	currentLatency := atomic.LoadInt64(&b.currentLatencyUs)
+	if currentLatency <= 0 {
+		atomic.StoreInt64(&b.currentLatencyUs, latencyUs)
+		return
+	}
+
+	// Fast EMA to update current latency
+	newCurrent := int64(latencyCurrentBeta*float64(latencyUs) + (1-latencyCurrentBeta)*float64(currentLatency))
+	atomic.StoreInt64(&b.currentLatencyUs, newCurrent)
+}
+
+func (b *googleBreaker) updateLatency(latencyUs int64) {
+	if latencyUs <= 0 || b.timeoutUs <= 0 {
+		return
+	}
+
+	b.updateBaselineLatency(latencyUs)
+	b.updateCurrentLatency(latencyUs)
+}
+
 type googlePromise struct {
-	b *googleBreaker
+	b     *googleBreaker
+	start time.Duration
 }
 
 func (p googlePromise) Accept() {
-	p.b.markSuccess()
+	latencyUs := timex.Since(p.start).Microseconds()
+	p.b.markSuccess(latencyUs)
 }
 
 func (p googlePromise) Reject() {
diff --git a/core/breaker/googlebreaker_test.go b/core/breaker/googlebreaker_test.go

Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,8 @@ type (`
`92`	`92`	`}`
`93`	`93`
`94`	`94`	`circuitBreaker struct {`
`95`		`- name string`
	`95`	`+ name string`
	`96`	`+ timeout time.Duration`
`96`	`97`	`throttle`
`97`	`98`	`}`
`98`	`99`
`@@ -117,7 +118,7 @@ func NewBreaker(opts ...Option) Breaker {`
`117`	`118`	`if len(b.name) == 0 {`
`118`	`119`	`b.name = stringx.Rand()`
`119`	`120`	`}`
`120`		`- b.throttle = newLoggedThrottle(b.name, newGoogleBreaker())`
	`121`	`+ b.throttle = newLoggedThrottle(b.name, newGoogleBreaker(b.timeout))`
`121`	`122`
`122`	`123`	`return &b`
`123`	`124`	`}`
`@@ -202,6 +203,12 @@ func WithName(name string) Option {`
`202`	`203`	`}`
`203`	`204`	`}`
`204`	`205`
	`206`	`+func WithTimeout(timeout time.Duration) Option {`
	`207`	`+ return func(b *circuitBreaker) {`
	`208`	`+ b.timeout = timeout`
	`209`	`+ }`
	`210`	`+}`
	`211`	`+`
`205`	`212`	`func defaultAcceptable(err error) bool {`
`206`	`213`	`return err == nil`
`207`	`214`	`}`