Skip to content

Commit bb217b9

Browse files
authored
Merge pull request #62 from keep-network/resubscriptions-dance
Subscription dance: backoff, logging, and test coverage This PR is the first one in the series of PRs refactoring Ethereum event subscriptions mechanism I plan to open in the next few days. You can see it in action here: keep-network/keep-ecdsa#663. So far resubscription logging was implemented in contract binding templates. There were three problems with that mechanism: 1. No resubscribe backoff. Each failed subscription was retried 5 seconds after it failed. For many individual subscriptions ECDSA client is opening, a massive retry with no backoff could be interpreted as misbehavior by a third party Ethereum provider and further attempts could be completely blocked. Some operators experienced it with Alchemy. 2. All the logging was on the warning level. With the lack of backoff mentioned in the previous point, this could produce a deadly mixture - keep client could be trying to reconnect for a long time, Ethereum client could be rejecting those attempts interpreting them as DoS/misbehavior, and operator received only warnings with no single error. 3. Lack of test coverage for the resubscription mechanism. All the subscription code was placed in templates used to generate Go contract bindings and was very hard - if possible at all - to test. Here we address all those problems. Instead of implementing resubscriptions on our side, we wrap the code from `github.com/ethereum/go-ethereum/event` with some additional logging logic. This code is used from contract templates to keep the subscription alive. We lean on `go-ethereum` to do those resubscriptions right and we cover our logic addition in unit tests on our side. The code from `go-ethereum` implements backoffs, increasing the delay twice until it reaches the maximum backoff time which is set in our bindings to 2 minutes. The new code in `ethutil` wrapping `go-ethereum`'s resubscriber, allows to log an error if the subscription is dropped too often which may indicate problems with Ethereum client. The threshold is set to 15 minutes.
2 parents c7eca9a + c6adefd commit bb217b9

File tree

6 files changed

+460
-194
lines changed

6 files changed

+460
-194
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package ethutil
2+
3+
import (
4+
"context"
5+
"time"
6+
7+
"github.com/ethereum/go-ethereum/event"
8+
)
9+
10+
// WithResubscription wraps the subscribe function to call it repeatedly
11+
// to keep a subscription alive. When a subscription is established, it is
12+
// monitored and in the case of a failure, resubscribe is attempted by
13+
// calling the subscribe function again.
14+
//
15+
// The mechanism applies backoff between resubscription attempts.
16+
// The time between calls is adapted based on the error rate, but will never
17+
// exceed backoffMax.
18+
//
19+
// The mechanism monitors the time elapsed between resubscription attempts and
20+
// if it is shorter than the specificed alertThreshold, it calls
21+
// thresholdViolatedFn passing the time elapsed between resubscription attempts.
22+
// This function alarms about potential problems with the stability of the
23+
// subscription.
24+
//
25+
// In case of an error returned by the wrapped subscription function,
26+
// subscriptionFailedFn is called with the underlying error.
27+
//
28+
// thresholdViolatedFn and subscriptionFailedFn calls are executed in a separate
29+
// goroutine and thus are non-blocking.
30+
func WithResubscription(
31+
backoffMax time.Duration,
32+
subscribeFn event.ResubscribeFunc,
33+
alertThreshold time.Duration,
34+
thresholdViolatedFn func(time.Duration),
35+
subscriptionFailedFn func(error),
36+
) event.Subscription {
37+
lastAttempt := time.Time{}
38+
wrappedResubscribeFn := func(ctx context.Context) (event.Subscription, error) {
39+
now := time.Now()
40+
elapsed := now.Sub(lastAttempt)
41+
if elapsed < alertThreshold {
42+
go thresholdViolatedFn(elapsed)
43+
}
44+
45+
lastAttempt = now
46+
47+
sub, err := subscribeFn(ctx)
48+
if err != nil {
49+
go subscriptionFailedFn(err)
50+
}
51+
return sub, err
52+
}
53+
54+
return event.Resubscribe(backoffMax, wrappedResubscribeFn)
55+
}
Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
package ethutil
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"testing"
7+
"time"
8+
9+
"github.com/ethereum/go-ethereum/event"
10+
)
11+
12+
func TestEmitOriginalError(t *testing.T) {
13+
backoffMax := 100 * time.Millisecond
14+
alertThreshold := 100 * time.Millisecond
15+
16+
failedOnce := false
17+
expectedFailMessage := "wherever I go, he goes"
18+
subscribeFn := func(ctx context.Context) (event.Subscription, error) {
19+
if !failedOnce {
20+
failedOnce = true
21+
return nil, fmt.Errorf(expectedFailMessage)
22+
}
23+
delegate := event.NewSubscription(func(unsubscribed <-chan struct{}) error {
24+
return nil
25+
})
26+
return delegate, nil
27+
}
28+
29+
// Using buffered channels to do not block writes.
30+
// There should never be a need to write more to those channels if the code
31+
// under the test works as expected.
32+
thresholdViolated := make(chan time.Duration, 10)
33+
subscriptionFailed := make(chan error, 10)
34+
subscription := WithResubscription(
35+
backoffMax,
36+
subscribeFn,
37+
alertThreshold,
38+
func(elapsed time.Duration) { thresholdViolated <- elapsed },
39+
func(err error) { subscriptionFailed <- err },
40+
)
41+
<-subscription.Err()
42+
43+
// Subscription failed one time so there should be one error in the channel.
44+
subscriptionFailCount := len(subscriptionFailed)
45+
if subscriptionFailCount != 1 {
46+
t.Fatalf(
47+
"subscription failure reported [%v] times, expected [1]",
48+
subscriptionFailCount,
49+
)
50+
}
51+
52+
// That failure should refer the original error.
53+
err := <-subscriptionFailed
54+
if err.Error() != expectedFailMessage {
55+
t.Errorf(
56+
"unexpected subscription error message\nexpected: [%v]\nactual: [%v]",
57+
expectedFailMessage,
58+
err.Error(),
59+
)
60+
}
61+
}
62+
63+
func TestResubscribeAboveThreshold(t *testing.T) {
64+
backoffMax := 100 * time.Millisecond
65+
alertThreshold := 100 * time.Millisecond
66+
67+
plannedSubscriptionFailures := 3
68+
elapsedBetweenFailures := 150 * time.Millisecond
69+
70+
resubscribeFnCalls := 0
71+
subscribeFn := func(ctx context.Context) (event.Subscription, error) {
72+
resubscribeFnCalls++
73+
time.Sleep(elapsedBetweenFailures) // 150ms > 100ms, above alert threshold
74+
if resubscribeFnCalls <= plannedSubscriptionFailures {
75+
return nil, fmt.Errorf("this is the way")
76+
}
77+
delegate := event.NewSubscription(func(unsubscribed <-chan struct{}) error {
78+
return nil
79+
})
80+
return delegate, nil
81+
}
82+
83+
// Using buffered channels to do not block writes.
84+
// There should never be a need to write more to those channels if the code
85+
// under the test works as expected.
86+
thresholdViolated := make(chan time.Duration, 10)
87+
subscriptionFailed := make(chan error, 10)
88+
subscription := WithResubscription(
89+
backoffMax,
90+
subscribeFn,
91+
alertThreshold,
92+
func(elapsed time.Duration) { thresholdViolated <- elapsed },
93+
func(err error) { subscriptionFailed <- err },
94+
)
95+
<-subscription.Err()
96+
97+
// Nothing expected in thresholdViolated channel.
98+
// Alert threshold is set to 100ms and there were no resubscription attempts
99+
// in a time shorter than 150ms one after another.
100+
violationCount := len(thresholdViolated)
101+
if violationCount != 0 {
102+
t.Errorf(
103+
"threshold violation reported [%v] times, expected none",
104+
violationCount,
105+
)
106+
}
107+
108+
// Subscription failed plannedSubscriptionFailures times and resubscription
109+
// function should be called plannedSubscriptionFailures + 1 times. One time
110+
// for each failure and one time at the end - that subscription was
111+
// successful and had not to be retried.
112+
expectedResubscriptionCalls := plannedSubscriptionFailures + 1
113+
if resubscribeFnCalls != expectedResubscriptionCalls {
114+
t.Errorf(
115+
"resubscription called [%v] times, expected [%v]",
116+
resubscribeFnCalls,
117+
expectedResubscriptionCalls,
118+
)
119+
}
120+
121+
// Expect all subscription failures to be reported.
122+
subscriptionFailCount := len(subscriptionFailed)
123+
if subscriptionFailCount != plannedSubscriptionFailures {
124+
t.Errorf(
125+
"subscription failure reported [%v] times, expected [%v]",
126+
subscriptionFailCount,
127+
plannedSubscriptionFailures,
128+
)
129+
}
130+
}
131+
132+
func TestResubscribeBelowThreshold(t *testing.T) {
133+
backoffMax := 50 * time.Millisecond
134+
alertThreshold := 100 * time.Millisecond
135+
136+
plannedSubscriptionFailures := 5
137+
elapsedBetweenFailures := 50 * time.Millisecond
138+
139+
resubscribeFnCalls := 0
140+
subscribeFn := func(ctx context.Context) (event.Subscription, error) {
141+
resubscribeFnCalls++
142+
time.Sleep(elapsedBetweenFailures) // 50ms < 100ms, below alert threshold
143+
if resubscribeFnCalls <= plannedSubscriptionFailures {
144+
return nil, fmt.Errorf("i have spoken")
145+
}
146+
delegate := event.NewSubscription(func(unsubscribed <-chan struct{}) error {
147+
return nil
148+
})
149+
return delegate, nil
150+
}
151+
152+
// Using buffered channels to do not block writes.
153+
// There should never be a need to write more to those channels if the code
154+
// under the test works as expected.
155+
thresholdViolated := make(chan time.Duration, 10)
156+
subscriptionFailed := make(chan error, 10)
157+
subscription := WithResubscription(
158+
backoffMax,
159+
subscribeFn,
160+
alertThreshold,
161+
func(elapsed time.Duration) { thresholdViolated <- elapsed },
162+
func(err error) { subscriptionFailed <- err },
163+
)
164+
<-subscription.Err()
165+
166+
// Threshold violaton should be reported for each subscription failure if
167+
// the time elapsed since the previous resubscription was shorter than the
168+
// threshold.
169+
// In this test, alert threshold is set to 100ms and delays between failures
170+
// are just 50ms. Thus, we expect the same number of threshold violations as
171+
// resubscription attempts.
172+
violationCount := len(thresholdViolated)
173+
if violationCount != plannedSubscriptionFailures {
174+
t.Errorf(
175+
"threshold violation reported [%v] times, expected [%v]",
176+
violationCount,
177+
plannedSubscriptionFailures,
178+
)
179+
}
180+
181+
// All violations reported should have correct values - all of them should
182+
// be longer than the time elapsed between failures and shorter than the
183+
// alert threshold. It is not possible to assert on a precise value.
184+
for i := 0; i < violationCount; i++ {
185+
violation := <-thresholdViolated
186+
if violation < elapsedBetweenFailures {
187+
t.Errorf(
188+
"violation reported should be longer than the time elapsed "+
189+
"between failures; is: [%v] and should be longer than [%v]",
190+
violation,
191+
elapsedBetweenFailures,
192+
)
193+
}
194+
if violation > alertThreshold {
195+
t.Errorf(
196+
"violation reported should be shorter than the alert threshold; "+
197+
"; is: [%v] and should be shorter than [%v]",
198+
violation,
199+
alertThreshold,
200+
)
201+
}
202+
}
203+
204+
// Subscription failed plannedSubscriptionFailures times and resubscription
205+
// function should be called plannedSubscriptionFailures + 1 times. One time
206+
// for each failure and one time at the end - that subscription was
207+
// successful and had not to be retried.
208+
expectedResubscriptionCalls := plannedSubscriptionFailures + 1
209+
if resubscribeFnCalls != expectedResubscriptionCalls {
210+
t.Errorf(
211+
"resubscription called [%v] times, expected [%v]",
212+
resubscribeFnCalls,
213+
expectedResubscriptionCalls,
214+
)
215+
}
216+
217+
// Expect all subscription failures to be reported.
218+
subscriptionFailCount := len(subscriptionFailed)
219+
if subscriptionFailCount != plannedSubscriptionFailures {
220+
t.Errorf(
221+
"subscription failure reported [%v] times, expected [%v]",
222+
subscriptionFailCount,
223+
plannedSubscriptionFailures,
224+
)
225+
}
226+
}
227+
228+
func TestDoNotBlockOnChannelWrites(t *testing.T) {
229+
backoffMax := 50 * time.Millisecond
230+
alertThreshold := 100 * time.Millisecond
231+
232+
plannedSubscriptionFailures := 5
233+
elapsedBetweenFailures := 10 * time.Millisecond
234+
235+
resubscribeFnCalls := 0
236+
subscribeFn := func(ctx context.Context) (event.Subscription, error) {
237+
resubscribeFnCalls++
238+
time.Sleep(elapsedBetweenFailures) // 10ms < 100ms, below alert threshold
239+
if resubscribeFnCalls <= plannedSubscriptionFailures {
240+
return nil, fmt.Errorf("Groku?")
241+
}
242+
delegate := event.NewSubscription(func(unsubscribed <-chan struct{}) error {
243+
return nil
244+
})
245+
return delegate, nil
246+
}
247+
248+
// Non-buffered channels with no receivers, will block on write
249+
thresholdViolated := make(chan time.Duration)
250+
subscriptionFailed := make(chan error)
251+
252+
ctx, cancel := context.WithCancel(context.Background())
253+
defer cancel()
254+
255+
subscription := WithResubscription(
256+
backoffMax,
257+
subscribeFn,
258+
alertThreshold,
259+
func(elapsed time.Duration) {
260+
select {
261+
case thresholdViolated <- elapsed:
262+
case <-ctx.Done():
263+
return
264+
}
265+
},
266+
func(err error) {
267+
select {
268+
case subscriptionFailed <- err:
269+
case <-ctx.Done():
270+
return
271+
}
272+
},
273+
)
274+
<-subscription.Err()
275+
276+
// Subscription failed plannedSubscriptionFailures times and resubscription
277+
// function should be called plannedSubscriptionFailures + 1 times. One time
278+
// for each failure and one time at the end - that subscription was
279+
// successful and had not to be retried. No resubscription attempt should be
280+
// blocked by the lack of channel receivers on non-buffered channels.
281+
expectedResubscriptionCalls := plannedSubscriptionFailures + 1
282+
if resubscribeFnCalls != expectedResubscriptionCalls {
283+
t.Errorf(
284+
"resubscription called [%v] times, expected [%v]",
285+
resubscribeFnCalls,
286+
expectedResubscriptionCalls,
287+
)
288+
}
289+
}

tools/generators/ethereum/contract.go.tmpl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,21 @@ import (
2525
// included or excluded from logging at startup by name.
2626
var {{.ShortVar}}Logger = log.Logger("keep-contract-{{.Class}}")
2727

28+
const (
29+
// Maximum backoff time between event resubscription attempts.
30+
{{.ShortVar}}SubscriptionBackoffMax = 2 * time.Minute
31+
32+
// Threshold below which event resubscription emits an error to the logs.
33+
// WS connection can be dropped at any moment and event resubscription will
34+
// follow. However, if WS connection for event subscription is getting
35+
// dropped too often, it may indicate something is wrong with Ethereum
36+
// client. This constant defines the minimum lifetime of an event
37+
// subscription required before the subscription failure happens and
38+
// resubscription follows so that the resubscription does not emit an error
39+
// to the logs alerting about potential problems with Ethereum client.
40+
{{.ShortVar}}SubscriptionAlertThreshold = 15 * time.Minute
41+
)
42+
2843
type {{.Class}} struct {
2944
contract *abi.{{.AbiClass}}
3045
contractAddress common.Address

0 commit comments

Comments
 (0)