Skip to content

Commit 29b3b06

Browse files
authored
Reduce nonce redemption flake by giving WFE time to mark SubConns READY (#8442)
In noncebalancer, add documentation and change names to make the roles played by each type clearer. Unexport the pickerBuilder and picker types, since they aren't directly referenced anywhere outside of the package's init function. In the WFE, move the nonceWellFormed error message upwards into validNonce, alongside the other errors returned by that function. Change that same error message to say "malformed" rather than "invalid", to differentiate it from redemption failures and to match the corresponding metric label. Replace the JWSInvalidNonce metric label with two more-specific metric labels JWSNoBackendNonce and JWSExpiredNonce, for better insight into whether nonce redemption failures are due to backends shutting down or due to backends expiring old nonces. Finally, in the python integration tests, increase how long we wait between retries from 10ms to (up to) 600ms. This gives the WFE's NonceRedeemer gRPC client enough time to move its SubConns from the CONNECTING state to the READY state, and in practice seems to eliminate flaky nonce redemption errors in CI. Fixes #8385
1 parent d1422d2 commit 29b3b06

File tree

5 files changed

+63
-71
lines changed

5 files changed

+63
-71
lines changed

grpc/noncebalancer/noncebalancer.go

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,49 +31,44 @@ const (
3131
// balancer.
3232
//
3333
// In any case, when the WFE receives this error it will return a badNonce error
34-
// to the ACME client.
34+
// to the ACME client. Note that the WFE uses exact pointer comparison to
35+
// detect that the status it receives is this exact status object, so don't
36+
// wrap this with fmt.Errorf when returning it.
3537
var ErrNoBackendsMatchPrefix = status.New(codes.Unavailable, "no backends match the nonce prefix")
3638
var errMissingPrefixCtxKey = errors.New("nonce.PrefixCtxKey value required in RPC context")
3739
var errMissingHMACKeyCtxKey = errors.New("nonce.HMACKeyCtxKey value required in RPC context")
3840
var errInvalidPrefixCtxKeyType = errors.New("nonce.PrefixCtxKey value in RPC context must be a string")
3941
var errInvalidHMACKeyCtxKeyType = errors.New("nonce.HMACKeyCtxKey value in RPC context must be a byte slice")
4042

41-
// Balancer implements the base.PickerBuilder interface. It's used to create new
42-
// balancer.Picker instances. It should only be used by nonce-service clients.
43-
type Balancer struct{}
44-
45-
// Compile-time assertion that *Balancer implements the base.PickerBuilder
46-
// interface.
47-
var _ base.PickerBuilder = (*Balancer)(nil)
43+
// pickerBuilder implements the base.PickerBuilder interface. It's used to
44+
// create new Picker instances. It should only be used by nonce-service clients.
45+
type pickerBuilder struct{}
4846

4947
// Build implements the base.PickerBuilder interface. It is called by the gRPC
5048
// runtime when the balancer is first initialized and when the set of backend
5149
// (SubConn) addresses changes.
52-
func (b *Balancer) Build(buildInfo base.PickerBuildInfo) balancer.Picker {
50+
func (b *pickerBuilder) Build(buildInfo base.PickerBuildInfo) balancer.Picker {
5351
if len(buildInfo.ReadySCs) == 0 {
5452
// The Picker must be rebuilt if there are no backends available.
5553
return base.NewErrPicker(balancer.ErrNoSubConnAvailable)
5654
}
57-
return &Picker{
55+
return &picker{
5856
backends: buildInfo.ReadySCs,
5957
}
6058
}
6159

62-
// Picker implements the balancer.Picker interface. It picks a backend (SubConn)
60+
// picker implements the balancer.Picker interface. It picks a backend (SubConn)
6361
// based on the nonce prefix contained in each request's Context.
64-
type Picker struct {
62+
type picker struct {
6563
backends map[balancer.SubConn]base.SubConnInfo
6664
prefixToBackend map[string]balancer.SubConn
6765
prefixToBackendOnce sync.Once
6866
}
6967

70-
// Compile-time assertion that *Picker implements the balancer.Picker interface.
71-
var _ balancer.Picker = (*Picker)(nil)
72-
7368
// Pick implements the balancer.Picker interface. It is called by the gRPC
7469
// runtime for each RPC message. It is responsible for picking a backend
7570
// (SubConn) based on the context of each RPC message.
76-
func (p *Picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) {
71+
func (p *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) {
7772
if len(p.backends) == 0 {
7873
// This should never happen, the Picker should only be built when there
7974
// are backends available.
@@ -124,6 +119,6 @@ func (p *Picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) {
124119

125120
func init() {
126121
balancer.Register(
127-
base.NewBalancerBuilder(Name, &Balancer{}, base.Config{}),
122+
base.NewBalancerBuilder(Name, &pickerBuilder{}, base.Config{}),
128123
)
129124
}

grpc/noncebalancer/noncebalancer_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ func TestPickerNoSubConnsAvailable(t *testing.T) {
9595
test.AssertNil(t, gotPick.SubConn, "subConn should be nil")
9696
}
9797

98-
func setupTest(noSubConns bool) (*Balancer, balancer.Picker, []*subConn) {
98+
func setupTest(noSubConns bool) (*pickerBuilder, balancer.Picker, []*subConn) {
9999
var subConns []*subConn
100100
bi := base.PickerBuildInfo{
101101
ReadySCs: make(map[balancer.SubConn]base.SubConnInfo),
@@ -110,7 +110,7 @@ func setupTest(noSubConns bool) (*Balancer, balancer.Picker, []*subConn) {
110110
subConns = append(subConns, sc)
111111
}
112112

113-
b := &Balancer{}
113+
b := &pickerBuilder{}
114114
p := b.Build(bi)
115115
return b, p, subConns
116116
}

test/chisel2.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -142,16 +142,16 @@ def auth_and_issue(domains, chall_type="dns-01", email=None, cert_output=None, c
142142
else:
143143
raise Exception("invalid challenge type %s" % chall_type)
144144

145-
# Retry up to twice upon badNonce errors
146-
for n in range(2):
145+
# Make up to three attempts, retrying on badNonce errors
146+
for n in range(3):
147+
time.sleep(0.2 * n) # No sleep before the first attempt, then backoff
147148
try:
148149
order = client.poll_and_finalize(order)
149150
if cert_output is not None:
150151
with open(cert_output, "w") as f:
151152
f.write(order.fullchain_pem)
152153
except messages.Error as e:
153154
if e.typ == "urn:ietf:params:acme:error:badNonce":
154-
time.sleep(0.01)
155155
continue
156156
else:
157157
break
@@ -242,15 +242,8 @@ def expect_problem(problem_type, func):
242242
if len(domains) == 0:
243243
print(__doc__)
244244
sys.exit(0)
245-
# Retry up to twice upon badNonce errors
246-
for n in range(2):
247-
try:
248-
auth_and_issue(domains)
249-
except messages.Error as e:
250-
if e.typ == "urn:ietf:params:acme:error:badNonce":
251-
time.sleep(0.01)
252-
continue
253-
print(e)
254-
sys.exit(1)
255-
else:
256-
break
245+
try:
246+
auth_and_issue(domains)
247+
except messages.Error as e:
248+
print(e)
249+
sys.exit(1)

wfe2/verify.go

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -182,27 +182,26 @@ func (wfe *WebFrontEndImpl) validPOSTRequest(request *http.Request) error {
182182
return nil
183183
}
184184

185-
// nonceWellFormed checks a JWS' Nonce header to ensure it is well-formed,
186-
// otherwise a bad nonce error is returned. This avoids unnecessary RPCs to
187-
// the nonce redemption service.
188-
func nonceWellFormed(nonceHeader string, prefixLen int) error {
189-
errBadNonce := berrors.BadNonceError("JWS has an invalid anti-replay nonce: %q", nonceHeader)
185+
// nonceWellFormed checks whether a JWS' Nonce header is well-formed, returning
186+
// false if it is not. This avoids unnecessary RPCs to the nonce redemption
187+
// service.
188+
func nonceWellFormed(nonceHeader string, prefixLen int) bool {
190189
if len(nonceHeader) <= prefixLen {
191190
// Nonce header was an unexpected length because there is either:
192191
// 1) no nonce, or
193192
// 2) no nonce material after the prefix.
194-
return errBadNonce
193+
return false
195194
}
196195
body, err := base64.RawURLEncoding.DecodeString(nonceHeader[prefixLen:])
197196
if err != nil {
198197
// Nonce was not valid base64url.
199-
return errBadNonce
198+
return false
200199
}
201200
if len(body) != nonce.NonceLen {
202201
// Nonce was an unexpected length.
203-
return errBadNonce
202+
return false
204203
}
205-
return nil
204+
return true
206205
}
207206

208207
// validNonce checks a JWS' Nonce header to ensure it is one that the
@@ -215,10 +214,9 @@ func (wfe *WebFrontEndImpl) validNonce(ctx context.Context, header jose.Header)
215214
return berrors.BadNonceError("JWS has no anti-replay nonce")
216215
}
217216

218-
err := nonceWellFormed(header.Nonce, nonce.PrefixLen)
219-
if err != nil {
217+
if !nonceWellFormed(header.Nonce, nonce.PrefixLen) {
220218
wfe.stats.joseErrorCount.With(prometheus.Labels{"type": "JWSMalformedNonce"}).Inc()
221-
return err
219+
return berrors.BadNonceError("JWS has a malformed anti-replay nonce: %q", header.Nonce)
222220
}
223221

224222
// Populate the context with the nonce prefix and HMAC key. These are
@@ -230,22 +228,25 @@ func (wfe *WebFrontEndImpl) validNonce(ctx context.Context, header jose.Header)
230228
resp, err := wfe.rnc.Redeem(ctx, &noncepb.NonceMessage{Nonce: header.Nonce})
231229
if err != nil {
232230
rpcStatus, ok := status.FromError(err)
233-
if !ok || rpcStatus != nb.ErrNoBackendsMatchPrefix {
234-
return fmt.Errorf("failed to redeem nonce: %w", err)
231+
if ok && rpcStatus == nb.ErrNoBackendsMatchPrefix {
232+
// Getting our sentinel ErrNoBackendsMatchPrefix status.Status means that
233+
// the nonce backend which issued this nonce is presently unreachable or
234+
// unrecognized by this WFE. As this is a transient failure, the client
235+
// should retry their request with a fresh nonce.
236+
wfe.stats.nonceNoMatchingBackendCount.Inc()
237+
wfe.stats.joseErrorCount.With(prometheus.Labels{"type": "JWSNoBackendNonce"}).Inc()
238+
return berrors.BadNonceError("JWS has an invalid anti-replay nonce: %q", header.Nonce)
235239
}
236240

237-
// ErrNoBackendsMatchPrefix suggests that the nonce backend, which
238-
// issued this nonce, is presently unreachable or unrecognized by
239-
// this WFE. As this is a transient failure, the client should retry
240-
// their request with a fresh nonce.
241-
resp = &noncepb.ValidMessage{Valid: false}
242-
wfe.stats.nonceNoMatchingBackendCount.Inc()
241+
// We don't recognize this error, so just pass it upwards.
242+
return fmt.Errorf("failed to redeem nonce: %w", err)
243243
}
244244

245245
if !resp.Valid {
246-
wfe.stats.joseErrorCount.With(prometheus.Labels{"type": "JWSInvalidNonce"}).Inc()
247-
return berrors.BadNonceError("JWS has an invalid anti-replay nonce: %q", header.Nonce)
246+
wfe.stats.joseErrorCount.With(prometheus.Labels{"type": "JWSExpiredNonce"}).Inc()
247+
return berrors.BadNonceError("JWS has an expired anti-replay nonce: %q", header.Nonce)
248248
}
249+
249250
return nil
250251
}
251252

wfe2/verify_test.go

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ import (
1414
"strings"
1515
"testing"
1616

17+
"github.com/go-jose/go-jose/v4"
1718
"github.com/prometheus/client_golang/prometheus"
19+
"google.golang.org/grpc"
1820

1921
"github.com/letsencrypt/boulder/core"
2022
corepb "github.com/letsencrypt/boulder/core/proto"
@@ -26,9 +28,6 @@ import (
2628
sapb "github.com/letsencrypt/boulder/sa/proto"
2729
"github.com/letsencrypt/boulder/test"
2830
"github.com/letsencrypt/boulder/web"
29-
30-
"github.com/go-jose/go-jose/v4"
31-
"google.golang.org/grpc"
3231
)
3332

3433
// sigAlgForKey uses `signatureAlgorithmForKey` but fails immediately using the
@@ -204,8 +203,9 @@ func (rs requestSigner) missingNonce() *jose.JSONWebSignature {
204203
return jws
205204
}
206205

207-
// invalidNonce returns an otherwise well-signed request with an invalid nonce.
208-
func (rs requestSigner) invalidNonce() *jose.JSONWebSignature {
206+
// expiredNonce returns an otherwise well-signed request with a nonce that the
207+
// nonce service doesn't remember giving out (i.e. is expired).
208+
func (rs requestSigner) expiredNonce() *jose.JSONWebSignature {
209209
privateKey := loadKey(rs.t, []byte(test1KeyPrivatePEM))
210210
jwk := &jose.JSONWebKey{
211211
Key: privateKey,
@@ -710,23 +710,26 @@ func TestValidNonce(t *testing.T) {
710710
Name: "Malformed nonce in JWS",
711711
JWS: signer.malformedNonce(),
712712
WantErrType: berrors.BadNonce,
713-
WantErrDetail: "JWS has an invalid anti-replay nonce: \"im-a-nonce\"",
713+
WantErrDetail: "JWS has a malformed anti-replay nonce: \"im-a-nonce\"",
714714
WantStatType: "JWSMalformedNonce",
715715
},
716716
{
717717
Name: "Canned nonce shorter than prefixLength in JWS",
718718
JWS: signer.shortNonce(),
719719
WantErrType: berrors.BadNonce,
720-
WantErrDetail: "JWS has an invalid anti-replay nonce: \"woww\"",
720+
WantErrDetail: "JWS has a malformed anti-replay nonce: \"woww\"",
721721
WantStatType: "JWSMalformedNonce",
722722
},
723723
{
724-
Name: "Invalid nonce in JWS (test/config-next)",
725-
JWS: signer.invalidNonce(),
724+
Name: "Expired nonce in JWS",
725+
JWS: signer.expiredNonce(),
726726
WantErrType: berrors.BadNonce,
727-
WantErrDetail: "JWS has an invalid anti-replay nonce: \"mlolmlol3ov77I5Ui-cdaY_k8IcjK58FvbG0y_BCRrx5rGQ8rjA\"",
728-
WantStatType: "JWSInvalidNonce",
727+
WantErrDetail: "JWS has an expired anti-replay nonce: \"mlolmlol3ov77I5Ui-cdaY_k8IcjK58FvbG0y_BCRrx5rGQ8rjA\"",
728+
WantStatType: "JWSExpiredNonce",
729729
},
730+
// We don't have a test case for "invalid" (i.e. no backend matching the
731+
// prefix) because the unit tests don't use the noncebalancer that does
732+
// that routing.
730733
{
731734
Name: "Valid nonce in JWS",
732735
JWS: goodJWS,
@@ -1349,12 +1352,12 @@ func TestValidJWSForKey(t *testing.T) {
13491352
WantStatType: "JWSAlgorithmCheckFailed",
13501353
},
13511354
{
1352-
Name: "JWS with an invalid nonce (test/config-next)",
1353-
JWS: bJSONWebSignature{signer.invalidNonce()},
1355+
Name: "JWS with an expired nonce",
1356+
JWS: bJSONWebSignature{signer.expiredNonce()},
13541357
JWK: goodJWK,
13551358
WantErrType: berrors.BadNonce,
1356-
WantErrDetail: "JWS has an invalid anti-replay nonce: \"mlolmlol3ov77I5Ui-cdaY_k8IcjK58FvbG0y_BCRrx5rGQ8rjA\"",
1357-
WantStatType: "JWSInvalidNonce",
1359+
WantErrDetail: "JWS has an expired anti-replay nonce: \"mlolmlol3ov77I5Ui-cdaY_k8IcjK58FvbG0y_BCRrx5rGQ8rjA\"",
1360+
WantStatType: "JWSExpiredNonce",
13581361
},
13591362
{
13601363
Name: "JWS with broken signature",

0 commit comments

Comments
 (0)