Skip to content

Commit 9392b44

Browse files
authored
Add visibility into nonce redemption failure causes (#8448)
Add metrics to the nonce server which give visibility into how many nonces it is currently tracking, and what distribution of nonce ages it actually sees. This should help us better understand the in-memory behavior of the nonce server, so we can better debug issues related to invalid nonce errors. Also add much more detailed error messages throughout the nonce redemption pipeline. Have the nonce server's `valid()` helper method return individual error messages based on why the nonce was invalid. Have the nonce service itself return those error messages to the WFE, rather than simply returning "Valid: false". Finally, have the WFE suppress those detailed error messages when displaying BadNonce errors to the end-user, so they end up in our logs but not in user-facing API responses. Finally, due to the refactoring of the nonce service's `.valid()` helper above, simplify some unnecessary layers of abstraction within the nonce package. Remove the distinction between the "NonceService" (which contained all nonce logic) and the "NonceServer" (which implemented the gRPC interface). Make the non-gRPC methods unexported, since they're really just helper methods. Simplify the inmem nonce service to look exactly like the other inmem services, forwarding calls to the real gRPC implementation rather than to the underlying NonceService.
1 parent 1d86023 commit 9392b44

File tree

10 files changed

+252
-209
lines changed

10 files changed

+252
-209
lines changed

cmd/nonce-service/main.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,8 @@ func main() {
9393
tlsConfig, err := c.NonceService.TLS.Load(scope)
9494
cmd.FailOnError(err, "tlsConfig config")
9595

96-
nonceServer := nonce.NewServer(ns)
9796
start, err := bgrpc.NewServer(c.NonceService.GRPC, logger).Add(
98-
&noncepb.NonceService_ServiceDesc, nonceServer).Build(tlsConfig, scope, clock.New())
97+
&noncepb.NonceService_ServiceDesc, ns).Build(tlsConfig, scope, clock.New())
9998
cmd.FailOnError(err, "Unable to setup nonce service gRPC server")
10099

101100
logger.Info(fmt.Sprintf("Nonce server listening on %s with prefix %q", c.NonceService.GRPC.Address, noncePrefix))

email/exporter_test.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,10 @@ type mockPardotClientImpl struct {
2727
// newMockPardotClientImpl returns a MockPardotClientImpl, implementing the
2828
// PardotClient interface. Both refer to the same instance, with the interface
2929
// for mock interaction and the struct for state inspection and modification.
30-
func newMockPardotClientImpl() (PardotClient, *mockPardotClientImpl) {
31-
mockImpl := &mockPardotClientImpl{
30+
func newMockPardotClientImpl() *mockPardotClientImpl {
31+
return &mockPardotClientImpl{
3232
CreatedContacts: []string{},
3333
}
34-
return mockImpl, mockImpl
3534
}
3635

3736
// SendContact adds an email to CreatedContacts.
@@ -55,10 +54,10 @@ func (m *mockPardotClientImpl) getCreatedContacts() []string {
5554
// ExporterImpl queue and cleanup() to drain and shutdown. If start() is called,
5655
// cleanup() must be called.
5756
func setup() (*ExporterImpl, *mockPardotClientImpl, func(), func()) {
58-
mockClient, clientImpl := newMockPardotClientImpl()
57+
mockClient := newMockPardotClientImpl()
5958
exporter := NewExporterImpl(mockClient, nil, 1000000, 5, metrics.NoopRegisterer, blog.NewMock())
6059
daemonCtx, cancel := context.WithCancel(context.Background())
61-
return exporter, clientImpl,
60+
return exporter, mockClient,
6261
func() { exporter.Start(daemonCtx) },
6362
func() {
6463
cancel()
@@ -167,7 +166,7 @@ func TestSendContactDeduplication(t *testing.T) {
167166
t.Parallel()
168167

169168
cache := NewHashedEmailCache(1000, metrics.NoopRegisterer)
170-
mockClient, clientImpl := newMockPardotClientImpl()
169+
mockClient := newMockPardotClientImpl()
171170
exporter := NewExporterImpl(mockClient, cache, 1000000, 5, metrics.NoopRegisterer, blog.NewMock())
172171

173172
daemonCtx, cancel := context.WithCancel(context.Background())
@@ -182,7 +181,7 @@ func TestSendContactDeduplication(t *testing.T) {
182181
cancel()
183182
exporter.Drain()
184183

185-
contacts := clientImpl.getCreatedContacts()
184+
contacts := mockClient.getCreatedContacts()
186185
test.AssertEquals(t, 1, len(contacts))
187186
test.AssertEquals(t, "[email protected]", contacts[0])
188187

nonce/nonce.go

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"google.golang.org/grpc"
3333
"google.golang.org/protobuf/types/known/emptypb"
3434

35+
berrors "github.com/letsencrypt/boulder/errors"
3536
noncepb "github.com/letsencrypt/boulder/nonce/proto"
3637
)
3738

@@ -63,6 +64,7 @@ func DerivePrefix(grpcAddr string, key []byte) string {
6364

6465
// NonceService generates, cancels, and tracks Nonces.
6566
type NonceService struct {
67+
noncepb.UnsafeNonceServiceServer
6668
mu sync.Mutex
6769
latest int64
6870
earliest int64
@@ -73,7 +75,9 @@ type NonceService struct {
7375
prefix string
7476
nonceCreates prometheus.Counter
7577
nonceEarliest prometheus.Gauge
78+
nonceLatest prometheus.Gauge
7679
nonceRedeems *prometheus.CounterVec
80+
nonceAges *prometheus.HistogramVec
7781
nonceHeapLatency prometheus.Histogram
7882
}
7983

@@ -143,11 +147,22 @@ func NewNonceService(stats prometheus.Registerer, maxUsed int, prefix string) (*
143147
Help: "A gauge with the current earliest valid nonce value",
144148
})
145149
stats.MustRegister(nonceEarliest)
150+
nonceLatest := prometheus.NewGauge(prometheus.GaugeOpts{
151+
Name: "nonce_latest",
152+
Help: "A gauge with the current latest valid nonce value",
153+
})
154+
stats.MustRegister(nonceLatest)
146155
nonceRedeems := prometheus.NewCounterVec(prometheus.CounterOpts{
147156
Name: "nonce_redeems",
148157
Help: "A counter of nonce validations labelled by result",
149158
}, []string{"result", "error"})
150159
stats.MustRegister(nonceRedeems)
160+
nonceAges := prometheus.NewHistogramVec(prometheus.HistogramOpts{
161+
Name: "nonce_ages",
162+
Help: "A histogram of nonce ages at the time they were (attempted to be) redeemed, expressed as fractions of the valid nonce window",
163+
Buckets: []float64{-0.01, 0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, 1.5, 2, 5},
164+
}, []string{"result"})
165+
stats.MustRegister(nonceAges)
151166
nonceHeapLatency := prometheus.NewHistogram(prometheus.HistogramOpts{
152167
Name: "nonce_heap_latency",
153168
Help: "A histogram of latencies of heap pop operations",
@@ -164,7 +179,9 @@ func NewNonceService(stats prometheus.Registerer, maxUsed int, prefix string) (*
164179
prefix: prefix,
165180
nonceCreates: nonceCreates,
166181
nonceEarliest: nonceEarliest,
182+
nonceLatest: nonceLatest,
167183
nonceRedeems: nonceRedeems,
184+
nonceAges: nonceAges,
168185
nonceHeapLatency: nonceHeapLatency,
169186
}, nil
170187
}
@@ -232,40 +249,53 @@ func (ns *NonceService) decrypt(nonce string) (int64, error) {
232249
return ctr.Int64(), nil
233250
}
234251

235-
// Nonce provides a new Nonce.
236-
func (ns *NonceService) Nonce() (string, error) {
252+
// nonce provides a new Nonce.
253+
func (ns *NonceService) nonce() (string, error) {
237254
ns.mu.Lock()
238255
ns.latest++
239256
latest := ns.latest
240257
ns.mu.Unlock()
241-
defer ns.nonceCreates.Inc()
258+
ns.nonceCreates.Inc()
259+
ns.nonceLatest.Set(float64(latest))
242260
return ns.encrypt(latest)
243261
}
244262

245-
// Valid determines whether the provided Nonce string is valid, returning
263+
// valid determines whether the provided Nonce string is valid, returning
246264
// true if so.
247-
func (ns *NonceService) Valid(nonce string) bool {
265+
func (ns *NonceService) valid(nonce string) error {
248266
c, err := ns.decrypt(nonce)
249267
if err != nil {
250268
ns.nonceRedeems.WithLabelValues("invalid", "decrypt").Inc()
251-
return false
269+
return berrors.BadNonceError("unable to decrypt nonce: %s", err)
252270
}
253271

254272
ns.mu.Lock()
255273
defer ns.mu.Unlock()
256-
if c > ns.latest {
274+
275+
// age represents how "far back" in the valid nonce window this nonce is.
276+
// If it is very recent, then the numerator is very small and the age is close
277+
// to zero. If it is old but still valid, the numerator is slightly smaller
278+
// than the denominator, and the age is close to one. If it is too old, then
279+
// the age is greater than one. If it is magically too new (i.e. greater than
280+
// the largest nonce we've actually handed out), then the age is negative.
281+
age := float64(ns.latest-c) / float64(ns.latest-ns.earliest)
282+
283+
if c > ns.latest { // i.e. age < 0
257284
ns.nonceRedeems.WithLabelValues("invalid", "too high").Inc()
258-
return false
285+
ns.nonceAges.WithLabelValues("invalid").Observe(age)
286+
return berrors.BadNonceError("nonce greater than highest dispensed nonce: %d > %d", c, ns.latest)
259287
}
260288

261-
if c <= ns.earliest {
289+
if c <= ns.earliest { // i.e. age >= 1
262290
ns.nonceRedeems.WithLabelValues("invalid", "too low").Inc()
263-
return false
291+
ns.nonceAges.WithLabelValues("invalid").Observe(age)
292+
return berrors.BadNonceError("nonce less than lowest eligible nonce: %d < %d", c, ns.earliest)
264293
}
265294

266295
if ns.used[c] {
267296
ns.nonceRedeems.WithLabelValues("invalid", "already used").Inc()
268-
return false
297+
ns.nonceAges.WithLabelValues("invalid").Observe(age)
298+
return berrors.BadNonceError("nonce already marked as used: %d in [%d]used", c, len(ns.used))
269299
}
270300

271301
ns.used[c] = true
@@ -279,7 +309,8 @@ func (ns *NonceService) Valid(nonce string) bool {
279309
}
280310

281311
ns.nonceRedeems.WithLabelValues("valid", "").Inc()
282-
return true
312+
ns.nonceAges.WithLabelValues("valid").Observe(age)
313+
return nil
283314
}
284315

285316
// splitNonce splits a nonce into a prefix and a body.
@@ -290,27 +321,18 @@ func (ns *NonceService) splitNonce(nonce string) (string, string, error) {
290321
return nonce[:PrefixLen], nonce[PrefixLen:], nil
291322
}
292323

293-
// NewServer returns a new Server, wrapping a NonceService.
294-
func NewServer(inner *NonceService) *Server {
295-
return &Server{inner: inner}
296-
}
297-
298-
// Server implements the gRPC nonce service.
299-
type Server struct {
300-
noncepb.UnsafeNonceServiceServer
301-
inner *NonceService
302-
}
303-
304-
var _ noncepb.NonceServiceServer = (*Server)(nil)
305-
306324
// Redeem accepts a nonce from a gRPC client and redeems it using the inner nonce service.
307-
func (ns *Server) Redeem(ctx context.Context, msg *noncepb.NonceMessage) (*noncepb.ValidMessage, error) {
308-
return &noncepb.ValidMessage{Valid: ns.inner.Valid(msg.Nonce)}, nil
325+
func (ns *NonceService) Redeem(ctx context.Context, msg *noncepb.NonceMessage) (*noncepb.ValidMessage, error) {
326+
err := ns.valid(msg.Nonce)
327+
if err != nil {
328+
return nil, err
329+
}
330+
return &noncepb.ValidMessage{Valid: true}, nil
309331
}
310332

311333
// Nonce generates a nonce and sends it to a gRPC client.
312-
func (ns *Server) Nonce(_ context.Context, _ *emptypb.Empty) (*noncepb.NonceMessage, error) {
313-
nonce, err := ns.inner.Nonce()
334+
func (ns *NonceService) Nonce(_ context.Context, _ *emptypb.Empty) (*noncepb.NonceMessage, error) {
335+
nonce, err := ns.nonce()
314336
if err != nil {
315337
return nil, err
316338
}

0 commit comments

Comments
 (0)