Skip to content

Commit 908421b

Browse files
authored
crl-updater: lease CRL shards to prevent races (#6941)
Add a new feature flag, LeaseCRLShards, which controls certain aspects of crl-updater's behavior. When this flag is enabled, crl-updater calls the new SA.LeaseCRLShard method before beginning work on a shard. This prevents it from stepping on the toes of another crl-updater instance which may be working on the same shard. This is important to prevent two competing instances from accidentally updating a CRL's Number (which is an integer representation of its thisUpdate timestamp) *backwards*, which would be a compliance violation. When this flag is enabled, crl-updater also calls the new SA.UpdateCRLShard method after finishing work on a shard. In the future, additional work will be done to make crl-updater use the "give me the oldest available shard" mode of the LeaseCRLShard method. Fixes #6897
1 parent 2bf5b26 commit 908421b

File tree

10 files changed

+199
-51
lines changed

10 files changed

+199
-51
lines changed

cmd/crl-updater/main.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ type Config struct {
8181
// less than the UpdatePeriod.
8282
UpdateOffset config.Duration
8383

84+
// UpdateTimeout controls how long a single CRL shard is allowed to attempt
85+
// to update before being timed out. The total CRL updating process may take
86+
// significantly longer, since a full update cycle may consist of updating
87+
// many shards with varying degrees of parallelism. This value must be
88+
// strictly less than the UpdatePeriod. Defaults to 1 hour.
89+
UpdateTimeout config.Duration `validate:"-"`
90+
8491
// MaxParallelism controls how many workers may be running in parallel.
8592
// A higher value reduces the total time necessary to update all CRL shards
8693
// that this updater is responsible for, but also increases the memory used
@@ -142,10 +149,13 @@ func main() {
142149
if c.CRLUpdater.LookbackPeriod.Duration == 0 {
143150
c.CRLUpdater.LookbackPeriod.Duration = 24 * time.Hour
144151
}
152+
if c.CRLUpdater.UpdateTimeout.Duration == 0 {
153+
c.CRLUpdater.UpdateTimeout.Duration = 1 * time.Hour
154+
}
145155

146156
saConn, err := bgrpc.ClientSetup(c.CRLUpdater.SAService, tlsConfig, scope, clk)
147157
cmd.FailOnError(err, "Failed to load credentials and create gRPC connection to SA")
148-
sac := sapb.NewStorageAuthorityReadOnlyClient(saConn)
158+
sac := sapb.NewStorageAuthorityClient(saConn)
149159

150160
caConn, err := bgrpc.ClientSetup(c.CRLUpdater.CRLGeneratorService, tlsConfig, scope, clk)
151161
cmd.FailOnError(err, "Failed to load credentials and create gRPC connection to CRLGenerator")
@@ -162,6 +172,7 @@ func main() {
162172
c.CRLUpdater.LookbackPeriod.Duration,
163173
c.CRLUpdater.UpdatePeriod.Duration,
164174
c.CRLUpdater.UpdateOffset.Duration,
175+
c.CRLUpdater.UpdateTimeout.Duration,
165176
c.CRLUpdater.MaxParallelism,
166177
c.CRLUpdater.MaxAttempts,
167178
sac,

crl/updater/updater.go

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@ import (
1515
"github.com/jmhodges/clock"
1616
"github.com/prometheus/client_golang/prometheus"
1717
"google.golang.org/protobuf/types/known/emptypb"
18+
"google.golang.org/protobuf/types/known/timestamppb"
1819

1920
capb "github.com/letsencrypt/boulder/ca/proto"
2021
"github.com/letsencrypt/boulder/core"
2122
"github.com/letsencrypt/boulder/core/proto"
2223
"github.com/letsencrypt/boulder/crl"
2324
cspb "github.com/letsencrypt/boulder/crl/storer/proto"
25+
"github.com/letsencrypt/boulder/features"
2426
"github.com/letsencrypt/boulder/issuance"
2527
blog "github.com/letsencrypt/boulder/log"
2628
sapb "github.com/letsencrypt/boulder/sa/proto"
@@ -33,10 +35,11 @@ type crlUpdater struct {
3335
lookbackPeriod time.Duration
3436
updatePeriod time.Duration
3537
updateOffset time.Duration
38+
updateTimeout time.Duration
3639
maxParallelism int
3740
maxAttempts int
3841

39-
sa sapb.StorageAuthorityReadOnlyClient
42+
sa sapb.StorageAuthorityClient
4043
ca capb.CRLGeneratorClient
4144
cs cspb.CRLStorerClient
4245

@@ -54,9 +57,10 @@ func NewUpdater(
5457
lookbackPeriod time.Duration,
5558
updatePeriod time.Duration,
5659
updateOffset time.Duration,
60+
updateTimeout time.Duration,
5761
maxParallelism int,
5862
maxAttempts int,
59-
sa sapb.StorageAuthorityReadOnlyClient,
63+
sa sapb.StorageAuthorityClient,
6064
ca capb.CRLGeneratorClient,
6165
cs cspb.CRLStorerClient,
6266
stats prometheus.Registerer,
@@ -80,6 +84,10 @@ func NewUpdater(
8084
return nil, fmt.Errorf("update offset must be less than period: %s !< %s", updateOffset, updatePeriod)
8185
}
8286

87+
if updateTimeout >= updatePeriod {
88+
return nil, fmt.Errorf("update timeout must be less than period: %s !< %s", updateTimeout, updatePeriod)
89+
}
90+
8391
if lookbackPeriod < 2*updatePeriod {
8492
return nil, fmt.Errorf("lookbackPeriod must be at least 2x updatePeriod: %s !< 2 * %s", lookbackPeriod, updatePeriod)
8593
}
@@ -112,6 +120,7 @@ func NewUpdater(
112120
lookbackPeriod,
113121
updatePeriod,
114122
updateOffset,
123+
updateTimeout,
115124
maxParallelism,
116125
maxAttempts,
117126
sa,
@@ -250,10 +259,12 @@ func (cu *crlUpdater) tickIssuer(ctx context.Context, atTime time.Time, issuerNa
250259
case <-ctx.Done():
251260
return
252261
default:
262+
ctx, cancel := context.WithTimeout(ctx, cu.updateTimeout)
253263
out <- shardResult{
254264
shardIdx: idx,
255265
err: cu.tickShardWithRetry(ctx, atTime, issuerNameID, idx, shardMap[idx]),
256266
}
267+
cancel()
257268
}
258269
}
259270
}
@@ -335,6 +346,24 @@ func (cu *crlUpdater) tickShard(ctx context.Context, atTime time.Time, issuerNam
335346
cu.log.Infof(
336347
"Generating CRL shard: id=[%s] numChunks=[%d]", crlID, len(chunks))
337348

349+
if features.Enabled(features.LeaseCRLShards) {
350+
// Notify the database that we're working on this shard.
351+
deadline, ok := ctx.Deadline()
352+
if !ok {
353+
return fmt.Errorf("context has no deadline")
354+
}
355+
356+
_, err = cu.sa.LeaseCRLShard(ctx, &sapb.LeaseCRLShardRequest{
357+
IssuerNameID: int64(issuerNameID),
358+
MinShardIdx: int64(shardIdx),
359+
MaxShardIdx: int64(shardIdx),
360+
Until: timestamppb.New(deadline.Add(-time.Second)),
361+
})
362+
if err != nil {
363+
return fmt.Errorf("leasing shard: %w", err)
364+
}
365+
}
366+
338367
// Get the full list of CRL Entries for this shard from the SA.
339368
var crlEntries []*proto.CRLEntry
340369
for _, chunk := range chunks {
@@ -452,6 +481,15 @@ func (cu *crlUpdater) tickShard(ctx context.Context, atTime time.Time, issuerNam
452481
return fmt.Errorf("closing CRLStorer upload stream: %w", err)
453482
}
454483

484+
if features.Enabled(features.LeaseCRLShards) {
485+
// Notify the database that that we're done.
486+
_, err = cu.sa.UpdateCRLShard(ctx, &sapb.UpdateCRLShardRequest{
487+
IssuerNameID: int64(issuerNameID),
488+
ShardIdx: int64(shardIdx),
489+
ThisUpdate: timestamppb.New(atTime),
490+
})
491+
}
492+
455493
cu.log.Infof(
456494
"Generated CRL shard: id=[%s] size=[%d] hash=[%x]",
457495
crlID, crlLen, crlHash.Sum(nil))

crl/updater/updater_test.go

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
capb "github.com/letsencrypt/boulder/ca/proto"
1616
corepb "github.com/letsencrypt/boulder/core/proto"
1717
cspb "github.com/letsencrypt/boulder/crl/storer/proto"
18+
"github.com/letsencrypt/boulder/features"
1819
"github.com/letsencrypt/boulder/issuance"
1920
blog "github.com/letsencrypt/boulder/log"
2021
"github.com/letsencrypt/boulder/metrics"
@@ -50,19 +51,27 @@ func (f *fakeGRCC) Recv() (*corepb.CRLEntry, error) {
5051
// fakeGRCC to be used as the return value for calls to GetRevokedCerts, and a
5152
// fake timestamp to serve as the database's maximum notAfter value.
5253
type fakeSAC struct {
53-
mocks.StorageAuthorityReadOnly
54+
mocks.StorageAuthority
5455
grcc fakeGRCC
5556
maxNotAfter time.Time
57+
leaseError error
5658
}
5759

58-
func (f *fakeSAC) GetRevokedCerts(ctx context.Context, _ *sapb.GetRevokedCertsRequest, _ ...grpc.CallOption) (sapb.StorageAuthorityReadOnly_GetRevokedCertsClient, error) {
60+
func (f *fakeSAC) GetRevokedCerts(ctx context.Context, _ *sapb.GetRevokedCertsRequest, _ ...grpc.CallOption) (sapb.StorageAuthority_GetRevokedCertsClient, error) {
5961
return &f.grcc, nil
6062
}
6163

6264
func (f *fakeSAC) GetMaxExpiration(_ context.Context, req *emptypb.Empty, _ ...grpc.CallOption) (*timestamppb.Timestamp, error) {
6365
return timestamppb.New(f.maxNotAfter), nil
6466
}
6567

68+
func (f *fakeSAC) LeaseCRLShard(_ context.Context, req *sapb.LeaseCRLShardRequest, _ ...grpc.CallOption) (*sapb.LeaseCRLShardResponse, error) {
69+
if f.leaseError != nil {
70+
return nil, f.leaseError
71+
}
72+
return &sapb.LeaseCRLShardResponse{IssuerNameID: req.IssuerNameID, ShardIdx: req.MinShardIdx}, nil
73+
}
74+
6675
// fakeGCC is a fake capb.CRLGenerator_GenerateCRLClient which can be
6776
// populated with some CRL entries or an error for use as the return value of
6877
// a faked GenerateCRL call.
@@ -140,13 +149,15 @@ func TestTickShard(t *testing.T) {
140149
test.AssertNotError(t, err, "loading test issuer")
141150

142151
sentinelErr := errors.New("oops")
152+
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
153+
defer cancel()
143154

144155
clk := clock.NewFake()
145156
clk.Set(time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC))
146157
cu, err := NewUpdater(
147158
[]*issuance.Certificate{e1, r3},
148159
2, 18*time.Hour, 24*time.Hour,
149-
6*time.Hour, 1*time.Minute, 1, 1,
160+
6*time.Hour, time.Minute, time.Hour, 1, 1,
150161
&fakeSAC{grcc: fakeGRCC{}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
151162
&fakeCGC{gcc: fakeGCC{}},
152163
&fakeCSC{ucc: fakeUCC{}},
@@ -159,16 +170,29 @@ func TestTickShard(t *testing.T) {
159170
}
160171

161172
// Ensure that getting no results from the SA still works.
162-
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
173+
err = cu.tickShard(ctx, cu.clk.Now(), e1.NameID(), 0, testChunks)
163174
test.AssertNotError(t, err, "empty CRL")
164175
test.AssertMetricWithLabelsEquals(t, cu.updatedCounter, prometheus.Labels{
165176
"issuer": "(TEST) Elegant Elephant E1", "result": "success",
166177
}, 1)
167178
cu.updatedCounter.Reset()
168179

180+
// With leasing enabled, errors while leasing should bubble up early.
181+
_ = features.Set(map[string]bool{"LeaseCRLShards": true})
182+
cu.sa.(*fakeSAC).leaseError = sentinelErr
183+
err = cu.tickShard(ctx, cu.clk.Now(), e1.NameID(), 0, testChunks)
184+
test.AssertError(t, err, "leasing error")
185+
test.AssertContains(t, err.Error(), "leasing shard")
186+
test.AssertErrorIs(t, err, sentinelErr)
187+
test.AssertMetricWithLabelsEquals(t, cu.updatedCounter, prometheus.Labels{
188+
"issuer": "(TEST) Elegant Elephant E1", "result": "failed",
189+
}, 1)
190+
cu.updatedCounter.Reset()
191+
features.Reset()
192+
169193
// Errors closing the Storer upload stream should bubble up.
170194
cu.cs = &fakeCSC{ucc: fakeUCC{recvErr: sentinelErr}}
171-
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
195+
err = cu.tickShard(ctx, cu.clk.Now(), e1.NameID(), 0, testChunks)
172196
test.AssertError(t, err, "storer error")
173197
test.AssertContains(t, err.Error(), "closing CRLStorer upload stream")
174198
test.AssertErrorIs(t, err, sentinelErr)
@@ -179,7 +203,7 @@ func TestTickShard(t *testing.T) {
179203

180204
// Errors sending to the Storer should bubble up sooner.
181205
cu.cs = &fakeCSC{ucc: fakeUCC{sendErr: sentinelErr}}
182-
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
206+
err = cu.tickShard(ctx, cu.clk.Now(), e1.NameID(), 0, testChunks)
183207
test.AssertError(t, err, "storer error")
184208
test.AssertContains(t, err.Error(), "sending CRLStorer metadata")
185209
test.AssertErrorIs(t, err, sentinelErr)
@@ -190,7 +214,7 @@ func TestTickShard(t *testing.T) {
190214

191215
// Errors reading from the CA should bubble up sooner.
192216
cu.ca = &fakeCGC{gcc: fakeGCC{recvErr: sentinelErr}}
193-
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
217+
err = cu.tickShard(ctx, cu.clk.Now(), e1.NameID(), 0, testChunks)
194218
test.AssertError(t, err, "CA error")
195219
test.AssertContains(t, err.Error(), "receiving CRL bytes")
196220
test.AssertErrorIs(t, err, sentinelErr)
@@ -201,7 +225,7 @@ func TestTickShard(t *testing.T) {
201225

202226
// Errors sending to the CA should bubble up sooner.
203227
cu.ca = &fakeCGC{gcc: fakeGCC{sendErr: sentinelErr}}
204-
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
228+
err = cu.tickShard(ctx, cu.clk.Now(), e1.NameID(), 0, testChunks)
205229
test.AssertError(t, err, "CA error")
206230
test.AssertContains(t, err.Error(), "sending CA metadata")
207231
test.AssertErrorIs(t, err, sentinelErr)
@@ -212,7 +236,7 @@ func TestTickShard(t *testing.T) {
212236

213237
// Errors reading from the SA should bubble up soonest.
214238
cu.sa = &fakeSAC{grcc: fakeGRCC{err: sentinelErr}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)}
215-
err = cu.tickShard(context.Background(), cu.clk.Now(), e1.NameID(), 0, testChunks)
239+
err = cu.tickShard(ctx, cu.clk.Now(), e1.NameID(), 0, testChunks)
216240
test.AssertError(t, err, "database error")
217241
test.AssertContains(t, err.Error(), "retrieving entry from SA")
218242
test.AssertErrorIs(t, err, sentinelErr)
@@ -237,7 +261,7 @@ func TestTickShardWithRetry(t *testing.T) {
237261
cu, err := NewUpdater(
238262
[]*issuance.Certificate{e1, r3},
239263
2, 18*time.Hour, 24*time.Hour,
240-
6*time.Hour, 1*time.Minute, 1, 1,
264+
6*time.Hour, time.Minute, time.Hour, 1, 1,
241265
&fakeSAC{grcc: fakeGRCC{err: sentinelErr}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
242266
&fakeCGC{gcc: fakeGCC{}},
243267
&fakeCSC{ucc: fakeUCC{}},
@@ -283,7 +307,7 @@ func TestTickIssuer(t *testing.T) {
283307
cu, err := NewUpdater(
284308
[]*issuance.Certificate{e1, r3},
285309
2, 18*time.Hour, 24*time.Hour,
286-
6*time.Hour, 1*time.Minute, 1, 1,
310+
6*time.Hour, time.Minute, time.Hour, 1, 1,
287311
&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
288312
&fakeCGC{gcc: fakeGCC{}},
289313
&fakeCSC{ucc: fakeUCC{}},
@@ -319,7 +343,7 @@ func TestTick(t *testing.T) {
319343
cu, err := NewUpdater(
320344
[]*issuance.Certificate{e1, r3},
321345
2, 18*time.Hour, 24*time.Hour,
322-
6*time.Hour, 1*time.Minute, 1, 1,
346+
6*time.Hour, time.Minute, time.Hour, 1, 1,
323347
&fakeSAC{grcc: fakeGRCC{err: errors.New("db no worky")}, maxNotAfter: clk.Now().Add(90 * 24 * time.Hour)},
324348
&fakeCGC{gcc: fakeGCC{}},
325349
&fakeCSC{ucc: fakeUCC{}},

features/featureflag_string.go

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

features/features.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ const (
6666
// According to the BRs Section 7.1.4.2.2(a), the commonName field is
6767
// Deprecated, and its inclusion is discouraged but not (yet) prohibited.
6868
RequireCommonName
69+
70+
// LeaseCRLShards causes the crl-updater to use the database to control which
71+
// instance of crl-updater is responsible for updating each shard. This flag
72+
// should only be enabled if the `crlShards` table exists in the database.
73+
LeaseCRLShards
6974
)
7075

7176
// List of features and their default value, protected by fMu
@@ -86,6 +91,7 @@ var features = map[FeatureFlag]bool{
8691
CertCheckerRequiresValidations: false,
8792
AsyncFinalize: false,
8893
RequireCommonName: true,
94+
LeaseCRLShards: false,
8995

9096
StoreLintingCertificateInsteadOfPrecertificate: false,
9197
}

0 commit comments

Comments
 (0)