Skip to content

Commit a79a830

Browse files
kruti-spgporada
andauthored
ratelimits: Auto pause zombie clients (#7763)
- Added a new key-value ratelimit `FailedAuthorizationsForPausingPerDomainPerAccount` which is incremented each time a client fails a validation. - As long as capacity exists in the bucket, a successful validation attempt will reset the bucket back to full capacity. - Upon exhausting bucket capacity, the RA will send a gRPC to the SA to pause the `account:identifier`. Further validation attempts will be rejected by the [WFE](#7599). - Added a new feature flag, `AutomaticallyPauseZombieClients`, which enables automatic pausing of zombie clients in the RA. - Added a new RA metric `paused_pairs{"paused":[bool], "repaused":[bool], "grace":[bool]}` to monitor use of this new functionality. - Updated `ra_test.go` `initAuthorities` to allow accessing the `*ratelimits.RedisSource` for checking that the new ratelimit functions as intended. Co-authored-by: @pgporada Fixes #7738 --------- Co-authored-by: Phil Porada <[email protected]> Co-authored-by: Phil Porada <[email protected]>
1 parent 2058d98 commit a79a830

14 files changed

+598
-84
lines changed

features/features.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,13 @@ type Config struct {
119119
// unique "INSERT ... RETURNING" functionality.
120120
InsertAuthzsIndividually bool
121121

122+
// AutomaticallyPauseZombieClients configures the RA to automatically track
123+
// limiter to be the authoritative source of rate limiting information for
124+
// automatically pausing clients who systemically fail every validation
125+
// attempt. When disabled, only manually paused accountID:identifier pairs
126+
// will be rejected.
127+
AutomaticallyPauseZombieClients bool
128+
122129
// IncrementRateLimits uses Redis' IncrBy, instead of Set, for rate limit
123130
// accounting. This catches and denies spikes of requests much more
124131
// reliably.

ra/ra.go

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ type RegistrationAuthorityImpl struct {
122122
orderAges *prometheus.HistogramVec
123123
inflightFinalizes prometheus.Gauge
124124
certCSRMismatch prometheus.Counter
125+
pauseCounter *prometheus.CounterVec
125126
}
126127

127128
var _ rapb.RegistrationAuthorityServer = (*RegistrationAuthorityImpl)(nil)
@@ -241,6 +242,12 @@ func NewRegistrationAuthorityImpl(
241242
})
242243
stats.MustRegister(certCSRMismatch)
243244

245+
pauseCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
246+
Name: "paused_pairs",
247+
Help: "Number of times a pause operation is performed, labeled by paused=[bool], repaused=[bool], grace=[bool]",
248+
}, []string{"paused", "repaused", "grace"})
249+
stats.MustRegister(pauseCounter)
250+
244251
issuersByNameID := make(map[issuance.NameID]*issuance.Certificate)
245252
for _, issuer := range issuers {
246253
issuersByNameID[issuer.NameID()] = issuer
@@ -276,6 +283,7 @@ func NewRegistrationAuthorityImpl(
276283
orderAges: orderAges,
277284
inflightFinalizes: inflightFinalizes,
278285
certCSRMismatch: certCSRMismatch,
286+
pauseCounter: pauseCounter,
279287
}
280288
return ra
281289
}
@@ -1810,15 +1818,17 @@ func (ra *RegistrationAuthorityImpl) recordValidation(ctx context.Context, authI
18101818
}
18111819

18121820
// countFailedValidation increments the failed authorizations per domain per
1813-
// account rate limit. There is no reason to surface errors from this function
1814-
// to the Subscriber, spends against this limit are best effort.
1815-
func (ra *RegistrationAuthorityImpl) countFailedValidation(ctx context.Context, regId int64, name string) {
1821+
// account rate limit. If the AutomaticallyPauseZombieClients feature has been
1822+
// enabled, it also increments the failed authorizations for pausing per domain
1823+
// per account rate limit. There is no reason to surface errors from this
1824+
// function to the Subscriber, spends against this limit are best effort.
1825+
func (ra *RegistrationAuthorityImpl) countFailedValidation(ctx context.Context, regId int64, ident identifier.ACMEIdentifier) {
18161826
if ra.limiter == nil || ra.txnBuilder == nil {
18171827
// Limiter is disabled.
18181828
return
18191829
}
18201830

1821-
txn, err := ra.txnBuilder.FailedAuthorizationsPerDomainPerAccountSpendOnlyTransaction(regId, name)
1831+
txn, err := ra.txnBuilder.FailedAuthorizationsPerDomainPerAccountSpendOnlyTransaction(regId, ident.Value)
18221832
if err != nil {
18231833
ra.log.Warningf("building rate limit transaction for the %s rate limit: %s", ratelimits.FailedAuthorizationsPerDomainPerAccount, err)
18241834
}
@@ -1830,6 +1840,54 @@ func (ra *RegistrationAuthorityImpl) countFailedValidation(ctx context.Context,
18301840
}
18311841
ra.log.Warningf("spending against the %s rate limit: %s", ratelimits.FailedAuthorizationsPerDomainPerAccount, err)
18321842
}
1843+
1844+
if features.Get().AutomaticallyPauseZombieClients {
1845+
txn, err = ra.txnBuilder.FailedAuthorizationsForPausingPerDomainPerAccountTransaction(regId, ident.Value)
1846+
if err != nil {
1847+
ra.log.Warningf("building rate limit transaction for the %s rate limit: %s", ratelimits.FailedAuthorizationsForPausingPerDomainPerAccount, err)
1848+
}
1849+
1850+
decision, err := ra.limiter.Spend(ctx, txn)
1851+
if err != nil {
1852+
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
1853+
return
1854+
}
1855+
ra.log.Warningf("spending against the %s rate limit: %s", ratelimits.FailedAuthorizationsForPausingPerDomainPerAccount, err)
1856+
}
1857+
1858+
if decision.Result(ra.clk.Now()) != nil {
1859+
resp, err := ra.SA.PauseIdentifiers(ctx, &sapb.PauseRequest{
1860+
RegistrationID: regId,
1861+
Identifiers: []*corepb.Identifier{
1862+
{
1863+
Type: string(ident.Type),
1864+
Value: ident.Value,
1865+
},
1866+
},
1867+
})
1868+
if err != nil {
1869+
ra.log.Warningf("failed to pause %d/%q: %s", regId, ident.Value, err)
1870+
}
1871+
ra.pauseCounter.With(prometheus.Labels{
1872+
"paused": strconv.FormatBool(resp.Paused > 0),
1873+
"repaused": strconv.FormatBool(resp.Repaused > 0),
1874+
"grace": strconv.FormatBool(resp.Paused <= 0 && resp.Repaused <= 0),
1875+
}).Inc()
1876+
}
1877+
}
1878+
}
1879+
1880+
// resetAccountPausingLimit resets bucket to maximum capacity for given account.
1881+
// There is no reason to surface errors from this function to the Subscriber.
1882+
func (ra *RegistrationAuthorityImpl) resetAccountPausingLimit(ctx context.Context, regId int64, ident identifier.ACMEIdentifier) {
1883+
bucketKey, err := ratelimits.NewRegIdDomainBucketKey(ratelimits.FailedAuthorizationsForPausingPerDomainPerAccount, regId, ident.Value)
1884+
if err != nil {
1885+
ra.log.Warningf("creating bucket key for regID=[%d] identifier=[%s]: %s", regId, ident.Value, err)
1886+
}
1887+
err = ra.limiter.Reset(ctx, bucketKey)
1888+
if err != nil {
1889+
ra.log.Warningf("resetting bucket for regID=[%d] identifier=[%s]: %s", regId, ident.Value, err)
1890+
}
18331891
}
18341892

18351893
// PerformValidation initiates validation for a specific challenge associated
@@ -1953,9 +2011,12 @@ func (ra *RegistrationAuthorityImpl) PerformValidation(
19532011
if prob != nil {
19542012
challenge.Status = core.StatusInvalid
19552013
challenge.Error = prob
1956-
go ra.countFailedValidation(vaCtx, authz.RegistrationID, authz.Identifier.Value)
2014+
go ra.countFailedValidation(vaCtx, authz.RegistrationID, authz.Identifier)
19572015
} else {
19582016
challenge.Status = core.StatusValid
2017+
if features.Get().AutomaticallyPauseZombieClients {
2018+
ra.resetAccountPausingLimit(vaCtx, authz.RegistrationID, authz.Identifier)
2019+
}
19592020
}
19602021
challenge.Validated = &vStart
19612022
authz.Challenges[challIndex] = *challenge

0 commit comments

Comments
 (0)