Skip to content

Commit 1bedd25

Browse files
craig[bot]jbowensjeffswenson
committed
151795: go.mod: bump Pebble to 9be04468f598 r=RaduBerinde a=jbowens Changes: * [`9be04468`](cockroachdb/pebble@9be04468) db: add more logs to TestCompactionCorruption * [`e52a9b1d`](cockroachdb/pebble@e52a9b1d) wal: hint sequential reads when reading WAL files * [`43d95d5d`](cockroachdb/pebble@43d95d5d) internal/testutils: move Logger * [`71cce7bd`](cockroachdb/pebble@71cce7bd) db: take ScanInternal options through an options struct * [`ab9ffc9d`](cockroachdb/pebble@ab9ffc9d) block,cache: trace and stats for wait duration (for concurrent reads) * [`1f091ae1`](cockroachdb/pebble@1f091ae1) blockkind: remove the Index Kind * [`0c927430`](cockroachdb/pebble@0c927430) block: don't obtain decompressor twice * [`f4ef14f2`](cockroachdb/pebble@f4ef14f2) tool: show compression stats in sstable properties * [`222c46bb`](cockroachdb/pebble@222c46bb) db: propagate *PhysicalBlobFile to reportCorruption * [`bbd7e034`](cockroachdb/pebble@bbd7e034) base: add DiskFile interface * [`cbfea02d`](cockroachdb/pebble@cbfea02d) base: move BlobReferenceID, BlobFileMapping * [`436c169a`](cockroachdb/pebble@436c169a) db: remove EnableColumnarBlocks option * [`453a7219`](cockroachdb/pebble@453a7219) arenaskl,batchskl: add test ensuring node contains no pointer types Release note: none. Epic: none. 151817: amazon: add settings to control retry behavior r=jeffswenson a=jeffswenson This change adds two settings to control the S3 client retry behavior: 1. `cloudstorage.s3.max_retries`: this replaces a constant that was hardcoded to 10. It controls how many times the s3 client will retry an error. 2. `cloudstorage.s3.enable_client_retry_token_bucket`: this disables the client retry token bucket. The token bucket has no time based refill, and can only refill when new operations are started, so it can get stuck in an empty state with fixed concurrency operations like backup and restore. There is also a minor improvement to verbose logging in the S3 client: * --vmodule=s3_storage=1 will enable retry logging and deprecation warnings. * --vmodule=s3_storage=2 logs requests and responses. * --vmodule=s3_storage=3 logs headers+bodies of messages. Informs: #151748 Release note: Tunes S3 client retry behavior to be more reliable in the presence of correlated errors. Co-authored-by: Jackson Owens <[email protected]> Co-authored-by: Jeff Swenson <[email protected]>
3 parents 54bcb71 + a44397a + 43c4b6a commit 1bedd25

File tree

11 files changed

+101
-40
lines changed

11 files changed

+101
-40
lines changed

DEPS.bzl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1850,10 +1850,10 @@ def go_deps():
18501850
patches = [
18511851
"@com_github_cockroachdb_cockroach//build/patches:com_github_cockroachdb_pebble.patch",
18521852
],
1853-
sha256 = "e47b5d6c32ebd359e64d4332866e2935861a82cff227ccfe184741384955b199",
1854-
strip_prefix = "github.com/cockroachdb/[email protected]20250806123517-258531bdf87e",
1853+
sha256 = "668f4a5ade4ee097a7f6f504672e202032329b11f1d0191361d75223922b1986",
1854+
strip_prefix = "github.com/cockroachdb/[email protected]20250812190039-9be04468f598",
18551855
urls = [
1856-
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/pebble/com_github_cockroachdb_pebble-v0.0.0-20250806123517-258531bdf87e.zip",
1856+
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/pebble/com_github_cockroachdb_pebble-v0.0.0-20250812190039-9be04468f598.zip",
18571857
],
18581858
)
18591859
go_repository(

build/bazelutil/distdir_files.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ DISTDIR_FILES = {
359359
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/gostdlib/com_github_cockroachdb_gostdlib-v1.19.0.zip": "c4d516bcfe8c07b6fc09b8a9a07a95065b36c2855627cb3514e40c98f872b69e",
360360
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/logtags/com_github_cockroachdb_logtags-v0.0.0-20241215232642-bb51bb14a506.zip": "920068af09e3846d9ebb4e4a7787ff1dd10f3989c5f940ad861b0f6a9f824f6e",
361361
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/metamorphic/com_github_cockroachdb_metamorphic-v0.0.0-20231108215700-4ba948b56895.zip": "28c8cf42192951b69378cf537be5a9a43f2aeb35542908cc4fe5f689505853ea",
362-
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/pebble/com_github_cockroachdb_pebble-v0.0.0-20250806123517-258531bdf87e.zip": "e47b5d6c32ebd359e64d4332866e2935861a82cff227ccfe184741384955b199",
362+
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/pebble/com_github_cockroachdb_pebble-v0.0.0-20250812190039-9be04468f598.zip": "668f4a5ade4ee097a7f6f504672e202032329b11f1d0191361d75223922b1986",
363363
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/redact/com_github_cockroachdb_redact-v1.1.6.zip": "018eccb5fb9ca52d43ec9eaf213539d01c1f2b94e0e822406ebfb2e9321ef6cf",
364364
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/returncheck/com_github_cockroachdb_returncheck-v0.0.0-20200612231554-92cdbca611dd.zip": "ce92ba4352deec995b1f2eecf16eba7f5d51f5aa245a1c362dfe24c83d31f82b",
365365
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/stress/com_github_cockroachdb_stress-v0.0.0-20220803192808-1806698b1b7b.zip": "3fda531795c600daf25532a4f98be2a1335cd1e5e182c72789bca79f5f69fcc1",

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ require (
136136
github.com/cockroachdb/errors v1.12.0
137137
github.com/cockroachdb/gostdlib v1.19.0
138138
github.com/cockroachdb/logtags v0.0.0-20241215232642-bb51bb14a506
139-
github.com/cockroachdb/pebble v0.0.0-20250806123517-258531bdf87e
139+
github.com/cockroachdb/pebble v0.0.0-20250812190039-9be04468f598
140140
github.com/cockroachdb/redact v1.1.6
141141
github.com/cockroachdb/returncheck v0.0.0-20200612231554-92cdbca611dd
142142
github.com/cockroachdb/tokenbucket v0.0.0-20250429170803-42689b6311bb

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -583,8 +583,8 @@ github.com/cockroachdb/logtags v0.0.0-20241215232642-bb51bb14a506 h1:ASDL+UJcILM
583583
github.com/cockroachdb/logtags v0.0.0-20241215232642-bb51bb14a506/go.mod h1:Mw7HqKr2kdtu6aYGn3tPmAftiP3QPX63LdK/zcariIo=
584584
github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895 h1:XANOgPYtvELQ/h4IrmPAohXqe2pWA8Bwhejr3VQoZsA=
585585
github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895/go.mod h1:aPd7gM9ov9M8v32Yy5NJrDyOcD8z642dqs+F0CeNXfA=
586-
github.com/cockroachdb/pebble v0.0.0-20250806123517-258531bdf87e h1:MqHNboOEEG68bOzAvh+crJX7j9CmkvyTiGWtTphR0Eg=
587-
github.com/cockroachdb/pebble v0.0.0-20250806123517-258531bdf87e/go.mod h1:86lLSKhilQEdaYPIVAG2mIlDGhxlKX1CUkr4Z09nCzA=
586+
github.com/cockroachdb/pebble v0.0.0-20250812190039-9be04468f598 h1:38IyfrdbyCapHZZS2A6uCBROisJ3Cy0IirleW955AD0=
587+
github.com/cockroachdb/pebble v0.0.0-20250812190039-9be04468f598/go.mod h1:86lLSKhilQEdaYPIVAG2mIlDGhxlKX1CUkr4Z09nCzA=
588588
github.com/cockroachdb/redact v1.1.3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
589589
github.com/cockroachdb/redact v1.1.6 h1:zXJBwDZ84xJNlHl1rMyCojqyIxv+7YUpQiJLQ7n4314=
590590
github.com/cockroachdb/redact v1.1.6/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=

pkg/cloud/amazon/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ go_library(
2828
"//pkg/util/timeutil",
2929
"//pkg/util/tracing",
3030
"@com_github_aws_aws_sdk_go_v2//aws",
31+
"@com_github_aws_aws_sdk_go_v2//aws/ratelimit",
3132
"@com_github_aws_aws_sdk_go_v2//aws/retry",
3233
"@com_github_aws_aws_sdk_go_v2//aws/transport/http",
3334
"@com_github_aws_aws_sdk_go_v2_config//:config",

pkg/cloud/amazon/aws_kms.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"github.com/aws/aws-sdk-go-v2/service/sts"
2020
"github.com/cockroachdb/cockroach/pkg/cloud"
2121
"github.com/cockroachdb/cockroach/pkg/settings"
22-
"github.com/cockroachdb/cockroach/pkg/util/log"
2322
"github.com/cockroachdb/cockroach/pkg/util/metamorphic"
2423
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2524
"github.com/cockroachdb/errors"
@@ -51,7 +50,7 @@ type kmsURIParams struct {
5150
auth string
5251
roleProvider roleProvider
5352
delegateRoleProviders []roleProvider
54-
verbose bool
53+
logMode aws.ClientLogMode
5554
}
5655

5756
var reuseKMSSession = settings.RegisterBoolSetting(
@@ -85,7 +84,7 @@ func resolveKMSURIParams(kmsURI cloud.ConsumeURL) (kmsURIParams, error) {
8584
auth: kmsURI.ConsumeParam(cloud.AuthParam),
8685
roleProvider: assumeRoleProvider,
8786
delegateRoleProviders: delegateProviders,
88-
verbose: log.V(2),
87+
logMode: getLogMode(),
8988
}
9089

9190
// Validate that all the passed in parameters are supported.
@@ -134,8 +133,8 @@ func MakeAWSKMS(ctx context.Context, uri string, env cloud.KMSEnv) (cloud.KMS, e
134133
loadOptions = append(loadOptions, option)
135134
}
136135
addLoadOption(config.WithLogger(newLogAdapter(ctx)))
137-
if kmsURIParams.verbose {
138-
addLoadOption(config.WithClientLogMode(awsVerboseLogging))
136+
if kmsURIParams.logMode != 0 {
137+
addLoadOption(config.WithClientLogMode(kmsURIParams.logMode))
139138
}
140139

141140
var endpointURI string

pkg/cloud/amazon/s3_storage.go

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"time"
1919

2020
"github.com/aws/aws-sdk-go-v2/aws"
21+
"github.com/aws/aws-sdk-go-v2/aws/ratelimit"
2122
"github.com/aws/aws-sdk-go-v2/aws/retry"
2223
awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http"
2324
"github.com/aws/aws-sdk-go-v2/config"
@@ -89,9 +90,6 @@ const (
8990
scheme = "s3"
9091

9192
checksumAlgorithm = types.ChecksumAlgorithmSha256
92-
93-
// TODO(yevgeniy): Revisit retry logic. Retrying 10 times seems arbitrary.
94-
defaultRetryMaxAttempts = 10
9593
)
9694

9795
// NightlyEnvVarS3Params maps param keys that get added to an S3
@@ -125,16 +123,6 @@ type s3Storage struct {
125123
cached *s3Client
126124
}
127125

128-
// retryMaxAttempts defines how many times we will retry a
129-
// S3 request on a retriable error.
130-
var retryMaxAttempts = defaultRetryMaxAttempts
131-
132-
// InjectTestingRetryMaxAttempts is used to change the
133-
// default retries for tests that need quick fail.
134-
func InjectTestingRetryMaxAttempts(maxAttempts int) {
135-
retryMaxAttempts = maxAttempts
136-
}
137-
138126
// customRetryer implements the `request.Retryer` interface and allows for
139127
// customization of the retry behaviour of an AWS client.
140128
type customRetryer struct{}
@@ -186,6 +174,39 @@ var usePutObject = settings.RegisterBoolSetting(
186174
false,
187175
)
188176

177+
var maxRetries = settings.RegisterIntSetting(
178+
settings.ApplicationLevel,
179+
"cloudstorage.s3.max_retries",
180+
"the maximum number of retries per S3 operation",
181+
10)
182+
183+
// The v2 S3 client includes a client side retry token bucket. The high level
184+
// behavior of the token bucket is:
185+
//
186+
// 1. The token bucket starts full with 500 tokens.
187+
// 2. Each request that completes on the first attempt adds 1 token to the bucket.
188+
// 3. Each failed retry consumes 5 tokens from the bucket.
189+
//
190+
// When the token bucket runs out, the only way to make forward progress is to
191+
// start a new request that succeeds on the first attempt. This is sensible for
192+
// an RPC service that can bubble up a retryable error to the RPC client, but
193+
// it doesn't make sense in the context of something like backup/restrore,
194+
// where we have a fixed number of workers that are sending requests until they
195+
// complete all of their work. Since there are no client side requests to
196+
// refill the token bucket, a handful of errors can permanently exhaust the
197+
// token bucket.
198+
//
199+
// TODO(jeffswenson): consider deleting this after we've had time to evaluate
200+
// it in production. This setting mostly exists so we can keep it turned on by
201+
// default in the backport.
202+
var enableClientRetryTokenBucket = settings.RegisterBoolSetting(
203+
settings.ApplicationLevel,
204+
"cloudstorage.s3.client_retry_token_bucket.enabled",
205+
"enable the client side retry token bucket in the AWS S3 client",
206+
// TODO(jeffswenson): change this to false in a seperate PR. This is false in
207+
// the backports to stay true to the backport policy.
208+
true)
209+
189210
// roleProvider contains fields about the role that needs to be assumed
190211
// in order to access the external storage.
191212
type roleProvider struct {
@@ -219,8 +240,20 @@ type s3ClientConfig struct {
219240

220241
skipChecksum bool
221242
skipTLSVerify bool
222-
// log.V(2) decides session init params so include it in key.
223-
verbose bool
243+
logMode aws.ClientLogMode
244+
}
245+
246+
func getLogMode() aws.ClientLogMode {
247+
switch {
248+
case log.VDepth(3, 1):
249+
return awsVLevel3Logging
250+
case log.VDepth(2, 1):
251+
return awsVLevel2Logging
252+
case log.VDepth(1, 1):
253+
return awsVLevel1Logging
254+
default:
255+
return 0
256+
}
224257
}
225258

226259
func clientConfig(conf *cloudpb.ExternalStorage_S3) s3ClientConfig {
@@ -259,7 +292,7 @@ func clientConfig(conf *cloudpb.ExternalStorage_S3) s3ClientConfig {
259292
secret: conf.Secret,
260293
tempToken: conf.TempToken,
261294
auth: conf.Auth,
262-
verbose: log.V(2),
295+
logMode: getLogMode(),
263296
assumeRoleProvider: assumeRoleProvider,
264297
delegateRoleProviders: delegateRoleProviders,
265298
}
@@ -560,7 +593,9 @@ func newLogAdapter(ctx context.Context) *awsLogAdapter {
560593
}
561594
}
562595

563-
var awsVerboseLogging = aws.LogRequestEventMessage | aws.LogResponseEventMessage | aws.LogRetries | aws.LogSigning
596+
const awsVLevel1Logging = aws.LogRetries | aws.LogDeprecatedUsage
597+
const awsVLevel2Logging = awsVLevel1Logging | aws.LogRequestEventMessage | aws.LogResponseEventMessage | aws.LogRequest | aws.LogResponse
598+
const awsVLevel3Logging = awsVLevel2Logging | aws.LogSigning
564599

565600
func constructEndpointURI(endpoint string) (string, error) {
566601
parsedURL, err := url.Parse(endpoint)
@@ -586,6 +621,8 @@ func constructEndpointURI(endpoint string) (string, error) {
586621
// configures the client with it as well as returning it (so the caller can
587622
// remember it for future calls).
588623
func (s *s3Storage) newClient(ctx context.Context) (s3Client, string, error) {
624+
// TODO(jeffswenson): we should include the settings in the cache key so that
625+
// changing the cluster settings invalidates the cached instance.
589626

590627
// Open a span if client creation will do IO/RPCs to find creds/bucket region.
591628
if s.opts.region == "" || s.opts.auth == cloud.AuthParamImplicit {
@@ -611,16 +648,20 @@ func (s *s3Storage) newClient(ctx context.Context) (s3Client, string, error) {
611648
}
612649
addLoadOption(config.WithHTTPClient(client))
613650

651+
retryMaxAttempts := int(maxRetries.Get(&s.settings.SV))
614652
addLoadOption(config.WithRetryMaxAttempts(retryMaxAttempts))
615653

616654
addLoadOption(config.WithLogger(newLogAdapter(ctx)))
617-
if s.opts.verbose {
618-
addLoadOption(config.WithClientLogMode(awsVerboseLogging))
655+
if s.opts.logMode != 0 {
656+
addLoadOption(config.WithClientLogMode(s.opts.logMode))
619657
}
620658
config.WithRetryer(func() aws.Retryer {
621659
return retry.NewStandard(func(opts *retry.StandardOptions) {
622660
opts.MaxAttempts = retryMaxAttempts
623661
opts.Retryables = append(opts.Retryables, &customRetryer{})
662+
if !enableClientRetryTokenBucket.Get(&s.settings.SV) {
663+
opts.RateLimiter = ratelimit.None
664+
}
624665
})
625666
})
626667

pkg/cloud/amazon/s3_storage_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,8 @@ func TestPutS3Endpoint(t *testing.T) {
476476
testSettings := cluster.MakeTestingClusterSettings()
477477
clientFactory := blobs.TestBlobServiceClient("")
478478
// Force to fail quickly.
479-
InjectTestingRetryMaxAttempts(1)
479+
480+
maxRetries.Override(ctx, &testSettings.SV, 1)
480481
storage, err := cloud.MakeExternalStorage(ctx, conf, ioConf, testSettings, clientFactory,
481482
nil, nil, cloud.NilMetrics)
482483
if err != nil {

pkg/cmd/roachtest/tests/backup_fixtures.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,9 @@ func (bd *backupDriver) prepareCluster(ctx context.Context) {
188188
// there is a snapshot backlog, which makes the snapshot backlog worse
189189
// because add sst causes ranges to fall behind and need recovery snapshots
190190
// to catch up.
191-
"kv.snapshot_rebalance.max_rate": "256 MiB",
191+
"kv.snapshot_rebalance.max_rate": "256 MiB",
192+
"server.debug.default_vmodule": "s3_storage=2",
193+
"cloudstorage.s3.enable_client_retry_token_bucket": "false",
192194
},
193195
install.EnvOption{
194196
fmt.Sprintf("COCKROACH_AZURE_APPLICATION_CREDENTIALS_FILE=%s", azureCredentialsFilePath),

pkg/storage/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ go_library(
104104
"@com_github_cockroachdb_pebble//rangekey",
105105
"@com_github_cockroachdb_pebble//replay",
106106
"@com_github_cockroachdb_pebble//sstable",
107-
"@com_github_cockroachdb_pebble//sstable/block",
108107
"@com_github_cockroachdb_pebble//vfs",
109108
"@com_github_cockroachdb_pebble//wal",
110109
"@com_github_cockroachdb_redact//:redact",

0 commit comments

Comments
 (0)