Skip to content

Commit 38c0cd9

Browse files
craig[bot]jbowensjeffswensonalyshanjahani-crlcockroach-teamcity
committed
152299: storage: avoid eager MVCC scanner value retrieval r=sumeerbhola a=jbowens Previously pebbleMVCCScanner.getOne eagerly retrieved and decoded values of KVs it considered. If the scanner doesn't need to check for uncertainty, this is unnecessary. Additionally, future work may avoid the value retrieval even when performing uncertainty checks if the value is known to not encode a local timestamp. Epic: none Informs #144715. Release note: none 152524: cloud: add fault testing to S3 client r=jeffswenson a=jeffswenson This adds a cloud nemisis test suite and uses it to ensure the S3 client is tolerant of 30 second brown outs. This test is designed to be a general purpose fault injection test and it is confirmed to reliably reproduce the token bucket exhaustion errors that are caused by the S3 client. Release note: none Informs: #151748 152613: contention: Remove unlimited retries in the contention resolver r=alyshanjahani-crl a=alyshanjahani-crl Unlimited retries in the resolver for in progress transactions results in contention events for aborted transactions getting stuck in the resolver until the aborted transaction is removed from the txn id cache on the respective gateway node. If the gateway node is getting 100 txns per second, that roughly translates to 7 hours. If the transaction got aborted, we'd prefer to have the partially resolved contention event available in a timely manner. This commit changes unlimited retries for in progress transactions to 60 retries (30 minutes). Fixes: #139258 Release note: None 152620: release: released CockroachDB version v25.4.0-alpha.0. Next version: v25.4.0-alpha.1 r=celiala a=cockroach-teamcity Release note: None Epic: None Release justification: non-production (release infra) change. Co-authored-by: Jackson Owens <[email protected]> Co-authored-by: Jeff Swenson <[email protected]> Co-authored-by: Alyshan Jahani <[email protected]> Co-authored-by: Justin Beaver <[email protected]>
5 parents 8ab959a + f214915 + 84ad44f + 968b528 + 5cdaaf4 commit 38c0cd9

File tree

13 files changed

+461
-60
lines changed

13 files changed

+461
-60
lines changed

pkg/build/version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
v25.4.0-alpha.00000000
1+
v25.4.0-alpha.1

pkg/cloud/amazon/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ go_test(
6767
"//pkg/testutils",
6868
"//pkg/testutils/skip",
6969
"//pkg/util/leaktest",
70+
"//pkg/util/log",
7071
"//pkg/util/uuid",
7172
"@com_github_aws_aws_sdk_go//aws/awserr",
7273
"@com_github_aws_aws_sdk_go//aws/request",

pkg/cloud/amazon/s3_storage.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ type s3Storage struct {
114114
bucket *string
115115
conf *cloudpb.ExternalStorage_S3
116116
ioConf base.ExternalIODirConfig
117+
middleware cloud.HttpMiddleware
117118
settings *cluster.Settings
118119
prefix string
119120
metrics *cloud.Metrics
@@ -514,6 +515,7 @@ func MakeS3Storage(
514515
bucket: aws.String(conf.Bucket),
515516
conf: conf,
516517
ioConf: args.IOConf,
518+
middleware: args.HttpMiddleware,
517519
prefix: conf.Prefix,
518520
metrics: args.MetricsRecorder,
519521
settings: args.Settings,
@@ -611,6 +613,7 @@ func (s *s3Storage) newClient(ctx context.Context) (s3Client, string, error) {
611613
Client: s.storageOptions.ClientName,
612614
Cloud: "aws",
613615
InsecureSkipVerify: s.opts.skipTLSVerify,
616+
HttpMiddleware: s.middleware,
614617
})
615618
if err != nil {
616619
return s3Client{}, "", err

pkg/cloud/amazon/s3_storage_test.go

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"os"
1515
"strings"
1616
"testing"
17+
"time"
1718

1819
"github.com/aws/aws-sdk-go-v2/config"
1920
"github.com/aws/aws-sdk-go-v2/service/s3/types"
@@ -30,32 +31,34 @@ import (
3031
"github.com/cockroachdb/cockroach/pkg/testutils"
3132
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
3233
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
34+
"github.com/cockroachdb/cockroach/pkg/util/log"
3335
"github.com/cockroachdb/cockroach/pkg/util/uuid"
3436
"github.com/cockroachdb/errors"
3537
"github.com/stretchr/testify/require"
3638
)
3739

3840
func makeS3Storage(
39-
ctx context.Context, uri string, user username.SQLUsername,
41+
ctx context.Context, uri string, user username.SQLUsername, middleware cloud.HttpMiddleware,
4042
) (cloud.ExternalStorage, error) {
4143
conf, err := cloud.ExternalStorageConfFromURI(uri, user)
4244
if err != nil {
4345
return nil, err
4446
}
4547
testSettings := cluster.MakeTestingClusterSettings()
4648

47-
// Setup a sink for the given args.
48-
s, err := cloud.MakeExternalStorage(ctx, conf, base.ExternalIODirConfig{}, testSettings,
49-
blobs.TestEmptyBlobClientFactory,
50-
nil, /* db */
51-
nil, /* limiters */
52-
cloud.NilMetrics,
53-
)
54-
if err != nil {
55-
return nil, err
49+
// TODO(jeffswenson): remove this once we change the default to false.
50+
enableClientRetryTokenBucket.Override(ctx, &testSettings.SV, false)
51+
52+
args := cloud.EarlyBootExternalStorageContext{
53+
IOConf: base.ExternalIODirConfig{},
54+
Settings: testSettings,
55+
Options: nil,
56+
Limiters: nil,
57+
MetricsRecorder: cloud.NilMetrics,
58+
HttpMiddleware: middleware,
5659
}
5760

58-
return s, nil
61+
return MakeS3Storage(ctx, args, conf)
5962
}
6063

6164
// You can create an IAM that can access S3 in the AWS console, then
@@ -70,6 +73,7 @@ func skipIfNoDefaultConfig(t *testing.T, ctx context.Context) {
7073
require.NoError(t, err)
7174
_, err = cfg.Credentials.Retrieve(ctx)
7275
if err != nil {
76+
t.Logf("config: %+v", cfg)
7377
skip.IgnoreLintf(t, "%s: %s", helpMsg, err)
7478
}
7579
}
@@ -192,7 +196,7 @@ func TestPutS3(t *testing.T) {
192196
cloud.AuthParam, cloud.AuthParamImplicit, AWSServerSideEncryptionMode,
193197
"unsupported-algorithm")
194198

195-
_, err = makeS3Storage(ctx, invalidSSEModeURI, user)
199+
_, err = makeS3Storage(ctx, invalidSSEModeURI, user, nil)
196200
require.True(t, testutils.IsError(err, "unsupported server encryption mode unsupported-algorithm. Supported values are `aws:kms` and `AES256"))
197201

198202
// Specify aws:kms encryption mode but don't specify kms ID.
@@ -201,13 +205,44 @@ func TestPutS3(t *testing.T) {
201205
bucket, "backup-test-sse-256",
202206
cloud.AuthParam, cloud.AuthParamImplicit, AWSServerSideEncryptionMode,
203207
"aws:kms")
204-
_, err = makeS3Storage(ctx, invalidKMSURI, user)
208+
_, err = makeS3Storage(ctx, invalidKMSURI, user, nil)
205209
require.True(t, testutils.IsError(err, "AWS_SERVER_KMS_ID param must be set when using aws:kms server side encryption mode."))
206210
})
207211
})
208212
}
209213
}
210214

215+
func TestS3FaultInjection(t *testing.T) {
216+
defer leaktest.AfterTest(t)()
217+
218+
ctx := context.Background()
219+
skipIfNoDefaultConfig(t, ctx)
220+
221+
baseBucket := os.Getenv("AWS_S3_BUCKET")
222+
if baseBucket == "" {
223+
skip.IgnoreLint(t, "AWS_S3_BUCKET env var must be set")
224+
}
225+
226+
// Enable cloud transport logging.
227+
defer log.Scope(t).Close(t)
228+
prevVModule := log.GetVModule()
229+
defer func() { _ = log.SetVModule(prevVModule) }()
230+
require.NoError(t, log.SetVModule("cloud_logging_transport=1"))
231+
232+
uri := fmt.Sprintf(
233+
"s3://%s/%d-fault-injection-test?AUTH=implicit",
234+
baseBucket, cloudtestutils.NewTestID(),
235+
)
236+
237+
// Inject faults for 15-45 seconds after the storage is opened.
238+
middleware := cloudtestutils.BrownoutMiddleware(time.Second*15, time.Second*45)
239+
storage, err := makeS3Storage(ctx, uri, username.RootUserName(), middleware)
240+
require.NoError(t, err)
241+
defer storage.Close()
242+
243+
cloudtestutils.RunCloudNemesisTest(t, storage)
244+
}
245+
211246
func TestPutS3AssumeRole(t *testing.T) {
212247
defer leaktest.AfterTest(t)()
213248

pkg/cloud/azure/azure_storage.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,10 @@ func makeAzureStorage(
223223
options := args.ExternalStorageOptions()
224224
t, err := cloud.MakeHTTPClient(args.Settings, args.MetricsRecorder,
225225
cloud.HTTPClientConfig{
226-
Bucket: dest.AzureConfig.Container,
227-
Client: options.ClientName,
228-
Cloud: "azure",
226+
Bucket: dest.AzureConfig.Container,
227+
Client: options.ClientName,
228+
Cloud: "azure",
229+
HttpMiddleware: args.HttpMiddleware,
229230
})
230231
if err != nil {
231232
return nil, errors.Wrap(err, "azure: unable to create transport")

pkg/cloud/cloud_io.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ type HTTPClientConfig struct {
4141
// accepts any certificate presented by the server and any host name in that
4242
// certificate. In this mode, TLS is susceptible to machine-in-the-middle attacks.
4343
InsecureSkipVerify bool
44+
45+
HttpMiddleware HttpMiddleware
4446
}
4547

4648
// Timeout is a cluster setting used for cloud storage interactions.
@@ -115,7 +117,7 @@ func MakeHTTPClientForTransport(t http.RoundTripper) (*http.Client, error) {
115117
// Prefer MakeHTTPClient where possible.
116118
func MakeTransport(
117119
settings *cluster.Settings, metrics *Metrics, config HTTPClientConfig,
118-
) (*http.Transport, error) {
120+
) (http.RoundTripper, error) {
119121
var tlsConf *tls.Config
120122
if config.InsecureSkipVerify {
121123
tlsConf = &tls.Config{InsecureSkipVerify: true}
@@ -129,7 +131,6 @@ func MakeTransport(
129131
}
130132
tlsConf = &tls.Config{RootCAs: roots}
131133
}
132-
133134
t := http.DefaultTransport.(*http.Transport).Clone()
134135

135136
// Add our custom CA.
@@ -140,7 +141,12 @@ func MakeTransport(
140141
if metrics != nil {
141142
t.DialContext = metrics.NetMetrics.Wrap(t.DialContext, config.Cloud, config.Bucket, config.Client)
142143
}
143-
return t, nil
144+
145+
var roundTripper http.RoundTripper = t
146+
if config.HttpMiddleware != nil {
147+
roundTripper = config.HttpMiddleware(roundTripper)
148+
}
149+
return roundTripper, nil
144150
}
145151

146152
// MaxDelayedRetryAttempts is the number of times the delayedRetry method will

pkg/cloud/cloudtestutils/BUILD.bazel

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
22

33
go_library(
44
name = "cloudtestutils",
5-
srcs = ["cloud_test_helpers.go"],
5+
srcs = [
6+
"cloud_nemesis.go",
7+
"cloud_test_helpers.go",
8+
"http_middleware.go",
9+
],
610
importpath = "github.com/cockroachdb/cockroach/pkg/cloud/cloudtestutils",
711
visibility = ["//visibility:public"],
812
deps = [
@@ -16,8 +20,11 @@ go_library(
1620
"//pkg/testutils",
1721
"//pkg/util/ioctx",
1822
"//pkg/util/randutil",
23+
"//pkg/util/syncutil",
1924
"//pkg/util/sysutil",
25+
"//pkg/util/timeutil",
2026
"@com_github_cockroachdb_errors//:errors",
2127
"@com_github_stretchr_testify//require",
28+
"@org_golang_x_sync//errgroup",
2229
],
2330
)

0 commit comments

Comments
 (0)