Skip to content

Commit 84ad44f

Browse files
committed
cloud: add fault testing to S3 client
This adds a cloud nemesis test suite and uses it to ensure the S3 client is tolerant of 30 second brown outs. This test is designed to be a general purpose fault injection test and it is confirmed to reliably reproduce the token bucket exhaustion errors that are caused by the S3 client. Release note: none Informs: #151748
1 parent 579ec24 commit 84ad44f

File tree

10 files changed

+400
-23
lines changed

10 files changed

+400
-23
lines changed

pkg/cloud/amazon/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ go_test(
6767
"//pkg/testutils",
6868
"//pkg/testutils/skip",
6969
"//pkg/util/leaktest",
70+
"//pkg/util/log",
7071
"//pkg/util/uuid",
7172
"@com_github_aws_aws_sdk_go//aws/awserr",
7273
"@com_github_aws_aws_sdk_go//aws/request",

pkg/cloud/amazon/s3_storage.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ type s3Storage struct {
114114
bucket *string
115115
conf *cloudpb.ExternalStorage_S3
116116
ioConf base.ExternalIODirConfig
117+
middleware cloud.HttpMiddleware
117118
settings *cluster.Settings
118119
prefix string
119120
metrics *cloud.Metrics
@@ -514,6 +515,7 @@ func MakeS3Storage(
514515
bucket: aws.String(conf.Bucket),
515516
conf: conf,
516517
ioConf: args.IOConf,
518+
middleware: args.HttpMiddleware,
517519
prefix: conf.Prefix,
518520
metrics: args.MetricsRecorder,
519521
settings: args.Settings,
@@ -611,6 +613,7 @@ func (s *s3Storage) newClient(ctx context.Context) (s3Client, string, error) {
611613
Client: s.storageOptions.ClientName,
612614
Cloud: "aws",
613615
InsecureSkipVerify: s.opts.skipTLSVerify,
616+
HttpMiddleware: s.middleware,
614617
})
615618
if err != nil {
616619
return s3Client{}, "", err

pkg/cloud/amazon/s3_storage_test.go

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"os"
1515
"strings"
1616
"testing"
17+
"time"
1718

1819
"github.com/aws/aws-sdk-go-v2/config"
1920
"github.com/aws/aws-sdk-go-v2/service/s3/types"
@@ -30,32 +31,34 @@ import (
3031
"github.com/cockroachdb/cockroach/pkg/testutils"
3132
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
3233
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
34+
"github.com/cockroachdb/cockroach/pkg/util/log"
3335
"github.com/cockroachdb/cockroach/pkg/util/uuid"
3436
"github.com/cockroachdb/errors"
3537
"github.com/stretchr/testify/require"
3638
)
3739

3840
func makeS3Storage(
39-
ctx context.Context, uri string, user username.SQLUsername,
41+
ctx context.Context, uri string, user username.SQLUsername, middleware cloud.HttpMiddleware,
4042
) (cloud.ExternalStorage, error) {
4143
conf, err := cloud.ExternalStorageConfFromURI(uri, user)
4244
if err != nil {
4345
return nil, err
4446
}
4547
testSettings := cluster.MakeTestingClusterSettings()
4648

47-
// Setup a sink for the given args.
48-
s, err := cloud.MakeExternalStorage(ctx, conf, base.ExternalIODirConfig{}, testSettings,
49-
blobs.TestEmptyBlobClientFactory,
50-
nil, /* db */
51-
nil, /* limiters */
52-
cloud.NilMetrics,
53-
)
54-
if err != nil {
55-
return nil, err
49+
// TODO(jeffswenson): remove this once we change the default to false.
50+
enableClientRetryTokenBucket.Override(ctx, &testSettings.SV, false)
51+
52+
args := cloud.EarlyBootExternalStorageContext{
53+
IOConf: base.ExternalIODirConfig{},
54+
Settings: testSettings,
55+
Options: nil,
56+
Limiters: nil,
57+
MetricsRecorder: cloud.NilMetrics,
58+
HttpMiddleware: middleware,
5659
}
5760

58-
return s, nil
61+
return MakeS3Storage(ctx, args, conf)
5962
}
6063

6164
// You can create an IAM that can access S3 in the AWS console, then
@@ -70,6 +73,7 @@ func skipIfNoDefaultConfig(t *testing.T, ctx context.Context) {
7073
require.NoError(t, err)
7174
_, err = cfg.Credentials.Retrieve(ctx)
7275
if err != nil {
76+
t.Logf("config: %+v", cfg)
7377
skip.IgnoreLintf(t, "%s: %s", helpMsg, err)
7478
}
7579
}
@@ -192,7 +196,7 @@ func TestPutS3(t *testing.T) {
192196
cloud.AuthParam, cloud.AuthParamImplicit, AWSServerSideEncryptionMode,
193197
"unsupported-algorithm")
194198

195-
_, err = makeS3Storage(ctx, invalidSSEModeURI, user)
199+
_, err = makeS3Storage(ctx, invalidSSEModeURI, user, nil)
196200
require.True(t, testutils.IsError(err, "unsupported server encryption mode unsupported-algorithm. Supported values are `aws:kms` and `AES256"))
197201

198202
// Specify aws:kms encryption mode but don't specify kms ID.
@@ -201,13 +205,44 @@ func TestPutS3(t *testing.T) {
201205
bucket, "backup-test-sse-256",
202206
cloud.AuthParam, cloud.AuthParamImplicit, AWSServerSideEncryptionMode,
203207
"aws:kms")
204-
_, err = makeS3Storage(ctx, invalidKMSURI, user)
208+
_, err = makeS3Storage(ctx, invalidKMSURI, user, nil)
205209
require.True(t, testutils.IsError(err, "AWS_SERVER_KMS_ID param must be set when using aws:kms server side encryption mode."))
206210
})
207211
})
208212
}
209213
}
210214

215+
func TestS3FaultInjection(t *testing.T) {
216+
defer leaktest.AfterTest(t)()
217+
218+
ctx := context.Background()
219+
skipIfNoDefaultConfig(t, ctx)
220+
221+
baseBucket := os.Getenv("AWS_S3_BUCKET")
222+
if baseBucket == "" {
223+
skip.IgnoreLint(t, "AWS_S3_BUCKET env var must be set")
224+
}
225+
226+
// Enable cloud transport logging.
227+
defer log.Scope(t).Close(t)
228+
prevVModule := log.GetVModule()
229+
defer func() { _ = log.SetVModule(prevVModule) }()
230+
require.NoError(t, log.SetVModule("cloud_logging_transport=1"))
231+
232+
uri := fmt.Sprintf(
233+
"s3://%s/%d-fault-injection-test?AUTH=implicit",
234+
baseBucket, cloudtestutils.NewTestID(),
235+
)
236+
237+
// Inject faults for 15-45 seconds after the storage is opened.
238+
middleware := cloudtestutils.BrownoutMiddleware(time.Second*15, time.Second*45)
239+
storage, err := makeS3Storage(ctx, uri, username.RootUserName(), middleware)
240+
require.NoError(t, err)
241+
defer storage.Close()
242+
243+
cloudtestutils.RunCloudNemesisTest(t, storage)
244+
}
245+
211246
func TestPutS3AssumeRole(t *testing.T) {
212247
defer leaktest.AfterTest(t)()
213248

pkg/cloud/azure/azure_storage.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,10 @@ func makeAzureStorage(
223223
options := args.ExternalStorageOptions()
224224
t, err := cloud.MakeHTTPClient(args.Settings, args.MetricsRecorder,
225225
cloud.HTTPClientConfig{
226-
Bucket: dest.AzureConfig.Container,
227-
Client: options.ClientName,
228-
Cloud: "azure",
226+
Bucket: dest.AzureConfig.Container,
227+
Client: options.ClientName,
228+
Cloud: "azure",
229+
HttpMiddleware: args.HttpMiddleware,
229230
})
230231
if err != nil {
231232
return nil, errors.Wrap(err, "azure: unable to create transport")

pkg/cloud/cloud_io.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ type HTTPClientConfig struct {
4141
// accepts any certificate presented by the server and any host name in that
4242
// certificate. In this mode, TLS is susceptible to machine-in-the-middle attacks.
4343
InsecureSkipVerify bool
44+
45+
HttpMiddleware HttpMiddleware
4446
}
4547

4648
// Timeout is a cluster setting used for cloud storage interactions.
@@ -115,7 +117,7 @@ func MakeHTTPClientForTransport(t http.RoundTripper) (*http.Client, error) {
115117
// Prefer MakeHTTPClient where possible.
116118
func MakeTransport(
117119
settings *cluster.Settings, metrics *Metrics, config HTTPClientConfig,
118-
) (*http.Transport, error) {
120+
) (http.RoundTripper, error) {
119121
var tlsConf *tls.Config
120122
if config.InsecureSkipVerify {
121123
tlsConf = &tls.Config{InsecureSkipVerify: true}
@@ -129,7 +131,6 @@ func MakeTransport(
129131
}
130132
tlsConf = &tls.Config{RootCAs: roots}
131133
}
132-
133134
t := http.DefaultTransport.(*http.Transport).Clone()
134135

135136
// Add our custom CA.
@@ -140,7 +141,12 @@ func MakeTransport(
140141
if metrics != nil {
141142
t.DialContext = metrics.NetMetrics.Wrap(t.DialContext, config.Cloud, config.Bucket, config.Client)
142143
}
143-
return t, nil
144+
145+
var roundTripper http.RoundTripper = t
146+
if config.HttpMiddleware != nil {
147+
roundTripper = config.HttpMiddleware(roundTripper)
148+
}
149+
return roundTripper, nil
144150
}
145151

146152
// MaxDelayedRetryAttempts is the number of times the delayedRetry method will

pkg/cloud/cloudtestutils/BUILD.bazel

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
22

33
go_library(
44
name = "cloudtestutils",
5-
srcs = ["cloud_test_helpers.go"],
5+
srcs = [
6+
"cloud_nemesis.go",
7+
"cloud_test_helpers.go",
8+
"http_middleware.go",
9+
],
610
importpath = "github.com/cockroachdb/cockroach/pkg/cloud/cloudtestutils",
711
visibility = ["//visibility:public"],
812
deps = [
@@ -16,8 +20,11 @@ go_library(
1620
"//pkg/testutils",
1721
"//pkg/util/ioctx",
1822
"//pkg/util/randutil",
23+
"//pkg/util/syncutil",
1924
"//pkg/util/sysutil",
25+
"//pkg/util/timeutil",
2026
"@com_github_cockroachdb_errors//:errors",
2127
"@com_github_stretchr_testify//require",
28+
"@org_golang_x_sync//errgroup",
2229
],
2330
)

0 commit comments

Comments
 (0)