Skip to content

Commit d8d00d8

Browse files
craig[bot]jeffswenson
andcommitted
152746: azure: increase default number of retries r=jeffswenson a=jeffswenson This adds the cloudstorage.azure.max_retries setting to mirror the setting present in the S3 cloud client. The default retry count is changed from 3 to 10. This allows Azure to pass the 30 second brown out test added by #152692. Release note: Increase the default number of times CRDB will retry failures on Azure. Epic: CRDB-53946 152748: cloud: add a list test to cloud nemesis r=jeffswenson a=jeffswenson This adds a list test to the cloud nemesis test. Testing list's fault tolerance is important because it is used by backup and restore. Release note: none Epic: CRDB-53946 Co-authored-by: Jeff Swenson <[email protected]>
3 parents 0d92408 + 620460c + a6c8281 commit d8d00d8

File tree

2 files changed

+75
-1
lines changed

2 files changed

+75
-1
lines changed

pkg/cloud/azure/azure_storage.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ var maxConcurrentUploadBuffers = settings.RegisterIntSetting(
4242
1,
4343
settings.WithPublic)
4444

45+
var maxRetries = settings.RegisterIntSetting(
46+
settings.ApplicationLevel,
47+
"cloudstorage.azure.max_retries",
48+
"the maximum number of retries per Azure operation",
49+
10)
50+
4551
// A note on Azure authentication:
4652
//
4753
// The standardized way to authenticate a third-party identity to the Azure
@@ -233,6 +239,9 @@ func makeAzureStorage(
233239
}
234240
var opts service.ClientOptions
235241
opts.Transport = t
242+
// Azure SDK defaults to 3 retries, which is too low to survive the 30 second
243+
// brownout in TestAzureFaultInjection.
244+
opts.Retry.MaxRetries = int32(maxRetries.Get(&args.Settings.SV))
236245

237246
var azClient *service.Client
238247
switch conf.Auth {

pkg/cloud/cloudtestutils/cloud_nemesis.go

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"fmt"
1212
"io"
1313
"math/rand"
14+
"strings"
1415
"sync/atomic"
1516
"testing"
1617
"time"
@@ -30,12 +31,12 @@ import (
3031
// or corrupting data.
3132
//
3233
// TODO(jeffswenson): use this to test GCS and Azure
33-
// TODO(jeffswenson): add a list operation
3434
func RunCloudNemesisTest(t *testing.T, storage cloud.ExternalStorage) {
3535
nemesis := &cloudNemesis{
3636
storage: storage,
3737
writeConcurrency: 1,
3838
readConcurrency: 40,
39+
listConcurrency: 1,
3940
}
4041

4142
// We create a context here because we don't want to support a caller supplied
@@ -47,15 +48,18 @@ func RunCloudNemesisTest(t *testing.T, storage cloud.ExternalStorage) {
4748

4849
require.Greater(t, nemesis.writeSuccesses.Load(), int64(5), "not enough completed writes")
4950
require.Greater(t, nemesis.readSuccesses.Load(), int64(200), "not enough completed reads")
51+
require.Greater(t, nemesis.listSuccesses.Load(), int64(5), "not enough completed lists")
5052
}
5153

5254
type cloudNemesis struct {
5355
storage cloud.ExternalStorage
5456
writeConcurrency int
5557
readConcurrency int
58+
listConcurrency int
5659

5760
readSuccesses atomic.Int64
5861
writeSuccesses atomic.Int64
62+
listSuccesses atomic.Int64
5963

6064
mu struct {
6165
syncutil.Mutex
@@ -118,6 +122,22 @@ func (c *cloudNemesis) run(ctx context.Context, duration time.Duration) error {
118122
})
119123
}
120124

125+
for i := 0; i < c.listConcurrency; i++ {
126+
g.Go(func() error {
127+
for {
128+
time.Sleep(time.Millisecond)
129+
select {
130+
case <-done:
131+
return nil
132+
default:
133+
if err := c.listObjects(ctx); err != nil {
134+
return err
135+
}
136+
}
137+
}
138+
})
139+
}
140+
121141
// We shouldn't observe any errors from the client. We are injecting errors
122142
// that should be transparently retried.
123143
return g.Wait()
@@ -213,6 +233,42 @@ func (c *cloudNemesis) readObject(ctx context.Context) (err error) {
213233
return nil
214234
}
215235

236+
func (c *cloudNemesis) listObjects(ctx context.Context) (err error) {
237+
before := c.snapshotObjects()
238+
239+
listedFiles := map[string]bool{}
240+
err = c.storage.List(ctx, "", "", func(filename string) error {
241+
listedFiles[strings.TrimPrefix(filename, "/")] = true
242+
return nil
243+
})
244+
if err != nil {
245+
return errors.Wrap(err, "unable to list files")
246+
}
247+
248+
// Check if there are any missing files in the listing.
249+
for _, o := range before {
250+
if o.finished {
251+
if !listedFiles[o.name] {
252+
return errors.AssertionFailedf("expected to find object %s in listing", o.name)
253+
}
254+
}
255+
}
256+
257+
// Check if there are any unexpected files in the listing.
258+
afterFiles := map[string]bool{}
259+
for _, o := range c.snapshotObjects() {
260+
afterFiles[o.name] = true
261+
}
262+
for filename := range listedFiles {
263+
if !afterFiles[filename] {
264+
return errors.AssertionFailedf("found unexpected object %s in listing", filename)
265+
}
266+
}
267+
268+
c.listSuccesses.Add(1)
269+
return nil
270+
}
271+
216272
func (c *cloudNemesis) newObject() cloudObject {
217273
c.mu.Lock()
218274
defer c.mu.Unlock()
@@ -249,6 +305,15 @@ func (c *cloudNemesis) randomObject() cloudObject {
249305
return c.mu.objects[rand.Intn(len(c.mu.objects))]
250306
}
251307

308+
func (c *cloudNemesis) snapshotObjects() []cloudObject {
309+
c.mu.Lock()
310+
defer c.mu.Unlock()
311+
312+
snapshot := make([]cloudObject, len(c.mu.objects))
313+
copy(snapshot, c.mu.objects)
314+
return snapshot
315+
}
316+
252317
// generatedObject is a deterministic implementation of io.Reader.
253318
type generatedObject struct {
254319
seed int64

0 commit comments

Comments
 (0)