Skip to content

Commit b8c2405

Browse files
craig[bot]aa-joshi
andcommitted
Merge #150398
150398: cli(debug.zip): improve tsdump upload speed r=aa-joshi a=aa-joshi Previously, tsdump upload to Datadog was taking more time compare to stage tsdump in roachprod. This was inadequate because it would increase the MTTD (Mean Time To Detect) an issue. This change introduces `upload-workers` as flag to set the number of Datadog upload workers. The default value is 50. This change includes the changes around the retry configuration to further improve performance. Epic: None Part of: CRDB-52094 Release note: None ----- tsdump size: 11.19GB roachprod upload time: <img width="980" height="77" alt="Screenshot 2025-07-17 at 12 04 01 PM" src="https://github.com/user-attachments/assets/718d5f2b-142e-438c-9008-6739295e6930" /> Tsdump upload time (before changes): <img width="904" height="17" alt="Screenshot 2025-07-17 at 11 52 37 AM" src="https://github.com/user-attachments/assets/9b844a58-7b5b-4e67-bad0-142f6c37673b" /> Tsdump upload time with default(after changes): <img width="903" height="20" alt="Screenshot 2025-07-17 at 11 51 34 AM" src="https://github.com/user-attachments/assets/e6556e2a-d9bd-411b-acd1-591cc9971784" /> Co-authored-by: Akshay Joshi <[email protected]>
2 parents 4a0f95a + 0f4b146 commit b8c2405

File tree

4 files changed

+53
-27
lines changed

4 files changed

+53
-27
lines changed

pkg/cli/debug.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,6 +1617,7 @@ func init() {
16171617
f.StringVar(&debugTimeSeriesDumpOpts.userName, "user-name", "", "name of the user to perform datadog upload")
16181618
f.StringVar(&debugTimeSeriesDumpOpts.storeToNodeMapYAMLFile, "store-to-node-map-file", "", "yaml file path which contains the mapping of store ID to node ID for datadog upload.")
16191619
f.BoolVar(&debugTimeSeriesDumpOpts.dryRun, "dry-run", false, "run in dry-run mode without making any actual uploads")
1620+
f.IntVar(&debugTimeSeriesDumpOpts.noOfUploadWorkers, "upload-workers", 50, "number of workers to upload the time series data in parallel")
16201621

16211622
f = debugSendKVBatchCmd.Flags()
16221623
f.StringVar(&debugSendKVBatchContext.traceFormat, "trace", debugSendKVBatchContext.traceFormat,

pkg/cli/tsdump.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ var debugTimeSeriesDumpOpts = struct {
5252
userName string
5353
storeToNodeMapYAMLFile string
5454
dryRun bool
55+
noOfUploadWorkers int
5556
}{
5657
format: tsDumpText,
5758
from: timestampValue{},
@@ -121,6 +122,7 @@ will then convert it to the --format requested in the current invocation.
121122
debugTimeSeriesDumpOpts.ddApiKey,
122123
100,
123124
hostNameOverride,
125+
debugTimeSeriesDumpOpts.noOfUploadWorkers,
124126
)
125127
if err != nil {
126128
return err

pkg/cli/tsdump_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,11 +216,12 @@ func TestTsDumpFormatsDataDriven(t *testing.T) {
216216
debugTimeSeriesDumpOpts.zendeskTicket = "zd-test"
217217
debugTimeSeriesDumpOpts.organizationName = "test-org"
218218
debugTimeSeriesDumpOpts.userName = "test-user"
219+
debugTimeSeriesDumpOpts.noOfUploadWorkers = 50
219220
var series int
220221
d.ScanArgs(t, "series-threshold", &series)
221222
var ddwriter, err = makeDatadogWriter(
222223
defaultDDSite, d.Cmd == "format-datadog-init", "api-key", series,
223-
server.Listener.Addr().String(),
224+
server.Listener.Addr().String(), debugTimeSeriesDumpOpts.noOfUploadWorkers,
224225
)
225226
require.NoError(t, err)
226227

pkg/cli/tsdump_upload.go

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@ import (
2222

2323
"github.com/DataDog/datadog-api-client-go/v2/api/datadog"
2424
"github.com/DataDog/datadog-api-client-go/v2/api/datadogV2"
25+
"github.com/cockroachdb/cockroach/pkg/base"
2526
"github.com/cockroachdb/cockroach/pkg/roachpb"
2627
"github.com/cockroachdb/cockroach/pkg/ts"
28+
"github.com/cockroachdb/cockroach/pkg/util/retry"
2729
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2830
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2931
"github.com/cockroachdb/errors"
@@ -89,15 +91,21 @@ type datadogWriter struct {
8991
datadogContext context.Context
9092
// namePrefix sets the string to prepend to all metric names. The
9193
// names are kept with `.` delimiters.
92-
namePrefix string
93-
threshold int
94-
uploadTime time.Time
95-
storeToNodeMap map[string]string
96-
metricTypeMap map[string]string
94+
namePrefix string
95+
threshold int
96+
uploadTime time.Time
97+
storeToNodeMap map[string]string
98+
metricTypeMap map[string]string
99+
noOfUploadWorkers int
97100
}
98101

99102
func makeDatadogWriter(
100-
ddSite string, init bool, apiKey string, threshold int, hostNameOverride string,
103+
ddSite string,
104+
init bool,
105+
apiKey string,
106+
threshold int,
107+
hostNameOverride string,
108+
noOfUploadWorkers int,
101109
) (*datadogWriter, error) {
102110
currentTime := getCurrentTime()
103111

@@ -128,6 +136,9 @@ func makeDatadogWriter(
128136
ctx = context.WithValue(ctx, datadog.ContextServerVariables, map[string]string{
129137
"site": host,
130138
})
139+
140+
// The Datadog retry configuration is used when we receive error codes
141+
// 429 and >= 500 from the Datadog.
131142
configuration := datadog.NewConfiguration()
132143
configuration.RetryConfiguration.EnableRetry = true
133144
configuration.RetryConfiguration.BackOffMultiplier = 1
@@ -140,16 +151,17 @@ func makeDatadogWriter(
140151
}
141152

142153
return &datadogWriter{
143-
datadogContext: ctx,
144-
apiClient: apiClient,
145-
apiKey: apiKey,
146-
uploadID: newTsdumpUploadID(currentTime),
147-
init: init,
148-
namePrefix: "crdb.tsdump.", // Default pre-set prefix to distinguish these uploads.
149-
threshold: threshold,
150-
uploadTime: currentTime,
151-
storeToNodeMap: make(map[string]string),
152-
metricTypeMap: metricTypeMap,
154+
datadogContext: ctx,
155+
apiClient: apiClient,
156+
apiKey: apiKey,
157+
uploadID: newTsdumpUploadID(currentTime),
158+
init: init,
159+
namePrefix: "crdb.tsdump.", // Default pre-set prefix to distinguish these uploads.
160+
threshold: threshold,
161+
uploadTime: currentTime,
162+
storeToNodeMap: make(map[string]string),
163+
metricTypeMap: metricTypeMap,
164+
noOfUploadWorkers: noOfUploadWorkers,
153165
}, nil
154166
}
155167

@@ -346,11 +358,25 @@ func (d *datadogWriter) flush(data []datadogV2.MetricSeries) error {
346358
}
347359

348360
api := datadogV2.NewMetricsApi(d.apiClient)
349-
_, _, err := api.SubmitMetrics(d.datadogContext, datadogV2.MetricPayload{
350-
Series: data,
351-
}, datadogV2.SubmitMetricsOptionalParameters{
352-
ContentEncoding: datadogV2.METRICCONTENTENCODING_GZIP.Ptr(),
353-
})
361+
// The retry configuration is used when we receive any error code from upload.
362+
// We have seen 408 error codes from Datadog when the upload is too large which
363+
// is not handled by the default retry configuration that Datadog API client provides.
364+
retryOpts := base.DefaultRetryOptions()
365+
retryOpts.MaxBackoff = 20 * time.Millisecond
366+
retryOpts.MaxRetries = 100
367+
err := error(nil)
368+
369+
for retryAttempts := retry.Start(retryOpts); retryAttempts.Next(); {
370+
_, _, err = api.SubmitMetrics(d.datadogContext, datadogV2.MetricPayload{
371+
Series: data,
372+
}, datadogV2.SubmitMetricsOptionalParameters{
373+
ContentEncoding: datadogV2.METRICCONTENTENCODING_GZIP.Ptr(),
374+
})
375+
376+
if err == nil {
377+
return nil
378+
}
379+
}
354380
if err != nil {
355381
fmt.Printf("error submitting metrics to datadog: %v\n", err)
356382
}
@@ -417,11 +443,7 @@ func (d *datadogWriter) upload(fileName string) error {
417443
metricsUploadState.isSingleUploadSucceeded = true
418444
})
419445

420-
// Note(davidh): This was previously set at 1000 and we'd get regular
421-
// 400s from Datadog with the cryptic `Unable to decompress payload`
422-
// error. We reduced this to 20 and was able to upload a 3.2GB tsdump
423-
// in 6m20s without any errors.
424-
for i := 0; i < 20; i++ {
446+
for i := 0; i < d.noOfUploadWorkers; i++ {
425447
go func() {
426448
for data := range ch {
427449
emittedMetrics, err := d.emitDataDogMetrics(data)

0 commit comments

Comments
 (0)