Skip to content

Commit 761afd7

Browse files
committed
cli(tsdump): introduce partial retries for tsdump Datadog upload
Previously, tsdump datadog upload was all or nothing. This means that user has to retry the entire tsdump upload even if there is a single failure. This was inefficient because tsdump upload can take few hours to upload and most of upload requests would be redundant due to previous successful uploads. This patch adds the partial retries for the tsdump upload. If there are any failures during the upload then it captures failed requests in file with format `tsdump_failed_requests_{uploadID}.json`. User can retry the requests by providing the file path with `--retry-failed-requests` flag. If there are any failures during the retry then they will also get captured as part of same file. This patch also adds a cap of 100 on number of upload workers used during the Datadog upload. Epic: CRDB-52093 Part of: CRDB-44836 Release note: None
1 parent 08d1e04 commit 761afd7

File tree

6 files changed

+474
-47
lines changed

6 files changed

+474
-47
lines changed

pkg/cli/debug.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1618,6 +1618,7 @@ func init() {
16181618
f.StringVar(&debugTimeSeriesDumpOpts.storeToNodeMapYAMLFile, "store-to-node-map-file", "", "yaml file path which contains the mapping of store ID to node ID for datadog upload.")
16191619
f.BoolVar(&debugTimeSeriesDumpOpts.dryRun, "dry-run", false, "run in dry-run mode without making any actual uploads")
16201620
f.IntVar(&debugTimeSeriesDumpOpts.noOfUploadWorkers, "upload-workers", 50, "number of workers to upload the time series data in parallel")
1621+
f.BoolVar(&debugTimeSeriesDumpOpts.retryFailedRequests, "retry-failed-requests", false, "retry previously failed requests from file")
16211622

16221623
f = debugSendKVBatchCmd.Flags()
16231624
f.StringVar(&debugSendKVBatchContext.traceFormat, "trace", debugSendKVBatchContext.traceFormat,
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
2+
upload-datadog
3+
cr.node.admission.admitted.elastic-cpu,2025-05-26T08:32:00Z,1,1
4+
cr.node.sql.query.count,2021-01-01T00:00:00Z,1,100.5
5+
cr.node.sql.query.count,2021-01-01T00:00:10Z,1,102.3
6+
cr.store.rocksdb.block.cache.usage,2021-01-01T00:00:00Z,2,75.2
7+
----
8+
----
9+
{"series":[{"interval":10,"metric":"crdb.tsdump.admission.admitted.elastic-cpu","points":[{"timestamp":1748248320,"value":1}],"tags":["node_id:1","cluster_type:SELF_HOSTED","cluster_label:test-cluster","cluster_id:test-cluster-id","zendesk_ticket:zd-test","org_name:test-org","user_name:test-user","upload_id:","upload_timestamp:2024-11-14 00:00:00","upload_year:2024","upload_month:11","upload_day:14"],"type":0}]}
10+
11+
{"series":[{"interval":10,"metric":"crdb.tsdump.rocksdb.block.cache.usage","points":[{"timestamp":1609459200,"value":75.2}],"tags":["store:2","cluster_type:SELF_HOSTED","cluster_label:test-cluster","cluster_id:test-cluster-id","zendesk_ticket:zd-test","org_name:test-org","user_name:test-user","upload_id:","upload_timestamp:2024-11-14 00:00:00","upload_year:2024","upload_month:11","upload_day:14"],"type":0}]}
12+
13+
[{"ddsource":"tsdump_upload","ddtags":"cluster_type:SELF_HOSTED,cluster_label:test-cluster,cluster_id:test-cluster-id,zendesk_ticket:zd-test,org_name:test-org,user_name:test-user,upload_id:,upload_timestamp:2024-11-14 00:00:00,upload_year:2024,upload_month:11,upload_day:14,series_uploaded:4","dry_run":"false","duration":"0","estimated_cost":"0.000186986301369863","hostname":"hostname","message":"tsdump upload completed: uploaded 4 series overall","series_uploaded":"4","service":"tsdump_upload","success":"false"}]
14+
15+
{"metric_series":[{"interval":10,"metric":"crdb.tsdump.sql.query.count","points":[{"timestamp":1609459200,"value":100.5}],"tags":["node_id:1","cluster_type:SELF_HOSTED","cluster_label:test-cluster","cluster_id:test-cluster-id","zendesk_ticket:zd-test","org_name:test-org","user_name:test-user","upload_id:","upload_timestamp:2024-11-14 00:00:00","upload_year:2024","upload_month:11","upload_day:14"],"type":0}],"upload_id":"","timestamp":"2024-11-14T00:00:00Z","error":"409 Conflict"}
16+
{"metric_series":[{"interval":10,"metric":"crdb.tsdump.sql.query.count","points":[{"timestamp":1609459210,"value":102.3}],"tags":["node_id:1","cluster_type:SELF_HOSTED","cluster_label:test-cluster","cluster_id:test-cluster-id","zendesk_ticket:zd-test","org_name:test-org","user_name:test-user","upload_id:","upload_timestamp:2024-11-14 00:00:00","upload_year:2024","upload_month:11","upload_day:14"],"type":0}],"upload_id":"","timestamp":"2024-11-14T00:00:00Z","error":"409 Conflict"}
17+
----
18+
----
19+
20+
partial-upload
21+
{"metric_series":[{"interval":25,"metric":"crdb.tsdump.sql.query.count","points":[{"timestamp":1609459210,"value":102.3}],"tags":["node_id:1","cluster_type:SELF_HOSTED","cluster_label:test-cluster","cluster_id:test-cluster-id","zendesk_ticket:zd-test","org_name:test-org","user_name:test-user","upload_id:test-cluster-20241114000000","upload_timestamp:2024-11-14 00:00:00","upload_year:2024","upload_month:11","upload_day:14"],"type":0}],"upload_id":"test-cluster-20241114000000","timestamp":"2024-11-14T00:00:00Z","error":"409 Conflict"}
22+
{"metric_series":[{"interval":25,"metric":"crdb.tsdump.sql.query.count","points":[{"timestamp":1609459200,"value":100.5}],"tags":["node_id:1","cluster_type:SELF_HOSTED","cluster_label:test-cluster","cluster_id:test-cluster-id","zendesk_ticket:zd-test","org_name:test-org","user_name:test-user","upload_id:test-cluster-20241114000000","upload_timestamp:2024-11-14 00:00:00","upload_year:2024","upload_month:11","upload_day:14"],"type":0}],"upload_id":"test-cluster-20241114000000","timestamp":"2024-11-14T00:00:00Z","error":"409 Conflict"}
23+
----
24+
----
25+
{"series":[{"interval":25,"metric":"crdb.tsdump.sql.query.count","points":[{"timestamp":1609459210,"value":102.3}],"tags":["node_id:1","cluster_type:SELF_HOSTED","cluster_label:test-cluster","cluster_id:test-cluster-id","zendesk_ticket:zd-test","org_name:test-org","user_name:test-user","upload_id:test-cluster-20241114000000","upload_timestamp:2024-11-14 00:00:00","upload_year:2024","upload_month:11","upload_day:14"],"type":0}]}
26+
27+
{"series":[{"interval":25,"metric":"crdb.tsdump.sql.query.count","points":[{"timestamp":1609459200,"value":100.5}],"tags":["node_id:1","cluster_type:SELF_HOSTED","cluster_label:test-cluster","cluster_id:test-cluster-id","zendesk_ticket:zd-test","org_name:test-org","user_name:test-user","upload_id:test-cluster-20241114000000","upload_timestamp:2024-11-14 00:00:00","upload_year:2024","upload_month:11","upload_day:14"],"type":0}]}
28+
----
29+
----

pkg/cli/tsdump.go

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,17 +53,27 @@ var debugTimeSeriesDumpOpts = struct {
5353
storeToNodeMapYAMLFile string
5454
dryRun bool
5555
noOfUploadWorkers int
56+
retryFailedRequests bool
5657
}{
57-
format: tsDumpText,
58-
from: timestampValue{},
59-
to: timestampValue(timeutil.Now().Add(24 * time.Hour)),
60-
clusterLabel: "",
61-
yaml: "/tmp/tsdump.yaml",
58+
format: tsDumpText,
59+
from: timestampValue{},
60+
to: timestampValue(timeutil.Now().Add(24 * time.Hour)),
61+
clusterLabel: "",
62+
yaml: "/tmp/tsdump.yaml",
63+
retryFailedRequests: false,
6264
}
6365

6466
// hostNameOverride is used to override the hostname for testing purpose.
6567
var hostNameOverride string
6668

69+
// datadogSeriesThreshold holds the threshold for the number of series
70+
// that will be uploaded to Datadog in a single request. We have capped it to 100
71+
// to avoid hitting the Datadog API limits.
72+
var datadogSeriesThreshold = 100
73+
74+
const uploadWorkerErrorMessage = "--upload-workers is set to an invalid value." +
75+
" please select a value which between 1 and 100."
76+
6777
var debugTimeSeriesDumpCmd = &cobra.Command{
6878
Use: "tsdump",
6979
Short: "dump all the raw timeseries values in a cluster",
@@ -116,17 +126,28 @@ will then convert it to the --format requested in the current invocation.
116126
return errors.New("no input file provided")
117127
}
118128

129+
if debugTimeSeriesDumpOpts.noOfUploadWorkers <= 0 || debugTimeSeriesDumpOpts.noOfUploadWorkers > 100 {
130+
return errors.New(uploadWorkerErrorMessage)
131+
}
132+
119133
datadogWriter, err := makeDatadogWriter(
120134
debugTimeSeriesDumpOpts.ddSite,
121135
cmd == tsDumpDatadogInit,
122136
debugTimeSeriesDumpOpts.ddApiKey,
123-
100,
137+
datadogSeriesThreshold,
124138
hostNameOverride,
125139
debugTimeSeriesDumpOpts.noOfUploadWorkers,
140+
debugTimeSeriesDumpOpts.retryFailedRequests,
126141
)
127142
if err != nil {
128143
return err
129144
}
145+
146+
// Handle retry of failed requests if flag is set
147+
if datadogWriter.isPartialUploadOfFailedRequests {
148+
return datadogWriter.retryFailedRequests(args[0])
149+
}
150+
130151
return datadogWriter.upload(args[0])
131152
case tsDumpOpenMetrics:
132153
if debugTimeSeriesDumpOpts.targetURL != "" {

pkg/cli/tsdump_test.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,39 @@ func TestDebugTimeSeriesDumpCmd(t *testing.T) {
7474
require.NoError(t, err)
7575
require.NotEmpty(t, yamlContents)
7676
})
77+
78+
t.Run("debug tsdump datadog upload with invalid upload workers", func(t *testing.T) {
79+
// Create a temporary gob file for testing
80+
tmpFile, err := os.CreateTemp("", "test_tsdump_*.gob")
81+
require.NoError(t, err)
82+
defer func(name string) {
83+
err := os.Remove(name)
84+
if err != nil {
85+
t.Fatalf("failed to remove temporary file %s: %v", name, err)
86+
}
87+
}(tmpFile.Name())
88+
89+
// Test with upload workers > 100
90+
out, _ := c.RunWithCapture(fmt.Sprintf(
91+
"debug tsdump --format=datadog --dd-api-key=test-key --cluster-label=test --upload-workers=101 %s",
92+
tmpFile.Name(),
93+
))
94+
require.Contains(t, out, "--upload-workers is set to an invalid value. please select a value which between 1 and 100.")
95+
96+
// Test with upload workers = 0
97+
out, _ = c.RunWithCapture(fmt.Sprintf(
98+
"debug tsdump --format=datadog --dd-api-key=test-key --cluster-label=test --upload-workers=0 %s",
99+
tmpFile.Name(),
100+
))
101+
require.Contains(t, out, "--upload-workers is set to an invalid value. please select a value which between 1 and 100.")
102+
103+
// Test with negative upload workers
104+
out, _ = c.RunWithCapture(fmt.Sprintf(
105+
"debug tsdump --format=datadog --dd-api-key=test-key --cluster-label=test --upload-workers=-1 %s",
106+
tmpFile.Name(),
107+
))
108+
require.Contains(t, out, "--upload-workers is set to an invalid value. please select a value which between 1 and 100.")
109+
})
77110
}
78111

79112
func TestMakeOpenMetricsWriter(t *testing.T) {
@@ -217,12 +250,13 @@ func TestTsDumpFormatsDataDriven(t *testing.T) {
217250
debugTimeSeriesDumpOpts.organizationName = "test-org"
218251
debugTimeSeriesDumpOpts.userName = "test-user"
219252
debugTimeSeriesDumpOpts.noOfUploadWorkers = 50
253+
debugTimeSeriesDumpOpts.retryFailedRequests = false
220254
var series int
221255
d.ScanArgs(t, "series-threshold", &series)
222256
var ddwriter, err = makeDatadogWriter(
223257
defaultDDSite, d.Cmd == "format-datadog-init", "api-key", series,
224258
server.Listener.Addr().String(), debugTimeSeriesDumpOpts.noOfUploadWorkers,
225-
)
259+
debugTimeSeriesDumpOpts.retryFailedRequests)
226260
require.NoError(t, err)
227261

228262
parseDDInput(t, d.Input, ddwriter)

0 commit comments

Comments
 (0)