Skip to content

Commit 66d7fd2

Browse files
craig[bot]arjunmahishi
andcommitted
Merge #153524
153524: cli(tsdump): remove tsdump file dependency from datadoginit r=arjunmahishi a=arjunmahishi The `datadoginit` format previously required a tsdump file as input to extract metric names, then override their values to zero and timestamps to current time before uploading to Datadog. This created an unnecessary dependency and could miss metrics not present in the specific tsdump file. This change makes `datadoginit` format work independently by leveraging the existing `generateMetricList()` mechanism that starts a test server and scrapes all metric registries. This ensures complete metric coverage and eliminates the tsdump file dependency. The implementation uses a simple sequential approach without concurrency since there are only ~3153 metrics total, completing uploads in under 10 seconds. Additionally, adds a hidden `--dd-metric-interval` flag to make the metric interval configurable for `datadoginit` uploads (defaults to 10 seconds). The flag is hidden to avoid confusing users as it only applies to `datadoginit`. Release note: None Part of: CRDB-52597 Epic: None --- _Use of AI: Initially vibe coded the full end to end flow. The result was unnecessarily complex. Manually modified most of it_ Co-authored-by: Arjun Mahishi <[email protected]>
2 parents d8cab49 + 59eef08 commit 66d7fd2

File tree

4 files changed

+167
-2
lines changed

4 files changed

+167
-2
lines changed

pkg/cli/debug.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,6 +1620,8 @@ func init() {
16201620
f.IntVar(&debugTimeSeriesDumpOpts.noOfUploadWorkers, "upload-workers", 75, "number of workers to upload the time series data in parallel")
16211621
f.BoolVar(&debugTimeSeriesDumpOpts.retryFailedRequests, "retry-failed-requests", false, "retry previously failed requests from file")
16221622
f.BoolVar(&debugTimeSeriesDumpOpts.disableDeltaProcessing, "disable-delta-processing", false, "disable delta calculation for counter metrics (enabled by default)")
1623+
f.Int64Var(&debugTimeSeriesDumpOpts.ddMetricInterval, "dd-metric-interval", debugTimeSeriesDumpOpts.ddMetricInterval, "interval in seconds for datadoginit format only (default 10). Regular datadog format uses actual intervals from tsdump.")
1624+
f.Lookup("dd-metric-interval").Hidden = true // this is for internal use only
16231625

16241626
f = debugSendKVBatchCmd.Flags()
16251627
f.StringVar(&debugSendKVBatchContext.traceFormat, "trace", debugSendKVBatchContext.traceFormat,

pkg/cli/tsdump.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ var debugTimeSeriesDumpOpts = struct {
5757
noOfUploadWorkers int
5858
retryFailedRequests bool
5959
disableDeltaProcessing bool
60+
ddMetricInterval int64 // interval for datadoginit format only
6061
}{
6162
format: tsDumpText,
6263
from: timestampValue{},
@@ -65,6 +66,10 @@ var debugTimeSeriesDumpOpts = struct {
6566
yaml: "/tmp/tsdump.yaml",
6667
retryFailedRequests: false,
6768
disableDeltaProcessing: false, // delta processing enabled by default
69+
70+
// default to 10 seconds interval for datadoginit.
71+
// This is based on the scrape interval that is currently set accross all managed clusters
72+
ddMetricInterval: 10,
6873
}
6974

7075
// hostNameOverride is used to override the hostname for testing purpose.
@@ -125,7 +130,22 @@ will then convert it to the --format requested in the current invocation.
125130
10_000_000, /* threshold */
126131
doRequest,
127132
)
128-
case tsDumpDatadogInit, tsDumpDatadog:
133+
case tsDumpDatadogInit:
134+
datadogWriter, err := makeDatadogWriter(
135+
debugTimeSeriesDumpOpts.ddSite,
136+
true, /* init */
137+
debugTimeSeriesDumpOpts.ddApiKey,
138+
datadogSeriesThreshold,
139+
hostNameOverride,
140+
debugTimeSeriesDumpOpts.noOfUploadWorkers,
141+
false, /* retryFailedRequests not applicable for init */
142+
)
143+
if err != nil {
144+
return err
145+
}
146+
147+
return datadogWriter.uploadInitMetrics()
148+
case tsDumpDatadog:
129149
if len(args) < 1 {
130150
return errors.New("no input file provided")
131151
}
@@ -136,7 +156,7 @@ will then convert it to the --format requested in the current invocation.
136156

137157
datadogWriter, err := makeDatadogWriter(
138158
debugTimeSeriesDumpOpts.ddSite,
139-
cmd == tsDumpDatadogInit,
159+
false, /* init */
140160
debugTimeSeriesDumpOpts.ddApiKey,
141161
datadogSeriesThreshold,
142162
hostNameOverride,

pkg/cli/tsdump_upload.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -933,3 +933,67 @@ func loadMetricTypesMap(ctx context.Context) (map[string]string, error) {
933933

934934
return metricTypeMap, nil
935935
}
936+
937+
// uploadInitMetrics uploads all available metrics with zero values and current timestamp
938+
// This eliminates the need for a tsdump file in init mode
939+
func (d *datadogWriter) uploadInitMetrics() error {
940+
if debugTimeSeriesDumpOpts.dryRun {
941+
fmt.Println("Dry-run mode enabled. Not actually uploading data to Datadog.")
942+
}
943+
944+
// get list of all metrics
945+
metricLayers, err := generateMetricList(context.Background(), true /* skipFiltering */)
946+
if err != nil {
947+
return errors.Wrap(err, "failed to generate metric list for init upload")
948+
}
949+
950+
currentTimestamp := getCurrentTime().Unix()
951+
var successfulUploads, skippedMetrics int
952+
953+
// batch metrics based on threshold
954+
currentBatch := make([]datadogV2.MetricSeries, 0, d.threshold)
955+
for _, layer := range metricLayers {
956+
for _, category := range layer.Categories {
957+
for _, metric := range category.Metrics {
958+
series := datadogV2.MetricSeries{
959+
Metric: metric.Name,
960+
Tags: []string{},
961+
Type: d.resolveMetricType(metric.Name),
962+
Points: []datadogV2.MetricPoint{{
963+
Value: datadog.PtrFloat64(0),
964+
Timestamp: datadog.PtrInt64(currentTimestamp),
965+
}},
966+
Interval: datadog.PtrInt64(debugTimeSeriesDumpOpts.ddMetricInterval),
967+
}
968+
969+
currentBatch = append(currentBatch, series)
970+
971+
// flush batch when threshold is reached
972+
if len(currentBatch) >= d.threshold {
973+
_, err := d.emitDataDogMetrics(currentBatch)
974+
if err != nil {
975+
fmt.Printf("Warning: Failed to upload batch of %d metrics: %v\n", len(currentBatch), err)
976+
skippedMetrics += len(currentBatch)
977+
} else {
978+
successfulUploads += len(currentBatch)
979+
}
980+
currentBatch = make([]datadogV2.MetricSeries, 0, d.threshold)
981+
}
982+
}
983+
}
984+
}
985+
986+
// flush remaining metrics in the last batch
987+
if len(currentBatch) > 0 {
988+
_, err := d.emitDataDogMetrics(currentBatch)
989+
if err != nil {
990+
fmt.Printf("Warning: Failed to upload final batch of %d metrics: %v\n", len(currentBatch), err)
991+
skippedMetrics += len(currentBatch)
992+
} else {
993+
successfulUploads += len(currentBatch)
994+
}
995+
}
996+
997+
fmt.Printf("Init upload completed: successfully uploaded %d metrics, skipped %d metrics\n", successfulUploads, skippedMetrics)
998+
return nil
999+
}

pkg/cli/tsdump_upload_test.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"compress/gzip"
1111
"encoding/csv"
1212
"encoding/gob"
13+
"encoding/json"
1314
"fmt"
1415
"io"
1516
"math/rand"
@@ -751,3 +752,81 @@ func TestDeltaCalculationWithUnsortedTimestamps(t *testing.T) {
751752
require.Equal(t, 50.0, *series.Points[1].Value) // delta: 150 - 100
752753
require.Equal(t, 150.0, *series.Points[2].Value) // delta: 300 - 150
753754
}
755+
756+
func TestDatadogInit(t *testing.T) {
757+
defer leaktest.AfterTest(t)()
758+
defer log.Scope(t).Close(t)
759+
760+
now := time.Date(2025, 9, 23, 0, 0, 0, 0, time.UTC)
761+
defer testutils.TestingHook(&getCurrentTime, func() time.Time {
762+
return now
763+
})()
764+
765+
var (
766+
reqCount int
767+
expectedInterval = 30
768+
)
769+
770+
// checkign "all" metrics will make this test flaky as new metrics are added.
771+
// So, we can instead check for one representative metric from each namespace.
772+
expectedMetrics := map[string]struct{}{
773+
"cockroachdb.sql.txns.open": {},
774+
"cockroachdb.storage.disk-slow": {},
775+
"cockroachdb.storeliveness.callbacks.processing_duration": {},
776+
"cockroachdb.sys.cgo.allocbytes": {},
777+
"cockroachdb.timeseries.write.bytes": {},
778+
"cockroachdb.totalbytes": {},
779+
"cockroachdb.txn.durations": {},
780+
"cockroachdb.valbytes": {},
781+
}
782+
783+
receivedMetrics := make(map[string]struct{})
784+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
785+
require.Equal(t, "gzip", r.Header.Get("Content-Encoding"))
786+
787+
reader, err := gzip.NewReader(r.Body)
788+
require.NoError(t, err)
789+
defer reader.Close()
790+
791+
body, err := io.ReadAll(reader)
792+
require.NoError(t, err)
793+
794+
var request struct {
795+
Series []datadogV2.MetricSeries `json:"series"`
796+
}
797+
798+
require.NoError(t, json.Unmarshal(body, &request))
799+
if len(request.Series) > 0 {
800+
for _, series := range request.Series {
801+
receivedMetrics[series.Metric] = struct{}{}
802+
require.NotNil(t, series.Interval, "interval should be set for datadoginit format")
803+
require.EqualValues(t, expectedInterval, *series.Interval, "interval should match dd-metric-interval flag")
804+
805+
require.Contains(t, series.Tags, "cluster_label:\"test-cluster\"", "should include cluster_label tag")
806+
require.Contains(t, series.Tags, "cluster_type:SELF_HOSTED", "should include cluster_type tag")
807+
808+
require.Len(t, series.Points, 1, "should have exactly one point for init")
809+
require.EqualValues(t, now.Unix(), *series.Points[0].Timestamp, "timestamp should match mocked current time")
810+
require.EqualValues(t, float64(0), *series.Points[0].Value, "value should be 0 for datadoginit")
811+
}
812+
}
813+
814+
w.WriteHeader(http.StatusOK)
815+
reqCount++
816+
}))
817+
defer server.Close()
818+
defer testutils.TestingHook(&hostNameOverride, server.Listener.Addr().String())()
819+
820+
cmd := `debug tsdump --format=datadoginit --dd-api-key="test-api-key" --cluster-label="test-cluster" --dd-metric-interval=30`
821+
c := NewCLITest(TestCLIParams{})
822+
defer c.Cleanup()
823+
824+
_, err := c.RunWithCapture(cmd)
825+
require.NoError(t, err)
826+
require.NotZero(t, reqCount, "should have made at least one request to the server")
827+
828+
// verify that our expected metrics are present in the received metrics
829+
for expectedMetric := range expectedMetrics {
830+
require.Contains(t, receivedMetrics, expectedMetric, "expected metric %s should be present", expectedMetric)
831+
}
832+
}

0 commit comments

Comments
 (0)