Skip to content

Commit 9eef0da

Browse files
craig[bot]angeladietz
andcommitted
161285: roachtest: test secondary stores get bootstrapped with correct metrics r=angeladietz a=angeladietz This adds a roachtest to validate that secondary stores eventually get the same set of metrics as the primary store for a node. This will help prevent introducing new bugs related to store metrics which stem from the fact that bootstrapping "additional" stores is an asynchronous process. Release note: none Fixes cockroachdb#159681 Co-authored-by: Angela Dietz <dietz@cockroachlabs.com>
2 parents 1b48fd8 + db41da1 commit 9eef0da

File tree

10 files changed

+2274
-44
lines changed

10 files changed

+2274
-44
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 1666 additions & 26 deletions
Large diffs are not rendered by default.

pkg/cmd/roachtest/tests/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ go_library(
194194
"sqlsmith.go",
195195
"status_server.go",
196196
"stop_and_copy.go",
197+
"store_metrics.go",
197198
"sysbench.go",
198199
"tlp.go",
199200
"tombstones.go",
@@ -364,6 +365,7 @@ go_library(
364365
"@com_github_stretchr_testify//require",
365366
"@org_golang_google_protobuf//proto",
366367
"@org_golang_x_exp//maps",
368+
"@org_golang_x_exp//slices",
367369
"@org_golang_x_oauth2//clientcredentials",
368370
"@org_golang_x_text//cases",
369371
"@org_golang_x_text//language",

pkg/cmd/roachtest/tests/registry.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ func RegisterTests(r registry.Registry) {
160160
registerSecondaryIndexesMultiVersionCluster(r)
161161
registerSequelize(r)
162162
registerSlowDrain(r)
163+
registerStoreMetrics(r)
163164
registerSysbench(r)
164165
registerTLP(r)
165166
registerTPCC(r)
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
// Copyright 2026 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package tests
7+
8+
import (
9+
"bufio"
10+
"context"
11+
"fmt"
12+
"reflect"
13+
"strings"
14+
"time"
15+
16+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
17+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
18+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
19+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
20+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
21+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
22+
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
23+
"github.com/cockroachdb/cockroach/pkg/util/retry"
24+
"github.com/cockroachdb/errors"
25+
"golang.org/x/exp/slices"
26+
)
27+
28+
const storeMetricsStoresPerNode = 4
29+
30+
func registerStoreMetrics(r registry.Registry) {
31+
r.Add(registry.TestSpec{
32+
Name: "store-metrics",
33+
Owner: registry.OwnerKV,
34+
// Single node with multiple stores to test that all stores have the same
35+
// set of metrics.
36+
Cluster: r.MakeClusterSpec(1, spec.Disks(storeMetricsStoresPerNode)),
37+
CompatibleClouds: registry.OnlyGCE,
38+
Suites: registry.Suites(registry.Weekly),
39+
Timeout: 5 * time.Minute,
40+
EncryptionSupport: registry.EncryptionMetamorphic,
41+
Leases: registry.MetamorphicLeases,
42+
Run: runStoreMetrics,
43+
})
44+
}
45+
46+
type metricKey struct {
47+
name string
48+
hasNode bool
49+
}
50+
51+
// runStoreMetrics verifies that store metrics are correctly labeled, including
52+
// the node label for asynchronously bootstrapped stores. See issue #159046 for
53+
// an example of such a bug.
54+
//
55+
// This test will catch inconsistency bugs across the entire system, not just
56+
// store-related metrics. Failures of this test should be forwarded to the
57+
// appropriate team based on the metric.
58+
//
59+
// The test:
60+
// 1. Starts a single-node cluster with multiple stores.
61+
// 2. Fetches /_status/vars and parses store-labeled metrics (excluding histogram buckets).
62+
// 3. Builds a map keyed by (metric name, hasNodeLabel) -> count of stores.
63+
// 4. Asserts (with retry) that each count equals the store count.
64+
// 5. Stops and restarts the node.
65+
// 6. Repeats the check and compares the two maps.
66+
func runStoreMetrics(ctx context.Context, t test.Test, c cluster.Cluster) {
67+
startOpts := option.DefaultStartOpts()
68+
startOpts.RoachprodOpts.StoreCount = storeMetricsStoresPerNode
69+
startSettings := install.MakeClusterSettings()
70+
71+
getMetricsMap := func(ctx context.Context) (map[metricKey]int, error) {
72+
var result map[metricKey]int
73+
var lastInconsistencies []string
74+
if err := retry.ForDuration(30*time.Second, func() error {
75+
metricsMap, err := buildMetricsData(ctx, t, c)
76+
if err != nil {
77+
lastInconsistencies = nil
78+
return err
79+
}
80+
var inconsistencies []string
81+
for key, count := range metricsMap {
82+
if count != storeMetricsStoresPerNode {
83+
inconsistencies = append(
84+
inconsistencies,
85+
fmt.Sprintf("metric %q (has_node=%t) has %d entries; want %d",
86+
key.name, key.hasNode, count, storeMetricsStoresPerNode),
87+
)
88+
}
89+
}
90+
lastInconsistencies = inconsistencies
91+
if len(inconsistencies) > 0 {
92+
return errors.New("metric inconsistencies found")
93+
}
94+
result = metricsMap
95+
return nil
96+
}); err != nil {
97+
if len(lastInconsistencies) > 20 {
98+
t.L().Printf("found %d metric inconsistencies (showing first 20)", len(lastInconsistencies))
99+
lastInconsistencies = lastInconsistencies[:20]
100+
} else {
101+
t.L().Printf("found %d metric inconsistencies: ", len(lastInconsistencies))
102+
}
103+
t.L().Printf("%s", strings.Join(lastInconsistencies, " "))
104+
return nil, err
105+
}
106+
return result, nil
107+
}
108+
109+
t.Status("starting cluster with multiple stores")
110+
c.Start(ctx, t.L(), startOpts, startSettings, c.Node(1))
111+
112+
t.Status("checking metrics parity between stores after initial start")
113+
metricsMap1, err := getMetricsMap(ctx)
114+
if err != nil {
115+
t.Fatal(err)
116+
}
117+
118+
t.Status("stopping node")
119+
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(1))
120+
121+
t.Status("restarting node")
122+
c.Start(ctx, t.L(), startOpts, startSettings, c.Node(1))
123+
124+
t.Status("checking metrics parity between stores after restart")
125+
metricsMap2, err := getMetricsMap(ctx)
126+
if err != nil {
127+
t.Fatal(err)
128+
}
129+
130+
if !reflect.DeepEqual(metricsMap1, metricsMap2) {
131+
var diffs []string
132+
133+
// Find metrics in map1 but not in map2.
134+
for key, count1 := range metricsMap1 {
135+
if _, ok := metricsMap2[key]; !ok {
136+
diffs = append(diffs, fmt.Sprintf("metric %q (has_node=%t): in map1 (count=%d) but NOT in map2",
137+
key.name, key.hasNode, count1))
138+
}
139+
}
140+
141+
// Find metrics in map2 but not in map1.
142+
for key, count2 := range metricsMap2 {
143+
if _, ok := metricsMap1[key]; !ok {
144+
diffs = append(diffs, fmt.Sprintf("metric %q (has_node=%t): in map2 (count=%d) but NOT in map1",
145+
key.name, key.hasNode, count2))
146+
}
147+
}
148+
149+
slices.Sort(diffs)
150+
t.L().Printf("found %d metric differences between before and after restart:", len(diffs))
151+
for _, diff := range diffs {
152+
t.L().Printf(" %s", diff)
153+
}
154+
t.Fatalf("metrics map mismatch after restart")
155+
}
156+
}
157+
158+
func buildMetricsData(
159+
ctx context.Context, t test.Test, c cluster.Cluster,
160+
) (map[metricKey]int, error) {
161+
adminUIAddrs, err := c.ExternalAdminUIAddr(
162+
ctx, t.L(), c.Node(1), option.VirtualClusterName(install.SystemInterfaceName),
163+
)
164+
if err != nil {
165+
return nil, err
166+
}
167+
url := "https://" + adminUIAddrs[0] + "/_status/vars"
168+
client := roachtestutil.DefaultHTTPClient(
169+
c, t.L(), roachtestutil.VirtualCluster(install.SystemInterfaceName),
170+
)
171+
resp, err := client.Get(ctx, url)
172+
if err != nil {
173+
return nil, err
174+
}
175+
defer resp.Body.Close()
176+
if resp.StatusCode != 200 {
177+
return nil, errors.Newf("invalid non-200 status code %v from %s", resp.StatusCode, url)
178+
}
179+
180+
metricsMap := make(map[metricKey]int)
181+
scanner := bufio.NewScanner(resp.Body)
182+
for scanner.Scan() {
183+
name, labels, ok := parsePromLine(scanner.Text())
184+
if !ok {
185+
continue
186+
}
187+
hasStore, hasNode, hasLE, err := parseLabelInfo(labels)
188+
if err != nil {
189+
return nil, err
190+
}
191+
if hasLE || !hasStore {
192+
continue
193+
}
194+
key := metricKey{name: name, hasNode: hasNode}
195+
metricsMap[key]++
196+
}
197+
if err := scanner.Err(); err != nil {
198+
return nil, err
199+
}
200+
if len(metricsMap) == 0 {
201+
return nil, errors.New("no store metrics found in /_status/vars")
202+
}
203+
204+
return metricsMap, nil
205+
}
206+
207+
// parsePromLine extracts the metric name and the label section from a
208+
// Prometheus exposition line. It skips empty lines and comment/metadata lines
209+
// that start with '#'. The expected format is:
210+
//
211+
// metric_name{key="value",other="value"} <number>
212+
//
213+
// It returns the metric name and the raw label string between { and }.
214+
func parsePromLine(line string) (name, labels string, ok bool) {
215+
line = strings.TrimSpace(line)
216+
if line == "" || strings.HasPrefix(line, "#") {
217+
return "", "", false
218+
}
219+
start := strings.Index(line, "{")
220+
if start == -1 {
221+
return "", "", false
222+
}
223+
end := strings.Index(line[start+1:], "}")
224+
if end == -1 {
225+
return "", "", false
226+
}
227+
end += start + 1
228+
name = strings.TrimSpace(line[:start])
229+
labels = line[start+1 : end]
230+
if name == "" {
231+
return "", "", false
232+
}
233+
return name, labels, true
234+
}
235+
236+
// parseLabelInfo scans a raw label string and extracts presence and values.
237+
// The labels string is expected to be the comma-separated key=value pairs
238+
// inside {...} (e.g. `store="1",node_id="1",le="0.5"`).
239+
func parseLabelInfo(labels string) (hasStore bool, hasNode bool, hasLE bool, err error) {
240+
for _, part := range strings.Split(labels, ",") {
241+
part = strings.TrimSpace(part)
242+
if part == "" {
243+
continue
244+
}
245+
kv := strings.SplitN(part, "=", 2)
246+
key := kv[0]
247+
switch key {
248+
case "store":
249+
hasStore = true
250+
case "node_id":
251+
hasNode = true
252+
case "le":
253+
hasLE = true
254+
}
255+
}
256+
return hasStore, hasNode, hasLE, nil
257+
}

pkg/kv/kvserver/metrics.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4623,6 +4623,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
46234623
}
46244624

46254625
sm.categoryIterMetrics.init(storeRegistry)
4626+
sm.categoryDiskWriteMetrics.init(storeRegistry)
46264627

46274628
storeRegistry.AddMetricStruct(sm)
46284629
storeRegistry.AddMetricStruct(sm.LoadSplitterMetrics)
@@ -5143,6 +5144,36 @@ type pebbleCategoryDiskWriteMetricsContainer struct {
51435144
metricsMap syncutil.Map[vfs.DiskWriteCategory, pebbleCategoryDiskWriteMetrics]
51445145
}
51455146

5147+
// knownDiskWriteCategories contains all known disk write categories that should
5148+
// be eagerly initialized. This includes both Pebble internal categories and
5149+
// CockroachDB-defined categories.
5150+
var knownDiskWriteCategories = []vfs.DiskWriteCategory{
5151+
// Pebble internal categories
5152+
"pebble-wal",
5153+
"pebble-memtable-flush",
5154+
"pebble-compaction",
5155+
"pebble-manifest",
5156+
// CockroachDB categories (from pkg/storage/fs/category.go)
5157+
vfs.WriteCategoryUnspecified,
5158+
fs.RaftSnapshotWriteCategory,
5159+
fs.SQLColumnSpillWriteCategory,
5160+
fs.PebbleIngestionWriteCategory,
5161+
fs.CRDBLogWriteCategory,
5162+
fs.EncryptionRegistryWriteCategory,
5163+
}
5164+
5165+
func (m *pebbleCategoryDiskWriteMetricsContainer) init(registry *metric.Registry) {
5166+
m.registry = registry
5167+
// Eagerly initialize metrics for all known disk write categories to ensure
5168+
// consistent metric visibility across all stores from startup.
5169+
for _, category := range knownDiskWriteCategories {
5170+
cm, ok := m.metricsMap.LoadOrStore(category, makePebbleCategorizedWriteMetrics(category))
5171+
if !ok {
5172+
m.registry.AddMetricStruct(cm)
5173+
}
5174+
}
5175+
}
5176+
51465177
func (m *pebbleCategoryDiskWriteMetricsContainer) update(stats []vfs.DiskWriteStatsAggregate) {
51475178
for _, s := range stats {
51485179
cm, ok := m.metricsMap.Load(s.Category)

0 commit comments

Comments
 (0)