|
| 1 | +// Copyright 2026 The Cockroach Authors. |
| 2 | +// |
| 3 | +// Use of this software is governed by the CockroachDB Software License |
| 4 | +// included in the /LICENSE file. |
| 5 | + |
| 6 | +package tests |
| 7 | + |
| 8 | +import ( |
| 9 | + "bufio" |
| 10 | + "context" |
| 11 | + "fmt" |
| 12 | + "reflect" |
| 13 | + "strings" |
| 14 | + "time" |
| 15 | + |
| 16 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" |
| 17 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" |
| 18 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" |
| 19 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" |
| 20 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" |
| 21 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" |
| 22 | + "github.com/cockroachdb/cockroach/pkg/roachprod/install" |
| 23 | + "github.com/cockroachdb/cockroach/pkg/util/retry" |
| 24 | + "github.com/cockroachdb/errors" |
| 25 | + "golang.org/x/exp/slices" |
| 26 | +) |
| 27 | + |
| 28 | +const storeMetricsStoresPerNode = 4 |
| 29 | + |
| 30 | +func registerStoreMetrics(r registry.Registry) { |
| 31 | + r.Add(registry.TestSpec{ |
| 32 | + Name: "store-metrics", |
| 33 | + Owner: registry.OwnerKV, |
| 34 | + // Single node with multiple stores to test that all stores have the same |
| 35 | + // set of metrics. |
| 36 | + Cluster: r.MakeClusterSpec(1, spec.Disks(storeMetricsStoresPerNode)), |
| 37 | + CompatibleClouds: registry.OnlyGCE, |
| 38 | + Suites: registry.Suites(registry.Weekly), |
| 39 | + Timeout: 5 * time.Minute, |
| 40 | + EncryptionSupport: registry.EncryptionMetamorphic, |
| 41 | + Leases: registry.MetamorphicLeases, |
| 42 | + Run: runStoreMetrics, |
| 43 | + }) |
| 44 | +} |
| 45 | + |
| 46 | +type metricKey struct { |
| 47 | + name string |
| 48 | + hasNode bool |
| 49 | +} |
| 50 | + |
| 51 | +// runStoreMetrics verifies that store metrics are correctly labeled, including |
| 52 | +// the node label for asynchronously bootstrapped stores. See issue #159046 for |
| 53 | +// an example of such a bug. |
| 54 | +// |
| 55 | +// This test will catch inconsistency bugs across the entire system, not just |
| 56 | +// store-related metrics. Failures of this test should be forwarded to the |
| 57 | +// appropriate team based on the metric. |
| 58 | +// |
| 59 | +// The test: |
| 60 | +// 1. Starts a single-node cluster with multiple stores. |
| 61 | +// 2. Fetches /_status/vars and parses store-labeled metrics (excluding histogram buckets). |
| 62 | +// 3. Builds a map keyed by (metric name, hasNodeLabel) -> count of stores. |
| 63 | +// 4. Asserts (with retry) that each count equals the store count. |
| 64 | +// 5. Stops and restarts the node. |
| 65 | +// 6. Repeats the check and compares the two maps. |
| 66 | +func runStoreMetrics(ctx context.Context, t test.Test, c cluster.Cluster) { |
| 67 | + startOpts := option.DefaultStartOpts() |
| 68 | + startOpts.RoachprodOpts.StoreCount = storeMetricsStoresPerNode |
| 69 | + startSettings := install.MakeClusterSettings() |
| 70 | + |
| 71 | + getMetricsMap := func(ctx context.Context) (map[metricKey]int, error) { |
| 72 | + var result map[metricKey]int |
| 73 | + var lastInconsistencies []string |
| 74 | + if err := retry.ForDuration(30*time.Second, func() error { |
| 75 | + metricsMap, err := buildMetricsData(ctx, t, c) |
| 76 | + if err != nil { |
| 77 | + lastInconsistencies = nil |
| 78 | + return err |
| 79 | + } |
| 80 | + var inconsistencies []string |
| 81 | + for key, count := range metricsMap { |
| 82 | + if count != storeMetricsStoresPerNode { |
| 83 | + inconsistencies = append( |
| 84 | + inconsistencies, |
| 85 | + fmt.Sprintf("metric %q (has_node=%t) has %d entries; want %d", |
| 86 | + key.name, key.hasNode, count, storeMetricsStoresPerNode), |
| 87 | + ) |
| 88 | + } |
| 89 | + } |
| 90 | + lastInconsistencies = inconsistencies |
| 91 | + if len(inconsistencies) > 0 { |
| 92 | + return errors.New("metric inconsistencies found") |
| 93 | + } |
| 94 | + result = metricsMap |
| 95 | + return nil |
| 96 | + }); err != nil { |
| 97 | + if len(lastInconsistencies) > 20 { |
| 98 | + t.L().Printf("found %d metric inconsistencies (showing first 20)", len(lastInconsistencies)) |
| 99 | + lastInconsistencies = lastInconsistencies[:20] |
| 100 | + } else { |
| 101 | + t.L().Printf("found %d metric inconsistencies: ", len(lastInconsistencies)) |
| 102 | + } |
| 103 | + t.L().Printf("%s", strings.Join(lastInconsistencies, " ")) |
| 104 | + return nil, err |
| 105 | + } |
| 106 | + return result, nil |
| 107 | + } |
| 108 | + |
| 109 | + t.Status("starting cluster with multiple stores") |
| 110 | + c.Start(ctx, t.L(), startOpts, startSettings, c.Node(1)) |
| 111 | + |
| 112 | + t.Status("checking metrics parity between stores after initial start") |
| 113 | + metricsMap1, err := getMetricsMap(ctx) |
| 114 | + if err != nil { |
| 115 | + t.Fatal(err) |
| 116 | + } |
| 117 | + |
| 118 | + t.Status("stopping node") |
| 119 | + c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(1)) |
| 120 | + |
| 121 | + t.Status("restarting node") |
| 122 | + c.Start(ctx, t.L(), startOpts, startSettings, c.Node(1)) |
| 123 | + |
| 124 | + t.Status("checking metrics parity between stores after restart") |
| 125 | + metricsMap2, err := getMetricsMap(ctx) |
| 126 | + if err != nil { |
| 127 | + t.Fatal(err) |
| 128 | + } |
| 129 | + |
| 130 | + if !reflect.DeepEqual(metricsMap1, metricsMap2) { |
| 131 | + var diffs []string |
| 132 | + |
| 133 | + // Find metrics in map1 but not in map2. |
| 134 | + for key, count1 := range metricsMap1 { |
| 135 | + if _, ok := metricsMap2[key]; !ok { |
| 136 | + diffs = append(diffs, fmt.Sprintf("metric %q (has_node=%t): in map1 (count=%d) but NOT in map2", |
| 137 | + key.name, key.hasNode, count1)) |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + // Find metrics in map2 but not in map1. |
| 142 | + for key, count2 := range metricsMap2 { |
| 143 | + if _, ok := metricsMap1[key]; !ok { |
| 144 | + diffs = append(diffs, fmt.Sprintf("metric %q (has_node=%t): in map2 (count=%d) but NOT in map1", |
| 145 | + key.name, key.hasNode, count2)) |
| 146 | + } |
| 147 | + } |
| 148 | + |
| 149 | + slices.Sort(diffs) |
| 150 | + t.L().Printf("found %d metric differences between before and after restart:", len(diffs)) |
| 151 | + for _, diff := range diffs { |
| 152 | + t.L().Printf(" %s", diff) |
| 153 | + } |
| 154 | + t.Fatalf("metrics map mismatch after restart") |
| 155 | + } |
| 156 | +} |
| 157 | + |
| 158 | +func buildMetricsData( |
| 159 | + ctx context.Context, t test.Test, c cluster.Cluster, |
| 160 | +) (map[metricKey]int, error) { |
| 161 | + adminUIAddrs, err := c.ExternalAdminUIAddr( |
| 162 | + ctx, t.L(), c.Node(1), option.VirtualClusterName(install.SystemInterfaceName), |
| 163 | + ) |
| 164 | + if err != nil { |
| 165 | + return nil, err |
| 166 | + } |
| 167 | + url := "https://" + adminUIAddrs[0] + "/_status/vars" |
| 168 | + client := roachtestutil.DefaultHTTPClient( |
| 169 | + c, t.L(), roachtestutil.VirtualCluster(install.SystemInterfaceName), |
| 170 | + ) |
| 171 | + resp, err := client.Get(ctx, url) |
| 172 | + if err != nil { |
| 173 | + return nil, err |
| 174 | + } |
| 175 | + defer resp.Body.Close() |
| 176 | + if resp.StatusCode != 200 { |
| 177 | + return nil, errors.Newf("invalid non-200 status code %v from %s", resp.StatusCode, url) |
| 178 | + } |
| 179 | + |
| 180 | + metricsMap := make(map[metricKey]int) |
| 181 | + scanner := bufio.NewScanner(resp.Body) |
| 182 | + for scanner.Scan() { |
| 183 | + name, labels, ok := parsePromLine(scanner.Text()) |
| 184 | + if !ok { |
| 185 | + continue |
| 186 | + } |
| 187 | + hasStore, hasNode, hasLE, err := parseLabelInfo(labels) |
| 188 | + if err != nil { |
| 189 | + return nil, err |
| 190 | + } |
| 191 | + if hasLE || !hasStore { |
| 192 | + continue |
| 193 | + } |
| 194 | + key := metricKey{name: name, hasNode: hasNode} |
| 195 | + metricsMap[key]++ |
| 196 | + } |
| 197 | + if err := scanner.Err(); err != nil { |
| 198 | + return nil, err |
| 199 | + } |
| 200 | + if len(metricsMap) == 0 { |
| 201 | + return nil, errors.New("no store metrics found in /_status/vars") |
| 202 | + } |
| 203 | + |
| 204 | + return metricsMap, nil |
| 205 | +} |
| 206 | + |
| 207 | +// parsePromLine extracts the metric name and the label section from a |
| 208 | +// Prometheus exposition line. It skips empty lines and comment/metadata lines |
| 209 | +// that start with '#'. The expected format is: |
| 210 | +// |
| 211 | +// metric_name{key="value",other="value"} <number> |
| 212 | +// |
| 213 | +// It returns the metric name and the raw label string between { and }. |
| 214 | +func parsePromLine(line string) (name, labels string, ok bool) { |
| 215 | + line = strings.TrimSpace(line) |
| 216 | + if line == "" || strings.HasPrefix(line, "#") { |
| 217 | + return "", "", false |
| 218 | + } |
| 219 | + start := strings.Index(line, "{") |
| 220 | + if start == -1 { |
| 221 | + return "", "", false |
| 222 | + } |
| 223 | + end := strings.Index(line[start+1:], "}") |
| 224 | + if end == -1 { |
| 225 | + return "", "", false |
| 226 | + } |
| 227 | + end += start + 1 |
| 228 | + name = strings.TrimSpace(line[:start]) |
| 229 | + labels = line[start+1 : end] |
| 230 | + if name == "" { |
| 231 | + return "", "", false |
| 232 | + } |
| 233 | + return name, labels, true |
| 234 | +} |
| 235 | + |
| 236 | +// parseLabelInfo scans a raw label string and extracts presence and values. |
| 237 | +// The labels string is expected to be the comma-separated key=value pairs |
| 238 | +// inside {...} (e.g. `store="1",node_id="1",le="0.5"`). |
| 239 | +func parseLabelInfo(labels string) (hasStore bool, hasNode bool, hasLE bool, err error) { |
| 240 | + for _, part := range strings.Split(labels, ",") { |
| 241 | + part = strings.TrimSpace(part) |
| 242 | + if part == "" { |
| 243 | + continue |
| 244 | + } |
| 245 | + kv := strings.SplitN(part, "=", 2) |
| 246 | + key := kv[0] |
| 247 | + switch key { |
| 248 | + case "store": |
| 249 | + hasStore = true |
| 250 | + case "node_id": |
| 251 | + hasNode = true |
| 252 | + case "le": |
| 253 | + hasLE = true |
| 254 | + } |
| 255 | + } |
| 256 | + return hasStore, hasNode, hasLE, nil |
| 257 | +} |
0 commit comments