Merge #143251 #144790 #145402

craig[bot] · yuzefovich · dhartunian · craig[bot] · commit 81377d1e2a49 · 2025-04-30T15:42:45.000Z
143251: sql: decode all plan gists in test builds r=yuzefovich a=yuzefovich **explain: remove a couple redundant nil checks** `OutputBuilder.Expr` already checks for `nil` argument, so we can skip doing that in a few places. **sql: decode all plan gists in test builds** This commit adds the logic to always decode plan gists in test builds right after the gist was created. This should have trivial overhead while providing extra test coverage for the feature. Note that we had `TestExplainGist` that was targeting this coverage, so to a certain degree that test is now obsolete. However, I decided to not completely remove it since the test has been good at finding issues unrelated to gists (it is effectively the `sqlsmith` roachtest that runs as a unit test). Informs: #143211 Epic: None Release note: None 144790: metric: document metrics in yaml format r=dhartunian a=dhartunian The generated metrics file is now in YAML format instead of HTML, which makes it easier to consume with automated tooling. The file is now structured in a hierarchical format by layer, then category, then in a list of metric names. Some new fields have been added: - `exported_name`: metric name used in the Prometheus endpoint that ends up shown in 3rd party observability tooling. - `essential`: boolean that flags a metric as essential for customers to use. - `how_to_use`: extended description that is included with essential metrics Resolves: #142571 Release note: None 145402: workflows: run automerge on schedule only r=jlinder a=rail Previously, the automerge workflow was triggered on every push to any branch. This commit changes the workflow to run only on a schedule, also removing the `on: workflow_dispatch` trigger. Epic: none Release note: none Co-authored-by: Yahor Yuzefovich <yahor@cockroachlabs.com> Co-authored-by: David Hartunian <davidh@cockroachlabs.com> Co-authored-by: Rail Aliiev <rail@iqchoice.com>
diff --git a/.github/workflows/auto-merge-backports.yml b/.github/workflows/auto-merge-backports.yml
@@ -3,8 +3,6 @@ name: Auto-Merge Test Backport PRs
 on:
   schedule:
     - cron: "0 * * * *"  # Every hour
-  workflow_dispatch:
-  push:
 
 jobs:
   auto-merge:
diff --git a/docs/generated/metrics/BUILD.bazel b/docs/generated/metrics/BUILD.bazel
@@ -1,7 +1,7 @@
 genrule(
     name = "metrics",
-    outs = ["metrics.html"],
-    cmd = "$(location //pkg/cmd/cockroach-short) gen metric-list --format=unnumbered-html --logtostderr=NONE > $@",
+    outs = ["metrics.yaml"],
+    cmd = "$(location //pkg/cmd/cockroach-short) gen metric-list --logtostderr=NONE > $@",
     tools = ["//pkg/cmd/cockroach-short"],
     visibility = [
         ":__pkg__",
diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
diff --git a/docs/generated/metrics/metrics.yaml b/docs/generated/metrics/metrics.yaml
diff --git a/pkg/cli/gen.go b/pkg/cli/gen.go
@@ -33,6 +33,7 @@ import (
 	slugify "github.com/mozillazg/go-slugify"
 	"github.com/spf13/cobra"
 	"github.com/spf13/cobra/doc"
+	"gopkg.in/yaml.v2"
 )
 
 var manPath string
@@ -335,32 +336,119 @@ Output the list of metrics typical for a node.
 			return err
 		}
 
-		// Sort by layer then metric name.
+		// Sort by layer then category name.
 		sort.Slice(sections, func(i, j int) bool {
 			return sections[i].MetricLayer < sections[j].MetricLayer ||
 				(sections[i].MetricLayer == sections[j].MetricLayer &&
 					sections[i].Title < sections[j].Title)
 		})
 
-		// Populate the resulting table.
-		cols := []string{"Layer", "Metric", "Description", "Y-Axis Label", "Type", "Unit", "Aggregation", "Derivative"}
-		var rows [][]string
+		// Structure for file is:
+		// layers:
+		//  - name: layer_name
+		//    categories:
+		//      - name: category_name
+		//        metrics:
+		//          - name: metric_name
+		//            exported_name: metric_exported_name
+		//            description: metric_description
+		//            y_axis_label: metric_y_axis_label
+		//            etc.
+
+		type MetricInfo struct {
+			Name         string `yaml:"name"`
+			ExportedName string `yaml:"exported_name"`
+			Description  string `yaml:"description"`
+			YAxisLabel   string `yaml:"y_axis_label"`
+			Type         string `yaml:"type"`
+			Unit         string `yaml:"unit"`
+			Aggregation  string `yaml:"aggregation"`
+			Derivative   string `yaml:"derivative"`
+			HowToUse     string `yaml:"how_to_use,omitempty"`
+			Essential    bool   `yaml:"essential,omitempty"`
+		}
+
+		type Category struct {
+			Name    string
+			Metrics []MetricInfo
+		}
+
+		type Layer struct {
+			Name       string
+			Categories []Category
+		}
+
+		type YAMLOutput struct {
+			Layers []*Layer
+		}
+
+		layers := make(map[string]*Layer)
 		for _, section := range sections {
-			rows = append(rows,
-				[]string{
-					section.MetricLayer.String(),
-					section.Title,
-					section.Charts[0].Metrics[0].Help,
-					section.Charts[0].AxisLabel,
-					section.Charts[0].Metrics[0].MetricType.String(),
-					section.Charts[0].Units.String(),
-					section.Charts[0].Aggregator.String(),
-					section.Charts[0].Derivative.String(),
+			// Get or create the layer that the current section is in
+			layerName := section.MetricLayer.String()
+			layer, ok := layers[layerName]
+			if !ok {
+				layer = &Layer{
+					Name:       layerName,
+					Categories: []Category{},
+				}
+				layers[layerName] = layer
+			}
+
+			// Every section is a separate category
+			category := Category{
+				Name: section.Title,
+			}
+
+			for _, chart := range section.Charts {
+				// There are many charts, but only 1 metric per chart.
+				metric := MetricInfo{
+					Name:         chart.Metrics[0].Name,
+					ExportedName: chart.Metrics[0].ExportedName,
+					Description:  chart.Metrics[0].Help,
+					YAxisLabel:   chart.AxisLabel,
+					Type:         chart.Metrics[0].MetricType.String(),
+					Unit:         chart.Units.String(),
+					Aggregation:  chart.Aggregator.String(),
+					Derivative:   chart.Derivative.String(),
+					HowToUse:     chart.Metrics[0].HowToUse,
+					Essential:    chart.Metrics[0].Essential,
+				}
+				category.Metrics = append(category.Metrics, metric)
+			}
+
+			layer.Categories = append(layer.Categories, category)
+		}
+
+		// Sort metrics within each layer by name
+		for _, layer := range layers {
+			for _, cat := range layer.Categories {
+				sort.Slice(cat.Metrics, func(i, j int) bool {
+					return cat.Metrics[i].Name < cat.Metrics[j].Name
 				})
+
+			}
 		}
-		align := "dddddddd"
-		sliceIter := clisqlexec.NewRowSliceIter(rows, align)
-		return sqlExecCtx.PrintQueryOutput(os.Stdout, stderr, cols, sliceIter)
+
+		output := YAMLOutput{}
+
+		var layerNames []string
+		for name := range layers {
+			layerNames = append(layerNames, name)
+		}
+		sort.Strings(layerNames)
+
+		for _, layer := range layerNames {
+			output.Layers = append(output.Layers, layers[layer])
+		}
+
+		// Output YAML
+		yamlData, err := yaml.Marshal(output)
+		if err != nil {
+			return err
+		}
+		fmt.Fprintf(os.Stdout, "%s", yamlData)
+		return nil
 	},
 }
 
@@ -404,5 +492,7 @@ func init() {
 		[]string{"system-only", "system-visible", "application"},
 		"label to use in the output for the various setting classes")
 
+	genMetricListCmd.Flags().Bool("essential", false, "only emit essential metrics")
+
 	GenCmd.AddCommand(genCmds...)
 }
diff --git a/pkg/gen/docs.bzl b/pkg/gen/docs.bzl
@@ -8,7 +8,7 @@ DOCS_SRCS = [
     "//docs/generated/http:nodes-other.md",
     "//docs/generated/http:nodes-request.md",
     "//docs/generated/http:nodes-response.md",
-    "//docs/generated/metrics:metrics.html",
+    "//docs/generated/metrics:metrics.yaml",
     "//docs/generated/settings:settings-for-tenants.txt",
     "//docs/generated/settings:settings.html",
     "//docs/generated/sql/bnf:abort_stmt.bnf",
diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go
@@ -173,6 +173,12 @@ var (
 		Help:        "Current user+system cpu percentage consumed by the CRDB process, normalized 0-1 by number of cores",
 		Measurement: "CPU Time",
 		Unit:        metric.Unit_PERCENT,
+		Essential:   true,
+		Category:    metric.Metadata_HARDWARE,
+		HowToUse: `This metric gives the CPU utilization percentage by the CockroachDB process. 
+		If it is equal to 1 (or 100%), then the CPU is overloaded. The CockroachDB process should 
+		not be running with over 80% utilization for extended periods of time (hours). This metric 
+		is used in the DB Console CPU Percent graph.`,
 	}
 	metaCPUNowNS = metric.Metadata{
 		Name:        "sys.cpu.now.ns",
diff --git a/pkg/sql/conn_executor_exec.go b/pkg/sql/conn_executor_exec.go
@@ -32,6 +32,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
 	"github.com/cockroachdb/cockroach/pkg/sql/execstats"
 	"github.com/cockroachdb/cockroach/pkg/sql/isql"
+	"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt/exec/explain"
 	"github.com/cockroachdb/cockroach/pkg/sql/paramparse"
 	"github.com/cockroachdb/cockroach/pkg/sql/parser"
@@ -3348,6 +3349,22 @@ func (ex *connExecutor) makeExecPlan(
 	// Include gist in error reports.
 	ih := &planner.instrumentation
 	ctx = withPlanGist(ctx, ih.planGist.String())
+	if buildutil.CrdbTestBuild && ih.planGist.String() != "" {
+		// Ensure that the gist can be decoded in test builds.
+		//
+		// In 50% cases, use nil catalog.
+		var catalog cat.Catalog
+		if ex.rng.internal.Float64() < 0.5 && !planner.SessionData().AllowRoleMembershipsToChangeDuringTransaction {
+			// For some reason, TestAllowRoleMembershipsToChangeDuringTransaction
+			// times out with non-nil catalog, so we'll keep it as nil when the
+			// session var is set to 'true' ('false' is the default).
+			catalog = planner.optPlanningCtx.catalog
+		}
+		_, err := explain.DecodePlanGistToRows(ctx, &planner.extendedEvalCtx.Context, ih.planGist.String(), catalog)
+		if err != nil {
+			return ctx, errors.NewAssertionErrorWithWrappedErrf(err, "failed to decode plan gist: %q", ih.planGist.String())
+		}
+	}
 
 	// Now that we have the plan gist, check whether we should get a bundle for
 	// it.
diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
@@ -1102,6 +1102,9 @@ var (
 		Unit:         metric.Unit_COUNT,
 		LabeledName:  "sql.count",
 		StaticLabels: metric.MakeLabelPairs(metric.LabelQueryType, "select"),
+		Essential:    true,
+		Category:     metric.Metadata_SQL,
+		HowToUse:     "This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application.",
 	}
 	MetaUpdateExecuted = metric.Metadata{
 		Name:         "sql.update.count",
@@ -1310,6 +1313,8 @@ func getMetricMeta(meta metric.Metadata, internal bool) metric.Metadata {
 		meta.Name += ".internal"
 		meta.Help += " (internal queries)"
 		meta.Measurement = "SQL Internal Statements"
+		meta.Essential = false
+		meta.HowToUse = ""
 		if meta.LabeledName != "" {
 			meta.StaticLabels = append(meta.StaticLabels, metric.MakeLabelPairs(metric.LabelQueryInternal, "true")...)
 		}
diff --git a/pkg/sql/opt/exec/explain/emit.go b/pkg/sql/opt/exec/explain/emit.go
@@ -740,12 +740,8 @@ func (e *emitter) emitNodeAttributes(ctx context.Context, evalCtx *eval.Context,
 
 	case limitOp:
 		a := n.args.(*limitArgs)
-		if a.Limit != nil {
-			ob.Expr("count", a.Limit, nil /* columns */)
-		}
-		if a.Offset != nil {
-			ob.Expr("offset", a.Offset, nil /* columns */)
-		}
+		ob.Expr("count", a.Limit, nil /* columns */)
+		ob.Expr("offset", a.Offset, nil /* columns */)
 
 	case sortOp:
 		a := n.args.(*sortArgs)
@@ -848,9 +844,7 @@ func (e *emitter) emitNodeAttributes(ctx context.Context, evalCtx *eval.Context,
 
 	case applyJoinOp:
 		a := n.args.(*applyJoinArgs)
-		if a.OnCond != nil {
-			ob.Expr("pred", a.OnCond, appendColumns(a.Left.Columns(), a.RightColumns...))
-		}
+		ob.Expr("pred", a.OnCond, appendColumns(a.Left.Columns(), a.RightColumns...))
 
 	case lookupJoinOp:
 		a := n.args.(*lookupJoinArgs)
diff --git a/pkg/ts/catalog/catalog_generator.go b/pkg/ts/catalog/catalog_generator.go
@@ -6,6 +6,8 @@
 package catalog
 
 import (
+	"fmt"
+
 	"github.com/cockroachdb/cockroach/pkg/ts/tspb"
 	"github.com/cockroachdb/cockroach/pkg/util/metric"
 	prometheusgo "github.com/prometheus/client_model/go"
@@ -39,6 +41,8 @@ func generateInternal(
 	metadata map[string]metric.Metadata, sl []ChartSection, metricLayer MetricLayer,
 ) []ChartSection {
 	avgAgg := tspb.TimeSeriesQueryAggregator_AVG
+	chartSections := make(map[metric.Metadata_Category]*ChartSection)
+
 	for name, meta := range metadata {
 		der := tspb.TimeSeriesQueryDerivative_NONE
 		if meta.MetricType == prometheusgo.MetricType_COUNTER {
@@ -50,33 +54,53 @@ func generateInternal(
 			dstUnit = AxisUnits(candidate)
 		}
 
-		sl = append(sl, ChartSection{
+		if meta.Essential {
+			if meta.Category == metric.Metadata_UNSET {
+				panic(fmt.Sprintf("Metric %s is essential but has no Category", name))
+			}
+			if meta.HowToUse == "" {
+				panic(fmt.Sprintf("Metric %s is essential but has no HowToUse", name))
+			}
+		}
+
+		section, ok := chartSections[meta.Category]
+		if !ok {
+			chartSections[meta.Category] = &ChartSection{
+				Title:           meta.Category.String(),
+				LongTitle:       meta.Category.String(),
+				CollectionTitle: meta.Category.String(),
+				Description:     meta.Category.String(),
+				Level:           0,
+				MetricLayer:     metricLayer,
+			}
+			section = chartSections[meta.Category]
+		}
+
+		section.Charts = append(section.Charts, &IndividualChart{
 			Title:           name,
 			LongTitle:       name,
 			CollectionTitle: name,
-			Description:     name,
-			Level:           0,
-			MetricLayer:     metricLayer,
-			Charts: []*IndividualChart{{
-				Title:           name,
-				LongTitle:       name,
-				CollectionTitle: name,
-				Downsampler:     &avgAgg,
-				Aggregator:      &avgAgg,
-				Derivative:      &der,
-				Units:           dstUnit,
-				AxisLabel:       meta.Measurement,
-				Metrics: []ChartMetric{
-					{
-						Name:           name,
-						Help:           meta.Help,
-						AxisLabel:      meta.Measurement,
-						PreferredUnits: dstUnit,
-						MetricType:     meta.MetricType,
-					},
+			Downsampler:     &avgAgg,
+			Aggregator:      &avgAgg,
+			Derivative:      &der,
+			Units:           dstUnit,
+			AxisLabel:       meta.Measurement,
+			Metrics: []ChartMetric{
+				{
+					ExportedName:   metric.ExportedName(name),
+					Name:           name,
+					Help:           meta.Help,
+					AxisLabel:      meta.Measurement,
+					PreferredUnits: dstUnit,
+					MetricType:     meta.MetricType,
+					Essential:      meta.Essential,
+					HowToUse:       meta.HowToUse,
 				},
-			}},
+			},
 		})
 	}
+	for _, s := range chartSections {
+		sl = append(sl, *s)
+	}
 	return sl
 }
diff --git a/pkg/ts/catalog/chart_catalog.proto b/pkg/ts/catalog/chart_catalog.proto
@@ -88,6 +88,12 @@ message ChartMetric {
   // should be of the same type to ensure the information displays behaves in
   // expected ways.
   optional io.prometheus.client.MetricType metricType = 5 [(gogoproto.nullable) = false];
+  // exportedName is the name of metrics as seen by external scrapers.
+  required string exportedName = 6 [(gogoproto.nullable) = false];
+  // essential is true when the metric is part of the essential list for customers to monitor.
+  required bool essential = 7 [(gogoproto.nullable) = false];
+  // howToUse is the usage instructions for the metric.
+  optional string howToUse = 8 [(gogoproto.nullable) = false];
 }
 
 // IndividualChart describes both the properties necessary to display
diff --git a/pkg/util/metric/metric.proto b/pkg/util/metric/metric.proto
diff --git a/pkg/util/metric/prometheus_exporter.go b/pkg/util/metric/prometheus_exporter.go
diff --git a/pkg/util/metric/registry.go b/pkg/util/metric/registry.go