Skip to content

Commit 66514d6

Browse files
alyshanjahani-crldhartunian
authored andcommitted
metric: Report aggregate metric when no child metrics are present
SQLMetrics are PrometheusIterable, they are the first and only instance of metrics that dynamically add and remove childset metrics. This commit fixes a bug where the includeAggregate flag being false (cluster setting disabled) resulted in these SQLMetrics not being reported. When a PrometheusIterable has no child metrics it should report the aggregate regardless of the cluster setting. Fixes: #149481 Release note (bug fix): When child metrics are enabled, include_aggregate is disabled, and the sql.metric.application_name/database_name are disabled, a handful of `sql` metrics were not being reported.
1 parent 71bcba2 commit 66514d6

File tree

3 files changed

+125
-29
lines changed

3 files changed

+125
-29
lines changed

pkg/server/status/recorder.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,16 @@ var includeAggregateMetricsEnabled = settings.RegisterBoolSetting(
9898
true,
9999
settings.WithPublic)
100100

101+
// bugfix149481Enabled is a (temporary) cluster setting that fixes
102+
// https://github.com/cockroachdb/cockroach/issues/149481. It is true (enabled) by default on master,
103+
// and false on backports.
104+
var bugfix149481Enabled = settings.RegisterBoolSetting(
105+
settings.ApplicationLevel, "server.child_metrics.bugfix_missing_sql_metrics_149481.enabled",
106+
"fixes bug where certain SQL metrics are not being reported, see: "+
107+
"https://github.com/cockroachdb/cockroach/issues/149481",
108+
true,
109+
settings.WithVisibility(settings.Reserved))
110+
101111
// MetricsRecorder is used to periodically record the information in a number of
102112
// metric registries.
103113
//
@@ -348,10 +358,12 @@ func (mr *MetricsRecorder) ScrapeIntoPrometheusWithStaticLabels(
348358

349359
includeChildMetrics := ChildMetricsEnabled.Get(&mr.settings.SV)
350360
includeAggregateMetrics := includeAggregateMetricsEnabled.Get(&mr.settings.SV)
361+
reinitialisableBugFixEnabled := bugfix149481Enabled.Get(&mr.settings.SV)
351362
scrapeOptions := []metric.ScrapeOption{
352363
metric.WithIncludeChildMetrics(includeChildMetrics),
353364
metric.WithIncludeAggregateMetrics(includeAggregateMetrics),
354365
metric.WithUseStaticLabels(useStaticLabels),
366+
metric.WithReinitialisableBugFixEnabled(reinitialisableBugFixEnabled),
355367
}
356368
if mr.mu.nodeRegistry == nil {
357369
// We haven't yet processed initialization information; output nothing.

pkg/util/metric/aggmetric/agg_metric_test.go

Lines changed: 73 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -33,40 +33,87 @@ func TestExcludeAggregateMetrics(t *testing.T) {
3333
counter.AddChild("xyz")
3434
r.AddMetric(counter)
3535

36-
// Don't include child metrics, so only report the aggregate.
37-
// Flipping the includeAggregateMetrics flag should have no effect.
38-
testutils.RunTrueAndFalse(t, "includeChildMetrics=false,includeAggregateMetrics", func(t *testing.T, includeAggregateMetrics bool) {
36+
testutils.RunTrueAndFalse(t, "reinitialisableBugFixEnabled", func(t *testing.T, includeBugFix bool) {
37+
// Don't include child metrics, so only report the aggregate.
38+
// Flipping the includeAggregateMetrics flag should have no effect.
39+
testutils.RunTrueAndFalse(t, "includeChildMetrics=false,includeAggregateMetrics", func(t *testing.T, includeAggregateMetrics bool) {
40+
pe := metric.MakePrometheusExporter()
41+
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(false), metric.WithIncludeAggregateMetrics(includeAggregateMetrics), metric.WithReinitialisableBugFixEnabled(includeBugFix))
42+
families, err := pe.Gather()
43+
require.NoError(t, err)
44+
require.Equal(t, 1, len(families))
45+
require.Equal(t, "counter", families[0].GetName())
46+
require.Equal(t, 1, len(families[0].GetMetric()))
47+
require.Equal(t, 0, len(families[0].GetMetric()[0].GetLabel()))
48+
})
49+
50+
testutils.RunTrueAndFalse(t, "includeChildMetrics=true,includeAggregateMetrics", func(t *testing.T, includeAggregateMetrics bool) {
51+
pe := metric.MakePrometheusExporter()
52+
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(true), metric.WithIncludeAggregateMetrics(includeAggregateMetrics), metric.WithReinitialisableBugFixEnabled(includeBugFix))
53+
families, err := pe.Gather()
54+
require.NoError(t, err)
55+
require.Equal(t, 1, len(families))
56+
require.Equal(t, "counter", families[0].GetName())
57+
var labelPair *prometheusgo.LabelPair
58+
if includeAggregateMetrics {
59+
require.Equal(t, 2, len(families[0].GetMetric()))
60+
require.Equal(t, 0, len(families[0].GetMetric()[0].GetLabel()))
61+
require.Equal(t, 1, len(families[0].GetMetric()[1].GetLabel()))
62+
labelPair = families[0].GetMetric()[1].GetLabel()[0]
63+
} else {
64+
require.Equal(t, 1, len(families[0].GetMetric()))
65+
require.Equal(t, 1, len(families[0].GetMetric()[0].GetLabel()))
66+
labelPair = families[0].GetMetric()[0].GetLabel()[0]
67+
}
68+
require.Equal(t, "child_label", *labelPair.Name)
69+
require.Equal(t, "xyz", *labelPair.Value)
70+
})
71+
})
72+
73+
t.Run("reinitialisable metrics", func(t *testing.T) {
74+
r := metric.NewRegistry()
75+
counter := NewSQLCounter(metric.Metadata{Name: "counter"})
76+
r.AddMetric(counter)
77+
78+
// Aggregate disabled, but no children, so the aggregate is still reported.
3979
pe := metric.MakePrometheusExporter()
40-
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(false), metric.WithIncludeAggregateMetrics(includeAggregateMetrics))
80+
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(true), metric.WithIncludeAggregateMetrics(false), metric.WithReinitialisableBugFixEnabled(true))
4181
families, err := pe.Gather()
4282
require.NoError(t, err)
4383
require.Equal(t, 1, len(families))
44-
require.Equal(t, "counter", families[0].GetName())
4584
require.Equal(t, 1, len(families[0].GetMetric()))
46-
require.Equal(t, 0, len(families[0].GetMetric()[0].GetLabel()))
47-
})
48-
49-
testutils.RunTrueAndFalse(t, "includeChildMetrics=true,includeAggregateMetrics", func(t *testing.T, includeAggregateMetrics bool) {
50-
pe := metric.MakePrometheusExporter()
51-
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(true), metric.WithIncludeAggregateMetrics(includeAggregateMetrics))
52-
families, err := pe.Gather()
85+
require.Equal(t, 0, len(families[0].GetMetric()[0].GetLabel())) // The aggregate has no labels.
86+
87+
// Add children (app and db label). Now only the childset metric is reported.
88+
r.ReinitialiseChildMetrics(true, true)
89+
counter.Inc(1, "db_foo", "app_foo")
90+
pe = metric.MakePrometheusExporter()
91+
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(true), metric.WithIncludeAggregateMetrics(false), metric.WithReinitialisableBugFixEnabled(true))
92+
families, err = pe.Gather()
5393
require.NoError(t, err)
5494
require.Equal(t, 1, len(families))
55-
require.Equal(t, "counter", families[0].GetName())
56-
var labelPair *prometheusgo.LabelPair
57-
if includeAggregateMetrics {
58-
require.Equal(t, 2, len(families[0].GetMetric()))
59-
require.Equal(t, 0, len(families[0].GetMetric()[0].GetLabel()))
60-
require.Equal(t, 1, len(families[0].GetMetric()[1].GetLabel()))
61-
labelPair = families[0].GetMetric()[1].GetLabel()[0]
62-
} else {
63-
require.Equal(t, 1, len(families[0].GetMetric()))
64-
require.Equal(t, 1, len(families[0].GetMetric()[0].GetLabel()))
65-
labelPair = families[0].GetMetric()[0].GetLabel()[0]
66-
}
67-
require.Equal(t, "child_label", *labelPair.Name)
68-
require.Equal(t, "xyz", *labelPair.Value)
95+
require.Equal(t, 1, len(families[0].GetMetric()))
96+
// Childset metric labels.
97+
labelPair := families[0].GetMetric()[0].GetLabel()[0]
98+
require.Equal(t, "database", *labelPair.Name)
99+
labelPair = families[0].GetMetric()[0].GetLabel()[1]
100+
require.Equal(t, "application_name", *labelPair.Name)
101+
102+
// Enable aggregate. Now reporting two metrics (the aggregate and the childset).
103+
pe = metric.MakePrometheusExporter()
104+
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(true), metric.WithIncludeAggregateMetrics(true), metric.WithReinitialisableBugFixEnabled(true))
105+
families, err = pe.Gather()
106+
require.NoError(t, err)
107+
require.Equal(t, 1, len(families))
108+
require.Equal(t, 2, len(families[0].GetMetric()))
109+
require.Equal(t, 0, len(families[0].GetMetric()[0].GetLabel())) // The aggregate has no labels.
110+
// Childset metric labels.
111+
labelPair = families[0].GetMetric()[1].GetLabel()[0]
112+
require.Equal(t, "database", *labelPair.Name)
113+
labelPair = families[0].GetMetric()[1].GetLabel()[1]
114+
require.Equal(t, "application_name", *labelPair.Name)
69115
})
116+
70117
}
71118

72119
func TestAggMetric(t *testing.T) {

pkg/util/metric/prometheus_exporter.go

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,10 @@ func (pm *PrometheusExporter) findOrCreateFamily(
7878
}
7979

8080
type scrapeOptions struct {
81-
includeChildMetrics bool
82-
includeAggregateMetrics bool
83-
useStaticLabels bool
81+
includeChildMetrics bool
82+
includeAggregateMetrics bool
83+
useStaticLabels bool
84+
reinitialisableBugFixEnabled bool
8485
}
8586

8687
// ScrapeOption is a function that modifies scrapeOptions
@@ -107,6 +108,13 @@ func WithUseStaticLabels(use bool) ScrapeOption {
107108
}
108109
}
109110

111+
// WithReinitialisableBugFixEnabled returns an option to set whether the reinitialisable bug fix is enabled
112+
func WithReinitialisableBugFixEnabled(enabled bool) ScrapeOption {
113+
return func(o *scrapeOptions) {
114+
o.reinitialisableBugFixEnabled = enabled
115+
}
116+
}
117+
110118
// applyScrapeOptions creates a new scrapeOptions with the given options applied
111119
func applyScrapeOptions(options ...ScrapeOption) *scrapeOptions {
112120
opts := &scrapeOptions{
@@ -125,6 +133,7 @@ func applyScrapeOptions(options ...ScrapeOption) *scrapeOptions {
125133
func (pm *PrometheusExporter) ScrapeRegistry(registry *Registry, options ...ScrapeOption) {
126134
o := applyScrapeOptions(options...)
127135
labels := registry.GetLabels()
136+
128137
f := func(name string, v interface{}) {
129138
switch prom := v.(type) {
130139
case PrometheusVector:
@@ -138,6 +147,34 @@ func (pm *PrometheusExporter) ScrapeRegistry(registry *Registry, options ...Scra
138147
}
139148

140149
case PrometheusExportable:
150+
if _, ok := v.(PrometheusReinitialisable); ok && o.reinitialisableBugFixEnabled {
151+
m := prom.ToPrometheusMetric()
152+
// Set registry and metric labels.
153+
m.Label = append(labels, prom.GetLabels(o.useStaticLabels)...)
154+
family := pm.findOrCreateFamily(prom, o)
155+
156+
if o.includeAggregateMetrics {
157+
family.Metric = append(family.Metric, m)
158+
}
159+
160+
promIter, ok := v.(PrometheusIterable)
161+
numChildren := 0
162+
if ok && o.includeChildMetrics {
163+
promIter.Each(m.Label, func(metric *prometheusgo.Metric) {
164+
family.Metric = append(family.Metric, metric)
165+
numChildren += 1
166+
})
167+
}
168+
169+
// PrometheusReinitialisable metrics (like SQLMetric) dynamically
170+
// add child metrics. If no child metrics are present we want to ensure
171+
// we report the aggregate regardless of the respective cluster setting.
172+
if numChildren == 0 && !o.includeAggregateMetrics {
173+
family.Metric = append(family.Metric, m)
174+
}
175+
return
176+
}
177+
141178
m := prom.ToPrometheusMetric()
142179
// Set registry and metric labels.
143180
m.Label = append(labels, prom.GetLabels(o.useStaticLabels)...)

0 commit comments

Comments
 (0)