Skip to content

Commit 730a164

Browse files
craig[bot]alyshanjahani-crl
andcommitted
Merge #141601
141601: metric: Add functionality to disable aggregate metric reporting r=alyshanjahani-crl a=alyshanjahani-crl Previously when server.child_metrics.enabled was true, the prometheus exporter would include the time series with the child labels as well as the aggregate time series. This behaviour is not ideal as it can lead to double counting. For example, consider the following metric: counter_foo{node_id="1"} 6 counter_foo{node_id="1", child_metric_label="bar"} 3 counter_foo{node_id="1", child_metric_label="xyz"} 3 The aggregate metric is always the sum of all the child metrics. If a user were to export these metrics and query them like so: sum(counter_foo) by node_id They would get back 6+3+3=12 when really they should be getting back 6. This commit introduces a cluster setting server.child_metrics.include_aggregate.enabled which is false by default. Fixes: https://cockroachlabs.atlassian.net/browse/CC-31395 Release note (ops change): Modifies the default behaviour of prometheus metric reporting (/_status/vars) by not including the aggregate time series. This prevents issues with double counting when querying metrics. Note that this reporting behaviour can be toggled by a new cluster setting server.child_metrics.include_aggregate.enabled. By default it is false, but by setting it to true, the (undesired) behaviour of reporting the aggregate timeseries is restored. Co-authored-by: Alyshan Jahani <[email protected]>
2 parents d404b01 + f7119d1 commit 730a164

File tree

12 files changed

+92
-29
lines changed

12 files changed

+92
-29
lines changed

docs/generated/settings/settings-for-tenants.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ server.auth_log.sql_connections.enabled boolean false if set, log SQL client con
111111
server.auth_log.sql_sessions.enabled boolean false if set, log verbose SQL session authentication events to the SESSIONS log channel (note: may hinder performance on loaded nodes). Session start and end events are always logged regardless of this setting; disable the SESSIONS log channel to suppress them. application
112112
server.authentication_cache.enabled boolean true enables a cache used during authentication to avoid lookups to system tables when retrieving per-user authentication-related information application
113113
server.child_metrics.enabled boolean false enables the exporting of child metrics, additional prometheus time series with extra labels application
114+
server.child_metrics.include_aggregate.enabled boolean true include the reporting of the aggregate time series when child metrics are enabled. This cluster setting has no effect if child metrics are disabled. application
114115
server.client_cert_expiration_cache.capacity integer 1000 the maximum number of client cert expirations stored application
115116
server.clock.forward_jump_check.enabled (alias: server.clock.forward_jump_check_enabled) boolean false if enabled, forward clock jumps > max_offset/2 will cause a panic application
116117
server.clock.persist_upper_bound_interval duration 0s the interval between persisting the wall time upper bound of the clock. The clock does not generate a wall time greater than the persisted timestamp and will panic if it sees a wall time greater than this value. When cockroach starts, it waits for the wall time to catch-up till this persisted timestamp. This guarantees monotonic wall time across server restarts. Not setting this or setting a value of 0 disables this feature. application

docs/generated/settings/settings.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@
141141
<tr><td><div id="setting-server-auth-log-sql-sessions-enabled" class="anchored"><code>server.auth_log.sql_sessions.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>if set, log verbose SQL session authentication events to the SESSIONS log channel (note: may hinder performance on loaded nodes). Session start and end events are always logged regardless of this setting; disable the SESSIONS log channel to suppress them.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
142142
<tr><td><div id="setting-server-authentication-cache-enabled" class="anchored"><code>server.authentication_cache.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>enables a cache used during authentication to avoid lookups to system tables when retrieving per-user authentication-related information</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
143143
<tr><td><div id="setting-server-child-metrics-enabled" class="anchored"><code>server.child_metrics.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>enables the exporting of child metrics, additional prometheus time series with extra labels</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
144+
<tr><td><div id="setting-server-child-metrics-include-aggregate-enabled" class="anchored"><code>server.child_metrics.include_aggregate.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>include the reporting of the aggregate time series when child metrics are enabled. This cluster setting has no effect if child metrics are disabled.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
144145
<tr><td><div id="setting-server-client-cert-expiration-cache-capacity" class="anchored"><code>server.client_cert_expiration_cache.capacity</code></div></td><td>integer</td><td><code>1000</code></td><td>the maximum number of client cert expirations stored</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
145146
<tr><td><div id="setting-server-clock-forward-jump-check-enabled" class="anchored"><code>server.clock.forward_jump_check.enabled<br />(alias: server.clock.forward_jump_check_enabled)</code></div></td><td>boolean</td><td><code>false</code></td><td>if enabled, forward clock jumps &gt; max_offset/2 will cause a panic</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
146147
<tr><td><div id="setting-server-clock-persist-upper-bound-interval" class="anchored"><code>server.clock.persist_upper_bound_interval</code></div></td><td>duration</td><td><code>0s</code></td><td>the interval between persisting the wall time upper bound of the clock. The clock does not generate a wall time greater than the persisted timestamp and will panic if it sees a wall time greater than this value. When cockroach starts, it waits for the wall time to catch-up till this persisted timestamp. This guarantees monotonic wall time across server restarts. Not setting this or setting a value of 0 disables this feature.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>

pkg/ccl/sqlproxyccl/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ func (s *Server) handleVars(w http.ResponseWriter, r *http.Request) {
149149
contentType := expfmt.Negotiate(r.Header)
150150
w.Header().Set(httputil.ContentTypeHeader, string(contentType))
151151
scrape := func(pm *metric.PrometheusExporter) {
152-
pm.ScrapeRegistry(s.metricsRegistry, true /* includeChildMetrics*/)
152+
pm.ScrapeRegistry(s.metricsRegistry, true /* includeChildMetrics*/, true /* includeAggregateMetrics */)
153153
}
154154
if err := s.prometheusExporter.ScrapeAndPrintAsText(w, contentType, scrape); err != nil {
155155
log.Errorf(r.Context(), "%v", err)

pkg/kv/kvserver/client_tenant_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func TestTenantsStorageMetricsOnSplit(t *testing.T) {
9999
})
100100
ex := metric.MakePrometheusExporter()
101101
scrape := func(ex *metric.PrometheusExporter) {
102-
ex.ScrapeRegistry(store.Registry(), true /* includeChildMetrics */)
102+
ex.ScrapeRegistry(store.Registry(), true /* includeChildMetrics */, true)
103103
}
104104
var in bytes.Buffer
105105
if err := ex.ScrapeAndPrintAsText(&in, expfmt.FmtText, scrape); err != nil {

pkg/server/load_endpoint.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ func newLoadEndpoint(
8989

9090
// Exporter for the load vars that are provided only by the load handler.
9191
func (le *loadEndpoint) scrapeLoadVarsIntoPrometheus(pm *metric.PrometheusExporter) {
92-
pm.ScrapeRegistry(le.registry, true)
92+
pm.ScrapeRegistry(le.registry, true, true)
9393
}
9494

9595
// Handler responsible for serving the instant values of selected

pkg/server/status/recorder.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,15 @@ var ChildMetricsEnabled = settings.RegisterBoolSetting(
8989
false,
9090
settings.WithPublic)
9191

92+
// includeAggregateMetricsEnabled enables the exporting of the aggregate time series when child
93+
// metrics are enabled.
94+
var includeAggregateMetricsEnabled = settings.RegisterBoolSetting(
95+
settings.ApplicationLevel, "server.child_metrics.include_aggregate.enabled",
96+
"include the reporting of the aggregate time series when child metrics are enabled. This cluster setting "+
97+
"has no effect if child metrics are disabled.",
98+
true,
99+
settings.WithPublic)
100+
92101
// MetricsRecorder is used to periodically record the information in a number of
93102
// metric registries.
94103
//
@@ -336,15 +345,16 @@ func (mr *MetricsRecorder) ScrapeIntoPrometheus(pm *metric.PrometheusExporter) {
336345
}
337346
}
338347
includeChildMetrics := ChildMetricsEnabled.Get(&mr.settings.SV)
339-
pm.ScrapeRegistry(mr.mu.nodeRegistry, includeChildMetrics)
340-
pm.ScrapeRegistry(mr.mu.appRegistry, includeChildMetrics)
341-
pm.ScrapeRegistry(mr.mu.logRegistry, includeChildMetrics)
342-
pm.ScrapeRegistry(mr.mu.sysRegistry, includeChildMetrics)
348+
includeAggregateMetrics := includeAggregateMetricsEnabled.Get(&mr.settings.SV)
349+
pm.ScrapeRegistry(mr.mu.nodeRegistry, includeChildMetrics, includeAggregateMetrics)
350+
pm.ScrapeRegistry(mr.mu.appRegistry, includeChildMetrics, includeAggregateMetrics)
351+
pm.ScrapeRegistry(mr.mu.logRegistry, includeChildMetrics, includeAggregateMetrics)
352+
pm.ScrapeRegistry(mr.mu.sysRegistry, includeChildMetrics, includeAggregateMetrics)
343353
for _, reg := range mr.mu.storeRegistries {
344-
pm.ScrapeRegistry(reg, includeChildMetrics)
354+
pm.ScrapeRegistry(reg, includeChildMetrics, includeAggregateMetrics)
345355
}
346356
for _, tenantRegistry := range mr.mu.tenantRegistries {
347-
pm.ScrapeRegistry(tenantRegistry, includeChildMetrics)
357+
pm.ScrapeRegistry(tenantRegistry, includeChildMetrics, includeAggregateMetrics)
348358
}
349359
}
350360

pkg/sql/mvcc_statistics_update_job_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ func TestTenantGlobalAggregatedLivebytes(t *testing.T) {
127127
&in,
128128
expfmt.FmtText,
129129
func(ex *metric.PrometheusExporter) {
130-
ex.ScrapeRegistry(r, true /* includeChildMetrics */)
130+
ex.ScrapeRegistry(r, true /* includeChildMetrics */, true)
131131
},
132132
)
133133
require.NoError(t, err)

pkg/testutils/metrictestutils/metrics_text.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import (
2121
func GetMetricsText(registry *metric.Registry, re *regexp.Regexp) (string, error) {
2222
ex := metric.MakePrometheusExporter()
2323
scrape := func(ex *metric.PrometheusExporter) {
24-
ex.ScrapeRegistry(registry, true /* includeChildMetrics */)
24+
ex.ScrapeRegistry(registry, true /* includeChildMetrics */, true)
2525
}
2626
var in bytes.Buffer
2727
if err := ex.ScrapeAndPrintAsText(&in, expfmt.FmtText, scrape); err != nil {

pkg/util/metric/aggmetric/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ go_test(
3131
deps = [
3232
"//pkg/base",
3333
"//pkg/roachpb",
34+
"//pkg/testutils",
3435
"//pkg/testutils/datapathutils",
3536
"//pkg/testutils/echotest",
3637
"//pkg/util/leaktest",

pkg/util/metric/aggmetric/agg_metric_test.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
"github.com/cockroachdb/cockroach/pkg/base"
1717
"github.com/cockroachdb/cockroach/pkg/roachpb"
18+
"github.com/cockroachdb/cockroach/pkg/testutils"
1819
"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
1920
"github.com/cockroachdb/cockroach/pkg/testutils/echotest"
2021
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
@@ -24,6 +25,48 @@ import (
2425
"github.com/stretchr/testify/require"
2526
)
2627

28+
func TestExcludeAggregateMetrics(t *testing.T) {
29+
r := metric.NewRegistry()
30+
counter := NewCounter(metric.Metadata{Name: "counter"}, "child_label")
31+
counter.AddChild("xyz")
32+
r.AddMetric(counter)
33+
34+
// Don't include child metrics, so only report the aggregate.
35+
// Flipping the includeAggregateMetrics flag should have no effect.
36+
testutils.RunTrueAndFalse(t, "includeChildMetrics=false,includeAggregateMetrics", func(t *testing.T, includeAggregateMetrics bool) {
37+
pe := metric.MakePrometheusExporter()
38+
pe.ScrapeRegistry(r, false, includeAggregateMetrics)
39+
families, err := pe.Gather()
40+
require.NoError(t, err)
41+
require.Equal(t, 1, len(families))
42+
require.Equal(t, "counter", families[0].GetName())
43+
require.Equal(t, 1, len(families[0].GetMetric()))
44+
require.Equal(t, 0, len(families[0].GetMetric()[0].GetLabel()))
45+
})
46+
47+
testutils.RunTrueAndFalse(t, "includeChildMetrics=true,includeAggregateMetrics", func(t *testing.T, includeAggregateMetrics bool) {
48+
pe := metric.MakePrometheusExporter()
49+
pe.ScrapeRegistry(r, true, includeAggregateMetrics)
50+
families, err := pe.Gather()
51+
require.NoError(t, err)
52+
require.Equal(t, 1, len(families))
53+
require.Equal(t, "counter", families[0].GetName())
54+
var labelPair *prometheusgo.LabelPair
55+
if includeAggregateMetrics {
56+
require.Equal(t, 2, len(families[0].GetMetric()))
57+
require.Equal(t, 0, len(families[0].GetMetric()[0].GetLabel()))
58+
require.Equal(t, 1, len(families[0].GetMetric()[1].GetLabel()))
59+
labelPair = families[0].GetMetric()[1].GetLabel()[0]
60+
} else {
61+
require.Equal(t, 1, len(families[0].GetMetric()))
62+
require.Equal(t, 1, len(families[0].GetMetric()[0].GetLabel()))
63+
labelPair = families[0].GetMetric()[0].GetLabel()[0]
64+
}
65+
require.Equal(t, "child_label", *labelPair.Name)
66+
require.Equal(t, "xyz", *labelPair.Value)
67+
})
68+
}
69+
2770
func TestAggMetric(t *testing.T) {
2871
defer leaktest.AfterTest(t)()
2972

@@ -32,7 +75,7 @@ func TestAggMetric(t *testing.T) {
3275
var in bytes.Buffer
3376
ex := metric.MakePrometheusExporter()
3477
scrape := func(ex *metric.PrometheusExporter) {
35-
ex.ScrapeRegistry(r, true /* includeChildMetrics */)
78+
ex.ScrapeRegistry(r, true /* includeChildMetrics */, true)
3679
}
3780
require.NoError(t, ex.ScrapeAndPrintAsText(&in, expfmt.FmtText, scrape))
3881
var lines []string

0 commit comments

Comments
 (0)