Skip to content

Commit 9742966

Browse files
committed
telemetry recording rule
This commit adds generation code and manifest marshalling for a telemetry recording rule resource. This takes the telemetry matches and generates a recording rule that is evaluated every $telemetry_interval (currently 4m30s). every match creates a new time series with the `telemetry:` prefix. The telemetry metrics are send via remote_write after relabel rules have removed the telmetry prefix and other unneeded labels. Signed-off-by: Jan Fajerski <[email protected]>
1 parent 2758b32 commit 9742966

File tree

8 files changed

+524
-20
lines changed

8 files changed

+524
-20
lines changed

assets/telemetry-recording-rules/prometheus-rule.yaml

Lines changed: 406 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// Parse the telemetry config to extract properly grouped matchers
2+
local telemetryConfigYaml = std.parseYaml(importstr '../../manifests/0000_50_cluster-monitoring-operator_04-config.yaml');
3+
local telemetryMatches = std.parseYaml(telemetryConfigYaml.data['metrics.yaml']).matches;
4+
5+
// Extract metric name from a telemetry match expression
6+
local extractMetricName(expr) =
7+
local quoteMatch = std.findSubstr('"', expr);
8+
assert std.length(quoteMatch) >= 2;
9+
local name = expr[quoteMatch[0]+1:quoteMatch[1]];
10+
11+
local nameMatch = std.findSubstr('__name__="', expr);
12+
local regexMatch = std.findSubstr('__name__=~"', expr);
13+
if std.length(regexMatch) > 0 then
14+
std.strReplace(name, '.*', 'wildcard')
15+
else
16+
name;
17+
18+
local maybeAddNameLabel(expr) =
19+
local regexMatch = std.findSubstr('__name__=~"', expr);
20+
# We need to keep track of the metric name in case the match contains a regex.
21+
# Otherwise Prometheus will log `execution: vector cannot contain metrics with the same labelset`
22+
# since the metric name is dropped while querying. See also https://github.com/prometheus/prometheus/issues/11397
23+
# We reset the correct label name in the remote_write config.
24+
if std.length(regexMatch) > 0 then
25+
'label_replace(%s,"name_label","$1","__name__", "(.+)")' % expr
26+
else
27+
expr;
28+
29+
// Generate individual recording rules for each properly grouped telemetry matcher
30+
local generateTelemetryRules() = [
31+
{
32+
record: 'telemetry:' + extractMetricName(match),
33+
expr: maybeAddNameLabel(match),
34+
}
35+
for match in telemetryMatches
36+
];
37+
38+
function(params) {
39+
local cfg = params,
40+
local telemetryRules = generateTelemetryRules(),
41+
42+
prometheusRule: {
43+
apiVersion: 'monitoring.coreos.com/v1',
44+
kind: 'PrometheusRule',
45+
metadata: {
46+
labels: cfg.commonLabels + {
47+
'role': 'telemetry-rules',
48+
},
49+
name: 'telemetry-recording-rules',
50+
namespace: cfg.namespace,
51+
},
52+
spec: {
53+
groups: [{
54+
name: 'telemetry-recording.rules',
55+
interval: '4m30s',
56+
rules: telemetryRules,
57+
}],
58+
},
59+
},
60+
}

jsonnet/main.jsonnet

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ local thanosQuerier = import './components/thanos-querier.libsonnet';
2626

2727
local openshiftStateMetrics = import './components/openshift-state-metrics.libsonnet';
2828
local telemeterClient = import './components/telemeter-client.libsonnet';
29+
local telemetryRecordingRules = import './components/telemetry-recording-rules.libsonnet';
2930

3031
// Common configuration
3132
local commonConfig = {
@@ -386,6 +387,10 @@ local inCluster =
386387
},
387388
},
388389
},
390+
telemetryRecordingRules: {
391+
namespace: $.values.common.namespace,
392+
commonLabels+: $.values.common.commonLabels,
393+
},
389394
},
390395

391396
// Objects
@@ -430,6 +435,7 @@ local inCluster =
430435
telemeterClient: telemeterClient($.values.telemeterClient),
431436
monitoringPlugin: monitoringPlugin($.values.monitoringPlugin),
432437
openshiftStateMetrics: openshiftStateMetrics($.values.openshiftStateMetrics),
438+
telemetryRecordingRules: telemetryRecordingRules($.values.telemetryRecordingRules),
433439
} +
434440
(import './utils/anti-affinity.libsonnet') +
435441
(import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/addons/ksm-lite.libsonnet') +
@@ -535,6 +541,7 @@ setTerminationMessagePolicy(
535541
{ ['thanos-querier/' + name]: inCluster.thanosQuerier[name] for name in std.objectFields(inCluster.thanosQuerier) } +
536542
{ ['thanos-ruler/' + name]: inCluster.thanosRuler[name] for name in std.objectFields(inCluster.thanosRuler) } +
537543
{ ['control-plane/' + name]: inCluster.controlPlane[name] for name in std.objectFields(inCluster.controlPlane) } +
544+
{ ['telemetry-recording-rules/' + name]: inCluster.telemetryRecordingRules[name] for name in std.objectFields(inCluster.telemetryRecordingRules) } +
538545
{ ['manifests/' + name]: inCluster.manifests[name] for name in std.objectFields(inCluster.manifests) } +
539546
{}
540547
)

pkg/manifests/config.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ func (c *Config) applyDefaults() {
441441
}
442442
if c.ClusterMonitoringConfiguration.TelemeterClientConfig == nil {
443443
c.ClusterMonitoringConfiguration.TelemeterClientConfig = &TelemeterClientConfig{
444-
TelemeterServerURL: "https://infogw.api.openshift.com/",
444+
TelemeterServerURL: "https://infogw.api.openshift.com/metrics/v1/receive",
445445
}
446446
}
447447

@@ -515,9 +515,6 @@ func (c *Config) SetTelemetryMatches(matches []string) {
515515

516516
func (c *Config) SetRemoteWrite(rw bool) {
517517
c.RemoteWrite = rw
518-
if c.RemoteWrite && c.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL == "https://infogw.api.openshift.com/" {
519-
c.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL = "https://infogw.api.openshift.com/metrics/v1/receive"
520-
}
521518
}
522519

523520
func (c *Config) LoadClusterID(load func() (*configv1.ClusterVersion, error)) error {

pkg/manifests/manifests.go

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,6 @@ import (
5151
apiregistrationv1 "k8s.io/kube-aggregator/pkg/apis/apiregistration/v1"
5252
"k8s.io/utils/ptr"
5353
k8syaml "sigs.k8s.io/yaml"
54-
55-
"github.com/openshift/cluster-monitoring-operator/pkg/promqlgen"
5654
)
5755

5856
const (
@@ -258,6 +256,8 @@ var (
258256
TelemeterClientKubeRbacProxySecret = "telemeter-client/kube-rbac-proxy-secret.yaml"
259257
TelemeterClientPrometheusRule = "telemeter-client/prometheus-rule.yaml"
260258

259+
TelemetryRecordingRulesPrometheusRule = "telemetry-recording-rules/prometheus-rule.yaml"
260+
261261
ThanosQuerierDeployment = "thanos-querier/deployment.yaml"
262262
ThanosQuerierPodDisruptionBudget = "thanos-querier/pod-disruption-budget.yaml"
263263
ThanosQuerierService = "thanos-querier/service.yaml"
@@ -1383,12 +1383,7 @@ func (f *Factory) PrometheusK8s(grpcTLS *v1.Secret, telemetrySecret *v1.Secret)
13831383
}
13841384

13851385
clusterID := f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID
1386-
if f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() && f.config.RemoteWrite {
1387-
selectorRelabelConfig, err := promqlgen.LabelSelectorsToRelabelConfig(f.config.ClusterMonitoringConfiguration.PrometheusK8sConfig.TelemetryMatches)
1388-
if err != nil {
1389-
return nil, fmt.Errorf("generate label selector relabel config: %w", err)
1390-
}
1391-
1386+
if f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() {
13921387
p.Spec.Secrets = append(p.Spec.Secrets, telemetrySecret.GetName())
13931388

13941389
spec := monv1.RemoteWriteSpec{
@@ -1415,7 +1410,33 @@ func (f *Factory) PrometheusK8s(grpcTLS *v1.Secret, telemetrySecret *v1.Secret)
14151410
MaxBackoff: ptr.To(monv1.Duration("256s")),
14161411
},
14171412
WriteRelabelConfigs: []monv1.RelabelConfig{
1418-
*selectorRelabelConfig,
1413+
// Only send telemetry recording rules (metrics with telemetry: prefix)
1414+
{
1415+
SourceLabels: []monv1.LabelName{"__name__"},
1416+
Regex: "telemetry:.*",
1417+
Action: "keep",
1418+
},
1419+
// To support a regex matcher we track the
1420+
// original metric name in the recording rule.
1421+
// Here we reinstate the original name and drop
1422+
// the temp name.
1423+
// See also jsonnet/components/telemetry-recording-rules.libsonnet
1424+
{
1425+
SourceLabels: []monv1.LabelName{"__name__", "name_label"},
1426+
TargetLabel: "__name__",
1427+
Regex: "telemetry:.*;(.*)",
1428+
Replacement: ptr.To("$1"),
1429+
},
1430+
{
1431+
SourceLabels: []monv1.LabelName{"name_label"},
1432+
Action: "labeldrop",
1433+
},
1434+
{
1435+
SourceLabels: []monv1.LabelName{"__name__"},
1436+
TargetLabel: "__name__",
1437+
Regex: "telemetry:(.*)",
1438+
Replacement: ptr.To("$1"),
1439+
},
14191440
{
14201441
TargetLabel: "_id",
14211442
Replacement: ptr.To(clusterID),
@@ -3009,6 +3030,10 @@ func (f *Factory) TelemeterClientPrometheusRule() (*monv1.PrometheusRule, error)
30093030
return f.NewPrometheusRule(f.assets.MustNewAssetSlice(TelemeterClientPrometheusRule))
30103031
}
30113032

3033+
func (f *Factory) TelemetryRecordingRulesPrometheusRule() (*monv1.PrometheusRule, error) {
3034+
return f.NewPrometheusRule(f.assets.MustNewAssetSlice(TelemetryRecordingRulesPrometheusRule))
3035+
}
3036+
30123037
// TelemeterClientDeployment generates a new Deployment for Telemeter client.
30133038
// If the passed ConfigMap is not empty it mounts the Trusted CA Bundle as a VolumeMount to
30143039
// /etc/pki/ca-trust/extracted/pem/ location.

pkg/manifests/manifests_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1110,8 +1110,11 @@ func TestPrometheusK8sRemoteWriteURLs(t *testing.T) {
11101110

11111111
return c
11121112
},
1113+
telemetrySecret: telemetrySecret,
11131114

1114-
expectedRemoteWriteURLs: nil,
1115+
expectedRemoteWriteURLs: []string{
1116+
"https://infogw.api.openshift.com/metrics/v1/receive",
1117+
},
11151118
},
11161119
{
11171120
name: "legacy telemetry and custom remote write",
@@ -1124,9 +1127,11 @@ func TestPrometheusK8sRemoteWriteURLs(t *testing.T) {
11241127

11251128
return c
11261129
},
1130+
telemetrySecret: telemetrySecret,
11271131

11281132
expectedRemoteWriteURLs: []string{
11291133
"http://custom",
1134+
"https://infogw.api.openshift.com/metrics/v1/receive",
11301135
},
11311136
},
11321137
{

pkg/tasks/clustermonitoringoperator.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,15 @@ func (t *ClusterMonitoringOperatorTask) Run(ctx context.Context) error {
140140
return fmt.Errorf("reconciling cluster-monitoring-operator rules PrometheusRule failed: %w", err)
141141
}
142142

143+
trr, err := t.factory.TelemetryRecordingRulesPrometheusRule()
144+
if err != nil {
145+
return fmt.Errorf("initializing telemetry recording rules PrometheusRule failed: %w", err)
146+
}
147+
err = t.client.CreateOrUpdatePrometheusRule(ctx, trr)
148+
if err != nil {
149+
return fmt.Errorf("reconciling telemetry recording rules PrometheusRule failed: %w", err)
150+
}
151+
143152
smcmo, err := t.factory.ClusterMonitoringOperatorServiceMonitor()
144153
if err != nil {
145154
return fmt.Errorf("initializing Cluster Monitoring Operator ServiceMonitor failed: %w", err)

pkg/tasks/prometheus.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -346,16 +346,11 @@ func (t *PrometheusTask) create(ctx context.Context) error {
346346
return fmt.Errorf("initializing Prometheus telemetry secret failed: %w", err)
347347
}
348348

349-
if t.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() && t.config.RemoteWrite {
349+
if t.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() {
350350
klog.V(4).Info("updating Prometheus telemetry secret")
351351
if err = t.client.CreateOrUpdateSecret(ctx, telemetrySecret); err != nil {
352352
return fmt.Errorf("reconciling Prometheus telemetry secret failed: %w", err)
353353
}
354-
} else {
355-
klog.V(4).Info("deleting Prometheus telemetry secret")
356-
if err = t.client.DeleteSecret(ctx, telemetrySecret); err != nil {
357-
return fmt.Errorf("deleting Prometheus telemetry secret failed: %w", err)
358-
}
359354
}
360355

361356
{

0 commit comments

Comments
 (0)