diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cf701b1b1..727552abca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Note: This CHANGELOG is only for the monitoring team to track all monitoring related changes. Please see OpenShift release notes for official changes. +## 4.XX + +- [#2694](https://github.com/openshift/cluster-monitoring-operator/pull/2694) Add "telemetry" profile to the set of supported collection profiles. Switching to this profile will disable collection of all metrics except those required for telemetry purposes. + ## 4.20 - [#2595](https://github.com/openshift/cluster-monitoring-operator/pull/2595) Multi-tenant support for KSM's CRS feature-set downstream. diff --git a/assets/alertmanager/telemetry-service-monitor.yaml b/assets/alertmanager/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..0e9ec32f46 --- /dev/null +++ b/assets/alertmanager/telemetry-service-monitor.yaml @@ -0,0 +1,34 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 0.28.1 + monitoring.openshift.io/collection-profile: telemetry + name: alertmanager-main-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + interval: 30s + metricRelabelings: + - action: keep + regex: (alertmanager_integrations|scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: metrics + scheme: https + tlsConfig: + insecureSkipVerify: false + serverName: alertmanager-main.openshift-monitoring.svc + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: openshift-monitoring diff --git a/assets/cluster-monitoring-operator/telemetry-service-monitor.yaml b/assets/cluster-monitoring-operator/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..f0aa8fdc2f --- /dev/null +++ b/assets/cluster-monitoring-operator/telemetry-service-monitor.yaml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: cluster-monitoring-operator + app.kubernetes.io/part-of: openshift-monitoring + monitoring.openshift.io/collection-profile: telemetry + name: cluster-monitoring-operator-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + metricRelabelings: + - action: keep + regex: (cluster_monitoring_operator_collection_profile|scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: https + scheme: https + tlsConfig: + insecureSkipVerify: false + serverName: cluster-monitoring-operato.openshift-monitoring.svc + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/name: cluster-monitoring-operator diff --git a/assets/control-plane/telemetry-service-monitor-kubelet.yaml b/assets/control-plane/telemetry-service-monitor-kubelet.yaml new file mode 100644 index 0000000000..0388cb5e40 --- /dev/null +++ b/assets/control-plane/telemetry-service-monitor-kubelet.yaml @@ -0,0 +1,118 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: openshift-monitoring + k8s-app: kubelet + monitoring.openshift.io/collection-profile: telemetry + name: kubelet-telemetry + namespace: openshift-monitoring +spec: + attachMetadata: + node: true + endpoints: + - bearerTokenFile: "" + honorLabels: true + interval: 30s + metricRelabelings: + - action: keep + regex: (apiserver_storage_objects|container_cpu_usage_seconds_total|container_memory_working_set_bytes|kubelet_containers_per_pod_count_sum|up) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + scrapeTimeout: 30s + tlsConfig: + caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt + insecureSkipVerify: false + - bearerTokenFile: "" + honorLabels: true + honorTimestamps: true + interval: 30s + metricRelabelings: + - action: labeldrop + regex: __tmp_keep_metric + - action: keep + regex: (apiserver_storage_objects|container_cpu_usage_seconds_total|container_memory_working_set_bytes|kubelet_containers_per_pod_count_sum|up) + sourceLabels: + - __name__ + path: /metrics/cadvisor + port: https-metrics + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + scrapeTimeout: 30s + tlsConfig: + caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt + insecureSkipVerify: false + trackTimestampsStaleness: true + - bearerTokenFile: "" + honorLabels: true + interval: 30s + metricRelabelings: + - action: keep + regex: (apiserver_storage_objects|container_cpu_usage_seconds_total|container_memory_working_set_bytes|kubelet_containers_per_pod_count_sum|up) + sourceLabels: + - __name__ + path: /metrics/probes + port: https-metrics + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + scrapeTimeout: 30s + tlsConfig: + caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt + insecureSkipVerify: false + - bearerTokenFile: "" + interval: 30s + metricRelabelings: + - action: keep + regex: (apiserver_storage_objects|container_cpu_usage_seconds_total|container_memory_working_set_bytes|kubelet_containers_per_pod_count_sum|up) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - action: keep + regex: (linux|) + sourceLabels: + - __meta_kubernetes_node_label_kubernetes_io_os + - action: replace + regex: (.+)(?::\d+) + replacement: $1:9637 + sourceLabels: + - __address__ + targetLabel: __address__ + - action: replace + replacement: crio + sourceLabels: + - endpoint + targetLabel: endpoint + - action: replace + replacement: crio + targetLabel: job + scheme: https + tlsConfig: + caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt + insecureSkipVerify: false + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + k8s-app: kubelet diff --git a/assets/kube-state-metrics/telemetry-service-monitor.yaml b/assets/kube-state-metrics/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..d723519bd2 --- /dev/null +++ b/assets/kube-state-metrics/telemetry-service-monitor.yaml @@ -0,0 +1,53 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 2.17.0 + monitoring.openshift.io/collection-profile: telemetry + name: kube-state-metrics-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + honorLabels: true + interval: 1m + metricRelabelings: + - action: labeldrop + regex: instance + - action: keep + regex: (kube_node_labels|kube_node_role|kube_node_spec_unschedulable|kube_node_status_capacity|kube_node_status_condition|kube_pod_info|kube_pod_restart_policy|kube_running_pod_ready|scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: https-main + relabelings: + - action: labeldrop + regex: pod + scheme: https + scrapeTimeout: 1m + tlsConfig: + insecureSkipVerify: false + serverName: kube-state-metrics.openshift-monitoring.svc + - bearerTokenFile: "" + interval: 1m + metricRelabelings: + - action: keep + regex: (kube_node_labels|kube_node_role|kube_node_spec_unschedulable|kube_node_status_capacity|kube_node_status_condition|kube_pod_info|kube_pod_restart_policy|kube_running_pod_ready|scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: https-self + scheme: https + scrapeTimeout: 1m + tlsConfig: + insecureSkipVerify: false + serverName: kube-state-metrics.openshift-monitoring.svc + jobLabel: app.kubernetes.io/name + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: openshift-monitoring diff --git a/assets/metrics-server/telemetry-service-monitor.yaml b/assets/metrics-server/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..bc71cc98c6 --- /dev/null +++ b/assets/metrics-server/telemetry-service-monitor.yaml @@ -0,0 +1,30 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: metrics-server + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: metrics-server + app.kubernetes.io/part-of: openshift-monitoring + monitoring.openshift.io/collection-profile: telemetry + name: metrics-server-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + metricRelabelings: + - action: keep + regex: (scrape_samples_post_metric_relabeling|scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: https + scheme: https + tlsConfig: + insecureSkipVerify: false + serverName: metrics-serv.openshift-monitoring.svc + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/component: metrics-server + app.kubernetes.io/name: metrics-server + app.kubernetes.io/part-of: openshift-monitoring diff --git a/assets/node-exporter/telemetry-service-monitor.yaml b/assets/node-exporter/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..4ccdeb4b3f --- /dev/null +++ b/assets/node-exporter/telemetry-service-monitor.yaml @@ -0,0 +1,48 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 1.9.1 + monitoring.openshift.io/collection-profile: telemetry + name: node-exporter-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + interval: 15s + metricRelabelings: + - action: replace + regex: (node_mountstats_nfs_read_bytes_total|node_mountstats_nfs_write_bytes_total|node_mountstats_nfs_operations_requests_total) + replacement: "true" + sourceLabels: + - __name__ + targetLabel: __tmp_keep + - action: labeldrop + regex: __tmp_keep + - action: keep + regex: (node_cpu_info|virt_platform|node_memory_MemTotal_bytes|node_memory_MemAvailable_bytes|node_cpu_seconds_total|up|scrape_series_added|scrape_samples_post_metric_relabeling|node_accelerator_card_info) + sourceLabels: + - __name__ + port: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: false + serverName: node-expo.openshift-monitoring.svc + jobLabel: app.kubernetes.io/name + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: openshift-monitoring diff --git a/assets/openshift-state-metrics/telemetry-service-monitor.yaml b/assets/openshift-state-metrics/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..d68d51b6e8 --- /dev/null +++ b/assets/openshift-state-metrics/telemetry-service-monitor.yaml @@ -0,0 +1,46 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/part-of: openshift-monitoring + k8s-app: openshift-state-metrics + monitoring.openshift.io/collection-profile: telemetry + name: openshift-state-metrics-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + honorLabels: true + interval: 2m + metricRelabelings: + - action: keep + regex: (scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: https-main + scheme: https + scrapeTimeout: 2m + tlsConfig: + caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt + insecureSkipVerify: false + serverName: openshift-state-metrics.openshift-monitoring.svc + - bearerTokenFile: "" + interval: 2m + metricRelabelings: + - action: keep + regex: (scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: https-self + scheme: https + scrapeTimeout: 2m + tlsConfig: + caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt + insecureSkipVerify: false + serverName: openshift-state-metrics.openshift-monitoring.svc + jobLabel: k8s-app + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + k8s-app: openshift-state-metrics diff --git a/assets/prometheus-k8s/telemetry-service-monitor-thanos-sidecar.yaml b/assets/prometheus-k8s/telemetry-service-monitor-thanos-sidecar.yaml new file mode 100644 index 0000000000..a55751d15b --- /dev/null +++ b/assets/prometheus-k8s/telemetry-service-monitor-thanos-sidecar.yaml @@ -0,0 +1,34 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: thanos-sidecar + app.kubernetes.io/instance: k8s + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 3.5.0 + monitoring.openshift.io/collection-profile: telemetry + name: thanos-sidecar-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + interval: 30s + metricRelabelings: + - action: keep + regex: (ALERTS|prometheus_tsdb_head_series|up|scrape_samples_post_metric_relabeling|scrape_series_added) + sourceLabels: + - __name__ + port: thanos-proxy + scheme: https + tlsConfig: + caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt + certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt + insecureSkipVerify: false + keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key + serverName: thanos-sideca.openshift-monitoring.svc + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/component: thanos-sidecar diff --git a/assets/prometheus-k8s/telemetry-service-monitor.yaml b/assets/prometheus-k8s/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..80baf201af --- /dev/null +++ b/assets/prometheus-k8s/telemetry-service-monitor.yaml @@ -0,0 +1,34 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 3.5.0 + monitoring.openshift.io/collection-profile: telemetry + name: prometheus-k8s-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + interval: 30s + metricRelabelings: + - action: keep + regex: (prometheus_tsdb_head_series|up|scrape_samples_post_metric_relabeling|scrape_series_added) + sourceLabels: + - __name__ + port: metrics + scheme: https + tlsConfig: + insecureSkipVerify: false + serverName: prometheus-k8s.openshift-monitoring.svc + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: openshift-monitoring diff --git a/assets/prometheus-operator/telemetry-service-monitor.yaml b/assets/prometheus-operator/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..0249006b8f --- /dev/null +++ b/assets/prometheus-operator/telemetry-service-monitor.yaml @@ -0,0 +1,33 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 0.85.0 + monitoring.openshift.io/collection-profile: telemetry + name: prometheus-operator-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + honorLabels: true + metricRelabelings: + - action: keep + regex: (scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: https + scheme: https + tlsConfig: + insecureSkipVerify: false + serverName: prometheus-operato.openshift-monitoring.svc + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 0.85.0 diff --git a/assets/telemeter-client/telemetry-service-monitor.yaml b/assets/telemeter-client/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..d3ee4d443f --- /dev/null +++ b/assets/telemeter-client/telemetry-service-monitor.yaml @@ -0,0 +1,29 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/part-of: openshift-monitoring + k8s-app: telemeter-client + monitoring.openshift.io/collection-profile: telemetry + name: telemeter-client-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + interval: 30s + metricRelabelings: + - action: keep + regex: (federate_filtered_samples|federate_samples|up) + sourceLabels: + - __name__ + port: https + scheme: https + tlsConfig: + insecureSkipVerify: false + serverName: telemeter-clien.openshift-monitoring.svc + jobLabel: k8s-app + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + k8s-app: telemeter-client diff --git a/assets/thanos-querier/telemetry-service-monitor.yaml b/assets/thanos-querier/telemetry-service-monitor.yaml new file mode 100644 index 0000000000..6637c3b314 --- /dev/null +++ b/assets/thanos-querier/telemetry-service-monitor.yaml @@ -0,0 +1,34 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: query-layer + app.kubernetes.io/instance: thanos-querier + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/name: thanos-query + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 0.39.2 + monitoring.openshift.io/collection-profile: telemetry + name: thanos-querier-telemetry + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + interval: 30s + metricRelabelings: + - action: keep + regex: (scrape_samples_post_metric_relabeling|scrape_series_added|up) + sourceLabels: + - __name__ + port: metrics + scheme: https + tlsConfig: + insecureSkipVerify: false + serverName: thanos-queri.openshift-monitoring.svc + scrapeClass: tls-client-certificate-auth + selector: + matchLabels: + app.kubernetes.io/component: query-layer + app.kubernetes.io/instance: thanos-querier + app.kubernetes.io/name: thanos-query + app.kubernetes.io/part-of: openshift-monitoring diff --git a/jsonnet/components/alertmanager.libsonnet b/jsonnet/components/alertmanager.libsonnet index 896623b9fa..bc12b68ec7 100644 --- a/jsonnet/components/alertmanager.libsonnet +++ b/jsonnet/components/alertmanager.libsonnet @@ -7,6 +7,7 @@ local withDescription = (import '../utils/add-annotations.libsonnet').withDescri local testFilePlaceholder = (import '../utils/add-annotations.libsonnet').testFilePlaceholder; local requiredRoles = (import '../utils/add-annotations.libsonnet').requiredRoles; local requiredClusterRoles = (import '../utils/add-annotations.libsonnet').requiredClusterRoles; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) local cfg = params { @@ -226,6 +227,15 @@ function(params) }, }, + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'alertmanager_integrations', + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + 'up', + ]) + ), + alertmanager+: { metadata+: { annotations+: { diff --git a/jsonnet/components/cluster-monitoring-operator.libsonnet b/jsonnet/components/cluster-monitoring-operator.libsonnet index c9d2b9b8f5..d1471f17a5 100644 --- a/jsonnet/components/cluster-monitoring-operator.libsonnet +++ b/jsonnet/components/cluster-monitoring-operator.libsonnet @@ -1,5 +1,6 @@ local metrics = import 'github.com/openshift/telemeter/jsonnet/telemeter/metrics.jsonnet'; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; local cmoRules = import './../rules.libsonnet'; local kubePrometheus = import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/components/mixin/custom.libsonnet'; @@ -159,6 +160,15 @@ function(params) { }, }, + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'cluster_monitoring_operator_collection_profile', + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + 'up', + ]) + ), + // This is the base for the cluster-monitoring-operator ClusterRole. It will // be extended with the rules from all other ClusterRoles in main.jsonnet. clusterRole: { diff --git a/jsonnet/components/control-plane.libsonnet b/jsonnet/components/control-plane.libsonnet index 1f0eb9a859..a6ef24939f 100644 --- a/jsonnet/components/control-plane.libsonnet +++ b/jsonnet/components/control-plane.libsonnet @@ -167,6 +167,17 @@ function(params) ]) ), + telemetryServiceMonitorKubelet: generateServiceMonitor.telemetry( + self.serviceMonitorKubelet, std.join('|', [ + 'apiserver_storage_objects', + 'container_cpu_usage_seconds_total', + 'container_memory_working_set_bytes', + 'kubelet_containers_per_pod_count_sum', + 'up', + ]) + ), + + // This avoids creating service monitors which are already managed by the respective operators. serviceMonitorApiserver:: {}, serviceMonitorKubeScheduler:: {}, diff --git a/jsonnet/components/kube-state-metrics.libsonnet b/jsonnet/components/kube-state-metrics.libsonnet index 0e825e1fee..861486c5c0 100644 --- a/jsonnet/components/kube-state-metrics.libsonnet +++ b/jsonnet/components/kube-state-metrics.libsonnet @@ -170,6 +170,22 @@ function(params) ]) ), + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'kube_node_labels', + 'kube_node_role', + 'kube_node_spec_unschedulable', + 'kube_node_status_capacity', + 'kube_node_status_condition', + 'kube_pod_info', + 'kube_pod_restart_policy', + 'kube_running_pod_ready', + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + 'up', + ]) + ), + kubeRbacProxySecret: generateSecret.staticAuthSecret(cfg.namespace, cfg.commonLabels, 'kube-state-metrics-kube-rbac-proxy-config'), // This removes the upstream addon-resizer and all resource requests and diff --git a/jsonnet/components/metrics-server.libsonnet b/jsonnet/components/metrics-server.libsonnet index fb57677fa8..de535fd636 100644 --- a/jsonnet/components/metrics-server.libsonnet +++ b/jsonnet/components/metrics-server.libsonnet @@ -1,4 +1,5 @@ local withDescription = (import '../utils/add-annotations.libsonnet').withDescription; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) { local cfg = params, @@ -373,4 +374,12 @@ function(params) { }, }, }, + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'scrape_samples_post_metric_relabeling', + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + 'up', + ]) + ), } diff --git a/jsonnet/components/node-exporter.libsonnet b/jsonnet/components/node-exporter.libsonnet index a9cfbe0853..61868537c4 100644 --- a/jsonnet/components/node-exporter.libsonnet +++ b/jsonnet/components/node-exporter.libsonnet @@ -223,6 +223,20 @@ function(params) ]) ), + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'node_cpu_info', + 'virt_platform', + 'node_memory_MemTotal_bytes', + 'node_memory_MemAvailable_bytes', + 'node_cpu_seconds_total', + 'up', + 'scrape_series_added', + 'scrape_samples_post_metric_relabeling', + 'node_accelerator_card_info', + ]) + ), + securityContextConstraints: { allowHostDirVolumePlugin: true, allowHostNetwork: true, diff --git a/jsonnet/components/openshift-state-metrics.libsonnet b/jsonnet/components/openshift-state-metrics.libsonnet index 21d00f35a1..cdea8289ba 100644 --- a/jsonnet/components/openshift-state-metrics.libsonnet +++ b/jsonnet/components/openshift-state-metrics.libsonnet @@ -1,5 +1,6 @@ local generateSecret = import '../utils/generate-secret.libsonnet'; local withDescription = (import '../utils/add-annotations.libsonnet').withDescription; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) { local cfg = params, @@ -96,5 +97,11 @@ function(params) { }, }, serviceMonitor: osm.openshiftStateMetrics.serviceMonitor, - + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + 'up', + ]) + ), } diff --git a/jsonnet/components/prometheus-operator.libsonnet b/jsonnet/components/prometheus-operator.libsonnet index f6a5d2ae87..1d964df147 100644 --- a/jsonnet/components/prometheus-operator.libsonnet +++ b/jsonnet/components/prometheus-operator.libsonnet @@ -6,6 +6,7 @@ local conversionWebhook = import 'github.com/prometheus-operator/prometheus-oper local generateSecret = import '../utils/generate-secret.libsonnet'; local rbac = import '../utils/rbac.libsonnet'; local withDescription = (import '../utils/add-annotations.libsonnet').withDescription; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) local po = operator(params); @@ -178,4 +179,12 @@ function(params) ], }, }, + + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + 'up', + ]) + ), } diff --git a/jsonnet/components/prometheus.libsonnet b/jsonnet/components/prometheus.libsonnet index 4a81ff4440..c82faa4f65 100644 --- a/jsonnet/components/prometheus.libsonnet +++ b/jsonnet/components/prometheus.libsonnet @@ -5,6 +5,7 @@ local generateSecret = import '../utils/generate-secret.libsonnet'; local prometheus = import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/components/prometheus.libsonnet'; local withDescription = (import '../utils/add-annotations.libsonnet').withDescription; local requiredClusterRoles = (import '../utils/add-annotations.libsonnet').requiredClusterRoles; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) local cfg = params; @@ -282,6 +283,15 @@ function(params) }, }, + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'prometheus_tsdb_head_series', + 'up', + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + ]) + ), + serviceThanosSidecar+: { metadata+: { annotations+: { @@ -315,6 +325,16 @@ function(params) }, }, + telemetryServiceMonitorThanosSidecar: generateServiceMonitor.telemetry( + self.serviceMonitorThanosSidecar, std.join('|', [ + 'ALERTS', + 'prometheus_tsdb_head_series', + 'up', + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + ]) + ), + // These patches inject the kube-rbac-proxy as a sidecar and configures it with // TLS. Additionally as the Alertmanager is protected with TLS, authN and // authZ it requires some additonal configuration. diff --git a/jsonnet/components/telemeter-client.libsonnet b/jsonnet/components/telemeter-client.libsonnet index 987126bfa5..2c4e8a8830 100644 --- a/jsonnet/components/telemeter-client.libsonnet +++ b/jsonnet/components/telemeter-client.libsonnet @@ -1,6 +1,7 @@ local generateCertInjection = import '../utils/generate-certificate-injection.libsonnet'; local generateSecret = import '../utils/generate-secret.libsonnet'; local withDescription = (import '../utils/add-annotations.libsonnet').withDescription; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) { local cfg = params, @@ -43,6 +44,13 @@ function(params) { ], }, }, + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'federate_filtered_samples', + 'federate_samples', + 'up', + ]) + ), secret: tc.telemeterClient.secret, servingCertsCABundle: tc.telemeterClient.servingCertsCABundle, kubeRbacProxySecret: generateSecret.staticAuthSecret(cfg.namespace, cfg.commonLabels, 'telemeter-client-kube-rbac-proxy-config'), diff --git a/jsonnet/components/thanos-querier.libsonnet b/jsonnet/components/thanos-querier.libsonnet index 0f86af86f3..d3c237cc11 100644 --- a/jsonnet/components/thanos-querier.libsonnet +++ b/jsonnet/components/thanos-querier.libsonnet @@ -3,6 +3,7 @@ local querier = import 'github.com/thanos-io/kube-thanos/jsonnet/kube-thanos/kub local withDescription = (import '../utils/add-annotations.libsonnet').withDescription; local requiredRoles = (import '../utils/add-annotations.libsonnet').requiredRoles; local requiredClusterRoles = (import '../utils/add-annotations.libsonnet').requiredClusterRoles; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) local cfg = params; @@ -254,6 +255,15 @@ function(params) }, }, + + telemetryServiceMonitor: generateServiceMonitor.telemetry( + self.serviceMonitor, std.join('|', [ + 'scrape_samples_post_metric_relabeling', + 'scrape_series_added', + 'up', + ]) + ), + deployment+: { metadata+: { labels+: { diff --git a/jsonnet/utils/generate-service-monitors.libsonnet b/jsonnet/utils/generate-service-monitors.libsonnet index 0e4ee92558..ed6407276c 100644 --- a/jsonnet/utils/generate-service-monitors.libsonnet +++ b/jsonnet/utils/generate-service-monitors.libsonnet @@ -2,13 +2,16 @@ local minimalLabel = { 'monitoring.openshift.io/collection-profile': 'minimal', }, - // 1. Add the prefix minimal to the ServiceMonitor name - // 2. Add the minimal label "monitoring.openshift.io/collection-profile: minimal" + local telemetryLabel = { + 'monitoring.openshift.io/collection-profile': 'telemetry', + }, + // 1. Add the profile prefix to the ServiceMonitor name + // 2. Add the profile label "monitoring.openshift.io/collection-profile: " // 3. Add a metricRelabelings with action keep and regex equal to metrics - local minimal(sm, metrics) = sm { + local run(sm, metrics, label) = sm { metadata+: { - name+: '-minimal', - labels+: minimalLabel, + name+: '-' + label['monitoring.openshift.io/collection-profile'], + labels+: label, }, spec+: { endpoints: std.map( @@ -39,5 +42,6 @@ }, }, - minimal(sm, metrics): minimal(removeDrop(sm), metrics), + minimal(sm, metrics): run(removeDrop(sm), metrics, minimalLabel), + telemetry(sm, metrics): run(removeDrop(sm), metrics, telemetryLabel), } diff --git a/pkg/manifests/types.go b/pkg/manifests/types.go index 287d1bc61e..cfbb9e07d9 100644 --- a/pkg/manifests/types.go +++ b/pkg/manifests/types.go @@ -24,11 +24,16 @@ type CollectionProfiles []CollectionProfile type ExternalLabels map[string]string const ( - FullCollectionProfile = "full" - MinimalCollectionProfile = "minimal" + FullCollectionProfile = "full" + MinimalCollectionProfile = "minimal" + TelemetryCollectionProfile = "telemetry" ) -var SupportedCollectionProfiles = CollectionProfiles{FullCollectionProfile, MinimalCollectionProfile} +var SupportedCollectionProfiles = CollectionProfiles{ + FullCollectionProfile, + MinimalCollectionProfile, + TelemetryCollectionProfile, +} // The `ClusterMonitoringConfiguration` resource defines settings that // customize the default platform monitoring stack through the