diff --git a/Documentation/api.md b/Documentation/api.md index 7af2f31c0d..b97abb921f 100644 --- a/Documentation/api.md +++ b/Documentation/api.md @@ -136,7 +136,7 @@ The `ClusterMonitoringConfiguration` resource defines settings that customize th | prometheusOperator | *[PrometheusOperatorConfig](#prometheusoperatorconfig) | `PrometheusOperatorConfig` defines settings for the Prometheus Operator component. | | prometheusOperatorAdmissionWebhook | *[PrometheusOperatorAdmissionWebhookConfig](#prometheusoperatoradmissionwebhookconfig) | `PrometheusOperatorAdmissionWebhookConfig` defines settings for the Prometheus Operator's admission webhook component. | | openshiftStateMetrics | *[OpenShiftStateMetricsConfig](#openshiftstatemetricsconfig) | `OpenShiftMetricsConfig` defines settings for the `openshift-state-metrics` agent. | -| telemeterClient | *[TelemeterClientConfig](#telemeterclientconfig) | `TelemeterClientConfig` defines settings for the Telemeter Client component. | +| telemetryConfig | *[TelemetryConfig](#telemetryconfig) | TelemetryConfig defines settings for telemetry reporting. | | thanosQuerier | *[ThanosQuerierConfig](#thanosquerierconfig) | `ThanosQuerierConfig` defines settings for the Thanos Querier component. | | nodeExporter | [NodeExporterConfig](#nodeexporterconfig) | `NodeExporterConfig` defines settings for the `node-exporter` agent. | | monitoringPlugin | *[MonitoringPluginConfig](#monitoringpluginconfig) | `MonitoringPluginConfig` defines settings for the monitoring `console-plugin`. | @@ -568,9 +568,6 @@ The `TLSConfig` resource configures the settings for TLS connections. #### Required - ` nodeSelector ` - ` tolerations ` - -appears in: [ClusterMonitoringConfiguration](#clustermonitoringconfiguration) - | Property | Type | Description | | -------- | ---- | ----------- | | nodeSelector | map[string]string | Defines the nodes on which the pods are scheduled. | diff --git a/Documentation/openshiftdocs/modules/clustermonitoringconfiguration.adoc b/Documentation/openshiftdocs/modules/clustermonitoringconfiguration.adoc index 8a98389412..70cb791327 100644 --- a/Documentation/openshiftdocs/modules/clustermonitoringconfiguration.adoc +++ b/Documentation/openshiftdocs/modules/clustermonitoringconfiguration.adoc @@ -33,7 +33,7 @@ The `ClusterMonitoringConfiguration` resource defines settings that customize th |openshiftStateMetrics|*link:openshiftstatemetricsconfig.adoc[OpenShiftStateMetricsConfig]|`OpenShiftMetricsConfig` defines settings for the `openshift-state-metrics` agent. -|telemeterClient|*link:telemeterclientconfig.adoc[TelemeterClientConfig]|`TelemeterClientConfig` defines settings for the Telemeter Client component. +|telemetryConfig|*link:telemetryconfig.adoc[TelemetryConfig]|TelemetryConfig defines settings for telemetry reporting. |thanosQuerier|*link:thanosquerierconfig.adoc[ThanosQuerierConfig]|`ThanosQuerierConfig` defines settings for the Thanos Querier component. diff --git a/Documentation/openshiftdocs/modules/telemeterclientconfig.adoc b/Documentation/openshiftdocs/modules/telemeterclientconfig.adoc index 00a903454a..22dfd9d37d 100644 --- a/Documentation/openshiftdocs/modules/telemeterclientconfig.adoc +++ b/Documentation/openshiftdocs/modules/telemeterclientconfig.adoc @@ -15,9 +15,6 @@ * `nodeSelector` * `tolerations` - -Appears in: link:clustermonitoringconfiguration.adoc[ClusterMonitoringConfiguration] - [options="header"] |=== | Property | Type | Description diff --git a/assets/telemetry-recording-rules/prometheus-rule.yaml b/assets/telemetry-recording-rules/prometheus-rule.yaml new file mode 100644 index 0000000000..89e63ecd70 --- /dev/null +++ b/assets/telemetry-recording-rules/prometheus-rule.yaml @@ -0,0 +1,406 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/managed-by: cluster-monitoring-operator + app.kubernetes.io/part-of: openshift-monitoring + role: telemetry-rules + name: telemetry-recording-rules + namespace: openshift-monitoring +spec: + groups: + - interval: 4m30s + name: telemetry-recording.rules + rules: + - expr: label_replace(sum without(pod, container) ({__name__=~"cluster:usage:.*"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="count:up0"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="count:up1"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_version"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_version_available_updates"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_version_capability"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_operator_up"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_operator_conditions"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_version_payload"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_installer"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_infrastructure_provider"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_feature_set"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="instance:etcd_object_counts:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ALERTS",alertstate="firing",severity=~"critical|warning|info|none"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="code:apiserver_request_total:rate:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:capacity_cpu_cores:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:capacity_memory_bytes:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:cpu_usage_cores:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:memory_usage_bytes:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift:cpu_usage_cores:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift:memory_usage_bytes:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="workload:cpu_usage_cores:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="workload:memory_usage_bytes:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:virt_platform_nodes:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:node_instance_type_count:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cnv:vmi_status_running:count"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cnv_abnormal", reason=~"memory_working_set_delta_from_request|memory_rss_delta_from_request"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:vmi_request_cpu_cores:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="node_role_os_version_machine:cpu_capacity_cores:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="subscription_sync_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="olm_resolution_duration_seconds"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="csv_succeeded"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="csv_abnormal"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:kube_persistentvolumeclaim_resource_requests_storage_bytes:provisioner:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:kubelet_volume_stats_used_bytes:provisioner:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ceph_cluster_total_bytes"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ceph_cluster_total_used_raw_bytes"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ceph_health_status"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="odf_system_raw_capacity_total_bytes"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="odf_system_raw_capacity_used_bytes"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="odf_system_health_status"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:ceph_osd_metadata:count"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:kube_pv:count"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:odf_system_pvs:count"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:ceph_pools_iops:total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:ceph_pools_iops_bytes:total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:ceph_versions_running:count"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:noobaa_total_unhealthy_buckets:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:noobaa_bucket_count:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="job:noobaa_total_object_count:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="odf_system_bucket_count", system_type="OCS", system_vendor="Red Hat"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="odf_system_objects_total", system_type="OCS", system_vendor="Red Hat"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="noobaa_accounts_num"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="noobaa_total_usage"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="console_url"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:console_auth_login_requests_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:console_auth_login_successes_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:console_auth_login_failures_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:console_auth_logout_requests_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:console_usage_users:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:console_plugins_info:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:console_customization_perspectives_info:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:ovnkube_controller_egress_routing_via_host:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:ovnkube_controller_admin_network_policies_db_objects:max",table_name=~"ACL|Address_Set"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:ovnkube_controller_baseline_admin_network_policies_db_objects:max",table_name=~"ACL|Address_Set"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:ovnkube_controller_admin_network_policies_rules:max",direction=~"Ingress|Egress",action=~"Pass|Allow|Deny"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:ovnkube_controller_baseline_admin_network_policies_rules:max",direction=~"Ingress|Egress",action=~"Allow|Deny"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:network_attachment_definition_instances:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:network_attachment_definition_enabled_instance_up:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:ingress_controller_aws_nlb_active:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:route_metrics_controller_routes_per_shard:min"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:route_metrics_controller_routes_per_shard:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:route_metrics_controller_routes_per_shard:avg"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:route_metrics_controller_routes_per_shard:median"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:openshift_route_info:tls_termination:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="insightsclient_request_send_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cam_app_workload_migrations"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:apiserver_current_inflight_requests:sum:max_over_time:2m"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:alertmanager_integrations:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:telemetry_selected_series:count"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift:prometheus_tsdb_head_series:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift:prometheus_tsdb_head_samples_appended_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="monitoring:container_memory_working_set_bytes:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace_job:scrape_series_added:topk3_sum1h"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace_job:scrape_samples_post_metric_relabeling:topk3"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="monitoring:haproxy_server_http_responses_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="profile:cluster_monitoring_operator_collection_profile:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="vendor_model:node_accelerator_cards:sum",vendor=~"NVIDIA|AMD|GAUDI|INTEL|QUALCOMM"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhmi_status"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="status:upgrading:version:rhoam_state:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="state:rhoam_critical_alerts:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="state:rhoam_warning_alerts:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhoam_7d_slo_percentile:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhoam_7d_slo_remaining_error_budget:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_legacy_scheduler_policy"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_master_schedulable"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="che_workspace_status"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="che_workspace_started_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="che_workspace_failure_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="che_workspace_start_time_seconds_sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="che_workspace_start_time_seconds_count"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cco_credentials_mode"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:kube_persistentvolume_plugin_type_counts:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="acm_managed_cluster_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="acm_managed_cluster_worker_cores:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="acm_console_page_count:sum", page=~"overview-classic|overview-fleet|search|search-details|clusters|application|governance"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:vsphere_vcenter_info:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:vsphere_esxi_version_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:vsphere_node_hw_version_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift:build_by_strategy:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhods_aggregate_availability"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhods_total_users"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="instance:etcd_disk_wal_fsync_duration_seconds:histogram_quantile",quantile="0.99"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="instance:etcd_mvcc_db_total_size_in_bytes:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="instance:etcd_network_peer_round_trip_time_seconds:histogram_quantile",quantile="0.99"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="instance:etcd_mvcc_db_total_size_in_use_in_bytes:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="instance:etcd_disk_backend_commit_duration_seconds:histogram_quantile",quantile="0.99"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="jaeger_operator_instances_storage_types"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="jaeger_operator_instances_strategies"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="jaeger_operator_instances_agent_strategies"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="type:tempo_operator_tempostack_storage_backend:sum",type=~"azure|gcs|s3"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="state:tempo_operator_tempostack_managed:sum",state=~"Managed|Unmanaged"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="type:tempo_operator_tempostack_multi_tenancy:sum",type=~"static|openshift|disabled"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="enabled:tempo_operator_tempostack_jaeger_ui:sum",enabled=~"true|false"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="type:opentelemetry_collector_receivers:sum",type=~"jaeger|hostmetrics|opencensus|prometheus|zipkin|kafka|filelog|journald|k8sevents|kubeletstats|k8scluster|k8sobjects|otlp"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="type:opentelemetry_collector_exporters:sum",type=~"debug|logging|otlp|otlphttp|prometheus|lokiexporter|kafka|awscloudwatchlogs|loadbalancing"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="type:opentelemetry_collector_processors:sum",type=~"batch|memorylimiter|attributes|resource|span|k8sattributes|resourcedetection|filter|routing|cumulativetodelta|groupbyattrs"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="type:opentelemetry_collector_extensions:sum",type=~"zpages|ballast|memorylimiter|jaegerremotesampling|healthcheck|pprof|oauth2clientauth|oidcauth|bearertokenauth|filestorage"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="type:opentelemetry_collector_connectors:sum",type=~"spanmetrics|forward"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="type:opentelemetry_collector_info:sum",type=~"deployment|daemonset|sidecar|statefulset"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="appsvcs:cores_by_product:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="nto_custom_profiles:count"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_csi_share_configmap"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_csi_share_secret"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_csi_share_mount_failures_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_csi_share_mount_requests_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="eo_es_storage_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="eo_es_redundancy_policy_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="eo_es_defined_delete_namespaces_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="eo_es_misconfigured_memory_resources_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:eo_es_data_nodes_total:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:eo_es_documents_created_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:eo_es_documents_deleted_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="pod:eo_es_shards_total:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="eo_es_cluster_management_state_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="imageregistry:imagestreamtags_count:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="imageregistry:operations_count:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="log_logging_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="log_collector_error_count_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="log_forwarder_pipeline_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="log_forwarder_input_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="log_forwarder_output_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:log_collected_bytes_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:log_logged_bytes_total:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_logging:log_forwarder_pipelines:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_logging:log_forwarders:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_logging:log_forwarder_input_type:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_logging:log_forwarder_output_type:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift_logging:vector_component_received_bytes_total:rate5m"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:kata_monitor_running_shim_count:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="platform:hypershift_hostedclusters:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="platform:hypershift_nodepools:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_name:hypershift_nodepools_size:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster_name:hypershift_nodepools_available_replicas:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_unhealthy_bucket_claims:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_buckets_claims:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_unhealthy_namespace_resources:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_namespace_resources:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_unhealthy_namespace_buckets:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_namespace_buckets:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_accounts:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_usage:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="namespace:noobaa_system_health_status:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ocs_advanced_feature_usage"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="os_image_url_override:sum"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:vsphere_topology_tags:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:vsphere_infrastructure_failure_domains:max"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="apiserver_list_watch_request_success_total:rate:sum", verb=~"LIST|WATCH"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhacs:telemetry:rox_central_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhacs:telemetry:rox_central_secured_clusters"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhacs:telemetry:rox_central_secured_nodes"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhacs:telemetry:rox_central_secured_vcpus"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="rhacs:telemetry:rox_sensor_info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:volume_manager_selinux_pod_context_mismatch_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:volume_manager_selinux_volume_context_mismatch_warnings_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:volume_manager_selinux_volume_context_mismatch_errors_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:volume_manager_selinux_volumes_admitted_total"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ols:provider_model_configuration"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ols:rest_api_query_calls_total:2xx"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ols:rest_api_query_calls_total:4xx"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="ols:rest_api_query_calls_total:5xx"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="openshift:openshift_network_operator_ipsec_state:info"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:health:group_severity:count", severity=~"critical|warning|info|none"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:controlplane_topology:info", mode=~"HighlyAvailable|HighlyAvailableArbiter|SingleReplica|DualReplica|External"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric + - expr: label_replace(sum without(pod, container) ({__name__="cluster:infrastructure_topology:info", mode=~"HighlyAvailable|SingleReplica"}),"original_name_label","$1","__name__", "(.+)") + record: telemetry:metric diff --git a/jsonnet/components/telemetry-recording-rules.libsonnet b/jsonnet/components/telemetry-recording-rules.libsonnet new file mode 100644 index 0000000000..14401c6461 --- /dev/null +++ b/jsonnet/components/telemetry-recording-rules.libsonnet @@ -0,0 +1,41 @@ +// Parse the telemetry config to extract properly grouped matchers +local telemetryConfigYaml = std.parseYaml(importstr '../../manifests/0000_50_cluster-monitoring-operator_04-config.yaml'); +local telemetryMatches = std.parseYaml(telemetryConfigYaml.data['metrics.yaml']).matches; + +// Generate individual recording rules for each properly grouped telemetry matcher +local generateTelemetryRules() = [ + { + record: 'telemetry:metric', + # We keep track of the metric name in a label. For regex matchers this is + # required, so we might as well do it consistently. + # Otherwise Prometheus can log `execution: vector cannot contain metrics with the same labelset` + # since the metric name is dropped while querying. See also https://github.com/prometheus/prometheus/issues/11397 + # We reset the correct label name in the remote_write config. + expr: 'label_replace(sum without(pod, container) (%s),"original_name_label","$1","__name__", "(.+)")' % match + } + for match in telemetryMatches +]; + +function(params) { + local cfg = params, + local telemetryRules = generateTelemetryRules(), + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: cfg.commonLabels + { + 'role': 'telemetry-rules', + }, + name: 'telemetry-recording-rules', + namespace: cfg.namespace, + }, + spec: { + groups: [{ + name: 'telemetry-recording.rules', + interval: '4m30s', + rules: telemetryRules, + }], + }, + }, +} diff --git a/jsonnet/main.jsonnet b/jsonnet/main.jsonnet index b65e564bb5..79470ea146 100644 --- a/jsonnet/main.jsonnet +++ b/jsonnet/main.jsonnet @@ -26,6 +26,7 @@ local thanosQuerier = import './components/thanos-querier.libsonnet'; local openshiftStateMetrics = import './components/openshift-state-metrics.libsonnet'; local telemeterClient = import './components/telemeter-client.libsonnet'; +local telemetryRecordingRules = import './components/telemetry-recording-rules.libsonnet'; // Common configuration local commonConfig = { @@ -386,6 +387,10 @@ local inCluster = }, }, }, + telemetryRecordingRules: { + namespace: $.values.common.namespace, + commonLabels+: $.values.common.commonLabels, + }, }, // Objects @@ -430,6 +435,7 @@ local inCluster = telemeterClient: telemeterClient($.values.telemeterClient), monitoringPlugin: monitoringPlugin($.values.monitoringPlugin), openshiftStateMetrics: openshiftStateMetrics($.values.openshiftStateMetrics), + telemetryRecordingRules: telemetryRecordingRules($.values.telemetryRecordingRules), } + (import './utils/anti-affinity.libsonnet') + (import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/addons/ksm-lite.libsonnet') + @@ -535,6 +541,7 @@ setTerminationMessagePolicy( { ['thanos-querier/' + name]: inCluster.thanosQuerier[name] for name in std.objectFields(inCluster.thanosQuerier) } + { ['thanos-ruler/' + name]: inCluster.thanosRuler[name] for name in std.objectFields(inCluster.thanosRuler) } + { ['control-plane/' + name]: inCluster.controlPlane[name] for name in std.objectFields(inCluster.controlPlane) } + + { ['telemetry-recording-rules/' + name]: inCluster.telemetryRecordingRules[name] for name in std.objectFields(inCluster.telemetryRecordingRules) } + { ['manifests/' + name]: inCluster.manifests[name] for name in std.objectFields(inCluster.manifests) } + {} ) diff --git a/manifests/image-references b/manifests/image-references index eeb6f121ef..7cb49d0452 100644 --- a/manifests/image-references +++ b/manifests/image-references @@ -46,10 +46,6 @@ spec: from: kind: DockerImage name: quay.io/openshift/origin-kube-rbac-proxy:latest - - name: telemeter - from: - kind: DockerImage - name: quay.io/openshift/origin-telemeter:latest - name: prom-label-proxy from: kind: DockerImage diff --git a/pkg/manifests/config.go b/pkg/manifests/config.go index b9d281580a..88e2c3ff18 100644 --- a/pkg/manifests/config.go +++ b/pkg/manifests/config.go @@ -293,6 +293,20 @@ type Audit struct { Profile auditv1.Level `json:"profile"` } +func (cfg *TelemetryConfig) IsEnabled() bool { + if cfg == nil { + return false + } + + if (cfg.Enabled != nil && !*cfg.Enabled) || + cfg.ClusterID == "" || + cfg.Token == "" { + return false + } + + return true +} + func (cfg *TelemeterClientConfig) IsEnabled() bool { if cfg == nil { return false @@ -439,9 +453,18 @@ func (c *Config) applyDefaults() { if c.ClusterMonitoringConfiguration.HTTPConfig == nil { c.ClusterMonitoringConfiguration.HTTPConfig = &HTTPConfig{} } - if c.ClusterMonitoringConfiguration.TelemeterClientConfig == nil { - c.ClusterMonitoringConfiguration.TelemeterClientConfig = &TelemeterClientConfig{ - TelemeterServerURL: "https://infogw.api.openshift.com/", + if c.ClusterMonitoringConfiguration.TelemetryConfig == nil { + if c.ClusterMonitoringConfiguration.TelemeterClientConfig != nil { + c.ClusterMonitoringConfiguration.TelemetryConfig = &TelemetryConfig{ + ClusterID: c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID, + Enabled: c.ClusterMonitoringConfiguration.TelemeterClientConfig.Enabled, + TelemeterServerURL: c.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL, + Token: c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token, + } + } else { + c.ClusterMonitoringConfiguration.TelemetryConfig = &TelemetryConfig{ + TelemeterServerURL: "https://infogw.api.openshift.com/metrics/v1/receive", + } } } @@ -515,13 +538,10 @@ func (c *Config) SetTelemetryMatches(matches []string) { func (c *Config) SetRemoteWrite(rw bool) { c.RemoteWrite = rw - if c.RemoteWrite && c.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL == "https://infogw.api.openshift.com/" { - c.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL = "https://infogw.api.openshift.com/metrics/v1/receive" - } } func (c *Config) LoadClusterID(load func() (*configv1.ClusterVersion, error)) error { - if c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID != "" { + if c.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID != "" { return nil } @@ -530,12 +550,12 @@ func (c *Config) LoadClusterID(load func() (*configv1.ClusterVersion, error)) er return fmt.Errorf("error loading cluster version: %w", err) } - c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID = string(cv.Spec.ClusterID) + c.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID = string(cv.Spec.ClusterID) return nil } func (c *Config) LoadToken(load func() (*v1.Secret, error)) error { - if c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token != "" { + if c.ClusterMonitoringConfiguration.TelemetryConfig.Token != "" { return nil } @@ -560,7 +580,7 @@ func (c *Config) LoadToken(load func() (*v1.Secret, error)) error { return fmt.Errorf("unmarshaling pull secret failed: %w", err) } - c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token = ps.Auths.COC.Auth + c.ClusterMonitoringConfiguration.TelemetryConfig.Token = ps.Auths.COC.Auth return nil } @@ -639,13 +659,23 @@ func (c *Config) Precheck() error { } // Highlight deprecated config fields. - var d float64 - if c.ClusterMonitoringConfiguration.K8sPrometheusAdapter != nil { - klog.Infof("k8sPrometheusAdapter is a deprecated config use metricsServer instead") - d = 1 + { + var d float64 + if c.ClusterMonitoringConfiguration.K8sPrometheusAdapter != nil { + klog.Infof("k8sPrometheusAdapter is a deprecated config use metricsServer instead") + d = 1 + } + // Prometheus-Adapter is replaced with Metrics Server by default from 4.16 + metrics.DeprecatedConfig.WithLabelValues("openshift-monitoring/cluster-monitoring-config", "k8sPrometheusAdapter", "4.16").Set(d) + } + { + var d float64 + if c.ClusterMonitoringConfiguration.TelemeterClientConfig != nil { + klog.Infof("telemeterClientConfig is a deprecated config use telemetryConfig instead") + d = 1 + } + metrics.DeprecatedConfig.WithLabelValues("openshift-monitoring/cluster-monitoring-config", "telemeterClientConfig", "4.21").Set(d) } - // Prometheus-Adapter is replaced with Metrics Server by default from 4.16 - metrics.DeprecatedConfig.WithLabelValues("openshift-monitoring/cluster-monitoring-config", "k8sPrometheusAdapter", "4.16").Set(d) // TODO: remove after 4.19 // Only to assist with the migration to Prometheus 3; fail early if Alertmanager v1 is still in use. diff --git a/pkg/manifests/config_test.go b/pkg/manifests/config_test.go index 923a390e40..460ae98ad1 100644 --- a/pkg/manifests/config_test.go +++ b/pkg/manifests/config_test.go @@ -17,7 +17,9 @@ package manifests import ( "context" "errors" + "fmt" "os" + "strings" "testing" "github.com/openshift/cluster-monitoring-operator/pkg/metrics" @@ -357,6 +359,105 @@ prometheus: } } +func TestTelemetryConfig(t *testing.T) { + truev, falsev := true, false + + tcs := []struct { + enabled bool + cfg *TelemetryConfig + }{ + { + cfg: nil, + enabled: false, + }, + { + cfg: &TelemetryConfig{}, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + Enabled: &truev, + }, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + Enabled: &falsev, + }, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + ClusterID: "test", + }, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + ClusterID: "test", + Enabled: &falsev, + }, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + ClusterID: "test", + Enabled: &truev, + }, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + Token: "test", + }, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + Token: "test", + Enabled: &falsev, + }, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + Token: "test", + Enabled: &truev, + }, + enabled: false, + }, + { + cfg: &TelemetryConfig{ + ClusterID: "test", + Token: "test", + }, + enabled: true, // opt-in by default + }, + { + cfg: &TelemetryConfig{ + ClusterID: "test", + Token: "test", + Enabled: &truev, + }, + enabled: true, + }, + { + cfg: &TelemetryConfig{ + ClusterID: "test", + Token: "test", + Enabled: &falsev, // explicitely opt-out + }, + enabled: false, + }, + } + + for i, tc := range tcs { + if got := tc.cfg.IsEnabled(); got != tc.enabled { + t.Errorf("testcase %d: expected enabled %t, got %t", i, tc.enabled, got) + } + } +} + func TestTelemeterClientConfig(t *testing.T) { truev, falsev := true, false @@ -803,9 +904,9 @@ func TestCollectionProfilePreCheck(t *testing.T) { func TestDeprecatedConfig(t *testing.T) { for _, tc := range []struct { - name string - config string - expectedMetricValue float64 + name string + config string + expected [][]interface{} }{ { name: "setting a field in k8sPrometheusAdapter", @@ -815,18 +916,27 @@ func TestDeprecatedConfig(t *testing.T) { cpu: 1m memory: 20Mi `, - expectedMetricValue: 1, + expected: [][]interface{}{ + {"openshift-monitoring/cluster-monitoring-config", "4.16", "k8sPrometheusAdapter", "1"}, + {"openshift-monitoring/cluster-monitoring-config", "4.21", "telemeterClientConfig", "0"}, + }, }, { name: "k8sPrometheusAdapter nil", config: `k8sPrometheusAdapter: `, - expectedMetricValue: 0, + expected: [][]interface{}{ + {"openshift-monitoring/cluster-monitoring-config", "4.16", "k8sPrometheusAdapter", "0"}, + {"openshift-monitoring/cluster-monitoring-config", "4.21", "telemeterClientConfig", "0"}, + }, }, { - name: "no config set", - config: "", - expectedMetricValue: 0, + name: "no config set", + config: "", + expected: [][]interface{}{ + {"openshift-monitoring/cluster-monitoring-config", "4.16", "k8sPrometheusAdapter", "0"}, + {"openshift-monitoring/cluster-monitoring-config", "4.21", "telemeterClientConfig", "0"}, + }, }, } { t.Run(tc.name, func(t *testing.T) { @@ -834,7 +944,20 @@ func TestDeprecatedConfig(t *testing.T) { require.NoError(t, err) err = c.Precheck() require.NoError(t, err) - require.Equal(t, tc.expectedMetricValue, prom_testutil.ToFloat64(metrics.DeprecatedConfig)) + meta := ` + # HELP cluster_monitoring_operator_deprecated_config_in_use [ALPHA] Set to 1 for deprecated configuration fields that are still in use, else 0. + # TYPE cluster_monitoring_operator_deprecated_config_in_use gauge + ` + metric := ` + cluster_monitoring_operator_deprecated_config_in_use {configmap="%s", deprecation_version = "%s", field = "%s"} %s + ` + var b strings.Builder + b.WriteString(meta) + for _, e := range tc.expected { + b.WriteString(fmt.Sprintf(metric, e...)) + } + err = prom_testutil.CollectAndCompare(metrics.DeprecatedConfig, strings.NewReader(b.String())) + require.NoError(t, err) }) } } diff --git a/pkg/manifests/manifests.go b/pkg/manifests/manifests.go index b28c87a956..dcc1b1c71b 100644 --- a/pkg/manifests/manifests.go +++ b/pkg/manifests/manifests.go @@ -51,8 +51,6 @@ import ( apiregistrationv1 "k8s.io/kube-aggregator/pkg/apis/apiregistration/v1" "k8s.io/utils/ptr" k8syaml "sigs.k8s.io/yaml" - - "github.com/openshift/cluster-monitoring-operator/pkg/promqlgen" ) const ( @@ -258,6 +256,8 @@ var ( TelemeterClientKubeRbacProxySecret = "telemeter-client/kube-rbac-proxy-secret.yaml" TelemeterClientPrometheusRule = "telemeter-client/prometheus-rule.yaml" + TelemetryRecordingRulesPrometheusRule = "telemetry-recording-rules/prometheus-rule.yaml" + ThanosQuerierDeployment = "thanos-querier/deployment.yaml" ThanosQuerierPodDisruptionBudget = "thanos-querier/pod-disruption-budget.yaml" ThanosQuerierService = "thanos-querier/service.yaml" @@ -1301,8 +1301,8 @@ func (f *Factory) PrometheusK8sTelemetrySecret() (*v1.Secret, error) { return nil, err } compositeToken, err := json.Marshal(map[string]string{ - "cluster_id": f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID, - "authorization_token": f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.Token, + "cluster_id": f.config.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID, + "authorization_token": f.config.ClusterMonitoringConfiguration.TelemetryConfig.Token, }) if err != nil { return nil, err @@ -1382,17 +1382,12 @@ func (f *Factory) PrometheusK8s(grpcTLS *v1.Secret, telemetrySecret *v1.Secret) return nil, err } - clusterID := f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID - if f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() && f.config.RemoteWrite { - selectorRelabelConfig, err := promqlgen.LabelSelectorsToRelabelConfig(f.config.ClusterMonitoringConfiguration.PrometheusK8sConfig.TelemetryMatches) - if err != nil { - return nil, fmt.Errorf("generate label selector relabel config: %w", err) - } - + clusterID := f.config.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID + if f.config.ClusterMonitoringConfiguration.TelemetryConfig.IsEnabled() { p.Spec.Secrets = append(p.Spec.Secrets, telemetrySecret.GetName()) spec := monv1.RemoteWriteSpec{ - URL: f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL, + URL: f.config.ClusterMonitoringConfiguration.TelemetryConfig.TelemeterServerURL, BearerTokenFile: fmt.Sprintf("/etc/prometheus/secrets/%s/%s", telemetrySecret.GetName(), telemetryTokenSecretKey), QueueConfig: &monv1.QueueConfig{ // Amount of samples to load from the WAL into the in-memory @@ -1415,7 +1410,28 @@ func (f *Factory) PrometheusK8s(grpcTLS *v1.Secret, telemetrySecret *v1.Secret) MaxBackoff: ptr.To(monv1.Duration("256s")), }, WriteRelabelConfigs: []monv1.RelabelConfig{ - *selectorRelabelConfig, + // Only send telemetry recording rules (metrics with telemetry: prefix) + { + SourceLabels: []monv1.LabelName{"__name__"}, + Regex: "telemetry:.*", + Action: "keep", + }, + // To support a regex matcher we track the + // original metric names in the recording rule. + // Here we reinstate the original name and drop + // the temp name. + // See also jsonnet/components/telemetry-recording-rules.libsonnet + { + SourceLabels: []monv1.LabelName{"original_name_label"}, + TargetLabel: "__name__", + Regex: "(.*)", + Replacement: ptr.To("$1"), + }, + // drop unwanted labels + { + Regex: "original_name_label", + Action: "labeldrop", + }, { TargetLabel: "_id", Replacement: ptr.To(clusterID), @@ -1729,7 +1745,7 @@ func (f *Factory) PrometheusUserWorkload(grpcTLS *v1.Secret) (*monv1.Prometheus, if len(f.config.UserWorkloadConfiguration.Prometheus.RemoteWrite) > 0 { p.Spec.RemoteWrite = addRemoteWriteConfigs( - f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID, + f.config.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID, p.Spec.RemoteWrite, f.config.UserWorkloadConfiguration.Prometheus.RemoteWrite...) @@ -3009,6 +3025,10 @@ func (f *Factory) TelemeterClientPrometheusRule() (*monv1.PrometheusRule, error) return f.NewPrometheusRule(f.assets.MustNewAssetSlice(TelemeterClientPrometheusRule)) } +func (f *Factory) TelemetryRecordingRulesPrometheusRule() (*monv1.PrometheusRule, error) { + return f.NewPrometheusRule(f.assets.MustNewAssetSlice(TelemetryRecordingRulesPrometheusRule)) +} + // TelemeterClientDeployment generates a new Deployment for Telemeter client. // If the passed ConfigMap is not empty it mounts the Trusted CA Bundle as a VolumeMount to // /etc/pki/ca-trust/extracted/pem/ location. @@ -3018,77 +3038,6 @@ func (f *Factory) TelemeterClientDeployment(proxyCABundleCM *v1.ConfigMap, s *v1 return nil, err } - // Set annotation on deployment to trigger redeployments - if s != nil { - h := fnv.New64() - h.Write(s.Data["token"]) - d.Spec.Template.Annotations["telemeter-token-hash"] = strconv.FormatUint(h.Sum64(), 32) - } - - for i, container := range d.Spec.Template.Spec.Containers { - switch container.Name { - case "telemeter-client": - d.Spec.Template.Spec.Containers[i].Image = f.config.Images.TelemeterClient - - if f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.Resources != nil { - d.Spec.Template.Spec.Containers[i].Resources = *f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.Resources - } - - if f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID != "" { - setContainerEnvironmentVariable(&d.Spec.Template.Spec.Containers[i], "ID", f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID) - } - if f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL != "" { - setContainerEnvironmentVariable(&d.Spec.Template.Spec.Containers[i], "TO", f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL) - } - - f.injectProxyVariables(&d.Spec.Template.Spec.Containers[i]) - - cmd := []string{} - // Note: matchers are read only during CMO bootstrap. This mechanism was chosen as CMO image will be reloaded during upgrades - // and matchers shouldn't change during runtime. It offers similar amount of protection against unwanted configuration changes - // while not having any performance penalty. However, it should be changed to usual reconciliation mechanism after CMO performance - // issues are solved. - for _, a := range d.Spec.Template.Spec.Containers[i].Command { - if !strings.HasPrefix(a, "--match=") { - cmd = append(cmd, a) - } - } - for _, m := range f.config.ClusterMonitoringConfiguration.PrometheusK8sConfig.TelemetryMatches { - cmd = append(cmd, fmt.Sprintf("--match=%s", m)) - } - cmd = append(cmd, "--limit-bytes=5242880") - d.Spec.Template.Spec.Containers[i].Command = cmd - - if proxyCABundleCM != nil { - volumeName := "telemeter-trusted-ca-bundle" - d.Spec.Template.Spec.Containers[i].VolumeMounts = append(d.Spec.Template.Spec.Containers[i].VolumeMounts, trustedCABundleVolumeMount(volumeName)) - volume := trustedCABundleVolume(proxyCABundleCM.Name, volumeName) - volume.VolumeSource.ConfigMap.Items = append(volume.VolumeSource.ConfigMap.Items, v1.KeyToPath{ - Key: TrustedCABundleKey, - Path: "tls-ca-bundle.pem", - }) - d.Spec.Template.Spec.Volumes = append(d.Spec.Template.Spec.Volumes, volume) - } - - case "reload": - d.Spec.Template.Spec.Containers[i].Image = f.config.Images.PrometheusConfigReloader - case "kube-rbac-proxy": - d.Spec.Template.Spec.Containers[i].Image = f.config.Images.KubeRbacProxy - d.Spec.Template.Spec.Containers[i].Args = f.setTLSSecurityConfiguration(container.Args, KubeRbacProxyTLSCipherSuitesFlag, KubeRbacProxyMinTLSVersionFlag) - } - } - - if len(f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.NodeSelector) > 0 { - d.Spec.Template.Spec.NodeSelector = f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.NodeSelector - } - if len(f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.Tolerations) > 0 { - d.Spec.Template.Spec.Tolerations = f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.Tolerations - } - if len(f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.TopologySpreadConstraints) > 0 { - d.Spec.Template.Spec.TopologySpreadConstraints = - f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.TopologySpreadConstraints - } - d.Namespace = f.namespace return d, nil } diff --git a/pkg/manifests/manifests_test.go b/pkg/manifests/manifests_test.go index 4c13677dc5..130f7d2c6f 100644 --- a/pkg/manifests/manifests_test.go +++ b/pkg/manifests/manifests_test.go @@ -1105,13 +1105,16 @@ func TestPrometheusK8sRemoteWriteURLs(t *testing.T) { config: func() *Config { c := NewDefaultConfig() - c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID = "123" - c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token = "secret" + c.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID = "123" + c.ClusterMonitoringConfiguration.TelemetryConfig.Token = "secret" return c }, + telemetrySecret: telemetrySecret, - expectedRemoteWriteURLs: nil, + expectedRemoteWriteURLs: []string{ + "https://infogw.api.openshift.com/metrics/v1/receive", + }, }, { name: "legacy telemetry and custom remote write", @@ -1119,14 +1122,16 @@ func TestPrometheusK8sRemoteWriteURLs(t *testing.T) { config: func() *Config { c := NewDefaultConfig() c.ClusterMonitoringConfiguration.PrometheusK8sConfig.RemoteWrite = []RemoteWriteSpec{{URL: "http://custom"}} - c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID = "123" - c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token = "secret" + c.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID = "123" + c.ClusterMonitoringConfiguration.TelemetryConfig.Token = "secret" return c }, + telemetrySecret: telemetrySecret, expectedRemoteWriteURLs: []string{ "http://custom", + "https://infogw.api.openshift.com/metrics/v1/receive", }, }, { @@ -1135,8 +1140,8 @@ func TestPrometheusK8sRemoteWriteURLs(t *testing.T) { config: func() *Config { c := NewDefaultConfig() c.SetRemoteWrite(true) - c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID = "123" - c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token = "secret" + c.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID = "123" + c.ClusterMonitoringConfiguration.TelemetryConfig.Token = "secret" return c }, @@ -1153,8 +1158,8 @@ func TestPrometheusK8sRemoteWriteURLs(t *testing.T) { c := NewDefaultConfig() c.SetRemoteWrite(true) c.ClusterMonitoringConfiguration.PrometheusK8sConfig.RemoteWrite = []RemoteWriteSpec{{URL: "http://custom"}} - c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID = "123" - c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token = "secret" + c.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID = "123" + c.ClusterMonitoringConfiguration.TelemetryConfig.Token = "secret" return c }, @@ -1171,10 +1176,10 @@ func TestPrometheusK8sRemoteWriteURLs(t *testing.T) { config: func() *Config { c := NewDefaultConfig() c.SetRemoteWrite(true) - c.ClusterMonitoringConfiguration.TelemeterClientConfig.TelemeterServerURL = "http://custom-telemeter" + c.ClusterMonitoringConfiguration.TelemetryConfig.TelemeterServerURL = "http://custom-telemeter" c.ClusterMonitoringConfiguration.PrometheusK8sConfig.RemoteWrite = []RemoteWriteSpec{{URL: "http://custom-remote-write"}} - c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID = "123" - c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token = "secret" + c.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID = "123" + c.ClusterMonitoringConfiguration.TelemetryConfig.Token = "secret" return c }, @@ -1185,6 +1190,28 @@ func TestPrometheusK8sRemoteWriteURLs(t *testing.T) { "http://custom-telemeter", }, }, + { + name: "remote write telemetry with custom url via deprecated telemeterClientConfig", + + config: func() *Config { + c, err := NewConfigFromString(` +telemeterClient: + telemeterServerURL: http://custom-telemeter + clusterID: "123" + token: secret +`, false) + if err != nil { + t.Fatal(err) + } + c.SetRemoteWrite(true) + return c + }, + telemetrySecret: telemetrySecret, + + expectedRemoteWriteURLs: []string{ + "http://custom-telemeter", + }, + }, } { t.Run(tc.name, func(t *testing.T) { c := tc.config() @@ -4089,85 +4116,6 @@ grpc: } } -func TestTelemeterConfiguration(t *testing.T) { - config := `telemeterClient: - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 200m - memory: 200Mi - topologySpreadConstraints: - - maxSkew: 1 - topologyKey: type - whenUnsatisfiable: DoNotSchedule - labelSelector: - matchLabels: - foo: bar` - - c, err := NewConfigFromString(config, false) - if err != nil { - t.Fatal(err) - } - f := NewFactory("openshift-monitoring", "openshift-user-workload-monitoring", c, defaultInfrastructureReader(), &fakeProxyReader{}, NewAssets(assetsPath), &APIServerConfig{}, &configv1.Console{}) - d, err := f.TelemeterClientDeployment(&v1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "foo"}}, &v1.Secret{Data: map[string][]byte{"token": []byte("test")}}) - if err != nil { - t.Fatal(err) - } - - kubeRbacProxyTLSCipherSuitesArg := "" - kubeRbacProxyMinTLSVersionArg := "" - for _, container := range d.Spec.Template.Spec.Containers { - switch container.Name { - case "telemeter-client": - volumeName := "telemeter-trusted-ca-bundle" - if !volumeConfigured(d.Spec.Template.Spec.Volumes, volumeName) { - t.Fatalf("trusted CA bundle volume for %s is not configured correctly", container.Name) - } - if !volumeMountsConfigured(container.VolumeMounts, volumeName) { - t.Fatalf("trusted CA bundle volume mount for %s is not configured correctly", container.Name) - } - if !reflect.DeepEqual(container.Resources, *f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.Resources) { - t.Fatal("telemeter-client resources not configured correctly") - } - case "kube-rbac-proxy": - kubeRbacProxyTLSCipherSuitesArg = getContainerArgValue(d.Spec.Template.Spec.Containers, KubeRbacProxyTLSCipherSuitesFlag, container.Name) - kubeRbacProxyMinTLSVersionArg = getContainerArgValue(d.Spec.Template.Spec.Containers, KubeRbacProxyMinTLSVersionFlag, container.Name) - } - } - - expectedTokenHash := "8o29vfqfspfr9" - - if tokenHash, ok := d.Spec.Template.Annotations["telemeter-token-hash"]; !ok { - t.Fatalf("telemeter-token-hash annotation not set in telemeter-client deployment") - } else if expectedTokenHash != tokenHash { - t.Fatalf("incorrect token hash on telemeter-token-hash annotation, \n got %s, \nwant %s", tokenHash, expectedTokenHash) - } - - expectedKubeRbacProxyTLSCipherSuitesArg := fmt.Sprintf("%s%s", - KubeRbacProxyTLSCipherSuitesFlag, - strings.Join(crypto.OpenSSLToIANACipherSuites(APIServerDefaultTLSCiphers), ",")) - - if expectedKubeRbacProxyTLSCipherSuitesArg != kubeRbacProxyTLSCipherSuitesArg { - t.Fatalf("incorrect TLS ciphers, \n got %s, \nwant %s", kubeRbacProxyTLSCipherSuitesArg, expectedKubeRbacProxyTLSCipherSuitesArg) - } - - expectedKubeRbacProxyMinTLSVersionArg := fmt.Sprintf("%s%s", - KubeRbacProxyMinTLSVersionFlag, APIServerDefaultMinTLSVersion) - if expectedKubeRbacProxyMinTLSVersionArg != kubeRbacProxyMinTLSVersionArg { - t.Fatalf("incorrect TLS version \n got %s, \nwant %s", kubeRbacProxyMinTLSVersionArg, expectedKubeRbacProxyMinTLSVersionArg) - } - - if d.Spec.Template.Spec.TopologySpreadConstraints[0].MaxSkew != 1 { - t.Fatal("Telemeter topology spread constraints MaxSkew not configured correctly") - } - - if d.Spec.Template.Spec.TopologySpreadConstraints[0].WhenUnsatisfiable != "DoNotSchedule" { - t.Fatal("Telemeter topology spread constraints WhenUnsatisfiable not configured correctly") - } -} - func TestTelemeterClientSecret(t *testing.T) { for _, tc := range []struct { name string diff --git a/pkg/manifests/types.go b/pkg/manifests/types.go index 287d1bc61e..019ead85c3 100644 --- a/pkg/manifests/types.go +++ b/pkg/manifests/types.go @@ -58,7 +58,9 @@ type ClusterMonitoringConfiguration struct { PrometheusOperatorAdmissionWebhookConfig *PrometheusOperatorAdmissionWebhookConfig `json:"prometheusOperatorAdmissionWebhook,omitempty"` // `OpenShiftMetricsConfig` defines settings for the `openshift-state-metrics` agent. OpenShiftMetricsConfig *OpenShiftStateMetricsConfig `json:"openshiftStateMetrics,omitempty"` - // `TelemeterClientConfig` defines settings for the Telemeter Client + // TelemetryConfig defines settings for telemetry reporting. + TelemetryConfig *TelemetryConfig `json:"telemetryConfig,omitempty"` + // OmitFromDoc: `TelemeterClientConfig` defines settings for the Telemeter Client // component. TelemeterClientConfig *TelemeterClientConfig `json:"telemeterClient,omitempty"` // `ThanosQuerierConfig` defines settings for the Thanos Querier component. @@ -279,6 +281,19 @@ type OpenShiftStateMetricsConfig struct { TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"` } +// `TelemetryConfig` defines settings for the Telemeter Client +// component. +type TelemetryConfig struct { + // OmitFromDoc + ClusterID string `json:"clusterID,omitempty"` + // OmitFromDoc + Enabled *bool `json:"enabled,omitempty"` + // OmitFromDoc + TelemeterServerURL string `json:"telemeterServerURL,omitempty"` + // OmitFromDoc + Token string `json:"token,omitempty"` +} + // `TelemeterClientConfig` defines settings for the Telemeter Client // component. type TelemeterClientConfig struct { diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 91f8e6ecc7..d38aa4fc95 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -161,7 +161,6 @@ const ( // see https://github.com/kubernetes/apiserver/blob/b571c70e6e823fd78910c3f5b9be895a756f4cbb/pkg/server/options/authentication.go#L239 apiAuthenticationConfigMap = "kube-system/extension-apiserver-authentication" kubeletServingCAConfigMap = "openshift-config-managed/kubelet-serving-ca" - telemeterCABundleConfigMap = "openshift-monitoring/telemeter-trusted-ca-bundle" alertmanagerCABundleConfigMap = "openshift-monitoring/alertmanager-trusted-ca-bundle" grpcTLS = "openshift-monitoring/grpc-tls" metricsClientCerts = "openshift-monitoring/metrics-client-certs" @@ -480,30 +479,6 @@ func New( return nil, fmt.Errorf("failed to create client certificate controller: %w", err) } - // csrFederateController runs a controller that requests a client TLS - // certificate for the telemeter client. This certificate is used to - // authenticate against the Prometheus /federate API endpoint. - csrFederateController, err := csr.NewClientCertificateController( - csr.ClientCertOption{ - SecretNamespace: "openshift-monitoring", - SecretName: "federate-client-certs", - AdditionalAnnotations: certrotation.AdditionalAnnotations{ - JiraComponent: "Monitoring", - }, - }, - csrOption, - kubeInformersOperatorNS.Certificates().V1().CertificateSigningRequests(), - o.client.KubernetesInterface().CertificatesV1().CertificateSigningRequests(), - kubeInformersOperatorNS.Core().V1().Secrets(), - o.client.KubernetesInterface().CoreV1(), - o.client.EventRecorder(), - "OpenShiftMonitoringTelemeterClientCertRequester", - ) - - if err != nil { - return nil, fmt.Errorf("failed to create federate certificate controller: %w", err) - } - csrMetricsServerController, err := csr.NewClientCertificateController( csr.ClientCertOption{ SecretNamespace: "openshift-monitoring", @@ -527,7 +502,6 @@ func New( o.controllersToRunFunc = append( o.controllersToRunFunc, - csrFederateController.Run, csrController.Run, csrMetricsServerController.Run, o.ruleController.Run, @@ -664,7 +638,6 @@ func (o *Operator) handleEvent(obj interface{}) { case apiAuthenticationConfigMap: case kubeletServingCAConfigMap: case metricsServerClientCerts: - case telemeterCABundleConfigMap: case alertmanagerCABundleConfigMap: case grpcTLS: case metricsClientCerts: @@ -1008,7 +981,7 @@ func (o *Operator) Config(ctx context.Context, key string) (*manifests.Config, e } // Only fetch the token and cluster ID if they have not been specified in the config. - if c.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID == "" || c.ClusterMonitoringConfiguration.TelemeterClientConfig.Token == "" { + if c.ClusterMonitoringConfiguration.TelemetryConfig.ClusterID == "" || c.ClusterMonitoringConfiguration.TelemetryConfig.Token == "" { err := c.LoadClusterID(func() (*configv1.ClusterVersion, error) { return o.client.GetClusterVersion(ctx, "version") }) diff --git a/pkg/tasks/clustermonitoringoperator.go b/pkg/tasks/clustermonitoringoperator.go index 843f15ad59..9fb51e443f 100644 --- a/pkg/tasks/clustermonitoringoperator.go +++ b/pkg/tasks/clustermonitoringoperator.go @@ -140,6 +140,15 @@ func (t *ClusterMonitoringOperatorTask) Run(ctx context.Context) error { return fmt.Errorf("reconciling cluster-monitoring-operator rules PrometheusRule failed: %w", err) } + trr, err := t.factory.TelemetryRecordingRulesPrometheusRule() + if err != nil { + return fmt.Errorf("initializing telemetry recording rules PrometheusRule failed: %w", err) + } + err = t.client.CreateOrUpdatePrometheusRule(ctx, trr) + if err != nil { + return fmt.Errorf("reconciling telemetry recording rules PrometheusRule failed: %w", err) + } + smcmo, err := t.factory.ClusterMonitoringOperatorServiceMonitor() if err != nil { return fmt.Errorf("initializing Cluster Monitoring Operator ServiceMonitor failed: %w", err) diff --git a/pkg/tasks/prometheus.go b/pkg/tasks/prometheus.go index ba67e70641..4978cf02d3 100644 --- a/pkg/tasks/prometheus.go +++ b/pkg/tasks/prometheus.go @@ -346,16 +346,11 @@ func (t *PrometheusTask) create(ctx context.Context) error { return fmt.Errorf("initializing Prometheus telemetry secret failed: %w", err) } - if t.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() && t.config.RemoteWrite { + if t.config.ClusterMonitoringConfiguration.TelemetryConfig.IsEnabled() { klog.V(4).Info("updating Prometheus telemetry secret") if err = t.client.CreateOrUpdateSecret(ctx, telemetrySecret); err != nil { return fmt.Errorf("reconciling Prometheus telemetry secret failed: %w", err) } - } else { - klog.V(4).Info("deleting Prometheus telemetry secret") - if err = t.client.DeleteSecret(ctx, telemetrySecret); err != nil { - return fmt.Errorf("deleting Prometheus telemetry secret failed: %w", err) - } } { diff --git a/pkg/tasks/telemeter.go b/pkg/tasks/telemeter.go index 65c5cb539e..77303eb0b8 100644 --- a/pkg/tasks/telemeter.go +++ b/pkg/tasks/telemeter.go @@ -18,7 +18,6 @@ import ( "context" "fmt" - apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/klog/v2" "github.com/openshift/cluster-monitoring-operator/pkg/client" @@ -40,163 +39,8 @@ func NewTelemeterClientTask(client *client.Client, factory *manifests.Factory, c } func (t *TelemeterClientTask) Run(ctx context.Context) error { - if t.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() && !t.config.RemoteWrite { - return t.create(ctx) - } - - var reason string - switch { - case !t.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled(): - reason = "telemetry is explicitly disabled" - case t.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() && t.config.RemoteWrite: - reason = "remote-write is enabled instead" - } - - if reason != "" { - klog.V(3).Infof("Telemeter client is disabled (because %s), existing related resources are to be destroyed.", reason) - return t.destroy(ctx) - } - - return nil -} - -func (t *TelemeterClientTask) create(ctx context.Context) error { - cacm, err := t.factory.TelemeterClientServingCertsCABundle() - if err != nil { - return fmt.Errorf("initializing Telemeter Client serving certs CA Bundle ConfigMap failed: %w", err) - } - - _, err = t.client.CreateIfNotExistConfigMap(ctx, cacm) - if err != nil { - return fmt.Errorf("creating Telemeter Client serving certs CA Bundle ConfigMap failed: %w", err) - } - - sa, err := t.factory.TelemeterClientServiceAccount() - if err != nil { - return fmt.Errorf("initializing Telemeter client Service failed: %w", err) - } - - err = t.client.CreateOrUpdateServiceAccount(ctx, sa) - if err != nil { - return fmt.Errorf("reconciling Telemeter client ServiceAccount failed: %w", err) - } - - cr, err := t.factory.TelemeterClientClusterRole() - if err != nil { - return fmt.Errorf("initializing Telemeter client ClusterRole failed: %w", err) - } - - err = t.client.CreateOrUpdateClusterRole(ctx, cr) - if err != nil { - return fmt.Errorf("reconciling Telemeter client ClusterRole failed: %w", err) - } - - crb, err := t.factory.TelemeterClientClusterRoleBinding() - if err != nil { - return fmt.Errorf("initializing Telemeter client ClusterRoleBinding failed: %w", err) - } - - err = t.client.CreateOrUpdateClusterRoleBinding(ctx, crb) - if err != nil { - return fmt.Errorf("reconciling Telemeter client ClusterRoleBinding failed: %w", err) - } - - crb, err = t.factory.TelemeterClientClusterRoleBindingView() - if err != nil { - return fmt.Errorf("initializing Telemeter client cluster monitoring view ClusterRoleBinding failed: %w", err) - } - - err = t.client.CreateOrUpdateClusterRoleBinding(ctx, crb) - if err != nil { - return fmt.Errorf("reconciling Telemeter client cluster monitoring view ClusterRoleBinding failed: %w", err) - } - - svc, err := t.factory.TelemeterClientService() - if err != nil { - return fmt.Errorf("initializing Telemeter client Service failed: %w", err) - } - - err = t.client.CreateOrUpdateService(ctx, svc) - if err != nil { - return fmt.Errorf("reconciling Telemeter client Service failed: %w", err) - } - - s, err := t.factory.TelemeterClientSecret() - if err != nil { - return fmt.Errorf("initializing Telemeter client Secret failed: %w", err) - } - - oldS, err := t.client.GetSecret(ctx, s.Namespace, s.Name) - if err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("getting Telemeter Client Secret failed: %w", err) - } - if oldS != nil && string(oldS.Data["token"]) == t.config.ClusterMonitoringConfiguration.TelemeterClientConfig.Token { - s.Data = oldS.Data - } - - err = t.client.CreateOrUpdateSecret(ctx, s) - if err != nil { - return fmt.Errorf("reconciling Telemeter client Secret failed: %w", err) - } - - krs, err := t.factory.TelemeterClientKubeRbacProxySecret() - if err != nil { - return fmt.Errorf("initializing Telemeter client kube rbac proxy secret failed: %w", err) - } - - err = t.client.CreateOrUpdateSecret(ctx, krs) - if err != nil { - return fmt.Errorf("reconciling Telemeter client kube rbac proxy secret failed: %w", err) - } - - { - // Create trusted CA bundle ConfigMap. - trustedCA, err := t.factory.TelemeterTrustedCABundle() - if err != nil { - return fmt.Errorf("initializing Telemeter client trusted CA bundle ConfigMap failed: %w", err) - } - - cbs := &caBundleSyncer{ - client: t.client, - factory: t.factory, - prefix: "telemeter", - } - trustedCA, err = cbs.syncTrustedCABundle(ctx, trustedCA) - if err != nil { - return fmt.Errorf("syncing Telemeter client CA bundle ConfigMap failed: %w", err) - } - - dep, err := t.factory.TelemeterClientDeployment(trustedCA, s) - if err != nil { - return fmt.Errorf("initializing Telemeter client Deployment failed: %w", err) - } - - err = t.client.CreateOrUpdateDeployment(ctx, dep) - if err != nil { - return fmt.Errorf("reconciling Telemeter client Deployment failed: %w", err) - } - } - - rule, err := t.factory.TelemeterClientPrometheusRule() - if err != nil { - return fmt.Errorf("initializing Telemeter client Prometheus Rule failed: %w", err) - } - - err = t.client.CreateOrUpdatePrometheusRule(ctx, rule) - if err != nil { - return fmt.Errorf("reconciling Telemeter client Prometheus Rule failed: %w", err) - } - - sm, err := t.factory.TelemeterClientServiceMonitor() - if err != nil { - return fmt.Errorf("initializing Telemeter client ServiceMonitor failed: %w", err) - } - - err = t.client.CreateOrUpdateServiceMonitor(ctx, sm) - if err != nil { - return fmt.Errorf("reconciling Telemeter client ServiceMonitor failed: %w", err) - } - return nil + klog.V(3).Infof("Telemeter client is deprecated and no longer used, existing related resources are to be destroyed.") + return t.destroy(ctx) } func (t *TelemeterClientTask) destroy(ctx context.Context) error { diff --git a/test/e2e/config_test.go b/test/e2e/config_test.go index 171ba853ed..d583355f45 100644 --- a/test/e2e/config_test.go +++ b/test/e2e/config_test.go @@ -427,83 +427,6 @@ func TestClusterMonitorOSMConfig(t *testing.T) { } } -func TestClusterMonitorTelemeterClientConfig(t *testing.T) { - const ( - deploymentName = "telemeter-client" - ) - - data := `telemeterClient: - tolerations: - - operator: "Exists" -` - f.MustCreateOrUpdateConfigMap(t, f.BuildCMOConfigMap(t, data)) - - for _, tc := range []scenario{ - { - name: "test the telemeter-client deployment is rolled out", - assertion: f.AssertDeploymentExistsAndRollout(deploymentName, f.Ns), - }, - { - name: "assert pod configuration is as expected", - assertion: f.AssertPodConfiguration( - f.Ns, - "app.kubernetes.io/name=telemeter-client", - []framework.PodAssertion{ - expectCatchAllToleration(), - }, - ), - }, - } { - if ok := t.Run(tc.name, tc.assertion); !ok { - t.Fatalf("scenario %q failed", tc.name) - } - } -} - -func TestTelemeterClientSecret(t *testing.T) { - for _, tc := range []struct { - name string - oldC string - newC string - tokenChanged bool - }{ - { - name: "Existing Secret", - oldC: `telemeterClient: - token: mySecretToken -`, - newC: `telemeterClient: - token: mySecretToken -`, - tokenChanged: false, - }, - { - name: "Existing Secret, new token", - oldC: `telemeterClient: - token: mySecretToken -`, - newC: `telemeterClient: - token: myNewSecretToken -`, - tokenChanged: true, - }, - } { - - t.Run(tc.name, func(t *testing.T) { - f.MustCreateOrUpdateConfigMap(t, f.BuildCMOConfigMap(t, tc.oldC)) - oldS := f.MustGetSecret(t, "telemeter-client", f.Ns) - f.MustCreateOrUpdateConfigMap(t, f.BuildCMOConfigMap(t, tc.newC)) - if tc.tokenChanged { - f.AssertValueInSecretNotEquals(oldS.GetName(), oldS.GetNamespace(), "token", string(oldS.Data["token"])) - f.AssertValueInSecretNotEquals(oldS.GetName(), oldS.GetNamespace(), "salt", string(oldS.Data["salt"])) - return - } - f.AssertValueInSecretEquals(oldS.GetName(), oldS.GetNamespace(), "token", string(oldS.Data["token"])) - f.AssertValueInSecretEquals(oldS.GetName(), oldS.GetNamespace(), "salt", string(oldS.Data["salt"])) - }) - } -} - func TestClusterMonitorThanosQuerierConfig(t *testing.T) { const ( deploymentName = "thanos-querier" diff --git a/test/e2e/main_test.go b/test/e2e/main_test.go index 2aebbb254c..e449ed8dbf 100644 --- a/test/e2e/main_test.go +++ b/test/e2e/main_test.go @@ -165,7 +165,6 @@ func testTargetsUp(t *testing.T) { "alertmanager-main", "cluster-monitoring-operator", "openshift-state-metrics", - "telemeter-client", "thanos-querier", } diff --git a/test/e2e/prometheus_test.go b/test/e2e/prometheus_test.go index f88ef0e460..fee8af262a 100644 --- a/test/e2e/prometheus_test.go +++ b/test/e2e/prometheus_test.go @@ -39,7 +39,7 @@ func TestPrometheusMetrics(t *testing.T) { "alertmanager-main": 2, "kube-state-metrics": 2, // one for the kube metrics + one for the metrics of the process itself. "openshift-state-metrics": 2, // ditto. - "telemeter-client": 1, + "telemeter-client": 0, } for service, metric := range expected { diff --git a/test/e2e/telemeter_test.go b/test/e2e/telemeter_test.go deleted file mode 100644 index 72c3de388f..0000000000 --- a/test/e2e/telemeter_test.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2019 The Cluster Monitoring Operator Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package e2e - -import ( - "context" - "errors" - "fmt" - "testing" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" -) - -// TestTelemeterRemoteWrite verifies that the monitoring stack can send data to -// the telemeter server using the native Prometheus remote write endpoint. -func TestTelemeterRemoteWrite(t *testing.T) { - cm := f.BuildCMOConfigMap(t, "{}") - f.MustCreateOrUpdateConfigMap(t, cm) - - t.Cleanup(func() { - f.MustDeleteConfigMap(t, cm) - }) - - // Put CMO deployment into unmanaged state and enable telemetry via remote-write manually. - ctx := context.Background() - patch := []byte(`{ - "spec": { - "overrides": [{ - "group": "apps", - "kind": "Deployment", - "name": "cluster-monitoring-operator", - "namespace": "openshift-monitoring", - "unmanaged": true - }] - } -}`) - _, err := f.OpenShiftConfigClient.ConfigV1().ClusterVersions().Patch(ctx, "version", types.MergePatchType, patch, metav1.PatchOptions{}) - if err != nil { - t.Fatal(err) - } - - t.Cleanup(func() { - patch := []byte(`{"spec": {"overrides": []}}`) - _, _ = f.OpenShiftConfigClient.ConfigV1().ClusterVersions().Patch(ctx, "version", types.MergePatchType, patch, metav1.PatchOptions{}) - }) - - dep, err := f.KubeClient.AppsV1().Deployments(f.Ns).Get(ctx, "cluster-monitoring-operator", metav1.GetOptions{}) - if err != nil { - t.Fatal(err) - } - - for i, c := range dep.Spec.Template.Spec.Containers { - if c.Name != "cluster-monitoring-operator" { - continue - } - dep.Spec.Template.Spec.Containers[i].Args = append(dep.Spec.Template.Spec.Containers[i].Args, "-enabled-remote-write=true") - } - dep, err = f.KubeClient.AppsV1().Deployments(f.Ns).Update(ctx, dep, metav1.UpdateOptions{}) - if err != nil { - t.Fatal(err) - } - - // Check that Prometheus sends samples to Telemeter. - f.PrometheusK8sClient.WaitForQueryReturn( - t, - 5*time.Minute, - `min without(pod,instance) (rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",url=~"https://infogw.api.openshift.com.+"}[5m]))`, - func(v float64) error { - if v == 0 { - return errors.New("expecting samples to be sent via Prometheus remote write but got none") - } - return nil - }, - ) -} - -// TestTelemeterClient verifies that the telemeter client can collect metrics from the monitoring stack and forward them to the telemeter server. -func TestTelemeterClient(t *testing.T) { - { - f.PrometheusK8sClient.WaitForQueryReturn( - t, - 5*time.Minute, - `metricsclient_request_send{client="federate_to",job="telemeter-client",status_code="200"}`, - func(v float64) error { - if v == 0 { - return fmt.Errorf("expecting metricsclient request send more than 0 but got none") - } - return nil - }, - ) - - f.PrometheusK8sClient.WaitForQueryReturn( - t, - 5*time.Minute, - `federate_samples{job="telemeter-client"}`, - func(v float64) error { - if v < 10 { - return fmt.Errorf("expecting federate samples from telemeter client greater than or equal to 10 but got %f", v) - } - return nil - }, - ) - } -} diff --git a/test/e2e/tls_security_profile_test.go b/test/e2e/tls_security_profile_test.go index e3788e408b..b37a426ee0 100644 --- a/test/e2e/tls_security_profile_test.go +++ b/test/e2e/tls_security_profile_test.go @@ -55,9 +55,6 @@ func TestDefaultTLSSecurityProfileConfiguration(t *testing.T) { assertCorrectTLSConfiguration(t, "node-exporter", "daemonset", manifests.KubeRbacProxyTLSCipherSuitesFlag, manifests.KubeRbacProxyMinTLSVersionFlag, configv1.TLSProfiles[configv1.TLSProfileIntermediateType].Ciphers, "VersionTLS12") - assertCorrectTLSConfiguration(t, "telemeter-client", "deployment", - manifests.KubeRbacProxyTLSCipherSuitesFlag, - manifests.KubeRbacProxyMinTLSVersionFlag, configv1.TLSProfiles[configv1.TLSProfileIntermediateType].Ciphers, "VersionTLS12") assertCorrectTLSConfiguration(t, "thanos-querier", "deployment", manifests.KubeRbacProxyTLSCipherSuitesFlag, manifests.KubeRbacProxyMinTLSVersionFlag, configv1.TLSProfiles[configv1.TLSProfileIntermediateType].Ciphers, "VersionTLS12")