Skip to content

Commit 2d1a5d9

Browse files
authored
o11y: Split MonitoringStack by environment (#7509)
* o11y: Split MonitoringStack by environment This is in attempt to speed up the pipeline whenever changes to this MonitoringStack definition is made. This is especially crucial for if we need to revert a breaking change to this definition. Additional, this allows us to test changes to this definition in dev or stage first before applying them to production. * Share MonitoringStack definition for staging and development * Move common resources for staging and env into their own dir * Use kustomize patches for env-specific configuration * Use safer defaults
1 parent c88a0c2 commit 2d1a5d9

File tree

9 files changed

+438
-208
lines changed

9 files changed

+438
-208
lines changed

components/monitoring/prometheus/base/monitoringstack/monitoringstack.yaml

Lines changed: 2 additions & 206 deletions
Large diffs are not rendered by default.

components/monitoring/prometheus/development/monitoringstack/cluster-type-patch.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
- op: add
2+
- op: replace
33
path: /spec/endpoints/0/relabelings/0
44
value:
55
targetLabel: source_environment

components/monitoring/prometheus/development/monitoringstack/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
22
kind: Kustomization
33
resources:
44
- ../../base/observability-operator
5-
- ../../base/monitoringstack
5+
- ../../staging/base/monitoringstack
66
patches:
77
- path: cluster-type-patch.yaml
88
target:
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
---
2+
- op: replace
3+
path: /spec/endpoints/0/params
4+
value:
5+
'match[]': # scrape only required metrics from in-cluster prometheus
6+
- '{__name__="pipeline_service_schedule_overhead_percentage_sum"}'
7+
- '{__name__="pipeline_service_schedule_overhead_percentage_count"}'
8+
- '{__name__="pipeline_service_execution_overhead_percentage_sum"}'
9+
- '{__name__="pipeline_service_execution_overhead_percentage_count"}'
10+
- '{__name__="pipelinerun_duration_scheduled_seconds_sum"}'
11+
- '{__name__="pipelinerun_duration_scheduled_seconds_count"}'
12+
- '{__name__="pipelinerun_gap_between_taskruns_milliseconds_sum"}'
13+
- '{__name__="pipelinerun_gap_between_taskruns_milliseconds_count"}'
14+
- '{__name__="pipelinerun_kickoff_not_attempted_count"}'
15+
- '{__name__="pending_resolutionrequest_count"}'
16+
- '{__name__="taskrun_pod_create_not_attempted_or_pending_count"}'
17+
- '{__name__="tekton_pipelines_controller_pipelinerun_count"}'
18+
- '{__name__="tekton_pipelines_controller_running_pipelineruns_count"}'
19+
- '{__name__="tekton_pipelines_controller_running_taskruns_throttled_by_quota_count"}'
20+
- '{__name__="tekton_pipelines_controller_running_taskruns_throttled_by_node_count"}'
21+
- '{__name__="tekton_pipelines_controller_running_taskruns_throttled_by_quota"}'
22+
- '{__name__="tekton_pipelines_controller_running_taskruns_throttled_by_node"}'
23+
- '{__name__="tekton_pipelines_controller_pipelinerun_duration_seconds_sum"}'
24+
- '{__name__="tekton_pipelines_controller_pipelinerun_duration_seconds_count"}'
25+
- '{__name__="watcher_workqueue_depth"}'
26+
- '{__name__="watcher_client_latency_bucket"}'
27+
- '{__name__="pac_watcher_work_queue_depth"}'
28+
- '{__name__="pac_watcher_client_latency_bucket"}'
29+
- '{__name__="grpc_server_handled_total", namespace=~"tekton-results|openshift-pipelines"}'
30+
- '{__name__="grpc_server_handled_total", namespace=~"openshift-etcd"}'
31+
- '{__name__="grpc_server_handling_seconds_bucket", namespace=~"tekton-results|openshift-pipelines"}'
32+
- '{__name__="grpc_server_handling_seconds_bucket", namespace="openshift-etcd"}'
33+
- '{__name__="grpc_server_msg_received_total", namespace="openshift-etcd"}'
34+
- '{__name__="controller_runtime_reconcile_errors_total", namespace!~".*-tenant|openshift-.*|kube-.*"}'
35+
- '{__name__="controller_runtime_reconcile_total", namespace!~".*-tenant|openshift-.*|kube-.*"}'
36+
- '{__name__="kube_lease_owner", namespace="openshift-pipelines", lease=~"controller.tektonresolverframework.bundleresolver..*"}'
37+
- '{__name__="kube_lease_owner", namespace="openshift-pipelines", lease=~"tekton-pipelines-controller.github.com.tektoncd.pipeline.pkg.reconciler..*"}'
38+
- '{__name__="kube_pod_status_unschedulable", namespace!~".*-tenant|openshift-.*|kube-.*"}'
39+
- '{__name__="kube_pod_container_status_restarts_total", namespace=~"openshift-pipelines|release-service"}'
40+
- '{__name__="kube_pod_container_status_waiting_reason", namespace!~".*-tenant|openshift-.*|kube-.*"}'
41+
- '{__name__="kube_pod_status_phase", namespace!~".*-tenant|openshift-.*|kube-.*"}'
42+
- '{__name__="kube_pod_container_resource_limits", namespace="release-service"}'
43+
- '{__name__="kube_pod_container_status_terminated_reason", namespace="release-service"}'
44+
- '{__name__="kube_pod_container_status_last_terminated_reason", namespace="release-service"}'
45+
- '{__name__="kube_pod_container_status_ready", namespace="release-service"}'
46+
- '{__name__="kube_persistentvolume_status_phase", namespace!~".*-tenant|openshift-.*|kube-.*"}'
47+
- '{__name__="kube_resourcequota", namespace!~".*-tenant|openshift-.*|kube-.*"}'
48+
- '{__name__="kube_statefulset_status_replicas_ready", namespace="gitops-service-argocd"}'
49+
- '{__name__="kube_statefulset_replicas", namespace="gitops-service-argocd"}'
50+
- '{__name__="openshift_route_status", namespace="gitops-service-argocd"}'
51+
52+
- '{__name__="kube_deployment_status_replicas_ready", namespace="gitops-service-argocd"}'
53+
- '{__name__="kube_deployment_spec_replicas", namespace=~"gitops-service-argocd"}'
54+
55+
# Namespace (expression): "build-service"
56+
- '{__name__="kube_deployment_status_replicas_ready", namespace="build-service"}'
57+
- '{__name__="kube_deployment_status_replicas_available", namespace="build-service"}'
58+
- '{__name__="kube_deployment_spec_replicas", namespace="build-service"}'
59+
60+
# Namespace (expression): "integration-service"
61+
- '{__name__="kube_deployment_status_replicas_ready", namespace="integration-service"}'
62+
- '{__name__="kube_deployment_status_replicas_available", namespace="integration-service"}'
63+
- '{__name__="kube_deployment_spec_replicas", namespace="integration-service"}'
64+
65+
# Namespace (expression): "konflux-ui"
66+
- '{__name__="kube_deployment_status_replicas_ready", namespace="konflux-ui"}'
67+
- '{__name__="kube_deployment_status_replicas_available", namespace="konflux-ui"}'
68+
- '{__name__="kube_deployment_spec_replicas", namespace="konflux-ui"}'
69+
- '{__name__="kube_running_pods_ready", namespace="konflux-ui"}'
70+
- '{__name__="kube_endpoint_address", namespace="konflux-ui"}'
71+
- '{__name__="kube_pod_container_status_restarts_total", namespace="konflux-ui"}'
72+
73+
# Namespace (expression): "mintmaker"
74+
- '{__name__="kube_deployment_status_replicas_ready", namespace="mintmaker"}'
75+
- '{__name__="kube_deployment_status_replicas_available", namespace="mintmaker"}'
76+
- '{__name__="kube_deployment_spec_replicas", namespace="mintmaker"}'
77+
- '{__name__="cluster_ram_requested_perc"}'
78+
- '{__name__="node_memory_pressured_perc"}'
79+
- '{__name__="redis_node_memory_usage_perc"}'
80+
81+
# Namespace (expression): ~".*monitoring.*"
82+
- '{__name__="kube_deployment_status_replicas_ready", namespace=~".*monitoring.*"}'
83+
- '{__name__="kube_deployment_status_replicas_available", namespace=~".*monitoring.*"}'
84+
- '{__name__="kube_deployment_spec_replicas", namespace=~".*monitoring.*"}'
85+
86+
# Namespace (expression): "multi-platform-controller"
87+
- '{__name__="kube_deployment_status_replicas_ready", namespace="multi-platform-controller"}'
88+
- '{__name__="kube_deployment_status_replicas_available", namespace="multi-platform-controller"}'
89+
- '{__name__="kube_deployment_spec_replicas", namespace="multi-platform-controller"}'
90+
91+
# Namespace (expression): "namespace-lister"
92+
- '{__name__="kube_deployment_status_replicas_ready", namespace="namespace-lister"}'
93+
- '{__name__="kube_deployment_status_replicas_available", namespace="namespace-lister"}'
94+
- '{__name__="kube_deployment_spec_replicas", namespace="namespace-lister"}'
95+
96+
# Namespace (expression): "openshift-pipelines"
97+
- '{__name__="kube_deployment_status_replicas_ready", namespace="openshift-pipelines"}'
98+
- '{__name__="kube_deployment_status_replicas_available", namespace="openshift-pipelines"}'
99+
- '{__name__="kube_deployment_spec_replicas", namespace="openshift-pipelines"}'
100+
101+
# Namespace (expression): "product-kubearchive"
102+
- '{__name__="kube_deployment_status_replicas_ready", namespace="product-kubearchive"}'
103+
- '{__name__="kube_deployment_status_replicas_available", namespace="product-kubearchive"}'
104+
- '{__name__="kube_deployment_spec_replicas", namespace="product-kubearchive"}'
105+
106+
# Namespace (expression): "project-controller"
107+
- '{__name__="kube_deployment_status_replicas_ready", namespace="project-controller"}'
108+
- '{__name__="kube_deployment_status_replicas_available", namespace="project-controller"}'
109+
- '{__name__="kube_deployment_spec_replicas", namespace="project-controller"}'
110+
111+
# Namespace (expression): "release-service"
112+
- '{__name__="kube_deployment_status_replicas_ready", namespace="release-service"}'
113+
- '{__name__="kube_deployment_status_replicas_available", namespace="release-service"}'
114+
- '{__name__="kube_deployment_spec_replicas", namespace="release-service"}'
115+
116+
# Namespace (expression): ~"smee.*"
117+
- '{__name__="kube_deployment_status_replicas_ready", namespace=~"smee.*"}'
118+
- '{__name__="kube_deployment_status_replicas_available", namespace=~"smee.*"}'
119+
- '{__name__="kube_deployment_spec_replicas", namespace=~"smee.*"}'
120+
121+
# Namespace (expression): "openshift-apiserver"
122+
- '{__name__="kube_deployment_status_replicas_ready", namespace="openshift-apiserver"}'
123+
- '{__name__="kube_deployment_status_replicas_available", namespace="openshift-apiserver"}'
124+
- '{__name__="kube_deployment_spec_replicas", namespace="openshift-apiserver"}'
125+
126+
# Namespace (expression): "openshift-oauth-apiserver"
127+
- '{__name__="kube_deployment_status_replicas_ready", namespace="openshift-oauth-apiserver"}'
128+
- '{__name__="kube_deployment_status_replicas_available", namespace="openshift-oauth-apiserver"}'
129+
- '{__name__="kube_deployment_spec_replicas", namespace="openshift-oauth-apiserver"}'
130+
131+
# Namespace (expression): "konflux-kyverno"
132+
- '{__name__="kube_deployment_status_replicas_ready", namespace="konflux-kyverno"}'
133+
- '{__name__="kube_deployment_status_replicas_available", namespace="konflux-kyverno"}'
134+
- '{__name__="kube_deployment_spec_replicas", namespace="konflux-kyverno"}'
135+
136+
# Namespace (expression): "openshift-kube-apiserver"
137+
- '{__name__="kube_deployment_status_replicas_ready", namespace="openshift-kube-apiserver"}'
138+
- '{__name__="kube_deployment_status_replicas_available", namespace="openshift-kube-apiserver"}'
139+
- '{__name__="kube_deployment_spec_replicas", namespace="openshift-kube-apiserver"}'
140+
141+
# Namespace (expression): "konflux-user-support"
142+
- '{__name__="kube_deployment_status_replicas_available", namespace="konflux-user-support"}'
143+
- '{__name__="kube_deployment_spec_replicas", namespace="konflux-user-support"}'
144+
145+
- '{__name__="argocd_app_reconcile_bucket", namespace="gitops-service-argocd"}'
146+
- '{__name__="argocd_app_info", namespace="gitops-service-argocd"}'
147+
- '{__name__="container_cpu_usage_seconds_total", namespace="release-service"}'
148+
- '{__name__="container_cpu_usage_seconds_total", namespace="openshift-etcd"}'
149+
- '{__name__="container_memory_usage_bytes", namespace="release-service"}'
150+
- '{__name__="container_memory_usage_bytes", namespace="openshift-etcd"}'
151+
- '{__name__="etcd_disk_wal_fsync_duration_seconds_bucket"}'
152+
- '{__name__="etcd_disk_backend_commit_duration_seconds_bucket"}'
153+
- '{__name__="etcd_server_proposals_failed_total"}'
154+
- '{__name__="etcd_server_leader_changes_seen_total", namespace="openshift-etcd"}'
155+
- '{__name__="etcd_server_has_leader", namespace="openshift-etcd"}'
156+
- '{__name__="etcd_server_is_leader", namespace="openshift-etcd"}'
157+
- '{__name__="etcd_server_id", namespace="openshift-etcd"}'
158+
- '{__name__="etcd_server_quota_backend_bytes", namespace="openshift-etcd"}'
159+
- '{__name__="etcd_mvcc_db_total_size_in_bytes", namespace="openshift-etcd"}'
160+
- '{__name__="etcd_server_received_total", namespace="openshift-etcd"}'
161+
- '{__name__="etcd_network_active_peers", namespace="openshift-etcd"}'
162+
- '{__name__="etcd_network_peer_round_trip_time_seconds_bucket"}'
163+
- '{__name__="etcd_disk_defrag_inflight"}'
164+
- '{__name__="kube_job_spec_completions"}'
165+
- '{__name__="kube_job_status_succeeded"}'
166+
- '{__name__="kube_job_status_failed"}'
167+
- '{__name__="node_cpu_seconds_total", mode="idle"}'
168+
- '{__name__="node_memory_MemTotal_bytes"}'
169+
- '{__name__="node_memory_MemAvailable_bytes"}'
170+
- '{__name__="platform:hypershift_hostedclusters:max"}'
171+
- '{__name__="kube_node_role"}'
172+
- '{__name__="etcd_shield_trigger"}'
173+
- '{__name__="etcd_shield_alert_triggered"}'
174+
- '{__name__="apiserver_admission_webhook_rejection_count", name="vpipelineruns.konflux-ci.dev"}'
175+
- '{__name__="apiserver_watch_events_total"}'
176+
- '{__name__="apiserver_storage_objects"}'
177+
- '{__name__="apiserver_current_inflight_requests"}'
178+
- '{__name__="resource_verb:apiserver_request_total:rate5m"}'
179+
- '{__name__="code:apiserver_request_total:rate5m"}'
180+
- '{__name__="instance:apiserver_request_total:rate5m"}'
181+
- '{__name__="prometheus_ready"}'
182+
- '{__name__="process_cpu_seconds_total", job="apiserver"}'
183+
- '{__name__="namespace:container_memory_usage_bytes:sum", namespace=~"openshift-etcd|openshift-kube-apiserver|build-service|image-controller|integration-service|konflux-ui|product-kubearchive|openshift-kueue-operator|tekton-kueue|kueue-external-admission|mintmaker|multi-platform-controller|namespace-lister|openshift-pipelines|tekton-results|project-controller|smee|smee-client"}'
184+
- '{__name__="namespace:container_cpu_usage:sum", namespace=~"openshift-etcd|openshift-kube-apiserver|build-service|image-controller|integration-service|konflux-ui|product-kubearchive|openshift-kueue-operator|tekton-kueue|kueue-external-admission|mintmaker|multi-platform-controller|namespace-lister|openshift-pipelines|tekton-results|project-controller|smee|smee-client"}'
185+
- '{__name__="node_namespace_pod:kube_pod_info:", namespace=~"openshift-etcd|openshift-kube-apiserver|build-service|image-controller|integration-service|konflux-ui|product-kubearchive|openshift-kueue-operator|tekton-kueue|kueue-external-admission|mintmaker|multi-platform-controller|namespace-lister|openshift-pipelines|tekton-results|project-controller|smee|smee-client"}'
186+
- '{__name__="kube_node_status_allocatable", resource=~"cpu|memory"}'
187+
- '{__name__="kube_node_status_condition", condition="MemoryPressure", status="true"}'
188+
- '{__name__="namespace_memory:kube_pod_container_resource_requests:sum", namespace=~"openshift-etcd|openshift-kube-apiserver|build-service|image-controller|integration-service|konflux-ui|product-kubearchive|openshift-kueue-operator|tekton-kueue|kueue-external-admission|mintmaker|multi-platform-controller|namespace-lister|openshift-pipelines|tekton-results|project-controller|smee|smee-client"}'
189+
- '{__name__="namespace_cpu:kube_pod_container_resource_requests:sum", namespace=~"openshift-etcd|openshift-kube-apiserver|build-service|image-controller|integration-service|konflux-ui|product-kubearchive|openshift-kueue-operator|tekton-kueue|kueue-external-admission|mintmaker|multi-platform-controller|namespace-lister|openshift-pipelines|tekton-results|project-controller|smee|smee-client"}'
190+
- '{__name__="namespace_memory:kube_pod_container_resource_limits:sum", namespace=~"openshift-etcd|openshift-kube-apiserver|build-service|image-controller|integration-service|konflux-ui|product-kubearchive|openshift-kueue-operator|tekton-kueue|kueue-external-admission|mintmaker|multi-platform-controller|namespace-lister|openshift-pipelines|tekton-results|project-controller|smee|smee-client"}'
191+
- '{__name__="namespace_cpu:kube_pod_container_resource_limits:sum", namespace=~"openshift-etcd|openshift-kube-apiserver|build-service|image-controller|integration-service|konflux-ui|product-kubearchive|openshift-kueue-operator|tekton-kueue|kueue-external-admission|mintmaker|multi-platform-controller|namespace-lister|openshift-pipelines|tekton-results|project-controller|smee|smee-client"}'

components/monitoring/prometheus/production/base/monitoringstack/kustomization.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ patches:
77
target:
88
name: appstudio-federate-smon
99
kind: ServiceMonitor
10+
- path: endpoints-params.yaml
11+
target:
12+
name: appstudio-federate-smon
13+
kind: ServiceMonitor
1014
- path: cluster-type-patch.yaml
1115
target:
1216
name: appstudio-federate-uwm-smon
@@ -15,6 +19,10 @@ patches:
1519
target:
1620
name: appstudio-federate-ms
1721
kind: MonitoringStack
22+
- path: writeRelabelConfigs.yaml
23+
target:
24+
name: appstudio-federate-ms
25+
kind: MonitoringStack
1826

1927
commonAnnotations:
2028
argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
---
2+
- op: replace
3+
path: /spec/prometheusConfig/remoteWrite/0/writeRelabelConfigs
4+
value:
5+
- action: LabelKeep
6+
regex: "__name__|source_environment|source_cluster|namespace|app|pod|container|\
7+
label_pipelines_appstudio_openshift_io_type|health_status|dest_namespace|\
8+
controller|service|reason|phase|type|resource|resourcequota|le|app|image|\
9+
commit_hash|job|operation|tokenName|rateLimited|state|persistentvolumeclaim|\
10+
storageclass|volumename|release_reason|instance|result|deployment_reason|\
11+
validation_reason|strategy|succeeded|target|name|method|code|sp|le|\
12+
unexpected_status|failure|hostname|label_app_kubernetes_io_managed_by|status|\
13+
pipeline|pipelinename|pipelinerun|schedule|check|grpc_service|grpc_code|\
14+
grpc_method|lease|lease_holder|deployment|platform|mode|cpu|role|node|kind|\
15+
verb|request_kind|tested_cluster|resource_type|exported_job|http_method|\
16+
http_route|http_status_code|gin_errors|rule_result|rule_execution_cause|\
17+
policy_name|policy_background_mode|rule_type|policy_type|policy_validation_mode|\
18+
resource_request_operation|resource_kind|policy_change_type|event_type"

0 commit comments

Comments
 (0)