Skip to content

Commit b095570

Browse files
committed
Cleanup service monitors, add new stacakble-generic service monitor that works with all products.
1 parent 38a3146 commit b095570

File tree

1 file changed

+61
-98
lines changed

1 file changed

+61
-98
lines changed

stacks/monitoring/prometheus-service-monitors.yaml

Lines changed: 61 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,12 @@ spec:
4040
- airflow
4141
- druid
4242
- hive
43-
- nifi # This only works for NiFi 1, NiFi 2 has a special ServiceMonitor below
43+
- kafka
44+
- nifi # This only works for NiFi 1, NiFi 2 works via stackable-generic
4445
- opa
4546
- superset
4647
- trino
48+
- zookeeper
4749
endpoints:
4850
- scheme: http
4951
port: metrics
@@ -55,10 +57,25 @@ spec:
5557
- app.kubernetes.io/role-group
5658
- app.kubernetes.io/version
5759
---
60+
# Utilize `prometheus.io/scheme`, `prometheus.io/port`, `prometheus.io/path` annotations set by the operators
61+
# to scrape all Stackable products.
62+
# [x] Airflow - relabel drop filter on airflow container
63+
# [x] Druid
64+
# [x] HBase
65+
# [X] Hadoop HDFS - relabel drop filter on empty container
66+
# [x] Hive
67+
# [~] Kafka - TODO: listener services have metrics?
68+
# [x] NiFi 1 + 2
69+
# [ ] OpenSearch
70+
# [x] Spark: Connect, HistoryServer
71+
# [x] Superset - relabel drop filter on superset container
72+
# [x] Trino
73+
# [x] ZooKeeper
74+
# [x] OPA
5875
apiVersion: monitoring.coreos.com/v1
5976
kind: ServiceMonitor
6077
metadata:
61-
name: stackable-native-metrics
78+
name: stackable-generic
6279
labels:
6380
stackable.tech/vendor: Stackable
6481
release: prometheus
@@ -69,46 +86,49 @@ spec:
6986
matchLabels:
7087
stackable.tech/vendor: Stackable
7188
prometheus.io/scrape: "true"
72-
matchExpressions:
73-
- key: app.kubernetes.io/name
74-
operator: In
75-
values:
76-
- zookeeper
7789
endpoints:
78-
- scheme: http
79-
port: native-metrics
80-
path: /metrics
81-
podTargetLabels:
82-
- app.kubernetes.io/name
83-
- app.kubernetes.io/instance
84-
- app.kubernetes.io/component
85-
- app.kubernetes.io/role-group
86-
- app.kubernetes.io/version
87-
---
88-
# Kafka is special in that the operator totally messes up services:
89-
# 1. The metrics Service is missing
90-
# 2. The role level simple-kafka-broker-default has the prometheus.io/scrape label, but exposes no ports...
91-
# 3. The role level simple-kafka-broker-default is labeled with app.kubernetes.io/name: listener???
92-
# So we have a dedicated config for it
93-
apiVersion: monitoring.coreos.com/v1
94-
kind: ServiceMonitor
95-
metadata:
96-
name: stackable-kafka
97-
labels:
98-
stackable.tech/vendor: Stackable
99-
release: prometheus
100-
spec:
101-
namespaceSelector:
102-
any: true
103-
selector:
104-
matchLabels:
105-
stackable.tech/vendor: Stackable
106-
app.kubernetes.io/name: listener # Dafuq?
107-
app.kubernetes.io/component: broker # We need to filter on brokers instead, as the app.kubernetes.io/name is messed up
108-
endpoints:
109-
- scheme: http
110-
port: metrics
111-
path: /metrics
90+
- relabelings:
91+
- sourceLabels:
92+
- __meta_kubernetes_pod_container_name
93+
# Pods show up twice due to multiple containers, we only keep the main / product container.
94+
# Except for Airflow and Superset, where we chose the metrics container (otherwise scheduler, worker etc.
95+
# which only have the metrics container are not getting picked up).
96+
# - airflow: airflow
97+
# - superset: superset
98+
# - empty: filter when container label does not exist: hdfs
99+
regex: ^(airflow|superset|)$
100+
action: drop
101+
- sourceLabels:
102+
- __meta_kubernetes_service_annotation_prometheus_io_scheme
103+
action: replace
104+
targetLabel: __scheme__
105+
regex: (https?)
106+
- sourceLabels:
107+
- __meta_kubernetes_service_annotation_prometheus_io_path
108+
action: replace
109+
targetLabel: __metrics_path__
110+
regex: (.+)
111+
- sourceLabels:
112+
- __meta_kubernetes_service_name
113+
- __meta_kubernetes_namespace
114+
- __meta_kubernetes_service_annotation_prometheus_io_port
115+
action: replace
116+
targetLabel: __address__
117+
regex: (.+);(.+);(\d+)
118+
# TODO: We could set the cluster domain via annotation as well and pick it up here.
119+
replacement: $1.$2.svc.cluster.local:$3
120+
tlsConfig:
121+
ca:
122+
secret:
123+
name: prometheus-tls-certificate
124+
key: ca.crt
125+
cert:
126+
secret:
127+
name: prometheus-tls-certificate
128+
key: tls.crt
129+
keySecret:
130+
name: prometheus-tls-certificate
131+
key: tls.key
112132
podTargetLabels:
113133
- app.kubernetes.io/name
114134
- app.kubernetes.io/instance
@@ -219,63 +239,6 @@ spec:
219239
- app.kubernetes.io/role-group
220240
- app.kubernetes.io/version
221241
---
222-
# NiFI 2 is a beast on it's own...
223-
# We need to use mTLS (otherwise we get a 401) and can not use the PodIP
224-
apiVersion: monitoring.coreos.com/v1
225-
kind: ServiceMonitor
226-
metadata:
227-
name: stackable-nifi-2
228-
labels:
229-
stackable.tech/vendor: Stackable
230-
release: prometheus
231-
spec:
232-
namespaceSelector:
233-
any: true
234-
selector:
235-
matchLabels:
236-
stackable.tech/vendor: Stackable
237-
prometheus.io/scrape: "true"
238-
matchExpressions:
239-
- key: app.kubernetes.io/name
240-
operator: In
241-
values:
242-
- nifi
243-
endpoints:
244-
- scheme: https
245-
port: https
246-
path: /nifi-api/flow/metrics/prometheus
247-
# See https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#monitoring.coreos.com/v1.TLSConfig
248-
tlsConfig:
249-
ca:
250-
secret:
251-
name: prometheus-tls-certificate
252-
key: ca.crt
253-
cert:
254-
secret:
255-
name: prometheus-tls-certificate
256-
key: tls.crt
257-
keySecret:
258-
name: prometheus-tls-certificate
259-
key: tls.key
260-
# We need to talk to the Pod via the FQDN of the Pod because of the stupid SNI check of NiFi.
261-
# We can not use the typical PodIP, as it is not contained in the NiFi certificate,
262-
# see https://github.com/stackabletech/secret-operator/issues/620
263-
relabelings:
264-
- sourceLabels:
265-
- __meta_kubernetes_pod_name
266-
- __meta_kubernetes_service_name
267-
- __meta_kubernetes_namespace
268-
- __meta_kubernetes_pod_container_port_number
269-
targetLabel: __address__
270-
replacement: ${1}.${2}-headless.${3}.svc.cluster.local:${4}
271-
regex: (.+);(.+?)(?:-metrics)?;(.+);(.+)
272-
podTargetLabels:
273-
- app.kubernetes.io/name
274-
- app.kubernetes.io/instance
275-
- app.kubernetes.io/component
276-
- app.kubernetes.io/role-group
277-
- app.kubernetes.io/version
278-
---
279242
# spark-k8s-operator does not deploy any Services at all (at least for SparkApplications).
280243
# We currently only scrape the driver, going forward we might want to scrape the executors as well.
281244
# In the future we might also want to scrape SparkConnect and HistoryServers.

0 commit comments

Comments
 (0)