Skip to content

Commit 8279391

Browse files
RatulDawarnineinchnick
authored andcommitted
Add ServiceMonitor support to gateway chart
- Add ServiceMonitor template for Prometheus Operator integration - Add serviceMonitor configuration in values.yaml: - enabled: false (default) - labels for Prometheus selector - scrape interval - Add ServiceMonitor test to verify Prometheus discovers the target - Update test.sh to install Prometheus for complete_values test - Disable ServiceMonitor in nodeport/https tests (no Prometheus) - Update README.md with new configuration options
1 parent 5f55347 commit 8279391

File tree

8 files changed

+212
-1
lines changed

8 files changed

+212
-1
lines changed

charts/gateway/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,15 @@ A Helm chart for Trino Gateway
205205
* `strategy` - object, default: `{"rollingUpdate":{"maxSurge":"25%","maxUnavailable":"25%"},"type":"RollingUpdate"}`
206206

207207
The deployment strategy to use to replace existing pods with new ones.
208+
* `serviceMonitor.enabled` - bool, default: `false`
209+
210+
Set to true to create resources for the [prometheus-operator](https://github.com/prometheus-operator/prometheus-operator).
211+
* `serviceMonitor.labels` - object, default: `{"prometheus":"kube-prometheus"}`
212+
213+
Labels for serviceMonitor, so that Prometheus can select it
214+
* `serviceMonitor.interval` - string, default: `"30s"`
215+
216+
The serviceMonitor web endpoint interval
208217

209218
----------------------------------------------
210219
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{{- if .Values.serviceMonitor.enabled -}}
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: ServiceMonitor
4+
metadata:
5+
name: {{ include "trino-gateway.fullname" . }}
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
{{- include "trino-gateway.labels" . | nindent 4 }}
9+
{{- if .Values.serviceMonitor.labels }}
10+
{{- toYaml .Values.serviceMonitor.labels | nindent 4 }}
11+
{{- end }}
12+
spec:
13+
selector:
14+
matchLabels:
15+
{{- include "trino-gateway.selectorLabels" . | nindent 6 }}
16+
namespaceSelector:
17+
matchNames:
18+
- {{ .Release.Namespace }}
19+
endpoints:
20+
- port: gateway
21+
interval: {{ .Values.serviceMonitor.interval }}
22+
{{- end }}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
{{- if .Values.serviceMonitor.enabled -}}
2+
apiVersion: v1
3+
kind: Pod
4+
metadata:
5+
name: {{ include "trino-gateway.fullname" . }}-test-servicemonitor
6+
labels:
7+
{{- include "trino-gateway.labels" . | nindent 4 }}
8+
app.kubernetes.io/component: test
9+
test: servicemonitor
10+
annotations:
11+
"helm.sh/hook": test
12+
"helm.sh/hook-delete-policy": hook-succeeded
13+
spec:
14+
containers:
15+
- name: service-monitor
16+
image: python:3-slim
17+
command: ["python", "/tests/test.py"]
18+
args: ["{{ include "trino-gateway.fullname" . }}", "{{ .Values.serviceName }}"]
19+
volumeMounts:
20+
- name: tests
21+
mountPath: /tests
22+
volumes:
23+
- name: tests
24+
configMap:
25+
name: {{ include "trino-gateway.fullname" . }}-test-servicemonitor
26+
restartPolicy: Never
27+
---
28+
apiVersion: v1
29+
kind: ConfigMap
30+
metadata:
31+
name: {{ include "trino-gateway.fullname" . }}-test-servicemonitor
32+
labels:
33+
{{- include "trino-gateway.labels" . | nindent 4 }}
34+
app.kubernetes.io/component: test
35+
test: servicemonitor
36+
annotations:
37+
"helm.sh/hook": test
38+
"helm.sh/hook-delete-policy": hook-succeeded
39+
data:
40+
test.py: |
41+
from urllib.request import urlopen
42+
from urllib.error import URLError, HTTPError
43+
import json
44+
import logging
45+
import sys
46+
import time
47+
48+
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
49+
logger = logging.getLogger(__name__)
50+
51+
servicemonitor_name = sys.argv[1]
52+
expected_service = sys.argv[2]
53+
namespace = "{{ .Release.Namespace }}"
54+
url = f"http://prometheus-operator-kube-p-prometheus:9090/api/v1/targets?scrapePool=serviceMonitor/{namespace}/{servicemonitor_name}/0&state=active"
55+
all_targets_url = "http://prometheus-operator-kube-p-prometheus:9090/api/v1/targets"
56+
57+
max_retries = 90 # 3 minutes max (90 * 2 seconds)
58+
retry_count = 0
59+
60+
logger.info(f"Looking for ServiceMonitor '{servicemonitor_name}' in namespace '{namespace}'")
61+
logger.info(f"Expected service name: '{expected_service}'")
62+
63+
while retry_count < max_retries:
64+
try:
65+
with urlopen(url, timeout=10) as response:
66+
data = json.load(response)
67+
except (URLError, HTTPError) as e:
68+
retry_count += 1
69+
logger.warning(f"Error fetching targets (attempt {retry_count}/{max_retries}), Prometheus service might not be ready: {e}")
70+
if retry_count >= max_retries:
71+
logger.error(f"Failed to connect to Prometheus after {max_retries} attempts")
72+
sys.exit(1)
73+
time.sleep(2) # Retry after 2 seconds
74+
continue
75+
76+
try:
77+
active_targets = data.get("data", {}).get("activeTargets", [])
78+
if not active_targets:
79+
retry_count += 1
80+
# Log diagnostic info every 10 attempts
81+
if retry_count % 10 == 0:
82+
try:
83+
with urlopen(all_targets_url, timeout=10) as all_response:
84+
all_data = json.load(all_response)
85+
all_active = all_data.get("data", {}).get("activeTargets", [])
86+
logger.info(f"Prometheus has {len(all_active)} total active targets")
87+
# Find ServiceMonitor scrape pools
88+
servicemonitor_pools = [t.get("scrapePool", "") for t in all_active if "serviceMonitor" in t.get("scrapePool", "")]
89+
if servicemonitor_pools:
90+
logger.info(f"Found ServiceMonitor scrape pools: {servicemonitor_pools[:5]}") # Show first 5
91+
except Exception as e:
92+
logger.debug(f"Could not fetch all targets for diagnostics: {e}")
93+
logger.warning(f"No active targets found (attempt {retry_count}/{max_retries}), waiting for ServiceMonitor to be discovered...")
94+
if retry_count >= max_retries:
95+
logger.error(f"No active targets found after {max_retries} attempts")
96+
logger.error(f"ServiceMonitor '{servicemonitor_name}' was not discovered by Prometheus")
97+
sys.exit(1)
98+
time.sleep(2) # Retry after 2 seconds
99+
continue
100+
service_name = active_targets[0]["discoveredLabels"]["__meta_kubernetes_service_name"]
101+
except (KeyError, IndexError) as e:
102+
retry_count += 1
103+
logger.warning(f"Invalid Prometheus response (attempt {retry_count}/{max_retries}): {e}")
104+
if retry_count >= max_retries:
105+
logger.error(f"Invalid Prometheus response after {max_retries} attempts")
106+
sys.exit(1)
107+
time.sleep(2) # Retry after 2 seconds
108+
continue
109+
110+
if service_name == expected_service:
111+
logger.info(f"Found expected service '{service_name}' in Prometheus targets!")
112+
sys.exit(0)
113+
else:
114+
retry_count += 1
115+
logger.warning(f"Service name mismatch: expected '{expected_service}', got '{service_name}' (attempt {retry_count}/{max_retries})")
116+
if retry_count >= max_retries:
117+
logger.error(f"Service name mismatch after {max_retries} attempts")
118+
sys.exit(1)
119+
time.sleep(2)
120+
121+
logger.error(f"Test failed after {max_retries} attempts")
122+
sys.exit(1)
123+
{{- end }}

charts/gateway/values.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,3 +265,13 @@ strategy:
265265
rollingUpdate:
266266
maxSurge: 25%
267267
maxUnavailable: 25%
268+
269+
serviceMonitor:
270+
# -- Set to true to create resources for the
271+
# [prometheus-operator](https://github.com/prometheus-operator/prometheus-operator).
272+
enabled: false
273+
# -- Labels for serviceMonitor, so that Prometheus can select it
274+
labels:
275+
prometheus: kube-prometheus
276+
# -- The serviceMonitor web endpoint interval
277+
interval: "30s"

tests/gateway/test-https.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ command:
55
cat /etc/certificates/tls.crt /etc/certificates/tls.key > /etc/scratch/tls.pem && \
66
java -XX:MinRAMPercentage=80.0 -XX:MaxRAMPercentage=80.0 -jar /usr/lib/trino-gateway/gateway-ha-jar-with-dependencies.jar /etc/trino-gateway/config.yaml
77
8+
# Disable ServiceMonitor - Prometheus is only installed for complete_values test
9+
serviceMonitor:
10+
enabled: false
11+
812
config:
913
serverConfig:
1014
http-server.http.enabled: false

tests/gateway/test-nodeport.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ config:
66
http-server.https.port: 8443
77
http-server.https.keystore.path: /etc/scratch/tls.pem
88

9+
# Disable ServiceMonitor - Prometheus is only installed for complete_values test
10+
serviceMonitor:
11+
enabled: false
12+
913
service:
1014
type: NodePort
1115
ports:

tests/gateway/test-values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,9 @@ resources:
3434
requests:
3535
cpu: 250m
3636
memory: 256Mi
37+
38+
serviceMonitor:
39+
enabled: true
40+
labels:
41+
prometheus: default
42+
interval: "1s"

tests/gateway/test.sh

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,38 @@ DB_NAMESPACE=postgres-gateway
3131
kubectl create namespace "${NAMESPACE}" --dry-run=client --output yaml | kubectl apply --filename -
3232
kubectl create namespace "${DB_NAMESPACE}" --dry-run=client --output yaml | kubectl apply --filename -
3333

34+
# install the Prometheus Helm chart when running the `complete_values` test
35+
if printf '%s\0' "${TEST_NAMES[@]}" | grep -qwz complete_values; then
36+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
37+
helm upgrade --install prometheus-operator prometheus-community/kube-prometheus-stack -n "$NAMESPACE" \
38+
--version "68.2.1" \
39+
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
40+
--set prometheus.prometheusSpec.serviceMonitorSelector.matchLabels.prometheus=default \
41+
--set grafana.enabled=false \
42+
--set alertmanager.enabled=false \
43+
--set kubeApiServer.enabled=false \
44+
--set kubelet.enabled=false \
45+
--set kubeControllerManager.enabled=false \
46+
--set coreDns.enabled=false \
47+
--set kubeEtcd.enabled=false \
48+
--set kubeScheduler.enabled=false \
49+
--set kubeProxy.enabled=false \
50+
--set kubeStateMetrics.enabled=false \
51+
--set nodeExporter.enabled=false \
52+
--set prometheusOperator.admissionWebhooks.enabled=false \
53+
--set prometheusOperator.kubeletService.enabled=false \
54+
--set prometheusOperator.tls.enabled=false \
55+
--set prometheusOperator.serviceMonitor.selfMonitor=false \
56+
--set prometheus.serviceMonitor.selfMonitor=false
57+
kubectl rollout status --watch deployments -l release=prometheus-operator -n "$NAMESPACE"
58+
# Wait for Prometheus pod to be ready and give it time to discover ServiceMonitors
59+
echo 1>&2 "Waiting for Prometheus to be ready..."
60+
kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=prometheus -n "$NAMESPACE" --timeout=300s || true
61+
# Give Prometheus Operator time to reconcile and discover ServiceMonitors
62+
echo 1>&2 "Waiting for Prometheus to discover ServiceMonitors..."
63+
sleep 10
64+
fi
65+
3466
echo 1>&2 "Generating a self-signed TLS certificate"
3567
NODE_IP=$(kubectl get nodes -o json -o jsonpath='{.items[0].status.addresses[0].address}')
3668
openssl req -new -newkey rsa:4096 -days 365 -nodes -x509 \
@@ -121,7 +153,7 @@ for test_name in "${TEST_NAMES[@]}"; do
121153
echo 1>&2 "✅ Test $test_name completed"
122154
fi
123155
if [ "$CLEANUP_NAMESPACE" == "true" ]; then
124-
for release in $(helm --namespace "$NAMESPACE" ls --all --short | grep -v 'prometheus-operator'); do
156+
for release in $(helm --namespace "$NAMESPACE" ls --short | grep -v 'prometheus-operator'); do
125157
echo 1>&2 "Cleaning up Helm release $release"
126158
helm --namespace "$NAMESPACE" delete "$release"
127159
done
@@ -131,6 +163,7 @@ done
131163
if [ "$CLEANUP_NAMESPACE" == "true" ]; then
132164
helm -n "$DB_NAMESPACE" uninstall gateway-backend-db --ignore-not-found
133165
kubectl delete namespace "$DB_NAMESPACE" --ignore-not-found
166+
helm -n "$NAMESPACE" uninstall prometheus-operator --ignore-not-found
134167
kubectl delete namespace "$NAMESPACE" --ignore-not-found
135168
mapfile -t crds < <(kubectl api-resources --api-group=monitoring.coreos.com --output name)
136169
if [ ${#crds[@]} -ne 0 ]; then

0 commit comments

Comments
 (0)