Skip to content

Commit 1529635

Browse files
zetxqxkfswain
authored andcommitted
add gke monitoring helm support. (#1600)
* fix gke monitoring. * change to namespaced resources as much as possible. * update helm chart readme. * resolve nits. * move autopilot to provider.gke.
1 parent ca6aa73 commit 1529635

File tree

4 files changed

+99
-6
lines changed

4 files changed

+99
-6
lines changed

config/charts/inferencepool/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,9 @@ inferenceExtension:
113113
114114
**Note:** Prometheus monitoring requires the Prometheus Operator and ServiceMonitor CRD to be installed in the cluster.
115115
116-
For GKE environments, monitoring is automatically configured when `provider.name` is set to `gke`.
116+
For GKE environments, monitoring is enabled by setting `provider.name` to `gke` and `inferenceExtension.monitoring.gke.enabled` to `true`. This will create the necessary `PodMonitoring` and RBAC resources for metrics collection.
117+
118+
If you are using a GKE Autopilot cluster, you also need to set `provider.gke.autopilot` to `true`.
117119

118120
Then apply it with:
119121

@@ -152,8 +154,10 @@ The following table list the configurable parameters of the chart.
152154
| `inferenceExtension.monitoring.interval` | Metrics scraping interval for monitoring. Defaults to `10s`. |
153155
| `inferenceExtension.monitoring.secret.name` | Name of the service account token secret for metrics authentication. Defaults to `inference-gateway-sa-metrics-reader-secret`. |
154156
| `inferenceExtension.monitoring.prometheus.enabled` | Enable Prometheus ServiceMonitor creation for EPP metrics collection. Defaults to `false`. |
157+
| `inferenceExtension.monitoring.gke.enabled` | Enable GKE monitoring resources (`PodMonitoring` and RBAC). Defaults to `false`. |
155158
| `inferenceExtension.pluginsCustomConfig` | Custom config that is passed to EPP as inline yaml. |
156159
| `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`. |
160+
| `provider.gke.autopilot` | Set to `true` if the cluster is a GKE Autopilot cluster. This is only used if `provider.name` is `gke`. Defaults to `false`. |
157161

158162
## Notes
159163

config/charts/inferencepool/templates/epp-sa-token-secret.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{{- if or .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.gke.enabled }}
1+
{{- if .Values.inferenceExtension.monitoring.prometheus.enabled }}
22
apiVersion: v1
33
kind: Secret
44
metadata:

config/charts/inferencepool/templates/gke.yaml

Lines changed: 85 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,44 @@ spec:
3535
timeoutSec: 300 # 5-minute timeout (adjust as needed)
3636
logging:
3737
enabled: true # log all requests by default
38+
{{- if .Values.inferenceExtension.monitoring.gke.enabled }}
39+
{{- $metricsReadSA := printf "%s-metrics-reader-sa" .Release.Name -}}
40+
{{- $metricsReadSecretName := printf "%s-metrics-reader-secret" .Release.Name -}}
41+
{{- $metricsReadRoleName := printf "%s-%s-metrics-reader" .Release.Namespace .Release.Name -}}
42+
{{- $metricsReadRoleBindingName := printf "%s-%s-metrics-reader-role-binding" .Release.Namespace .Release.Name -}}
43+
{{- $secretReadRoleName := printf "%s-metrics-reader-secret-read" .Release.Name -}}
44+
{{- $gmpNamespace := "gmp-system" -}}
45+
{{- $isAutopilot := false -}}
46+
{{- with .Values.provider.gke }}
47+
{{- $isAutopilot = .autopilot | default false -}}
48+
{{- end }}
49+
{{- if $isAutopilot -}}
50+
{{- $gmpNamespace = "gke-gmp-system" -}}
51+
{{- end -}}
52+
{{- $gmpCollectorRoleBindingName := printf "%s:collector:%s-%s-metrics-reader-secret-read" $gmpNamespace .Release.Namespace .Release.Name -}}
53+
---
54+
apiVersion: v1
55+
kind: ServiceAccount
56+
metadata:
57+
name: {{ $metricsReadSA }}
58+
namespace: {{ .Release.Namespace }}
59+
---
60+
apiVersion: v1
61+
kind: Secret
62+
metadata:
63+
name: {{ $metricsReadSecretName }}
64+
namespace: {{ .Release.Namespace }}
65+
labels:
66+
{{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
67+
annotations:
68+
kubernetes.io/service-account.name: {{ $metricsReadSA }}
69+
type: kubernetes.io/service-account-token
3870
---
3971
apiVersion: monitoring.googleapis.com/v1
40-
kind: ClusterPodMonitoring
72+
kind: PodMonitoring
4173
metadata:
42-
name: {{ .Release.Namespace }}-{{ .Release.Name }}
74+
name: {{ .Release.Name }}
75+
namespace: {{ .Release.Namespace }}
4376
labels:
4477
{{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
4578
spec:
@@ -52,10 +85,58 @@ spec:
5285
type: Bearer
5386
credentials:
5487
secret:
55-
name: {{ .Values.inferenceExtension.monitoring.secret.name }}
88+
name: {{ $metricsReadSecretName }}
5689
key: token
57-
namespace: {{ .Release.Namespace }}
5890
selector:
5991
matchLabels:
6092
{{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }}
93+
---
94+
apiVersion: rbac.authorization.k8s.io/v1
95+
kind: ClusterRole
96+
metadata:
97+
name: {{ $metricsReadRoleName }}
98+
rules:
99+
- nonResourceURLs:
100+
- /metrics
101+
verbs:
102+
- get
103+
---
104+
apiVersion: rbac.authorization.k8s.io/v1
105+
kind: ClusterRoleBinding
106+
metadata:
107+
name: {{ $metricsReadRoleBindingName }}
108+
subjects:
109+
- kind: ServiceAccount
110+
name: {{ $metricsReadSA }}
111+
namespace: {{ .Release.Namespace }}
112+
roleRef:
113+
kind: ClusterRole
114+
name: {{ $metricsReadRoleName }}
115+
apiGroup: rbac.authorization.k8s.io
116+
---
117+
apiVersion: rbac.authorization.k8s.io/v1
118+
kind: Role
119+
metadata:
120+
name: {{ $secretReadRoleName }}
121+
rules:
122+
- resources:
123+
- secrets
124+
apiGroups: [""]
125+
verbs: ["get", "list", "watch"]
126+
resourceNames: [{{ $metricsReadSecretName | quote }}]
127+
---
128+
apiVersion: rbac.authorization.k8s.io/v1
129+
kind: RoleBinding
130+
metadata:
131+
name: {{ $gmpCollectorRoleBindingName }}
132+
namespace: {{ .Release.Namespace }}
133+
roleRef:
134+
name: {{ $secretReadRoleName }}
135+
kind: Role
136+
apiGroup: rbac.authorization.k8s.io
137+
subjects:
138+
- name: collector
139+
namespace: {{ $gmpNamespace }}
140+
kind: ServiceAccount
141+
{{- end }}
61142
{{- end }}

config/charts/inferencepool/values.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ inferenceExtension:
5050
# Prometheus ServiceMonitor will be created when enabled for EPP metrics collection
5151
prometheus:
5252
enabled: false
53+
54+
gke:
55+
enabled: false
5356

5457
inferencePool:
5558
targetPorts:
@@ -67,3 +70,8 @@ inferencePool:
6770
provider:
6871
name: none
6972

73+
# GKE-specific configuration.
74+
# This block is only used if name is "gke".
75+
gke:
76+
# Set to true if the cluster is an Autopilot cluster.
77+
autopilot: false

0 commit comments

Comments
 (0)