From 947a9317d7e8e26ded510bccf195c26d30400b9e Mon Sep 17 00:00:00 2001 From: sallyom Date: Wed, 13 Aug 2025 23:07:40 -0400 Subject: [PATCH] add epp servicemonitor Signed-off-by: sallyom --- .../llm-d-modelservice/templates/_helpers.tpl | 5 +++ .../templates/epp-clusterrolebinding.yaml | 16 +++++++++ .../templates/epp-deployment.yaml | 7 ++-- .../templates/epp-metrics-clusterrole.yaml | 33 +++++++++++++++++++ .../templates/epp-sa-token-secret.yaml | 11 +++++++ .../llm-d-modelservice/templates/epp-sa.yaml | 2 +- .../templates/epp-service.yaml | 9 ++++- .../templates/epp-servicemonitor.yaml | 25 ++++++++++++++ charts/llm-d-modelservice/values.yaml | 6 ++++ 9 files changed, 109 insertions(+), 5 deletions(-) create mode 100644 charts/llm-d-modelservice/templates/epp-clusterrolebinding.yaml create mode 100644 charts/llm-d-modelservice/templates/epp-metrics-clusterrole.yaml create mode 100644 charts/llm-d-modelservice/templates/epp-sa-token-secret.yaml create mode 100644 charts/llm-d-modelservice/templates/epp-servicemonitor.yaml diff --git a/charts/llm-d-modelservice/templates/_helpers.tpl b/charts/llm-d-modelservice/templates/_helpers.tpl index 45767a5..a619bae 100644 --- a/charts/llm-d-modelservice/templates/_helpers.tpl +++ b/charts/llm-d-modelservice/templates/_helpers.tpl @@ -255,6 +255,11 @@ resources: {{- end -}} {{- end }} +{{/* EPP labels */}} +{{- define "llm-d-modelservice.eppLabels" -}} +llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} +{{- end }} + {{/* default http route name */}} {{- define "llm-d-modelservice.httpRouteName" -}} {{ include "llm-d-modelservice.fullname" . }} diff --git a/charts/llm-d-modelservice/templates/epp-clusterrolebinding.yaml b/charts/llm-d-modelservice/templates/epp-clusterrolebinding.yaml new file mode 100644 index 0000000..b47c5f2 --- /dev/null +++ b/charts/llm-d-modelservice/templates/epp-clusterrolebinding.yaml @@ -0,0 +1,16 @@ +{{- if .Values.routing.epp.monitoring.servicemonitor.enabled }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "llm-d-modelservice.eppName" . }}-metrics +subjects: +- kind: ServiceAccount + name: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/charts/llm-d-modelservice/templates/epp-deployment.yaml b/charts/llm-d-modelservice/templates/epp-deployment.yaml index f7c5011..54d68f5 100644 --- a/charts/llm-d-modelservice/templates/epp-deployment.yaml +++ b/charts/llm-d-modelservice/templates/epp-deployment.yaml @@ -4,17 +4,18 @@ kind: Deployment metadata: name: {{ include "llm-d-modelservice.eppName" . }} labels: - llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} + {{- include "llm-d-modelservice.labels" . | nindent 4 }} + {{- include "llm-d-modelservice.eppLabels" . | nindent 4 }} namespace: {{ .Release.Namespace }} spec: replicas: {{ default 1 .Values.routing.epp.replicas }} selector: matchLabels: - llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} + {{- include "llm-d-modelservice.eppLabels" . | nindent 6 }} template: metadata: labels: - llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} + {{- include "llm-d-modelservice.eppLabels" . | nindent 8 }} spec: containers: - name: epp diff --git a/charts/llm-d-modelservice/templates/epp-metrics-clusterrole.yaml b/charts/llm-d-modelservice/templates/epp-metrics-clusterrole.yaml new file mode 100644 index 0000000..cd0dfc4 --- /dev/null +++ b/charts/llm-d-modelservice/templates/epp-metrics-clusterrole.yaml @@ -0,0 +1,33 @@ +{{- if .Values.routing.epp.monitoring.servicemonitor.enabled }} +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ include "llm-d-modelservice.eppName" . }}-metrics + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - watch + - list +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- nonResourceURLs: + - {{ .Values.routing.epp.monitoring.servicemonitor.path }} + verbs: + - get +{{- end }} diff --git a/charts/llm-d-modelservice/templates/epp-sa-token-secret.yaml b/charts/llm-d-modelservice/templates/epp-sa-token-secret.yaml new file mode 100644 index 0000000..217a7f4 --- /dev/null +++ b/charts/llm-d-modelservice/templates/epp-sa-token-secret.yaml @@ -0,0 +1,11 @@ +{{- if or .Values.routing.epp.create .Values.routing.epp.monitoring.servicemonitor.enabled }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "llm-d-modelservice.eppServiceAccountName" . }}-token + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} + annotations: + kubernetes.io/service-account.name: {{ include "llm-d-modelservice.eppServiceAccountName" . }} +type: kubernetes.io/service-account-token +{{- end }} diff --git a/charts/llm-d-modelservice/templates/epp-sa.yaml b/charts/llm-d-modelservice/templates/epp-sa.yaml index 53c4f18..bf9d14c 100644 --- a/charts/llm-d-modelservice/templates/epp-sa.yaml +++ b/charts/llm-d-modelservice/templates/epp-sa.yaml @@ -1,4 +1,4 @@ -{{- if .Values.routing.epp.create -}} +{{- if or .Values.routing.epp.create .Values.routing.epp.monitoring.servicemonitor.enabled }} apiVersion: v1 kind: ServiceAccount metadata: diff --git a/charts/llm-d-modelservice/templates/epp-service.yaml b/charts/llm-d-modelservice/templates/epp-service.yaml index 95cb85f..1fb10f0 100644 --- a/charts/llm-d-modelservice/templates/epp-service.yaml +++ b/charts/llm-d-modelservice/templates/epp-service.yaml @@ -5,6 +5,7 @@ metadata: name: {{ include "llm-d-modelservice.eppServiceName" . }} labels: {{- include "llm-d-modelservice.labels" . | nindent 4 }} + {{- include "llm-d-modelservice.eppLabels" . | nindent 4 }} spec: type: {{ .Values.routing.epp.service.type }} ports: @@ -13,9 +14,15 @@ spec: targetPort: {{ .Values.routing.epp.service.targetPort }} protocol: TCP appProtocol: {{ .Values.routing.epp.service.appProtocol }} + {{- if .Values.routing.epp.monitoring.servicemonitor.enabled }} + - name: metrics + port: 9090 + targetPort: 9090 + protocol: TCP + {{- end }} {{- with .Values.routing.epp.service.extraPorts }} {{- toYaml . | nindent 4 }} {{- end }} selector: - llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} + {{- include "llm-d-modelservice.eppLabels" . | nindent 4 }} {{- end }} diff --git a/charts/llm-d-modelservice/templates/epp-servicemonitor.yaml b/charts/llm-d-modelservice/templates/epp-servicemonitor.yaml new file mode 100644 index 0000000..0f83581 --- /dev/null +++ b/charts/llm-d-modelservice/templates/epp-servicemonitor.yaml @@ -0,0 +1,25 @@ +{{- if .Values.routing.epp.monitoring.servicemonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "llm-d-modelservice.eppServiceAccountName" . }}-monitor + namespace: {{ .Release.Namespace }} + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + endpoints: + - interval: {{ .Values.routing.epp.monitoring.servicemonitor.interval }} + port: "metrics" + path: "/metrics" + authorization: + credentials: + key: token + name: {{ include "llm-d-modelservice.eppServiceAccountName" . }}-token + jobLabel: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + selector: + matchLabels: + {{- include "llm-d-modelservice.eppLabels" . | nindent 6 }} +{{- end }} diff --git a/charts/llm-d-modelservice/values.yaml b/charts/llm-d-modelservice/values.yaml index 342f647..0a10af3 100644 --- a/charts/llm-d-modelservice/values.yaml +++ b/charts/llm-d-modelservice/values.yaml @@ -218,6 +218,12 @@ routing: # - name: ENABLE_KVCACHE_AWARE_SCORER # value: "false" + # Monitoring configuration for EPP + monitoring: + # ServiceMonitor configuration for EPP metrics collection with Prometheus Operator + servicemonitor: + enabled: false + interval: "10s" # @schema # additionalProperties: true