diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 82be6b85c..f5ed0496f 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -117,6 +117,30 @@ Then apply it with: helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml ``` +### Install with Monitoring + +To enable metrics collection and monitoring for the EndpointPicker, you can configure Prometheus ServiceMonitor creation: + +```yaml +inferenceExtension: + monitoring: + interval: "10s" + prometheus: + enabled: true + secret: + name: inference-gateway-sa-metrics-reader-secret +``` + +**Note:** Prometheus monitoring requires the Prometheus Operator and ServiceMonitor CRD to be installed in the cluster. + +For GKE environments, monitoring is automatically configured when `provider.name` is set to `gke`. + +Then apply it with: + +```txt +helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml +``` + ## Uninstall Run the following command to uninstall the chart: @@ -146,6 +170,9 @@ The following table list the configurable parameters of the chart. | `inferenceExtension.affinity` | Affinity for the endpoint picker. Defaults to `{}`. | | `inferenceExtension.tolerations` | Tolerations for the endpoint picker. Defaults to `[]`. | | `inferenceExtension.flags.has-enable-leader-election` | Enable leader election for high availability. When enabled, only one EPP pod (the leader) will be ready to serve traffic. | +| `inferenceExtension.monitoring.interval` | Metrics scraping interval for monitoring. Defaults to `10s`. | +| `inferenceExtension.monitoring.secret.name` | Name of the service account token secret for metrics authentication. Defaults to `inference-gateway-sa-metrics-reader-secret`. | +| `inferenceExtension.monitoring.prometheus.enabled` | Enable Prometheus ServiceMonitor creation for EPP metrics collection. Defaults to `false`. | | `inferenceExtension.pluginsCustomConfig` | Custom config that is passed to EPP as inline yaml. | | `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`. | diff --git a/config/charts/inferencepool/templates/epp-sa-token-secret.yaml b/config/charts/inferencepool/templates/epp-sa-token-secret.yaml new file mode 100644 index 000000000..9abee0fcd --- /dev/null +++ b/config/charts/inferencepool/templates/epp-sa-token-secret.yaml @@ -0,0 +1,12 @@ +{{- if or .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.gke.enabled }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.inferenceExtension.monitoring.secret.name }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} + annotations: + kubernetes.io/service-account.name: {{ include "gateway-api-inference-extension.name" . }} +type: kubernetes.io/service-account-token +{{- end }} \ No newline at end of file diff --git a/config/charts/inferencepool/templates/epp-servicemonitor.yaml b/config/charts/inferencepool/templates/epp-servicemonitor.yaml new file mode 100644 index 000000000..e4788ba83 --- /dev/null +++ b/config/charts/inferencepool/templates/epp-servicemonitor.yaml @@ -0,0 +1,25 @@ +{{- if .Values.inferenceExtension.monitoring.prometheus.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-monitor + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +spec: + endpoints: + - interval: {{ .Values.inferenceExtension.monitoring.interval }} + port: "http-metrics" + path: "/metrics" + authorization: + credentials: + key: token + name: {{ .Values.inferenceExtension.monitoring.secret.name }} + jobLabel: {{ include "gateway-api-inference-extension.name" . }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + selector: + matchLabels: + {{- include "gateway-api-inference-extension.labels" . | nindent 6 }} +{{- end }} diff --git a/config/charts/inferencepool/templates/gke.yaml b/config/charts/inferencepool/templates/gke.yaml index 92010c0d0..e3b92ea6f 100644 --- a/config/charts/inferencepool/templates/gke.yaml +++ b/config/charts/inferencepool/templates/gke.yaml @@ -46,15 +46,15 @@ spec: endpoints: - port: metrics scheme: http - interval: 5s + interval: {{ .Values.inferenceExtension.monitoring.interval }} path: /metrics authorization: type: Bearer credentials: secret: - name: {{ .Values.gke.monitoringSecret.name }} + name: {{ .Values.inferenceExtension.monitoring.secret.name }} key: token - namespace: {{ .Values.gke.monitoringSecret.namespace }} + namespace: {{ .Release.Namespace }} selector: matchLabels: {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} diff --git a/config/charts/inferencepool/templates/rbac.yaml b/config/charts/inferencepool/templates/rbac.yaml index a8d891c32..da380df66 100644 --- a/config/charts/inferencepool/templates/rbac.yaml +++ b/config/charts/inferencepool/templates/rbac.yaml @@ -17,6 +17,12 @@ rules: - subjectaccessreviews verbs: - create +{{- if .Values.inferenceExtension.monitoring.prometheus.enabled }} +- nonResourceURLs: + - "/metrics" + verbs: + - get +{{- end }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index d45e6ed39..f61b64e37 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -40,6 +40,17 @@ inferenceExtension: tolerations: [] + # Monitoring configuration for EPP + monitoring: + interval: "10s" + # Service account token secret for authentication + secret: + name: inference-gateway-sa-metrics-reader-secret + + # Prometheus ServiceMonitor will be created when enabled for EPP metrics collection + prometheus: + enabled: false + inferencePool: targetPorts: - number: 8000 @@ -56,7 +67,3 @@ inferencePool: provider: name: none -gke: - monitoringSecret: - name: inference-gateway-sa-metrics-reader-secret - namespace: default