From c3d9469ea3b76bb6fe2123e91e6b97e6405917ef Mon Sep 17 00:00:00 2001 From: clcc2019 Date: Tue, 11 Nov 2025 11:03:25 +0800 Subject: [PATCH 1/4] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0Prometheus=20Serv?= =?UTF-8?q?iceMonitor=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 为scheduler和device-plugin组件添加ServiceMonitor模板 - 在values.yaml中添加servicemonitor配置选项 - 支持自定义监控间隔、标签和注解 - 默认启用ServiceMonitor功能 Signed-off-by: clcc2019 --- .../device-plugin/servicemonitor.yaml | 33 +++++++++++++++++++ .../templates/scheduler/servicemonitor.yaml | 33 +++++++++++++++++++ charts/hami/values.yaml | 14 ++++++++ 3 files changed, 80 insertions(+) create mode 100644 charts/hami/templates/device-plugin/servicemonitor.yaml create mode 100644 charts/hami/templates/scheduler/servicemonitor.yaml diff --git a/charts/hami/templates/device-plugin/servicemonitor.yaml b/charts/hami/templates/device-plugin/servicemonitor.yaml new file mode 100644 index 000000000..11cf994ba --- /dev/null +++ b/charts/hami/templates/device-plugin/servicemonitor.yaml @@ -0,0 +1,33 @@ +{{- if .Values.scheduler.servicemonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + {{- if .Values.devicePlugin.monitor.servicemonitor.annotations }} + annotations: + {{ toYaml .Values.devicePlugin.monitor.servicemonitor.annotations | nindent 4 }} + {{- end }} + name: {{ include "hami-vgpu.device-plugin" . }} + namespace: {{ include "hami-vgpu.namespace" . }} + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- if .Values.devicePlugin.monitor.servicemonitor.labels }} + {{ toYaml .Values.devicePlugin.monitor.servicemonitor.labels | indent 4 }} + {{- end }} +spec: + endpoints: + - path: /metrics + port: monitorport + scheme: http + interval: {{ .Values.devicePlugin.monitor.servicemonitor.interval | default "15s" }} + honorLabels: {{ .Values.devicePlugin.monitor.servicemonitor.honorLabels | default false }} + namespaceSelector: + matchNames: + - {{ include "hami-vgpu.namespace" . }} + selector: + matchLabels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.labels" . | nindent 6 }} + {{- if .Values.devicePlugin.service.labels }} # Use devicePlugin instead of scheduler + {{ toYaml .Values.devicePlugin.service.labels | indent 6 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/hami/templates/scheduler/servicemonitor.yaml b/charts/hami/templates/scheduler/servicemonitor.yaml new file mode 100644 index 000000000..b63c659c5 --- /dev/null +++ b/charts/hami/templates/scheduler/servicemonitor.yaml @@ -0,0 +1,33 @@ +{{- if .Values.scheduler.servicemonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + {{- if .Values.scheduler.servicemonitor.annotations }} + annotations: + {{ toYaml .Values.scheduler.servicemonitor.annotations | nindent 4 }} + {{- end }} + name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ include "hami-vgpu.namespace" . }} + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- if .Values.scheduler.servicemonitor.labels }} + {{ toYaml .Values.scheduler.servicemonitor.labels | indent 4 }} + {{- end }} +spec: + endpoints: + - path: /metrics + port: monitor + scheme: http + interval: {{ .Values.scheduler.servicemonitor.interval | default "15s" }} + honorLabels: {{ .Values.scheduler.servicemonitor.honorLabels | default false }} + namespaceSelector: + matchNames: + - {{ include "hami-vgpu.namespace" . }} + selector: + matchLabels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 6 }} + {{- if .Values.scheduler.service.labels }} + {{ toYaml .Values.scheduler.service.labels | indent 6 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml index 863643766..a6915e18e 100644 --- a/charts/hami/values.yaml +++ b/charts/hami/values.yaml @@ -235,6 +235,13 @@ scheduler: httpTargetPort: 443 labels: {} annotations: {} + # scheduler ServiceMonitor configuration + servicemonitor: + enabled: true + labels: {} + annotations: {} + interval: "15s" + honorLabels: false devicePlugin: enabled: true @@ -283,6 +290,13 @@ devicePlugin: pullSecrets: [] ctrPath: /usr/local/vgpu/containers resyncInterval: "5m" + # ServiceMonitor configuration + servicemonitor: + enabled: true + labels: {} + annotations: {} + interval: "15s" + honorLabels: false deviceSplitCount: 10 deviceMemoryScaling: 1 deviceCoreScaling: 1 From 491a4b15b64b9180d7f61ac032354a8363d18daa Mon Sep 17 00:00:00 2001 From: clcc2019 Date: Tue, 11 Nov 2025 11:11:58 +0800 Subject: [PATCH 2/4] =?UTF-8?q?docs:=20=E5=9C=A8chart=20README.md=E4=B8=AD?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ServiceMonitor=E9=85=8D=E7=BD=AE=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 为scheduler组件添加ServiceMonitor配置参数文档 - 为device-plugin组件添加ServiceMonitor配置参数文档 - 包含enabled、labels、annotations、interval和honorLabels参数说明 - 提供默认值和详细描述 Signed-off-by: clcc2019 --- charts/hami/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/charts/hami/README.md b/charts/hami/README.md index fd3ab3fe4..03f552d24 100644 --- a/charts/hami/README.md +++ b/charts/hami/README.md @@ -137,6 +137,16 @@ This document provides detailed descriptions of all configurable values paramete | `scheduler.service.monitorPort` | Monitor port | `31993` | | `scheduler.service.monitorTargetPort` | Monitor target port | `9395` | +### Scheduler ServiceMonitor Configuration + +| Parameter | Description | Default Value | +|-----------|-------------|---------------| +| `scheduler.servicemonitor.enabled` | Whether to enable ServiceMonitor for Prometheus monitoring | `false` | +| `scheduler.servicemonitor.labels` | Additional labels for ServiceMonitor | `{}` | +| `scheduler.servicemonitor.annotations` | Additional annotations for ServiceMonitor | `{}` | +| `scheduler.servicemonitor.interval` | Scrape interval for metrics collection | `"15s"` | +| `scheduler.servicemonitor.honorLabels` | Whether to honor labels from the target | `false` | + ## Device Plugin Configuration | Parameter | Description | Default Value | @@ -158,6 +168,16 @@ This document provides detailed descriptions of all configurable values paramete | `devicePlugin.monitor.image.pullSecrets` | Monitor image pull secrets | `[]` | | `devicePlugin.monitor.ctrPath` | Container path | `/usr/local/vgpu/containers` | +### Device Plugin ServiceMonitor Configuration + +| Parameter | Description | Default Value | +|-----------|-------------|---------------| +| `devicePlugin.monitor.servicemonitor.enabled` | Whether to enable ServiceMonitor for Prometheus monitoring | `false` | +| `devicePlugin.monitor.servicemonitor.labels` | Additional labels for ServiceMonitor | `{}` | +| `devicePlugin.monitor.servicemonitor.annotations` | Additional annotations for ServiceMonitor | `{}` | +| `devicePlugin.monitor.servicemonitor.interval` | Scrape interval for metrics collection | `"15s"` | +| `devicePlugin.monitor.servicemonitor.honorLabels` | Whether to honor labels from the target | `false` | + ### Device Plugin Other Configuration | Parameter | Description | Default Value | From efc42d78370ba637aedb31af55eaca1742e12d32 Mon Sep 17 00:00:00 2001 From: clcc2019 Date: Tue, 11 Nov 2025 11:14:11 +0800 Subject: [PATCH 3/4] chore: disable ServiceMonitor for scheduler and devicePlugin - Updated values.yaml to set servicemonitor.enabled to false for both scheduler and devicePlugin components, disabling the ServiceMonitor feature by default. Signed-off-by: clcc2019 --- charts/hami/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml index a6915e18e..c63c98b7b 100644 --- a/charts/hami/values.yaml +++ b/charts/hami/values.yaml @@ -237,7 +237,7 @@ scheduler: annotations: {} # scheduler ServiceMonitor configuration servicemonitor: - enabled: true + enabled: false labels: {} annotations: {} interval: "15s" @@ -292,7 +292,7 @@ devicePlugin: resyncInterval: "5m" # ServiceMonitor configuration servicemonitor: - enabled: true + enabled: false labels: {} annotations: {} interval: "15s" From 62cb67d8ec37e45e844b1a1fd76da05986f353de Mon Sep 17 00:00:00 2001 From: clcc2019 Date: Tue, 11 Nov 2025 13:52:18 +0800 Subject: [PATCH 4/4] fix: correct servicemonitor configuration - Fix device-plugin servicemonitor to use correct Values path - Add missing newline at end of servicemonitor files Signed-off-by: clcc2019 --- charts/hami/templates/device-plugin/servicemonitor.yaml | 6 +++--- charts/hami/templates/scheduler/servicemonitor.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/hami/templates/device-plugin/servicemonitor.yaml b/charts/hami/templates/device-plugin/servicemonitor.yaml index 11cf994ba..cd859a611 100644 --- a/charts/hami/templates/device-plugin/servicemonitor.yaml +++ b/charts/hami/templates/device-plugin/servicemonitor.yaml @@ -1,4 +1,4 @@ -{{- if .Values.scheduler.servicemonitor.enabled }} +{{- if .Values.devicePlugin.monitor.servicemonitor.enabled }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: @@ -27,7 +27,7 @@ spec: matchLabels: app.kubernetes.io/component: hami-device-plugin {{- include "hami-vgpu.labels" . | nindent 6 }} - {{- if .Values.devicePlugin.service.labels }} # Use devicePlugin instead of scheduler + {{- if .Values.devicePlugin.service.labels }} {{ toYaml .Values.devicePlugin.service.labels | indent 6 }} {{- end }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/hami/templates/scheduler/servicemonitor.yaml b/charts/hami/templates/scheduler/servicemonitor.yaml index b63c659c5..9a59500f2 100644 --- a/charts/hami/templates/scheduler/servicemonitor.yaml +++ b/charts/hami/templates/scheduler/servicemonitor.yaml @@ -30,4 +30,4 @@ spec: {{- if .Values.scheduler.service.labels }} {{ toYaml .Values.scheduler.service.labels | indent 6 }} {{- end }} -{{- end }} \ No newline at end of file +{{- end }}