diff --git a/charts/hami/README.md b/charts/hami/README.md index fd3ab3fe4..03f552d24 100644 --- a/charts/hami/README.md +++ b/charts/hami/README.md @@ -137,6 +137,16 @@ This document provides detailed descriptions of all configurable values paramete | `scheduler.service.monitorPort` | Monitor port | `31993` | | `scheduler.service.monitorTargetPort` | Monitor target port | `9395` | +### Scheduler ServiceMonitor Configuration + +| Parameter | Description | Default Value | +|-----------|-------------|---------------| +| `scheduler.servicemonitor.enabled` | Whether to enable ServiceMonitor for Prometheus monitoring | `false` | +| `scheduler.servicemonitor.labels` | Additional labels for ServiceMonitor | `{}` | +| `scheduler.servicemonitor.annotations` | Additional annotations for ServiceMonitor | `{}` | +| `scheduler.servicemonitor.interval` | Scrape interval for metrics collection | `"15s"` | +| `scheduler.servicemonitor.honorLabels` | Whether to honor labels from the target | `false` | + ## Device Plugin Configuration | Parameter | Description | Default Value | @@ -158,6 +168,16 @@ This document provides detailed descriptions of all configurable values paramete | `devicePlugin.monitor.image.pullSecrets` | Monitor image pull secrets | `[]` | | `devicePlugin.monitor.ctrPath` | Container path | `/usr/local/vgpu/containers` | +### Device Plugin ServiceMonitor Configuration + +| Parameter | Description | Default Value | +|-----------|-------------|---------------| +| `devicePlugin.monitor.servicemonitor.enabled` | Whether to enable ServiceMonitor for Prometheus monitoring | `false` | +| `devicePlugin.monitor.servicemonitor.labels` | Additional labels for ServiceMonitor | `{}` | +| `devicePlugin.monitor.servicemonitor.annotations` | Additional annotations for ServiceMonitor | `{}` | +| `devicePlugin.monitor.servicemonitor.interval` | Scrape interval for metrics collection | `"15s"` | +| `devicePlugin.monitor.servicemonitor.honorLabels` | Whether to honor labels from the target | `false` | + ### Device Plugin Other Configuration | Parameter | Description | Default Value | diff --git a/charts/hami/templates/device-plugin/servicemonitor.yaml b/charts/hami/templates/device-plugin/servicemonitor.yaml new file mode 100644 index 000000000..cd859a611 --- /dev/null +++ b/charts/hami/templates/device-plugin/servicemonitor.yaml @@ -0,0 +1,33 @@ +{{- if .Values.devicePlugin.monitor.servicemonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + {{- if .Values.devicePlugin.monitor.servicemonitor.annotations }} + annotations: + {{ toYaml .Values.devicePlugin.monitor.servicemonitor.annotations | nindent 4 }} + {{- end }} + name: {{ include "hami-vgpu.device-plugin" . }} + namespace: {{ include "hami-vgpu.namespace" . }} + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- if .Values.devicePlugin.monitor.servicemonitor.labels }} + {{ toYaml .Values.devicePlugin.monitor.servicemonitor.labels | indent 4 }} + {{- end }} +spec: + endpoints: + - path: /metrics + port: monitorport + scheme: http + interval: {{ .Values.devicePlugin.monitor.servicemonitor.interval | default "15s" }} + honorLabels: {{ .Values.devicePlugin.monitor.servicemonitor.honorLabels | default false }} + namespaceSelector: + matchNames: + - {{ include "hami-vgpu.namespace" . }} + selector: + matchLabels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.labels" . | nindent 6 }} + {{- if .Values.devicePlugin.service.labels }} + {{ toYaml .Values.devicePlugin.service.labels | indent 6 }} + {{- end }} +{{- end }} diff --git a/charts/hami/templates/scheduler/servicemonitor.yaml b/charts/hami/templates/scheduler/servicemonitor.yaml new file mode 100644 index 000000000..9a59500f2 --- /dev/null +++ b/charts/hami/templates/scheduler/servicemonitor.yaml @@ -0,0 +1,33 @@ +{{- if .Values.scheduler.servicemonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + {{- if .Values.scheduler.servicemonitor.annotations }} + annotations: + {{ toYaml .Values.scheduler.servicemonitor.annotations | nindent 4 }} + {{- end }} + name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ include "hami-vgpu.namespace" . }} + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- if .Values.scheduler.servicemonitor.labels }} + {{ toYaml .Values.scheduler.servicemonitor.labels | indent 4 }} + {{- end }} +spec: + endpoints: + - path: /metrics + port: monitor + scheme: http + interval: {{ .Values.scheduler.servicemonitor.interval | default "15s" }} + honorLabels: {{ .Values.scheduler.servicemonitor.honorLabels | default false }} + namespaceSelector: + matchNames: + - {{ include "hami-vgpu.namespace" . }} + selector: + matchLabels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 6 }} + {{- if .Values.scheduler.service.labels }} + {{ toYaml .Values.scheduler.service.labels | indent 6 }} + {{- end }} +{{- end }} diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml index 863643766..c63c98b7b 100644 --- a/charts/hami/values.yaml +++ b/charts/hami/values.yaml @@ -235,6 +235,13 @@ scheduler: httpTargetPort: 443 labels: {} annotations: {} + # scheduler ServiceMonitor configuration + servicemonitor: + enabled: false + labels: {} + annotations: {} + interval: "15s" + honorLabels: false devicePlugin: enabled: true @@ -283,6 +290,13 @@ devicePlugin: pullSecrets: [] ctrPath: /usr/local/vgpu/containers resyncInterval: "5m" + # ServiceMonitor configuration + servicemonitor: + enabled: false + labels: {} + annotations: {} + interval: "15s" + honorLabels: false deviceSplitCount: 10 deviceMemoryScaling: 1 deviceCoreScaling: 1