Skip to content

Commit 9d2ec89

Browse files
committed
Helm for lens plugin
Signed-off-by: Ritika Gupta <[email protected]>
1 parent 2f2f790 commit 9d2ec89

File tree

11 files changed

+1044
-0
lines changed

11 files changed

+1044
-0
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
apiVersion: v2
2+
name: oci-scanner-plugin
3+
description: OCI Scanner Plugin for AMD GPU monitoring and health checks
4+
type: application
5+
version: 0.1.0
6+
appVersion: "1.0.0"
7+
keywords:
8+
- oci
9+
- amd
10+
- gpu
11+
- monitoring
12+
- prometheus
13+
home: https://github.com/oracle/oci-lens-quickstart
14+
maintainers:
15+
- name: OCI Lens Team
16+
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# OCI Scanner Plugin Helm Chart
2+
3+
AMD GPU monitoring and health check solution for OCI compute instances.
4+
5+
## Components
6+
7+
- **Go Plugin**: Main metric collector
8+
- **Node Exporter**: System metrics
9+
- **AMD GPU Exporter**: GPU metrics
10+
- **Metrics Push Job**: Automated metrics forwarding
11+
- **Health Check**: GPU performance testing (optional)
12+
- **DRHPC**: DRHPC monitoring (optional)
13+
14+
## Configuration
15+
16+
```bash
17+
# Custom push gateway
18+
helm install oci-scanner-plugin ./oci-scanner-plugin-helm \
19+
--set global.pushGatewayUrl=http://my-pushgateway:9091/
20+
21+
# Enable health check
22+
helm install oci-scanner-plugin ./oci-scanner-plugin-helm \
23+
--set healthCheck.enabled=true
24+
25+
# Uninstall
26+
helm uninstall oci-scanner-plugin
27+
```
28+
29+
## Requirements
30+
31+
- Kubernetes cluster with AMD GPU nodes
32+
- Prometheus Push Gateway accessible from cluster
33+
- AMD GPU drivers installed on nodes
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
{{- if .Values.amdGpuExporter.enabled }}
2+
---
3+
apiVersion: apps/v1
4+
kind: DaemonSet
5+
metadata:
6+
name: amd-gpu-exporter
7+
namespace: {{ .Values.namespace.name }}
8+
labels:
9+
app: amd-gpu-exporter
10+
component: gpu-monitoring
11+
spec:
12+
selector:
13+
matchLabels:
14+
app: amd-gpu-exporter
15+
template:
16+
metadata:
17+
labels:
18+
app: amd-gpu-exporter
19+
component: gpu-monitoring
20+
annotations:
21+
prometheus.io/scrape: "true"
22+
prometheus.io/port: "5000"
23+
prometheus.io/path: "/metrics"
24+
spec:
25+
nodeSelector:
26+
kubernetes.io/arch: amd64
27+
28+
hostNetwork: true
29+
30+
volumes:
31+
- name: dri
32+
hostPath:
33+
path: /dev/dri
34+
type: Directory
35+
- name: kfd
36+
hostPath:
37+
path: /dev/kfd
38+
type: Directory
39+
- name: sys
40+
hostPath:
41+
path: /sys
42+
type: Directory
43+
- name: proc
44+
hostPath:
45+
path: /proc
46+
type: Directory
47+
- name: var-lib-dpkg
48+
hostPath:
49+
path: /var/lib/dpkg
50+
type: Directory
51+
52+
containers:
53+
- name: amd-gpu-exporter
54+
image: "{{ .Values.amdGpuExporter.image.repository }}:{{ .Values.amdGpuExporter.image.tag }}"
55+
imagePullPolicy: {{ .Values.amdGpuExporter.image.pullPolicy }}
56+
ports:
57+
- name: metrics
58+
containerPort: 5000
59+
protocol: TCP
60+
61+
volumeMounts:
62+
- name: dri
63+
mountPath: /dev/dri
64+
- name: kfd
65+
mountPath: /dev/kfd
66+
- name: sys
67+
mountPath: /sys
68+
readOnly: true
69+
- name: proc
70+
mountPath: /proc
71+
readOnly: true
72+
- name: var-lib-dpkg
73+
mountPath: /var/lib/dpkg
74+
readOnly: true
75+
76+
securityContext:
77+
privileged: false
78+
readOnlyRootFilesystem: false
79+
allowPrivilegeEscalation: false
80+
capabilities:
81+
add:
82+
- SYS_ADMIN
83+
- NET_ADMIN
84+
85+
resources:
86+
{{- toYaml .Values.amdGpuExporter.resources | nindent 10 }}
87+
88+
livenessProbe:
89+
httpGet:
90+
path: /health
91+
port: 5000
92+
initialDelaySeconds: 60
93+
periodSeconds: 30
94+
timeoutSeconds: 10
95+
failureThreshold: 3
96+
97+
readinessProbe:
98+
httpGet:
99+
path: /health
100+
port: 5000
101+
initialDelaySeconds: 30
102+
periodSeconds: 10
103+
timeoutSeconds: 5
104+
failureThreshold: 3
105+
106+
env:
107+
- name: ROCR_VISIBLE_DEVICES
108+
value: "all"
109+
- name: HSA_OVERRIDE_GFX_VERSION
110+
value: "11.0.0"
111+
112+
tolerations:
113+
- key: amd.com/gpu
114+
operator: Equal
115+
value: "present"
116+
effect: NoSchedule
117+
118+
restartPolicy: Always
119+
terminationGracePeriodSeconds: 30
120+
121+
---
122+
apiVersion: v1
123+
kind: Service
124+
metadata:
125+
name: amd-gpu-exporter
126+
namespace: {{ .Values.namespace.name }}
127+
labels:
128+
app: amd-gpu-exporter
129+
component: gpu-monitoring
130+
spec:
131+
selector:
132+
app: amd-gpu-exporter
133+
ports:
134+
- name: metrics
135+
port: 5000
136+
targetPort: 5000
137+
protocol: TCP
138+
type: ClusterIP
139+
{{- end }}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
{{- if .Values.drhpc.enabled }}
2+
---
3+
apiVersion: apps/v1
4+
kind: DaemonSet
5+
metadata:
6+
name: oci-lens-dr-hpc
7+
namespace: {{ .Values.namespace.name }}
8+
labels:
9+
app: oci-lens-dr-hpc
10+
component: gpu-monitoring
11+
spec:
12+
selector:
13+
matchLabels:
14+
app: oci-lens-dr-hpc
15+
template:
16+
metadata:
17+
labels:
18+
app: oci-lens-dr-hpc
19+
spec:
20+
nodeSelector:
21+
kubernetes.io/arch: amd64
22+
23+
affinity:
24+
nodeAffinity:
25+
requiredDuringSchedulingIgnoredDuringExecution:
26+
nodeSelectorTerms:
27+
- matchExpressions:
28+
- key: amd.com/gpu
29+
operator: In
30+
values: ["true", "present"]
31+
32+
tolerations:
33+
- key: amd.com/gpu
34+
operator: Exists
35+
effect: NoSchedule
36+
37+
priorityClassName: system-node-critical
38+
terminationGracePeriodSeconds: 0
39+
40+
volumes:
41+
- name: root
42+
hostPath:
43+
path: "/"
44+
45+
hostPID: true
46+
hostIPC: true
47+
hostNetwork: true
48+
49+
containers:
50+
- name: oke-dr-hpc-prod
51+
image: "{{ .Values.global.imageRegistry }}/{{ .Values.drhpc.image.repository }}:{{ .Values.drhpc.image.tag }}"
52+
imagePullPolicy: {{ .Values.drhpc.image.pullPolicy }}
53+
54+
securityContext:
55+
privileged: true
56+
capabilities:
57+
add: [SYS_ADMIN]
58+
59+
volumeMounts:
60+
- name: root
61+
mountPath: /host
62+
63+
resources: {}
64+
65+
env:
66+
- name: PUSH_GATEWAY
67+
value: {{ .Values.global.pushGatewayUrl | quote }}
68+
- name: JOB_NAME
69+
value: "oci_lens_drhpc_metrics"
70+
{{- end }}

0 commit comments

Comments
 (0)