File tree Expand file tree Collapse file tree 11 files changed +1044
-0
lines changed
oci-scanner-plugin-amd-helm Expand file tree Collapse file tree 11 files changed +1044
-0
lines changed Original file line number Diff line number Diff line change 1+ apiVersion : v2
2+ name : oci-scanner-plugin
3+ description : OCI Scanner Plugin for AMD GPU monitoring and health checks
4+ type : application
5+ version : 0.1.0
6+ appVersion : " 1.0.0"
7+ keywords :
8+ - oci
9+ - amd
10+ - gpu
11+ - monitoring
12+ - prometheus
13+ home : https://github.com/oracle/oci-lens-quickstart
14+ maintainers :
15+ - name : OCI Lens Team
16+
Original file line number Diff line number Diff line change 1+ # OCI Scanner Plugin Helm Chart
2+
3+ AMD GPU monitoring and health check solution for OCI compute instances.
4+
5+ ## Components
6+
7+ - ** Go Plugin** : Main metric collector
8+ - ** Node Exporter** : System metrics
9+ - ** AMD GPU Exporter** : GPU metrics
10+ - ** Metrics Push Job** : Automated metrics forwarding
11+ - ** Health Check** : GPU performance testing (optional)
12+ - ** DRHPC** : DRHPC monitoring (optional)
13+
14+ ## Configuration
15+
16+ ``` bash
17+ # Custom push gateway
18+ helm install oci-scanner-plugin ./oci-scanner-plugin-helm \
19+ --set global.pushGatewayUrl=http://my-pushgateway:9091/
20+
21+ # Enable health check
22+ helm install oci-scanner-plugin ./oci-scanner-plugin-helm \
23+ --set healthCheck.enabled=true
24+
25+ # Uninstall
26+ helm uninstall oci-scanner-plugin
27+ ```
28+
29+ ## Requirements
30+
31+ - Kubernetes cluster with AMD GPU nodes
32+ - Prometheus Push Gateway accessible from cluster
33+ - AMD GPU drivers installed on nodes
Original file line number Diff line number Diff line change 1+ {{- if .Values.amdGpuExporter.enabled }}
2+ ---
3+ apiVersion : apps/v1
4+ kind : DaemonSet
5+ metadata :
6+ name : amd-gpu-exporter
7+ namespace : {{ .Values.namespace.name }}
8+ labels :
9+ app : amd-gpu-exporter
10+ component : gpu-monitoring
11+ spec :
12+ selector :
13+ matchLabels :
14+ app : amd-gpu-exporter
15+ template :
16+ metadata :
17+ labels :
18+ app : amd-gpu-exporter
19+ component : gpu-monitoring
20+ annotations :
21+ prometheus.io/scrape : " true"
22+ prometheus.io/port : " 5000"
23+ prometheus.io/path : " /metrics"
24+ spec :
25+ nodeSelector :
26+ kubernetes.io/arch : amd64
27+
28+ hostNetwork : true
29+
30+ volumes :
31+ - name : dri
32+ hostPath :
33+ path : /dev/dri
34+ type : Directory
35+ - name : kfd
36+ hostPath :
37+ path : /dev/kfd
38+ type : Directory
39+ - name : sys
40+ hostPath :
41+ path : /sys
42+ type : Directory
43+ - name : proc
44+ hostPath :
45+ path : /proc
46+ type : Directory
47+ - name : var-lib-dpkg
48+ hostPath :
49+ path : /var/lib/dpkg
50+ type : Directory
51+
52+ containers :
53+ - name : amd-gpu-exporter
54+ image : " {{ .Values.amdGpuExporter.image.repository }}:{{ .Values.amdGpuExporter.image.tag }}"
55+ imagePullPolicy : {{ .Values.amdGpuExporter.image.pullPolicy }}
56+ ports :
57+ - name : metrics
58+ containerPort : 5000
59+ protocol : TCP
60+
61+ volumeMounts :
62+ - name : dri
63+ mountPath : /dev/dri
64+ - name : kfd
65+ mountPath : /dev/kfd
66+ - name : sys
67+ mountPath : /sys
68+ readOnly : true
69+ - name : proc
70+ mountPath : /proc
71+ readOnly : true
72+ - name : var-lib-dpkg
73+ mountPath : /var/lib/dpkg
74+ readOnly : true
75+
76+ securityContext :
77+ privileged : false
78+ readOnlyRootFilesystem : false
79+ allowPrivilegeEscalation : false
80+ capabilities :
81+ add :
82+ - SYS_ADMIN
83+ - NET_ADMIN
84+
85+ resources :
86+ {{- toYaml .Values.amdGpuExporter.resources | nindent 10 }}
87+
88+ livenessProbe :
89+ httpGet :
90+ path : /health
91+ port : 5000
92+ initialDelaySeconds : 60
93+ periodSeconds : 30
94+ timeoutSeconds : 10
95+ failureThreshold : 3
96+
97+ readinessProbe :
98+ httpGet :
99+ path : /health
100+ port : 5000
101+ initialDelaySeconds : 30
102+ periodSeconds : 10
103+ timeoutSeconds : 5
104+ failureThreshold : 3
105+
106+ env :
107+ - name : ROCR_VISIBLE_DEVICES
108+ value : " all"
109+ - name : HSA_OVERRIDE_GFX_VERSION
110+ value : " 11.0.0"
111+
112+ tolerations :
113+ - key : amd.com/gpu
114+ operator : Equal
115+ value : " present"
116+ effect : NoSchedule
117+
118+ restartPolicy : Always
119+ terminationGracePeriodSeconds : 30
120+
121+ ---
122+ apiVersion : v1
123+ kind : Service
124+ metadata :
125+ name : amd-gpu-exporter
126+ namespace : {{ .Values.namespace.name }}
127+ labels :
128+ app : amd-gpu-exporter
129+ component : gpu-monitoring
130+ spec :
131+ selector :
132+ app : amd-gpu-exporter
133+ ports :
134+ - name : metrics
135+ port : 5000
136+ targetPort : 5000
137+ protocol : TCP
138+ type : ClusterIP
139+ {{- end }}
Original file line number Diff line number Diff line change 1+ {{- if .Values.drhpc.enabled }}
2+ ---
3+ apiVersion : apps/v1
4+ kind : DaemonSet
5+ metadata :
6+ name : oci-lens-dr-hpc
7+ namespace : {{ .Values.namespace.name }}
8+ labels :
9+ app : oci-lens-dr-hpc
10+ component : gpu-monitoring
11+ spec :
12+ selector :
13+ matchLabels :
14+ app : oci-lens-dr-hpc
15+ template :
16+ metadata :
17+ labels :
18+ app : oci-lens-dr-hpc
19+ spec :
20+ nodeSelector :
21+ kubernetes.io/arch : amd64
22+
23+ affinity :
24+ nodeAffinity :
25+ requiredDuringSchedulingIgnoredDuringExecution :
26+ nodeSelectorTerms :
27+ - matchExpressions :
28+ - key : amd.com/gpu
29+ operator : In
30+ values : ["true", "present"]
31+
32+ tolerations :
33+ - key : amd.com/gpu
34+ operator : Exists
35+ effect : NoSchedule
36+
37+ priorityClassName : system-node-critical
38+ terminationGracePeriodSeconds : 0
39+
40+ volumes :
41+ - name : root
42+ hostPath :
43+ path : " /"
44+
45+ hostPID : true
46+ hostIPC : true
47+ hostNetwork : true
48+
49+ containers :
50+ - name : oke-dr-hpc-prod
51+ image : " {{ .Values.global.imageRegistry }}/{{ .Values.drhpc.image.repository }}:{{ .Values.drhpc.image.tag }}"
52+ imagePullPolicy : {{ .Values.drhpc.image.pullPolicy }}
53+
54+ securityContext :
55+ privileged : true
56+ capabilities :
57+ add : [SYS_ADMIN]
58+
59+ volumeMounts :
60+ - name : root
61+ mountPath : /host
62+
63+ resources : {}
64+
65+ env :
66+ - name : PUSH_GATEWAY
67+ value : {{ .Values.global.pushGatewayUrl | quote }}
68+ - name : JOB_NAME
69+ value : " oci_lens_drhpc_metrics"
70+ {{- end }}
You can’t perform that action at this time.
0 commit comments