File tree Expand file tree Collapse file tree 12 files changed +202
-163
lines changed
oci-scanner-plugin-amd-helm Expand file tree Collapse file tree 12 files changed +202
-163
lines changed Original file line number Diff line number Diff line change 11apiVersion : v2
2- name : oci-scanner-plugin
2+ name : oci-gpu- scanner-plugin
33description : OCI Scanner Plugin for AMD GPU monitoring and health checks
44type : application
55version : 0.1.0
@@ -13,4 +13,11 @@ keywords:
1313home : https://github.com/oracle/oci-lens-quickstart
1414maintainers :
1515 - name : OCI Lens Team
16- 16+ 17+
18+ dependencies :
19+ - name : prometheus-node-exporter
20+ version : 4.40.0
21+ repository : https://prometheus-community.github.io/helm-charts
22+ condition : nodeExporter.enabled
23+ namespace : oci-gpu-scanner-plugin
Original file line number Diff line number Diff line change @@ -14,16 +14,19 @@ AMD GPU monitoring and health check solution for OCI compute instances.
1414## Configuration
1515
1616``` bash
17- # Custom push gateway
18- helm install oci-scanner-plugin ./oci-scanner-plugin-helm \
19- --set global.pushGatewayUrl=http://my-pushgateway:9091/
17+ helm dependency build
18+ helm dependency update
19+
20+ helm install oci-gpu-scanner-plugin . -f values.yaml -n oci-gpu-scanner-plugin \
21+ --set global.pushGatewayUrl=" <your-push-gateway-url>" \
22+ --create-namespace
2023
2124# Enable health check
22- helm install oci-scanner-plugin ./oci-scanner-plugin-helm \
25+ helm install oci-gpu- scanner-plugin ./oci-scanner-plugin-amd -helm \
2326 --set healthCheck.enabled=true
2427
2528# Uninstall
26- helm uninstall oci-scanner-plugin
29+ helm uninstall oci-gpu-scanner-plugin -n oci-gpu- scanner-plugin
2730```
2831
2932## Requirements
Original file line number Diff line number Diff line change @@ -25,17 +25,20 @@ spec:
2525 nodeSelector :
2626 kubernetes.io/arch : amd64
2727
28- hostNetwork : true
28+ affinity :
29+ nodeAffinity :
30+ requiredDuringSchedulingIgnoredDuringExecution :
31+ nodeSelectorTerms :
32+ - matchExpressions :
33+ - key : amd.com/gpu
34+ operator : In
35+ values : ["true", "present"]
2936
3037 volumes :
3138 - name : dri
3239 hostPath :
3340 path : /dev/dri
3441 type : Directory
35- - name : kfd
36- hostPath :
37- path : /dev/kfd
38- type : Directory
3942 - name : sys
4043 hostPath :
4144 path : /sys
6164 volumeMounts :
6265 - name : dri
6366 mountPath : /dev/dri
64- - name : kfd
65- mountPath : /dev/kfd
6667 - name : sys
6768 mountPath : /sys
6869 readOnly : true
@@ -111,8 +112,7 @@ spec:
111112
112113 tolerations :
113114 - key : amd.com/gpu
114- operator : Equal
115- value : " present"
115+ operator : Exists
116116 effect : NoSchedule
117117
118118 restartPolicy : Always
Original file line number Diff line number Diff line change 4848
4949 containers :
5050 - name : oke-dr-hpc-prod
51- image : " {{ .Values.global.imageRegistry }}/{{ .Values.drhpc.image.repository }}:{{ .Values.drhpc.image.tag }}"
51+ image : " {{ .Values.global.ociImageRegistry }}/{{ .Values.drhpc.image.repository }}:{{ .Values.drhpc.image.tag }}"
5252 imagePullPolicy : {{ .Values.drhpc.image.pullPolicy }}
5353
5454 securityContext :
Original file line number Diff line number Diff line change 7373
7474 containers :
7575 - name : oci-lens-plugin
76- image : " {{ .Values.global.imageRegistry }}/{{ .Values.goPlugin.image.repository }}:{{ .Values.goPlugin.image.tag }}"
76+ image : " {{ .Values.global.ociImageRegistry }}/{{ .Values.goPlugin.image.repository }}:{{ .Values.goPlugin.image.tag }}"
7777 imagePullPolicy : {{ .Values.goPlugin.image.pullPolicy }}
7878
7979 env :
Original file line number Diff line number Diff line change 4848
4949 containers :
5050 - name : amd-gpu-healthcheck
51- image : " {{ .Values.global.imageRegistry }}/{{ .Values.healthCheck.image.repository }}:{{ .Values.healthCheck.image.tag }}"
51+ image : " {{ .Values.global.ociImageRegistry }}/{{ .Values.healthCheck.image.repository }}:{{ .Values.healthCheck.image.tag }}"
5252 imagePullPolicy : {{ .Values.healthCheck.image.pullPolicy }}
5353
5454 securityContext :
Original file line number Diff line number Diff line change 7474 return 1
7575 fi
7676
77- local push_url="${PUSHGATEWAY_URL} /job/${job_name}/instance/${instance_name}"
77+ local push_url="${PUSHGATEWAY_URL%/}/metrics /job/${job_name}/instance/${instance_name}"
7878 log "Pushing $job_name metrics to: $push_url"
7979
8080 if echo "$metrics" | curl -k --connect-timeout 10 --max-time 30 --data-binary @- "$push_url" 2>/dev/null; then
Load Diff This file was deleted.
Load Diff This file was deleted.
Original file line number Diff line number Diff line change 1+ {{- if .Values.podNodeMapper.enabled }}
2+ ---
3+ apiVersion : apps/v1
4+ kind : Deployment
5+ metadata :
6+ name : pod-node-mapper
7+ namespace : {{ .Values.namespace.name }}
8+ labels :
9+ app : pod-node-mapper
10+ component : monitoring
11+ spec :
12+ replicas : {{ .Values.podNodeMapper.replicas }}
13+ selector :
14+ matchLabels :
15+ app : pod-node-mapper
16+ template :
17+ metadata :
18+ labels :
19+ app : pod-node-mapper
20+ component : monitoring
21+ spec :
22+ serviceAccountName : pod-node-mapper-sa
23+
24+ nodeSelector :
25+ kubernetes.io/arch : amd64
26+
27+ affinity :
28+ nodeAffinity :
29+ requiredDuringSchedulingIgnoredDuringExecution :
30+ nodeSelectorTerms :
31+ - matchExpressions :
32+ - key : nvidia.com/gpu
33+ operator : DoesNotExist
34+ - key : amd.com/gpu
35+ operator : DoesNotExist
36+
37+ tolerations :
38+ - key : " node-role.kubernetes.io/control-plane"
39+ operator : " Exists"
40+ effect : " NoSchedule"
41+ - key : " node-role.kubernetes.io/master"
42+ operator : " Exists"
43+ effect : " NoSchedule"
44+
45+ containers :
46+ - name : pod-node-mapper
47+ image : " {{ .Values.global.ociImageRegistry }}/{{ .Values.podNodeMapper.image.repository }}:{{ .Values.podNodeMapper.image.tag }}"
48+ imagePullPolicy : {{ .Values.podNodeMapper.image.pullPolicy }}
49+
50+ env :
51+ - name : PUSH_GATEWAY
52+ value : {{ .Values.global.pushGatewayUrl | quote }}
53+ - name : JOB_NAME
54+ value : {{ .Values.podNodeMapper.jobName | quote }}
55+ - name : CLUSTER_NAME
56+ value : {{ .Values.podNodeMapper.clusterName | quote }}
57+
58+ resources :
59+ {{- toYaml .Values.podNodeMapper.resources | nindent 10 }}
60+
61+ livenessProbe :
62+ exec :
63+ command :
64+ - pgrep
65+ - -f
66+ - python3
67+ initialDelaySeconds : 30
68+ periodSeconds : 30
69+ timeoutSeconds : 5
70+ failureThreshold : 3
71+
72+ readinessProbe :
73+ exec :
74+ command :
75+ - pgrep
76+ - -f
77+ - python3
78+ initialDelaySeconds : 10
79+ periodSeconds : 10
80+ timeoutSeconds : 5
81+ failureThreshold : 3
82+ {{- end }}
You can’t perform that action at this time.
0 commit comments