Skip to content

Commit a872311

Browse files
committed
Amd helm cleanup and Adding code for pod mapper
Signed-off-by: Ritika Gupta <[email protected]>
1 parent 9d2ec89 commit a872311

File tree

12 files changed

+202
-163
lines changed

12 files changed

+202
-163
lines changed

oci-scanner-plugin-amd-helm/Chart.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
apiVersion: v2
2-
name: oci-scanner-plugin
2+
name: oci-gpu-scanner-plugin
33
description: OCI Scanner Plugin for AMD GPU monitoring and health checks
44
type: application
55
version: 0.1.0
@@ -13,4 +13,11 @@ keywords:
1313
home: https://github.com/oracle/oci-lens-quickstart
1414
maintainers:
1515
- name: OCI Lens Team
16-
16+
17+
18+
dependencies:
19+
- name: prometheus-node-exporter
20+
version: 4.40.0
21+
repository: https://prometheus-community.github.io/helm-charts
22+
condition: nodeExporter.enabled
23+
namespace: oci-gpu-scanner-plugin

oci-scanner-plugin-amd-helm/README.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,19 @@ AMD GPU monitoring and health check solution for OCI compute instances.
1414
## Configuration
1515

1616
```bash
17-
# Custom push gateway
18-
helm install oci-scanner-plugin ./oci-scanner-plugin-helm \
19-
--set global.pushGatewayUrl=http://my-pushgateway:9091/
17+
helm dependency build
18+
helm dependency update
19+
20+
helm install oci-gpu-scanner-plugin . -f values.yaml -n oci-gpu-scanner-plugin \
21+
--set global.pushGatewayUrl="<your-push-gateway-url>" \
22+
--create-namespace
2023

2124
# Enable health check
22-
helm install oci-scanner-plugin ./oci-scanner-plugin-helm \
25+
helm install oci-gpu-scanner-plugin ./oci-scanner-plugin-amd-helm \
2326
--set healthCheck.enabled=true
2427

2528
# Uninstall
26-
helm uninstall oci-scanner-plugin
29+
helm uninstall oci-gpu-scanner-plugin -n oci-gpu-scanner-plugin
2730
```
2831

2932
## Requirements

oci-scanner-plugin-amd-helm/templates/amd-gpu-exporter.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,20 @@ spec:
2525
nodeSelector:
2626
kubernetes.io/arch: amd64
2727

28-
hostNetwork: true
28+
affinity:
29+
nodeAffinity:
30+
requiredDuringSchedulingIgnoredDuringExecution:
31+
nodeSelectorTerms:
32+
- matchExpressions:
33+
- key: amd.com/gpu
34+
operator: In
35+
values: ["true", "present"]
2936

3037
volumes:
3138
- name: dri
3239
hostPath:
3340
path: /dev/dri
3441
type: Directory
35-
- name: kfd
36-
hostPath:
37-
path: /dev/kfd
38-
type: Directory
3942
- name: sys
4043
hostPath:
4144
path: /sys
@@ -61,8 +64,6 @@ spec:
6164
volumeMounts:
6265
- name: dri
6366
mountPath: /dev/dri
64-
- name: kfd
65-
mountPath: /dev/kfd
6667
- name: sys
6768
mountPath: /sys
6869
readOnly: true
@@ -111,8 +112,7 @@ spec:
111112

112113
tolerations:
113114
- key: amd.com/gpu
114-
operator: Equal
115-
value: "present"
115+
operator: Exists
116116
effect: NoSchedule
117117

118118
restartPolicy: Always

oci-scanner-plugin-amd-helm/templates/drhpc.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ spec:
4848

4949
containers:
5050
- name: oke-dr-hpc-prod
51-
image: "{{ .Values.global.imageRegistry }}/{{ .Values.drhpc.image.repository }}:{{ .Values.drhpc.image.tag }}"
51+
image: "{{ .Values.global.ociImageRegistry }}/{{ .Values.drhpc.image.repository }}:{{ .Values.drhpc.image.tag }}"
5252
imagePullPolicy: {{ .Values.drhpc.image.pullPolicy }}
5353

5454
securityContext:

oci-scanner-plugin-amd-helm/templates/go-plugin.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ spec:
7373

7474
containers:
7575
- name: oci-lens-plugin
76-
image: "{{ .Values.global.imageRegistry }}/{{ .Values.goPlugin.image.repository }}:{{ .Values.goPlugin.image.tag }}"
76+
image: "{{ .Values.global.ociImageRegistry }}/{{ .Values.goPlugin.image.repository }}:{{ .Values.goPlugin.image.tag }}"
7777
imagePullPolicy: {{ .Values.goPlugin.image.pullPolicy }}
7878

7979
env:

oci-scanner-plugin-amd-helm/templates/health-check.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ spec:
4848

4949
containers:
5050
- name: amd-gpu-healthcheck
51-
image: "{{ .Values.global.imageRegistry }}/{{ .Values.healthCheck.image.repository }}:{{ .Values.healthCheck.image.tag }}"
51+
image: "{{ .Values.global.ociImageRegistry }}/{{ .Values.healthCheck.image.repository }}:{{ .Values.healthCheck.image.tag }}"
5252
imagePullPolicy: {{ .Values.healthCheck.image.pullPolicy }}
5353

5454
securityContext:

oci-scanner-plugin-amd-helm/templates/metrics-push-job.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ spec:
7474
return 1
7575
fi
7676
77-
local push_url="${PUSHGATEWAY_URL}/job/${job_name}/instance/${instance_name}"
77+
local push_url="${PUSHGATEWAY_URL%/}/metrics/job/${job_name}/instance/${instance_name}"
7878
log "Pushing $job_name metrics to: $push_url"
7979
8080
if echo "$metrics" | curl -k --connect-timeout 10 --max-time 30 --data-binary @- "$push_url" 2>/dev/null; then

oci-scanner-plugin-amd-helm/templates/namespace.yaml

Lines changed: 0 additions & 9 deletions
This file was deleted.

oci-scanner-plugin-amd-helm/templates/node-exporter.yaml

Lines changed: 0 additions & 125 deletions
This file was deleted.
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
{{- if .Values.podNodeMapper.enabled }}
2+
---
3+
apiVersion: apps/v1
4+
kind: Deployment
5+
metadata:
6+
name: pod-node-mapper
7+
namespace: {{ .Values.namespace.name }}
8+
labels:
9+
app: pod-node-mapper
10+
component: monitoring
11+
spec:
12+
replicas: {{ .Values.podNodeMapper.replicas }}
13+
selector:
14+
matchLabels:
15+
app: pod-node-mapper
16+
template:
17+
metadata:
18+
labels:
19+
app: pod-node-mapper
20+
component: monitoring
21+
spec:
22+
serviceAccountName: pod-node-mapper-sa
23+
24+
nodeSelector:
25+
kubernetes.io/arch: amd64
26+
27+
affinity:
28+
nodeAffinity:
29+
requiredDuringSchedulingIgnoredDuringExecution:
30+
nodeSelectorTerms:
31+
- matchExpressions:
32+
- key: nvidia.com/gpu
33+
operator: DoesNotExist
34+
- key: amd.com/gpu
35+
operator: DoesNotExist
36+
37+
tolerations:
38+
- key: "node-role.kubernetes.io/control-plane"
39+
operator: "Exists"
40+
effect: "NoSchedule"
41+
- key: "node-role.kubernetes.io/master"
42+
operator: "Exists"
43+
effect: "NoSchedule"
44+
45+
containers:
46+
- name: pod-node-mapper
47+
image: "{{ .Values.global.ociImageRegistry }}/{{ .Values.podNodeMapper.image.repository }}:{{ .Values.podNodeMapper.image.tag }}"
48+
imagePullPolicy: {{ .Values.podNodeMapper.image.pullPolicy }}
49+
50+
env:
51+
- name: PUSH_GATEWAY
52+
value: {{ .Values.global.pushGatewayUrl | quote }}
53+
- name: JOB_NAME
54+
value: {{ .Values.podNodeMapper.jobName | quote }}
55+
- name: CLUSTER_NAME
56+
value: {{ .Values.podNodeMapper.clusterName | quote }}
57+
58+
resources:
59+
{{- toYaml .Values.podNodeMapper.resources | nindent 10 }}
60+
61+
livenessProbe:
62+
exec:
63+
command:
64+
- pgrep
65+
- -f
66+
- python3
67+
initialDelaySeconds: 30
68+
periodSeconds: 30
69+
timeoutSeconds: 5
70+
failureThreshold: 3
71+
72+
readinessProbe:
73+
exec:
74+
command:
75+
- pgrep
76+
- -f
77+
- python3
78+
initialDelaySeconds: 10
79+
periodSeconds: 10
80+
timeoutSeconds: 5
81+
failureThreshold: 3
82+
{{- end }}

0 commit comments

Comments
 (0)