File tree Expand file tree Collapse file tree 1 file changed +53
-0
lines changed Expand file tree Collapse file tree 1 file changed +53
-0
lines changed Original file line number Diff line number Diff line change 1+ ---
2+ apiVersion : apps/v1
3+ kind : DaemonSet
4+ metadata :
5+ name : vf-config
6+ namespace : kube-system
7+ spec :
8+ selector :
9+ matchLabels :
10+ app : vf-config
11+ template :
12+ metadata :
13+ labels :
14+ app : vf-config
15+ spec :
16+ priorityClassName : system-node-critical
17+ hostNetwork : true
18+ tolerations : [{ operator: "Exists" }]
19+ terminationGracePeriodSeconds : 0
20+ nodeSelector :
21+ node.kubernetes.io/instance-type : BM.GPU.H100.8
22+ volumes : [{ name: root, hostPath: { path: "/" } }]
23+ containers :
24+ - name : vf-config
25+ image : oraclelinux:9
26+ imagePullPolicy : Always
27+ securityContext :
28+ privileged : true
29+ capabilities :
30+ add : [CAP_SYS_ADMIN]
31+ volumeMounts : [{ name: root, mountPath: /host }]
32+ resources : {}
33+ command :
34+ - /usr/bin/bash
35+ - -c
36+ - |
37+ set -e -o pipefail; trap 'exit=1' SIGINT
38+ chroot /host /usr/bin/bash -ex <<EOF
39+ while true ; do
40+ grep "Fully Configured" /var/log/oracle-cloud-agent/plugins/oci-hpc/oci-hpc-configure/oci-hpc-mlx-configure.log && break
41+ sleep 15 ; continue ;
42+ done
43+
44+ snap stop oracle-cloud-agent || true
45+ echo 0 | tee /sys/class/net/rdma*/device/sriov_numvfs || true
46+ sleep 30
47+ rdma system show
48+ oci-vf-config; sleep 2
49+ echo 4220 | tee /sys/class/net/rdma*/mtu || true
50+ snap start oracle-cloud-agent || echo "Error restarting" >&2
51+ crictl rmp -f "\$(crictl pods | grep sriov-device | awk '{print \$1}' | tail -1)" || true
52+ EOF
53+ while :; do { [[ $exit ]] && break; }; sleep 1; done # Sleep forever, exit gracefully
You can’t perform that action at this time.
0 commit comments