Skip to content

Commit 63edb70

Browse files
Create vf-config.yaml
1 parent 75c48b0 commit 63edb70

File tree

1 file changed

+53
-0
lines changed

1 file changed

+53
-0
lines changed

manifests/vf-config.yaml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
---
2+
apiVersion: apps/v1
3+
kind: DaemonSet
4+
metadata:
5+
name: vf-config
6+
namespace: kube-system
7+
spec:
8+
selector:
9+
matchLabels:
10+
app: vf-config
11+
template:
12+
metadata:
13+
labels:
14+
app: vf-config
15+
spec:
16+
priorityClassName: system-node-critical
17+
hostNetwork: true
18+
tolerations: [{ operator: "Exists" }]
19+
terminationGracePeriodSeconds: 0
20+
nodeSelector:
21+
node.kubernetes.io/instance-type: BM.GPU.H100.8
22+
volumes: [{ name: root, hostPath: { path: "/" } }]
23+
containers:
24+
- name: vf-config
25+
image: oraclelinux:9
26+
imagePullPolicy: Always
27+
securityContext:
28+
privileged: true
29+
capabilities:
30+
add: [CAP_SYS_ADMIN]
31+
volumeMounts: [{ name: root, mountPath: /host }]
32+
resources: {}
33+
command:
34+
- /usr/bin/bash
35+
- -c
36+
- |
37+
set -e -o pipefail; trap 'exit=1' SIGINT
38+
chroot /host /usr/bin/bash -ex <<EOF
39+
while true ; do
40+
grep "Fully Configured" /var/log/oracle-cloud-agent/plugins/oci-hpc/oci-hpc-configure/oci-hpc-mlx-configure.log && break
41+
sleep 15 ; continue ;
42+
done
43+
44+
snap stop oracle-cloud-agent || true
45+
echo 0 | tee /sys/class/net/rdma*/device/sriov_numvfs || true
46+
sleep 30
47+
rdma system show
48+
oci-vf-config; sleep 2
49+
echo 4220 | tee /sys/class/net/rdma*/mtu || true
50+
snap start oracle-cloud-agent || echo "Error restarting" >&2
51+
crictl rmp -f "\$(crictl pods | grep sriov-device | awk '{print \$1}' | tail -1)" || true
52+
EOF
53+
while :; do { [[ $exit ]] && break; }; sleep 1; done # Sleep forever, exit gracefully

0 commit comments

Comments
 (0)