Skip to content

Commit f118709

Browse files
Merge pull request #9 from OguzPastirmaci/main
Update docs for Oracle Linux 8
2 parents 56cebaf + f1456d8 commit f118709

21 files changed

+914
-620
lines changed

README.md

Lines changed: 269 additions & 5 deletions
Large diffs are not rendered by default.

docs/running-non-rdma-workloads-on-oke.md

Lines changed: 0 additions & 71 deletions
This file was deleted.

docs/running-rdma-workloads-on-oke.md

Lines changed: 214 additions & 142 deletions
Large diffs are not rendered by default.

manifests/a100-nccl-test.yaml

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
apiVersion: batch.volcano.sh/v1alpha1
2+
kind: Job
3+
metadata:
4+
name: nccl-allreduce-job0
5+
spec:
6+
minAvailable: 1
7+
schedulerName: volcano
8+
plugins:
9+
ssh: []
10+
svc: []
11+
queue: default
12+
tasks:
13+
- replicas: 1
14+
name: mpimaster
15+
policies:
16+
- event: TaskCompleted
17+
action: CompleteJob
18+
template:
19+
spec:
20+
volumes:
21+
- name: topo
22+
configMap:
23+
name: nccl-topology
24+
items:
25+
- key: topo.xml
26+
path: topo.xml
27+
- name: root
28+
hostPath:
29+
path: /
30+
type: Directory
31+
initContainers:
32+
- command:
33+
- /bin/bash
34+
- -c
35+
- |
36+
until [[ "$(kubectl get pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker -o json | jq '.items | length')" != 0 ]]; do
37+
echo "Waiting for MPI worker pods..."
38+
sleep 3
39+
done
40+
echo "Waiting for MPI worker pods to be ready..."
41+
kubectl wait pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker --for=condition=Ready --timeout=600s && sleep 2
42+
image: aga.ocir.io/hpc_limited_availability/oke/kubectl:latest
43+
name: wait-for-workers
44+
serviceAccount: mpi-worker-view
45+
terminationGracePeriodSeconds: 2
46+
containers:
47+
- command:
48+
- /bin/bash
49+
- -c
50+
- |
51+
MPI_HOST=$(cat /etc/volcano/mpiworker.host | tr "\n" ",")
52+
mkdir -p /var/run/sshd; /usr/sbin/sshd
53+
mpirun --allow-run-as-root \
54+
-np 16 -npernode 8 --bind-to numa \
55+
-hostfile /etc/volcano/mpiworker.host \
56+
--mca pml ucx -mca coll ^hcoll \
57+
-x HCOLL_ENABLE_MCAST_ALL=0 \
58+
-x coll_hcoll_enable=0 \
59+
-x UCX_NET_DEVICES=eth0 \
60+
-x NCCL_CROSS_NIC=1 \
61+
-x NCCL_IB_GID_INDEX=3 \
62+
-x NCCL_SOCKET_IFNAME==eth0 \
63+
-x NCCL_IB_QPS_PER_CONNECTION=4 \
64+
-x NCCL_IB_TC=41 \
65+
-x NCCL_IB_SL=0 \
66+
-x NCCL_IB_HCA=mlx5 \
67+
-x NCCL_TOPO_FILE=/topo/topo.xml \
68+
/workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 8G -c 1; sleep 3600
69+
image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1
70+
volumeMounts:
71+
- { mountPath: /topo, name: topo }
72+
- { mountPath: /host, name: root }
73+
securityContext:
74+
capabilities:
75+
add: ["IPC_LOCK"]
76+
name: mpimaster
77+
ports:
78+
- containerPort: 22
79+
name: mpijob-port
80+
workingDir: /workspace
81+
resources:
82+
requests:
83+
cpu: 2
84+
memory: 128Mi
85+
ephemeral-storage: 16Gi
86+
restartPolicy: OnFailure
87+
- replicas: 2
88+
minAvailable: 2
89+
name: mpiworker
90+
template:
91+
metadata:
92+
annotations:
93+
k8s.v1.cni.cncf.io/networks: oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov
94+
spec:
95+
containers:
96+
- name: mpiworker
97+
command:
98+
- /bin/bash
99+
- -c
100+
- mkdir -p /var/run/sshd; /usr/sbin/sshd -D;
101+
image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1
102+
securityContext:
103+
capabilities:
104+
add: ["IPC_LOCK"]
105+
ports:
106+
- containerPort: 22
107+
name: mpijob-port
108+
workingDir: /workspace
109+
resources:
110+
requests:
111+
nvidia.com/gpu: 8
112+
nvidia.com/sriov_rdma_vf: 16
113+
ephemeral-storage: 1Gi
114+
limits:
115+
nvidia.com/gpu: 8
116+
nvidia.com/sriov_rdma_vf: 16
117+
ephemeral-storage: 1Gi
118+
volumeMounts:
119+
- { mountPath: /topo, name: topo }
120+
- mountPath: /dev/shm
121+
name: shm
122+
restartPolicy: OnFailure
123+
terminationGracePeriodSeconds: 15
124+
tolerations:
125+
- key: nvidia.com/gpu
126+
operator: Exists
127+
volumes:
128+
- name: topo
129+
configMap:
130+
name: nccl-topology
131+
items:
132+
- key: topo.xml
133+
path: topo.xml
134+
- name: root
135+
hostPath:
136+
path: /
137+
type: Directory
138+
- name: shm
139+
emptyDir:
140+
medium: Memory
141+
sizeLimit: 8Gi

manifests/h100-nccl-test.yaml

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
apiVersion: batch.volcano.sh/v1alpha1
2+
kind: Job
3+
metadata:
4+
name: nccl-allreduce-job0
5+
spec:
6+
minAvailable: 1
7+
schedulerName: volcano
8+
plugins:
9+
ssh: []
10+
svc: []
11+
queue: default
12+
tasks:
13+
- replicas: 1
14+
name: mpimaster
15+
policies:
16+
- event: TaskCompleted
17+
action: CompleteJob
18+
template:
19+
spec:
20+
volumes:
21+
- name: topo
22+
configMap:
23+
name: nccl-topology
24+
items:
25+
- key: topo.xml
26+
path: topo.xml
27+
- name: root
28+
hostPath:
29+
path: /
30+
type: Directory
31+
initContainers:
32+
- command:
33+
- /bin/bash
34+
- -c
35+
- |
36+
until [[ "$(kubectl get pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker -o json | jq '.items | length')" != 0 ]]; do
37+
echo "Waiting for MPI worker pods..."
38+
sleep 3
39+
done
40+
echo "Waiting for MPI worker pods to be ready..."
41+
kubectl wait pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker --for=condition=Ready --timeout=600s && sleep 2
42+
image: aga.ocir.io/hpc_limited_availability/oke/kubectl:latest
43+
name: wait-for-workers
44+
serviceAccount: mpi-worker-view
45+
terminationGracePeriodSeconds: 2
46+
containers:
47+
- command:
48+
- /bin/bash
49+
- -c
50+
- |
51+
MPI_HOST=$(cat /etc/volcano/mpiworker.host | tr "\n" ",")
52+
mkdir -p /var/run/sshd; /usr/sbin/sshd
53+
mpirun --allow-run-as-root \
54+
-np 16 -npernode 8 --bind-to numa \
55+
-hostfile /etc/volcano/mpiworker.host \
56+
-x NCCL_CROSS_NIC=1 \
57+
-x NCCL_SOCKET_NTHREADS=16 \
58+
-x NCCL_DEBUG=WARN \
59+
-x NCCL_CUMEM_ENABLE=0 \
60+
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
61+
-x NCCL_IB_QPS_PER_CONNECTION=16 \
62+
-x NCCL_IB_GID_INDEX=3 \
63+
-x NCCL_IB_TC=41 \
64+
-x NCCL_IB_SL=0 \
65+
-x NCCL_IB_TIMEOUT=22 \
66+
-x NCCL_NET_PLUGIN=none \
67+
-x HCOLL_ENABLE_MCAST_ALL=0 \
68+
-x coll_hcoll_enable=0 \
69+
-x UCX_TLS=tcp \
70+
-x UCX_NET_DEVICES=eth0 \
71+
-x RX_QUEUE_LEN=8192 \
72+
-x IB_RX_QUEUE_LEN=8192 \
73+
-x NCCL_SOCKET_IFNAME=eth0 \
74+
-x NCCL_IGNORE_CPU_AFFINITY=1 \
75+
-x NCCL_TOPO_FILE=/topo/topo.xml \
76+
-mca coll_hcoll_enable 0 -mca coll ^hcoll \
77+
/workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 8G -c 1; sleep 3600
78+
image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1
79+
volumeMounts:
80+
- { mountPath: /topo, name: topo }
81+
- { mountPath: /host, name: root }
82+
securityContext:
83+
capabilities:
84+
add: ["IPC_LOCK"]
85+
name: mpimaster
86+
ports:
87+
- containerPort: 22
88+
name: mpijob-port
89+
workingDir: /workspace
90+
resources:
91+
requests:
92+
cpu: 2
93+
memory: 128Mi
94+
ephemeral-storage: 16Gi
95+
restartPolicy: OnFailure
96+
- replicas: 2
97+
minAvailable: 2
98+
name: mpiworker
99+
template:
100+
metadata:
101+
annotations:
102+
k8s.v1.cni.cncf.io/networks: oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov
103+
spec:
104+
containers:
105+
- name: mpiworker
106+
command:
107+
- /bin/bash
108+
- -c
109+
- mkdir -p /var/run/sshd; /usr/sbin/sshd -D;
110+
image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1
111+
securityContext:
112+
capabilities:
113+
add: ["IPC_LOCK"]
114+
ports:
115+
- containerPort: 22
116+
name: mpijob-port
117+
workingDir: /workspace
118+
resources:
119+
requests:
120+
nvidia.com/gpu: 8
121+
nvidia.com/sriov_rdma_vf: 16
122+
ephemeral-storage: 1Gi
123+
limits:
124+
nvidia.com/gpu: 8
125+
nvidia.com/sriov_rdma_vf: 16
126+
ephemeral-storage: 1Gi
127+
volumeMounts:
128+
- { mountPath: /topo, name: topo }
129+
- mountPath: /dev/shm
130+
name: shm
131+
restartPolicy: OnFailure
132+
terminationGracePeriodSeconds: 15
133+
tolerations:
134+
- key: nvidia.com/gpu
135+
operator: Exists
136+
volumes:
137+
- name: topo
138+
configMap:
139+
name: nccl-topology
140+
items:
141+
- key: topo.xml
142+
path: topo.xml
143+
- name: root
144+
hostPath:
145+
path: /
146+
type: Directory
147+
- name: shm
148+
emptyDir:
149+
medium: Memory
150+
sizeLimit: 8Gi

manifests/ip-pool.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apiVersion: nv-ipam.nvidia.com/v1alpha1
2+
kind: IPPool
3+
metadata:
4+
name: default
5+
namespace: network-operator
6+
spec:
7+
subnet: 192.168.0.0/16
8+
perNodeBlockSize: 100
9+
gateway: 192.168.0.1

0 commit comments

Comments
 (0)