oracle-quickstart
diff --git a/‎README.md‎
Lines changed: 269 additions & 5 deletions b/‎README.md‎
Lines changed: 269 additions & 5 deletions
diff --git a/‎docs/running-non-rdma-workloads-on-oke.md‎
Lines changed: 0 additions & 71 deletions b/‎docs/running-non-rdma-workloads-on-oke.md‎
Lines changed: 0 additions & 71 deletions
diff --git a/‎docs/running-rdma-workloads-on-oke.md‎
Lines changed: 214 additions & 142 deletions b/‎docs/running-rdma-workloads-on-oke.md‎
Lines changed: 214 additions & 142 deletions
diff --git a/‎manifests/a100-nccl-test.yaml‎
Lines changed: 141 additions & 0 deletions b/‎manifests/a100-nccl-test.yaml‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎manifests/h100-nccl-test.yaml‎
Lines changed: 150 additions & 0 deletions b/‎manifests/h100-nccl-test.yaml‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎manifests/ip-pool.yaml‎
Lines changed: 9 additions & 0 deletions b/‎manifests/ip-pool.yaml‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,141 @@
+apiVersion: batch.volcano.sh/v1alpha1
+kind: Job
+metadata:
+  name: nccl-allreduce-job0
+spec:
+  minAvailable: 1
+  schedulerName: volcano
+  plugins:
+    ssh: []
+    svc: []
+  queue: default
+  tasks:
+    - replicas: 1
+      name: mpimaster
+      policies:
+        - event: TaskCompleted
+          action: CompleteJob
+      template:
+        spec:
+          volumes:
+            - name: topo
+              configMap:
+                name: nccl-topology
+                items:
+                - key: topo.xml
+                  path: topo.xml
+            - name: root
+              hostPath:
+                path: /
+                type: Directory
+          initContainers:
+            - command:
+                - /bin/bash
+                - -c
+                - |
+                  until [[ "$(kubectl get pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker -o json | jq '.items | length')" != 0 ]]; do
+                    echo "Waiting for MPI worker pods..."
+                    sleep 3
+                  done
+                  echo "Waiting for MPI worker pods to be ready..."
+                  kubectl wait pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker --for=condition=Ready --timeout=600s && sleep 2
+              image: aga.ocir.io/hpc_limited_availability/oke/kubectl:latest
+              name: wait-for-workers
+          serviceAccount: mpi-worker-view
+          terminationGracePeriodSeconds: 2
+          containers:
+            - command:
+                - /bin/bash
+                - -c
+                - |
+                  MPI_HOST=$(cat /etc/volcano/mpiworker.host | tr "\n" ",")
+                  mkdir -p /var/run/sshd; /usr/sbin/sshd
+                  mpirun --allow-run-as-root \
+                    -np 16 -npernode 8 --bind-to numa \
+                    -hostfile /etc/volcano/mpiworker.host \
+                    --mca pml ucx -mca coll ^hcoll \
+                    -x HCOLL_ENABLE_MCAST_ALL=0 \
+                    -x coll_hcoll_enable=0 \
+                    -x UCX_NET_DEVICES=eth0 \
+                    -x NCCL_CROSS_NIC=1 \
+                    -x NCCL_IB_GID_INDEX=3 \
+                    -x NCCL_SOCKET_IFNAME==eth0 \
+                    -x NCCL_IB_QPS_PER_CONNECTION=4 \
+                    -x NCCL_IB_TC=41 \
+                    -x NCCL_IB_SL=0 \
+                    -x NCCL_IB_HCA=mlx5 \
+                    -x NCCL_TOPO_FILE=/topo/topo.xml \
+                    /workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 8G -c 1; sleep 3600
+              image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1
+              volumeMounts:
+              - { mountPath: /topo, name: topo }
+              - { mountPath: /host, name: root }
+              securityContext:
+                capabilities:
+                  add: ["IPC_LOCK"]
+              name: mpimaster
+              ports:
+                - containerPort: 22
+                  name: mpijob-port
+              workingDir: /workspace
+              resources:
+                requests:
+                  cpu: 2
+                  memory: 128Mi
+                  ephemeral-storage: 16Gi
+          restartPolicy: OnFailure
+    - replicas: 2
+      minAvailable: 2
+      name: mpiworker
+      template:
+        metadata:
+          annotations:
+            k8s.v1.cni.cncf.io/networks: oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov
+        spec:
+          containers:
+            - name: mpiworker
+              command:
+                - /bin/bash
+                - -c
+                - mkdir -p /var/run/sshd; /usr/sbin/sshd -D;
+              image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1
+              securityContext:
+                capabilities:
+                  add: ["IPC_LOCK"]
+              ports:
+                - containerPort: 22
+                  name: mpijob-port
+              workingDir: /workspace
+              resources:
+                requests:
+                  nvidia.com/gpu: 8
+                  nvidia.com/sriov_rdma_vf: 16
+                  ephemeral-storage: 1Gi
+                limits:
+                  nvidia.com/gpu: 8
+                  nvidia.com/sriov_rdma_vf: 16
+                  ephemeral-storage: 1Gi
+              volumeMounts:
+              - { mountPath: /topo, name: topo }
+              - mountPath: /dev/shm
+                name: shm
+          restartPolicy: OnFailure
+          terminationGracePeriodSeconds: 15
+          tolerations:
+            - key: nvidia.com/gpu
+              operator: Exists
+          volumes:
+          - name: topo
+            configMap:
+              name: nccl-topology
+              items:
+              - key: topo.xml
+                path: topo.xml
+          - name: root
+            hostPath:
+              path: /
+              type: Directory
+          - name: shm
+            emptyDir:
+              medium: Memory
+              sizeLimit: 8Gi
@@ -0,0 +1,150 @@
+apiVersion: batch.volcano.sh/v1alpha1
+kind: Job
+metadata:
+  name: nccl-allreduce-job0
+spec:
+  minAvailable: 1
+  schedulerName: volcano
+  plugins:
+    ssh: []
+    svc: []
+  queue: default
+  tasks:
+    - replicas: 1
+      name: mpimaster
+      policies:
+        - event: TaskCompleted
+          action: CompleteJob
+      template:
+        spec:
+          volumes:
+            - name: topo
+              configMap:
+                name: nccl-topology
+                items:
+                - key: topo.xml
+                  path: topo.xml
+            - name: root
+              hostPath:
+                path: /
+                type: Directory
+          initContainers:
+            - command:
+                - /bin/bash
+                - -c
+                - |
+                  until [[ "$(kubectl get pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker -o json | jq '.items | length')" != 0 ]]; do
+                    echo "Waiting for MPI worker pods..."
+                    sleep 3
+                  done
+                  echo "Waiting for MPI worker pods to be ready..."
+                  kubectl wait pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker --for=condition=Ready --timeout=600s && sleep 2
+              image: aga.ocir.io/hpc_limited_availability/oke/kubectl:latest
+              name: wait-for-workers
+          serviceAccount: mpi-worker-view
+          terminationGracePeriodSeconds: 2
+          containers:
+            - command:
+                - /bin/bash
+                - -c
+                - |
+                  MPI_HOST=$(cat /etc/volcano/mpiworker.host | tr "\n" ",")
+                  mkdir -p /var/run/sshd; /usr/sbin/sshd
+                  mpirun --allow-run-as-root \
+                    -np 16 -npernode 8 --bind-to numa \
+                    -hostfile /etc/volcano/mpiworker.host \
+                    -x NCCL_CROSS_NIC=1 \
+                    -x NCCL_SOCKET_NTHREADS=16 \
+                    -x NCCL_DEBUG=WARN \
+                    -x NCCL_CUMEM_ENABLE=0 \
+                    -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
+                    -x NCCL_IB_QPS_PER_CONNECTION=16 \
+                    -x NCCL_IB_GID_INDEX=3 \
+                    -x NCCL_IB_TC=41 \
+                    -x NCCL_IB_SL=0 \
+                    -x NCCL_IB_TIMEOUT=22 \
+                    -x NCCL_NET_PLUGIN=none \
+                    -x HCOLL_ENABLE_MCAST_ALL=0 \
+                    -x coll_hcoll_enable=0 \
+                    -x UCX_TLS=tcp \
+                    -x UCX_NET_DEVICES=eth0 \
+                    -x RX_QUEUE_LEN=8192 \
+                    -x IB_RX_QUEUE_LEN=8192 \
+                    -x NCCL_SOCKET_IFNAME=eth0 \
+                    -x NCCL_IGNORE_CPU_AFFINITY=1 \
+                    -x NCCL_TOPO_FILE=/topo/topo.xml \
+                    -mca coll_hcoll_enable 0 -mca coll ^hcoll \
+                    /workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 8G -c 1; sleep 3600
+              image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1
+              volumeMounts:
+              - { mountPath: /topo, name: topo }
+              - { mountPath: /host, name: root }
+              securityContext:
+                capabilities:
+                  add: ["IPC_LOCK"]
+              name: mpimaster
+              ports:
+                - containerPort: 22
+                  name: mpijob-port
+              workingDir: /workspace
+              resources:
+                requests:
+                  cpu: 2
+                  memory: 128Mi
+                  ephemeral-storage: 16Gi
+          restartPolicy: OnFailure
+    - replicas: 2
+      minAvailable: 2
+      name: mpiworker
+      template:
+        metadata:
+          annotations:
+            k8s.v1.cni.cncf.io/networks: oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov
+        spec:
+          containers:
+            - name: mpiworker
+              command:
+                - /bin/bash
+                - -c
+                - mkdir -p /var/run/sshd; /usr/sbin/sshd -D;
+              image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1
+              securityContext:
+                capabilities:
+                  add: ["IPC_LOCK"]
+              ports:
+                - containerPort: 22
+                  name: mpijob-port
+              workingDir: /workspace
+              resources:
+                requests:
+                  nvidia.com/gpu: 8
+                  nvidia.com/sriov_rdma_vf: 16
+                  ephemeral-storage: 1Gi
+                limits:
+                  nvidia.com/gpu: 8
+                  nvidia.com/sriov_rdma_vf: 16
+                  ephemeral-storage: 1Gi
+              volumeMounts:
+              - { mountPath: /topo, name: topo }
+              - mountPath: /dev/shm
+                name: shm
+          restartPolicy: OnFailure
+          terminationGracePeriodSeconds: 15
+          tolerations:
+            - key: nvidia.com/gpu
+              operator: Exists
+          volumes:
+          - name: topo
+            configMap:
+              name: nccl-topology
+              items:
+              - key: topo.xml
+                path: topo.xml
+          - name: root
+            hostPath:
+              path: /
+              type: Directory
+          - name: shm
+            emptyDir:
+              medium: Memory
+              sizeLimit: 8Gi
@@ -0,0 +1,9 @@
+apiVersion: nv-ipam.nvidia.com/v1alpha1
+kind: IPPool
+metadata:
+  name: default
+  namespace: network-operator
+spec:
+  subnet: 192.168.0.0/16
+  perNodeBlockSize: 100
+  gateway: 192.168.0.1