|
| 1 | +// Module included in the following assemblies: |
| 2 | +// |
| 3 | +// * hardware_accelerators/rdma-remote-direct-memory-access.adoc |
| 4 | + |
| 5 | +:_mod-docs-content-type: PROCEDURE |
| 6 | +[id="rdma-creating-host-device-rdma-roce_{context}"] |
| 7 | + |
| 8 | += Creating a host device RDMA on RoCE |
| 9 | + |
| 10 | +Create the workload pods for a host device Remote Direct Memory Access (RDMA) for the NVIDIA Network Operator and test the pod configuration. |
| 11 | + |
| 12 | +.Prerequisites |
| 13 | + |
| 14 | +* Ensure that the Operator is running. |
| 15 | + |
| 16 | +* Delete the `NicClusterPolicy` custom resource (CR), if it exists. |
| 17 | + |
| 18 | +.Procedure |
| 19 | + |
| 20 | +. Generate a new host device `NicClusterPolicy` (CR), as shown below: |
| 21 | ++ |
| 22 | +[source,yaml] |
| 23 | +---- |
| 24 | +$ cat <<EOF > network-hostdev-nic-cluster-policy.yaml |
| 25 | +apiVersion: mellanox.com/v1alpha1 |
| 26 | +kind: NicClusterPolicy |
| 27 | +metadata: |
| 28 | + name: nic-cluster-policy |
| 29 | +spec: |
| 30 | + ofedDriver: |
| 31 | + image: doca-driver |
| 32 | + repository: nvcr.io/nvidia/mellanox |
| 33 | + version: 24.10-0.7.0.0-0 |
| 34 | + startupProbe: |
| 35 | + initialDelaySeconds: 10 |
| 36 | + periodSeconds: 20 |
| 37 | + livenessProbe: |
| 38 | + initialDelaySeconds: 30 |
| 39 | + periodSeconds: 30 |
| 40 | + readinessProbe: |
| 41 | + initialDelaySeconds: 10 |
| 42 | + periodSeconds: 30 |
| 43 | + env: |
| 44 | + - name: UNLOAD_STORAGE_MODULES |
| 45 | + value: "true" |
| 46 | + - name: RESTORE_DRIVER_ON_POD_TERMINATION |
| 47 | + value: "true" |
| 48 | + - name: CREATE_IFNAMES_UDEV |
| 49 | + value: "true" |
| 50 | + sriovDevicePlugin: |
| 51 | + image: sriov-network-device-plugin |
| 52 | + repository: ghcr.io/k8snetworkplumbingwg |
| 53 | + version: v3.7.0 |
| 54 | + config: | |
| 55 | + { |
| 56 | + "resourceList": [ |
| 57 | + { |
| 58 | + "resourcePrefix": "nvidia.com", |
| 59 | + "resourceName": "hostdev", |
| 60 | + "selectors": { |
| 61 | + "vendors": ["15b3"], |
| 62 | + "isRdma": true |
| 63 | + } |
| 64 | + } |
| 65 | + ] |
| 66 | + } |
| 67 | +EOF |
| 68 | +---- |
| 69 | + |
| 70 | +. Create the `NicClusterPolicy` CR on the cluster by using the following command: |
| 71 | ++ |
| 72 | +[source,terminal] |
| 73 | +---- |
| 74 | +$ oc create -f network-hostdev-nic-cluster-policy.yaml |
| 75 | +---- |
| 76 | ++ |
| 77 | +.Example output |
| 78 | ++ |
| 79 | +[source,terminal] |
| 80 | +---- |
| 81 | +nicclusterpolicy.mellanox.com/nic-cluster-policy created |
| 82 | +---- |
| 83 | + |
| 84 | +. Verify that the host device `NicClusterPolicy` CR by using the following command in the DOCA/MOFED container: |
| 85 | ++ |
| 86 | +[source,terminal] |
| 87 | +---- |
| 88 | +$ oc get pods -n nvidia-network-operator |
| 89 | +---- |
| 90 | ++ |
| 91 | +.Example output |
| 92 | ++ |
| 93 | +[source,terminal] |
| 94 | +---- |
| 95 | +NAME READY STATUS RESTARTS AGE |
| 96 | +mofed-rhcos4.16-696886fcb4-ds-9sgvd 2/2 Running 0 2m37s |
| 97 | +mofed-rhcos4.16-696886fcb4-ds-lkjd4 2/2 Running 0 2m37s |
| 98 | +nvidia-network-operator-controller-manager-68d547dbbd-qsdkf 1/1 Running 0 141m |
| 99 | +sriov-device-plugin-6v2nz 1/1 Running 0 2m14s |
| 100 | +sriov-device-plugin-hc4t8 1/1 Running 0 2m14s |
| 101 | +---- |
| 102 | + |
| 103 | +. Confirm that the resources appear in the cluster `oc describe node` section by using the following command: |
| 104 | ++ |
| 105 | +[source,terminal] |
| 106 | +---- |
| 107 | +$ oc describe node -l node-role.kubernetes.io/worker=| grep -E 'Capacity:|Allocatable:' -A7 |
| 108 | +---- |
| 109 | ++ |
| 110 | +.Example output |
| 111 | ++ |
| 112 | +[source,terminal] |
| 113 | +---- |
| 114 | +Capacity: |
| 115 | + cpu: 128 |
| 116 | + ephemeral-storage: 1561525616Ki |
| 117 | + hugepages-1Gi: 0 |
| 118 | + hugepages-2Mi: 0 |
| 119 | + memory: 263596708Ki |
| 120 | + nvidia.com/hostdev: 2 |
| 121 | + pods: 250 |
| 122 | +Allocatable: |
| 123 | + cpu: 127500m |
| 124 | + ephemeral-storage: 1438028263499 |
| 125 | + hugepages-1Gi: 0 |
| 126 | + hugepages-2Mi: 0 |
| 127 | + memory: 262445732Ki |
| 128 | + nvidia.com/hostdev: 2 |
| 129 | + pods: 250 |
| 130 | +-- |
| 131 | +Capacity: |
| 132 | + cpu: 128 |
| 133 | + ephemeral-storage: 1561525616Ki |
| 134 | + hugepages-1Gi: 0 |
| 135 | + hugepages-2Mi: 0 |
| 136 | + memory: 263596704Ki |
| 137 | + nvidia.com/hostdev: 2 |
| 138 | + pods: 250 |
| 139 | +Allocatable: |
| 140 | + cpu: 127500m |
| 141 | + ephemeral-storage: 1438028263499 |
| 142 | + hugepages-1Gi: 0 |
| 143 | + hugepages-2Mi: 0 |
| 144 | + memory: 262445728Ki |
| 145 | + nvidia.com/hostdev: 2 |
| 146 | + pods: 250 |
| 147 | +---- |
| 148 | + |
| 149 | +. Create a `HostDeviceNetwork` CR file: |
| 150 | ++ |
| 151 | +[source,yaml] |
| 152 | +---- |
| 153 | +$ cat <<EOF > hostdev-network.yaml |
| 154 | +apiVersion: mellanox.com/v1alpha1 |
| 155 | +kind: HostDeviceNetwork |
| 156 | +metadata: |
| 157 | + name: hostdev-net |
| 158 | +spec: |
| 159 | + networkNamespace: "default" |
| 160 | + resourceName: "hostdev" |
| 161 | + ipam: | |
| 162 | + { |
| 163 | + "type": "whereabouts", |
| 164 | + "range": "192.168.3.225/28", |
| 165 | + "exclude": [ |
| 166 | + "192.168.3.229/30", |
| 167 | + "192.168.3.236/32" |
| 168 | + ] |
| 169 | + } |
| 170 | +EOF |
| 171 | +---- |
| 172 | + |
| 173 | +. Create the `HostDeviceNetwork` resource on the cluster by using the following command: |
| 174 | ++ |
| 175 | +[source,terminal] |
| 176 | +---- |
| 177 | +$ oc create -f hostdev-network.yaml |
| 178 | +---- |
| 179 | ++ |
| 180 | +.Example output |
| 181 | ++ |
| 182 | +[source,terminal] |
| 183 | +---- |
| 184 | +hostdevicenetwork.mellanox.com/hostdev-net created |
| 185 | +---- |
| 186 | + |
| 187 | +. Confirm that the resources appear in the cluster `oc describe node` section by using the following command: |
| 188 | ++ |
| 189 | +[source,terminal] |
| 190 | +---- |
| 191 | +$ oc describe node -l node-role.kubernetes.io/worker=| grep -E 'Capacity:|Allocatable:' -A8 |
| 192 | +---- |
| 193 | ++ |
| 194 | +.Example output |
| 195 | ++ |
| 196 | +[source,terminal] |
| 197 | +---- |
| 198 | +Capacity: |
| 199 | + cpu: 128 |
| 200 | + ephemeral-storage: 1561525616Ki |
| 201 | + hugepages-1Gi: 0 |
| 202 | + hugepages-2Mi: 0 |
| 203 | + memory: 263596708Ki |
| 204 | + nvidia.com/gpu: 2 |
| 205 | + nvidia.com/hostdev: 2 |
| 206 | + pods: 250 |
| 207 | +Allocatable: |
| 208 | + cpu: 127500m |
| 209 | + ephemeral-storage: 1438028263499 |
| 210 | + hugepages-1Gi: 0 |
| 211 | + hugepages-2Mi: 0 |
| 212 | + memory: 262445732Ki |
| 213 | + nvidia.com/gpu: 2 |
| 214 | + nvidia.com/hostdev: 2 |
| 215 | + pods: 250 |
| 216 | +-- |
| 217 | +Capacity: |
| 218 | + cpu: 128 |
| 219 | + ephemeral-storage: 1561525616Ki |
| 220 | + hugepages-1Gi: 0 |
| 221 | + hugepages-2Mi: 0 |
| 222 | + memory: 263596680Ki |
| 223 | + nvidia.com/gpu: 2 |
| 224 | + nvidia.com/hostdev: 2 |
| 225 | + pods: 250 |
| 226 | +Allocatable: |
| 227 | + cpu: 127500m |
| 228 | + ephemeral-storage: 1438028263499 |
| 229 | + hugepages-1Gi: 0 |
| 230 | + hugepages-2Mi: 0 |
| 231 | + memory: 262445704Ki |
| 232 | + nvidia.com/gpu: 2 |
| 233 | + nvidia.com/hostdev: 2 |
| 234 | + pods: 250 |
| 235 | +---- |
0 commit comments