openshift
diff --git a/‎hardware_accelerators/rdma-remote-direct-memory-access.adoc‎
Lines changed: 15 additions & 1 deletion b/‎hardware_accelerators/rdma-remote-direct-memory-access.adoc‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎modules/rdma-creating-host-device-rdma-roce.adoc‎
Lines changed: 235 additions & 0 deletions b/‎modules/rdma-creating-host-device-rdma-roce.adoc‎
Lines changed: 235 additions & 0 deletions
diff --git a/‎modules/rdma-creating-shared-device-rdma-ib.adoc‎
Lines changed: 106 additions & 0 deletions b/‎modules/rdma-creating-shared-device-rdma-ib.adoc‎
Lines changed: 106 additions & 0 deletions
@@ -6,7 +6,7 @@ include::_attributes/common-attributes.adoc[]
 
 toc::[]
 
-NVIDIA GPUDirect Remote Direct Memory Access (RDMA) allows for the memory in one computer to directly access the memory of another computer without needing access through the operating system. This provides the ability to bypass kernel intervention in the process, freeing up resources and greatly reducing the CPU overhead normally needed to process network communications. This is useful for distributing GPU-accelerated workloads across clusters. And because RDMA is so suited toward high bandwidth and low latency applications, this makes it ideal for big data and machine learning applications.
+NVIDIA GPUDirect Remote Direct Memory Access (RDMA) allows for an application in one computer to directly access the memory of another computer without needing access through the operating system. This provides the ability to bypass kernel intervention in the process, freeing up resources and greatly reducing the CPU overhead normally needed to process network communications. This is useful for distributing GPU-accelerated workloads across clusters. And because RDMA is so suited toward high bandwidth and low latency applications, this makes it ideal for big data and machine learning applications.
 
 There are currently three configuration methods for NVIDIA GPUDirect RDMA:
 
@@ -43,3 +43,17 @@ include::modules/rdma-configuring-the-nvidia-network-operator.adoc[leveloffset=+
 
 include::modules/rdma-configuring-the-gpu-operator.adoc[leveloffset=+1]
 
+include::modules/rdma-creating-the-machine-configuration.adoc[leveloffset=+1]
+
+include::modules/rdma-creating-workload-pods.adoc[leveloffset=+1]
+
+include::modules/rdma-creating-shared-device-rdma-roce.adoc[leveloffset=+2]
+
+include::modules/rdma-creating-host-device-rdma-roce.adoc[leveloffset=+2]
+
+include::modules/rdma-creating-sriov-legacy-mode-rdma-roce.adoc[leveloffset=+2]
+
+include::modules/rdma-creating-shared-device-rdma-ib.adoc[leveloffset=+2]
+
+include::modules/rdma-verifying-rdma-connectivity.adoc[leveloffset=+1]
+
@@ -0,0 +1,235 @@
+// Module included in the following assemblies:
+//
+// * hardware_accelerators/rdma-remote-direct-memory-access.adoc
+
+:_mod-docs-content-type: PROCEDURE
+[id="rdma-creating-host-device-rdma-roce_{context}"]
+
+= Creating a host device RDMA on RoCE
+
+Create the workload pods for a host device Remote Direct Memory Access (RDMA) for the NVIDIA Network Operator and test the pod configuration.
+
+.Prerequisites
+
+* Ensure that the Operator is running.
+
+* Delete the `NicClusterPolicy` custom resource (CR), if it exists. 
+
+.Procedure
+
+. Generate a new host device `NicClusterPolicy` (CR), as shown below:
++
+[source,yaml]
+----
+$ cat <<EOF > network-hostdev-nic-cluster-policy.yaml
+apiVersion: mellanox.com/v1alpha1
+kind: NicClusterPolicy
+metadata:
+  name: nic-cluster-policy
+spec:
+  ofedDriver:
+    image: doca-driver
+    repository: nvcr.io/nvidia/mellanox
+    version: 24.10-0.7.0.0-0
+    startupProbe:
+      initialDelaySeconds: 10
+      periodSeconds: 20
+    livenessProbe:
+      initialDelaySeconds: 30
+      periodSeconds: 30
+    readinessProbe:
+      initialDelaySeconds: 10
+      periodSeconds: 30
+    env:
+    - name: UNLOAD_STORAGE_MODULES
+      value: "true"
+    - name: RESTORE_DRIVER_ON_POD_TERMINATION
+      value: "true"
+    - name: CREATE_IFNAMES_UDEV
+      value: "true"
+  sriovDevicePlugin:
+      image: sriov-network-device-plugin
+      repository: ghcr.io/k8snetworkplumbingwg
+      version: v3.7.0
+      config: |
+        {
+          "resourceList": [
+              {
+                  "resourcePrefix": "nvidia.com",
+                  "resourceName": "hostdev",
+                  "selectors": {
+                      "vendors": ["15b3"],
+                      "isRdma": true
+                  }
+              }
+          ]
+        }
+EOF
+----
+
+. Create the `NicClusterPolicy` CR on the cluster by using the following command: 
++
+[source,terminal]
+----
+$ oc create -f network-hostdev-nic-cluster-policy.yaml 
+----
++
+.Example output
++
+[source,terminal]
+----
+nicclusterpolicy.mellanox.com/nic-cluster-policy created
+----
+
+. Verify that the host device `NicClusterPolicy` CR by using the following command in the DOCA/MOFED container:
++
+[source,terminal]
+----
+$ oc get pods -n nvidia-network-operator
+----
++
+.Example output
++
+[source,terminal]
+----
+NAME                                                          READY   STATUS    RESTARTS   AGE
+mofed-rhcos4.16-696886fcb4-ds-9sgvd                           2/2     Running   0          2m37s
+mofed-rhcos4.16-696886fcb4-ds-lkjd4                           2/2     Running   0          2m37s
+nvidia-network-operator-controller-manager-68d547dbbd-qsdkf   1/1     Running   0          141m
+sriov-device-plugin-6v2nz                                     1/1     Running   0          2m14s
+sriov-device-plugin-hc4t8                                     1/1     Running   0          2m14s
+----
+
+. Confirm that the resources appear in the cluster `oc describe node` section by using the following command:
++
+[source,terminal]
+----
+$ oc describe node -l node-role.kubernetes.io/worker=| grep -E 'Capacity:|Allocatable:' -A7
+----
++
+.Example output
++
+[source,terminal]
+----
+Capacity:
+  cpu:                 128
+  ephemeral-storage:   1561525616Ki
+  hugepages-1Gi:       0
+  hugepages-2Mi:       0
+  memory:              263596708Ki
+  nvidia.com/hostdev:  2
+  pods:                250
+Allocatable:
+  cpu:                 127500m
+  ephemeral-storage:   1438028263499
+  hugepages-1Gi:       0
+  hugepages-2Mi:       0
+  memory:              262445732Ki
+  nvidia.com/hostdev:  2
+  pods:                250
+--
+Capacity:
+  cpu:                 128
+  ephemeral-storage:   1561525616Ki
+  hugepages-1Gi:       0
+  hugepages-2Mi:       0
+  memory:              263596704Ki
+  nvidia.com/hostdev:  2
+  pods:                250
+Allocatable:
+  cpu:                 127500m
+  ephemeral-storage:   1438028263499
+  hugepages-1Gi:       0
+  hugepages-2Mi:       0
+  memory:              262445728Ki
+  nvidia.com/hostdev:  2
+  pods:                250
+----
+
+. Create a `HostDeviceNetwork` CR file:
++
+[source,yaml]
+----
+$ cat <<EOF >  hostdev-network.yaml
+apiVersion: mellanox.com/v1alpha1
+kind: HostDeviceNetwork
+metadata:
+  name: hostdev-net
+spec:
+  networkNamespace: "default"
+  resourceName: "hostdev"
+  ipam: |
+    {
+      "type": "whereabouts",
+      "range": "192.168.3.225/28",
+      "exclude": [
+       "192.168.3.229/30",
+       "192.168.3.236/32"
+      ]
+    }
+EOF
+----
+
+. Create the `HostDeviceNetwork` resource on the cluster by using the following command:
++
+[source,terminal]
+----
+$ oc create -f hostdev-network.yaml
+----
++
+.Example output
++
+[source,terminal]
+----
+hostdevicenetwork.mellanox.com/hostdev-net created 
+----
+
+. Confirm that the resources appear in the cluster `oc describe node` section by using the following command:
++
+[source,terminal]
+----
+$ oc describe node -l node-role.kubernetes.io/worker=| grep -E 'Capacity:|Allocatable:' -A8
+----
++
+.Example output
++
+[source,terminal]
+----
+Capacity:
+  cpu:                 128
+  ephemeral-storage:   1561525616Ki
+  hugepages-1Gi:       0
+  hugepages-2Mi:       0
+  memory:              263596708Ki
+  nvidia.com/gpu:      2
+  nvidia.com/hostdev:  2
+  pods:                250
+Allocatable:
+  cpu:                 127500m
+  ephemeral-storage:   1438028263499
+  hugepages-1Gi:       0
+  hugepages-2Mi:       0
+  memory:              262445732Ki
+  nvidia.com/gpu:      2
+  nvidia.com/hostdev:  2
+  pods:                250
+--
+Capacity:
+  cpu:                 128
+  ephemeral-storage:   1561525616Ki
+  hugepages-1Gi:       0
+  hugepages-2Mi:       0
+  memory:              263596680Ki
+  nvidia.com/gpu:      2
+  nvidia.com/hostdev:  2
+  pods:                250
+Allocatable:
+  cpu:                 127500m
+  ephemeral-storage:   1438028263499
+  hugepages-1Gi:       0
+  hugepages-2Mi:       0
+  memory:              262445704Ki
+  nvidia.com/gpu:      2
+  nvidia.com/hostdev:  2
+  pods:                250
+----
@@ -0,0 +1,106 @@
+// Module included in the following assemblies:
+//
+// * hardware_accelerators/rdma-remote-direct-memory-access.adoc
+
+:_mod-docs-content-type: PROCEDURE
+[id="rdma-creating-shared-device-rdma-ib_{context}"]
+
+= Creating a shared device RDMA on Infiniband
+
+Create the workload pods for a shared device Remote Direct Memory Access (RDMA) for an Infiniband installation.
+
+.Procedure
+
+. Generate custom pod resources:
++
+[source,yaml]
+----
+$ cat <<EOF > rdma-ib-32-workload.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: rdma-ib-32-workload
+  namespace: default
+  annotations:
+    k8s.v1.cni.cncf.io/networks: example-ipoibnetwork
+spec:
+  nodeSelector: 
+    kubernetes.io/hostname: nvd-srv-32.nvidia.eng.rdu2.dc.redhat.com
+  containers:
+  - image: quay.io/edge-infrastructure/nvidia-tools:0.1.5
+    name: rdma-ib-32-workload
+    resources:
+      limits:
+        nvidia.com/gpu: 1
+        rdma/rdma_shared_device_ib: 1
+      requests:
+        nvidia.com/gpu: 1
+        rdma/rdma_shared_device_ib: 1
+EOF
+
+$ cat <<EOF > rdma-ib-32-workload.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: rdma-ib-33-workload
+  namespace: default
+  annotations:
+    k8s.v1.cni.cncf.io/networks: example-ipoibnetwork
+spec:
+  nodeSelector: 
+    kubernetes.io/hostname: nvd-srv-33.nvidia.eng.rdu2.dc.redhat.com
+  containers:
+  - image: quay.io/edge-infrastructure/nvidia-tools:0.1.5
+    name: rdma-ib-33-workload
+    securityContext:
+      capabilities:
+        add: [ "IPC_LOCK" ]
+    resources:
+      limits:
+        nvidia.com/gpu: 1
+        rdma/rdma_shared_device_ib: 1
+      requests:
+        nvidia.com/gpu: 1
+        rdma/rdma_shared_device_ib: 1
+EOF
+----
+
+. Create the pods on the cluster by using the following commands:
++
+[source,terminal]
+----
+$ oc create -f rdma-ib-32-workload.yaml
+----
++
+.Example output
+[source,terminal]
+----
+pod/rdma-ib-32-workload created
+----
++
+[source,terminal]
+----
+$ oc create -f rdma-ib-33-workload.yaml
+----
++
+.Example output
+[source,terminal]
+----
+pod/rdma-ib-33-workload created
+----
+
+. Verify that the pods are running by using the following command:
++
+[source,terminal]
+----
+$ oc get pods 
+----
++
+.Example output
++
+[source,terminal]
+----
+NAME                  READY   STATUS    RESTARTS   AGE
+rdma-ib-32-workload   1/1     Running   0          10s
+rdma-ib-33-workload   1/1     Running   0          3s
+----