TELCODOCS-758-GCP: First draft

StephenJamesSmith · StephenJamesSmith · commit 1b239eb0ebf5 · 2023-01-18T14:24:46.000-05:00
diff --git a/machine_management/creating_machinesets/creating-machineset-gcp.adoc b/machine_management/creating_machinesets/creating-machineset-gcp.adoc
@@ -36,3 +36,9 @@ include::modules/machineset-enabling-customer-managed-encryption.adoc[leveloffse
 //Enabling GPU support for a compute machine set
 include::modules/machineset-gcp-enabling-gpu-support.adoc[leveloffset=+1]
 //TODO break out procedure as a L2
+
+//Adding a GPU node to a machine set (stesmith)
+include::modules/nvidia-gpu-gcp-adding-a-gpu-node.adoc[leveloffset=+1]
+
+//Deploying the Node Feature Discovery Operator (stesmith)
+include::modules/nvidia-gpu-aws-deploying-the-node-feature-discovery-operator.adoc[leveloffset=+1]
diff --git a/modules/nvidia-gpu-aws-deploying-the-node-feature-discovery-operator.adoc b/modules/nvidia-gpu-aws-deploying-the-node-feature-discovery-operator.adoc
@@ -1,6 +1,7 @@
 // Module included in the following assemblies:
 //
 //  * machine_management/creating_machinesets/creating-machineset-aws.adoc
+//  * machine_management/creating_machinesets/creating-machineset-gcp.adoc
 
 :_content-type: PROCEDURE
 [id="nvidia-gpu-aws-deploying-the-node-feature-discovery-operator_{context}"]
diff --git a/modules/nvidia-gpu-gcp-adding-a-gpu-node.adoc b/modules/nvidia-gpu-gcp-adding-a-gpu-node.adoc
@@ -0,0 +1,323 @@
+// Module included in the following assemblies:
+//
+//  * machine_management/creating-machinesets/creating-machineset-aws.adoc
+
+:_content-type: PROCEDURE
+[id="nvidia-gpu-gcp-adding-a-gpu-node_{context}"]
+= Adding a GPU node to an existing {product-title} cluster
+
+You can copy and modify a default compute machine set configuration to create a GPU-enabled machine set and machines for the GCP cloud provider.
+
+The following table lists the validated instance types:
+
+[cols="1,1,1,1"]
+|===
+|Instance type |NVIDIA GPU accelerator |Maximum number of GPUs |Architecture
+
+|`a2-highgpu-1g`
+|A100
+|1
+|x86
+
+|`n1-standard-4`
+|T4
+|1
+|x86
+|===
+
+.Procedure
+
+. Make a copy of an existing `MachineSet`.
+
+. In the new copy, change the machine set `name` in `metadata.name` and in both instances of `machine.openshift.io/cluster-api-machineset`.
+
+. Change the instance type to add the following two lines to the newly copied `MachineSet`:
++
+----
+machineType: a2-highgpu-1g
+onHostMaintenance: Terminate
+----
++
+.Example `a2-highgpu-1g.json` file
++
+[source,json]
+----
+{
+    "apiVersion": "machine.openshift.io/v1beta1",
+    "kind": "MachineSet",
+    "metadata": {
+        "annotations": {
+            "machine.openshift.io/GPU": "0",
+            "machine.openshift.io/memoryMb": "16384",
+            "machine.openshift.io/vCPU": "4"
+        },
+        "creationTimestamp": "2023-01-13T17:11:02Z",
+        "generation": 1,
+        "labels": {
+            "machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p"
+        },
+        "name": "myclustername-2pt9p-worker-gpu-a",
+        "namespace": "openshift-machine-api",
+        "resourceVersion": "20185",
+        "uid": "2daf4712-733e-4399-b4b4-d43cb1ed32bd"
+    },
+    "spec": {
+        "replicas": 1,
+        "selector": {
+            "matchLabels": {
+                "machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p",
+                "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a"
+            }
+        },
+        "template": {
+            "metadata": {
+                "labels": {
+                    "machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p",
+                    "machine.openshift.io/cluster-api-machine-role": "worker",
+                    "machine.openshift.io/cluster-api-machine-type": "worker",
+                    "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a"
+                }
+            },
+            "spec": {
+                "lifecycleHooks": {},
+                "metadata": {},
+                "providerSpec": {
+                    "value": {
+                        "apiVersion": "machine.openshift.io/v1beta1",
+                        "canIPForward": false,
+                        "credentialsSecret": {
+                            "name": "gcp-cloud-credentials"
+                        },
+                        "deletionProtection": false,
+                        "disks": [
+                            {
+                                "autoDelete": true,
+                                "boot": true,
+                                "image": "projects/rhcos-cloud/global/images/rhcos-412-86-202212081411-0-gcp-x86-64",
+                                "labels": null,
+                                "sizeGb": 128,
+                                "type": "pd-ssd"
+                            }
+                        ],
+                        "kind": "GCPMachineProviderSpec",
+                        "machineType": "a2-highgpu-1g",
+                        "onHostMaintenance": "Terminate",
+                        "metadata": {
+                            "creationTimestamp": null
+                        },
+                        "networkInterfaces": [
+                            {
+                                "network": "myclustername-2pt9p-network",
+                                "subnetwork": "myclustername-2pt9p-worker-subnet"
+                            }
+                        ],
+                        "preemptible": true,
+                        "projectID": "myteam",
+                        "region": "us-central1",
+                        "serviceAccounts": [
+                            {
+                                "email": "myclustername-2pt9p-w@myteam.iam.gserviceaccount.com",
+                                "scopes": [
+                                    "https://www.googleapis.com/auth/cloud-platform"
+                                ]
+                            }
+                        ],
+                        "tags": [
+                            "myclustername-2pt9p-worker"
+                        ],
+                        "userDataSecret": {
+                            "name": "worker-user-data"
+                        },
+                        "zone": "us-central1-a"
+                    }
+                }
+            }
+        }
+    },
+    "status": {
+        "availableReplicas": 1,
+        "fullyLabeledReplicas": 1,
+        "observedGeneration": 1,
+        "readyReplicas": 1,
+        "replicas": 1
+    }
+}
+----
+
+. View the existing nodes, machines, and machine sets by running the following command. Note that each node is an instance of a machine definition with a specific GCP region and {product-title} role.
++
+[source,terminal]
+----
+$ oc get nodes
+----
++
+.Example output
++
+[source,terminal]
+----
+NAME                                                             STATUS     ROLES                  AGE     VERSION
+myclustername-2pt9p-master-0.c.openshift-qe.internal             Ready      control-plane,master   8h      v1.25.4+77bec7a
+myclustername-2pt9p-master-1.c.openshift-qe.internal             Ready      control-plane,master   8h      v1.25.4+77bec7a
+myclustername-2pt9p-master-2.c.openshift-qe.internal             Ready      control-plane,master   8h      v1.25.4+77bec7a
+myclustername-2pt9p-worker-a-mxtnz.c.openshift-qe.internal       Ready      worker                 8h      v1.25.4+77bec7a
+myclustername-2pt9p-worker-b-9pzzn.c.openshift-qe.internal       Ready      worker                 8h      v1.25.4+77bec7a
+myclustername-2pt9p-worker-c-6pbg6.c.openshift-qe.internal       Ready      worker                 8h      v1.25.4+77bec7a
+myclustername-2pt9p-worker-gpu-a-wxcr6.c.openshift-qe.internal   Ready      worker                 4h35m   v1.25.4+77bec7a
+----
+
+. View the machines and machine sets that exist in the `openshift-machine-api` namespace by running the following command. Each compute machine set is associated with a different availability zone within the GCP region. The installer automatically load balances compute machines across availability zones.
++
+[source,terminal]
+----
+$ oc get machinesets -n openshift-machine-api
+----
++
+.Example output
++
+[source,terminal]
+----
+NAME                               DESIRED   CURRENT   READY   AVAILABLE   AGE
+myclustername-2pt9p-worker-a       1         1         1       1           8h
+myclustername-2pt9p-worker-b       1         1         1       1           8h
+myclustername-2pt9p-worker-c       1         1                             8h
+myclustername-2pt9p-worker-f       0         0                             8h
+----
+
+. View the machines that exist in the `openshift-machine-api` namespace by running the following command. You can only configure one compute machine per set, although you can scale a compute machine set to add a node in a particular region and zone.
++
+[source,terminal]
+----
+$ oc get machines -n openshift-machine-api | grep worker
+----
++
+.Example output
++
+[source,terminal]
+----
+myclustername-2pt9p-worker-a-mxtnz       Running   n2-standard-4   us-central1   us-central1-a   8h
+myclustername-2pt9p-worker-b-9pzzn       Running   n2-standard-4   us-central1   us-central1-b   8h
+myclustername-2pt9p-worker-c-6pbg6       Running   n2-standard-4   us-central1   us-central1-c   8h
+----
+
+. Make a copy of one of the existing compute `MachineSet` definitions and output the result to a JSON file by running the following command. This will be the basis for the GPU-enabled compute machine set definition.
++
+[source,terminal]
+----
+$ oc get machineset myclustername-2pt9p-worker-a -n openshift-machine-api -o json  > <output_file.json>
+----
+
+. Edit the JSON file to make the following changes to the new `MachineSet` definition:
++
+* Rename the machine set `name` by inserting the substring `gpu` in `metadata.name` and in both instances of `machine.openshift.io/cluster-api-machineset`.
+* Change the `machineType` of the new `MachineSet` definition to `a2-highgpu-1g`, which includes an NVIDIA A100 GPU.
++
+[source,terminal]
+----
+jq .spec.template.spec.providerSpec.value.machineType ocp_4.12_machineset-a2-highgpu-1g.json
+
+"a2-highgpu-1g"
+----
++
+The `<output_file.json>` file is saved as `ocp_4.12_machineset-a2-highgpu-1g.json`.
+
+. Update the following fields in `ocp_4.12_machineset-a2-highgpu-1g.json`:
++
+* Change `.metadata.name` to a name containing `gpu`.
+
+* Change `.spec.selector.matchLabels["machine.openshift.io/cluster-api-machineset"]` to
+match the new `.metadata.name`.
+
+* Change `.spec.template.metadata.labels["machine.openshift.io/cluster-api-machineset"]`
+to match the new `.metadata.name`.
+
+* Change `.spec.template.spec.providerSpec.value.MachineType` to `a2-highgpu-1g`.
+
+* Add the following line under `machineType`: `"onHostMaintenance": "Terminate". For example:
++
+[source,json]
+----
+"machineType": "a2-highgpu-1g",
+"onHostMaintenance": "Terminate",
+----
+
+. To verify your changes, perform a `diff` of the original compute definition and the new GPU-enabled node definition by running the following command:
++
+[source,terminal]
+----
+$ oc get machineset/myclustername-2pt9p-worker-a -n openshift-machine-api -o json | diff ocp_4.12_machineset-a2-highgpu-1g.json -
+----
++
+.Example output
++
+[source,terminal]
+----
+15c15
+<         "name": "myclustername-2pt9p-worker-gpu-a",
+---
+>         "name": "myclustername-2pt9p-worker-a",
+25c25
+<                 "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a"
+---
+>                 "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-a"
+34c34
+<                     "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a"
+---
+>                     "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-a"
+59,60c59
+<                         "machineType": "a2-highgpu-1g",
+<                         "onHostMaintenance": "Terminate",
+---
+>                         "machineType": "n2-standard-4",
+----
+
+. Create the GPU-enabled compute machine set from the definition file by running the following command:
++
+[source,terminal]
+----
+$ oc create -f ocp_4.12_machineset-a2-highgpu-1g.json
+----
++
+.Example output
++
+[source,terminal]
+----
+machineset.machine.openshift.io/myclustername-2pt9p-worker-gpu-a created
+----
+
+.Verification
+
+. View the machine set you created by running the following command:
++
+[source,terminal]
+----
+$ oc -n openshift-machine-api get machinesets | grep gpu
+----
++
+The MachineSet replica count is set to `1` so a new `Machine` object is created automatically.
+
++
+.Example output
++
+[source,terminal]
+----
+myclustername-2pt9p-worker-gpu-a   1         1         1       1           5h24m
+----
+
+. View the `Machine` object that the machine set created by running the following command:
++
+[source,terminal]
+----
+$ oc -n openshift-machine-api get machines | grep gpu
+----
++
+.Example output
++
+[source,terminal]
+----
+myclustername-2pt9p-worker-gpu-a-wxcr6   Running   a2-highgpu-1g   us-central1   us-central1-a   5h25m
+----
+
+[NOTE]
+====
+Note that there is no need to specify a namespace for the node. The node definition is cluster scoped.
+====