|
| 1 | +// Module included in the following assemblies: |
| 2 | +// |
| 3 | +// * machine_management/creating-machinesets/creating-machineset-aws.adoc |
| 4 | + |
| 5 | +:_content-type: PROCEDURE |
| 6 | +[id="nvidia-gpu-gcp-adding-a-gpu-node_{context}"] |
| 7 | += Adding a GPU node to an existing {product-title} cluster |
| 8 | + |
| 9 | +You can copy and modify a default compute machine set configuration to create a GPU-enabled machine set and machines for the GCP cloud provider. |
| 10 | + |
| 11 | +The following table lists the validated instance types: |
| 12 | + |
| 13 | +[cols="1,1,1,1"] |
| 14 | +|=== |
| 15 | +|Instance type |NVIDIA GPU accelerator |Maximum number of GPUs |Architecture |
| 16 | + |
| 17 | +|`a2-highgpu-1g` |
| 18 | +|A100 |
| 19 | +|1 |
| 20 | +|x86 |
| 21 | + |
| 22 | +|`n1-standard-4` |
| 23 | +|T4 |
| 24 | +|1 |
| 25 | +|x86 |
| 26 | +|=== |
| 27 | + |
| 28 | +.Procedure |
| 29 | + |
| 30 | +. Make a copy of an existing `MachineSet`. |
| 31 | + |
| 32 | +. In the new copy, change the machine set `name` in `metadata.name` and in both instances of `machine.openshift.io/cluster-api-machineset`. |
| 33 | + |
| 34 | +. Change the instance type to add the following two lines to the newly copied `MachineSet`: |
| 35 | ++ |
| 36 | +---- |
| 37 | +machineType: a2-highgpu-1g |
| 38 | +onHostMaintenance: Terminate |
| 39 | +---- |
| 40 | ++ |
| 41 | +.Example `a2-highgpu-1g.json` file |
| 42 | ++ |
| 43 | +[source,json] |
| 44 | +---- |
| 45 | +{ |
| 46 | + "apiVersion": "machine.openshift.io/v1beta1", |
| 47 | + "kind": "MachineSet", |
| 48 | + "metadata": { |
| 49 | + "annotations": { |
| 50 | + "machine.openshift.io/GPU": "0", |
| 51 | + "machine.openshift.io/memoryMb": "16384", |
| 52 | + "machine.openshift.io/vCPU": "4" |
| 53 | + }, |
| 54 | + "creationTimestamp": "2023-01-13T17:11:02Z", |
| 55 | + "generation": 1, |
| 56 | + "labels": { |
| 57 | + "machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p" |
| 58 | + }, |
| 59 | + "name": "myclustername-2pt9p-worker-gpu-a", |
| 60 | + "namespace": "openshift-machine-api", |
| 61 | + "resourceVersion": "20185", |
| 62 | + "uid": "2daf4712-733e-4399-b4b4-d43cb1ed32bd" |
| 63 | + }, |
| 64 | + "spec": { |
| 65 | + "replicas": 1, |
| 66 | + "selector": { |
| 67 | + "matchLabels": { |
| 68 | + "machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p", |
| 69 | + "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a" |
| 70 | + } |
| 71 | + }, |
| 72 | + "template": { |
| 73 | + "metadata": { |
| 74 | + "labels": { |
| 75 | + "machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p", |
| 76 | + "machine.openshift.io/cluster-api-machine-role": "worker", |
| 77 | + "machine.openshift.io/cluster-api-machine-type": "worker", |
| 78 | + "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a" |
| 79 | + } |
| 80 | + }, |
| 81 | + "spec": { |
| 82 | + "lifecycleHooks": {}, |
| 83 | + "metadata": {}, |
| 84 | + "providerSpec": { |
| 85 | + "value": { |
| 86 | + "apiVersion": "machine.openshift.io/v1beta1", |
| 87 | + "canIPForward": false, |
| 88 | + "credentialsSecret": { |
| 89 | + "name": "gcp-cloud-credentials" |
| 90 | + }, |
| 91 | + "deletionProtection": false, |
| 92 | + "disks": [ |
| 93 | + { |
| 94 | + "autoDelete": true, |
| 95 | + "boot": true, |
| 96 | + "image": "projects/rhcos-cloud/global/images/rhcos-412-86-202212081411-0-gcp-x86-64", |
| 97 | + "labels": null, |
| 98 | + "sizeGb": 128, |
| 99 | + "type": "pd-ssd" |
| 100 | + } |
| 101 | + ], |
| 102 | + "kind": "GCPMachineProviderSpec", |
| 103 | + "machineType": "a2-highgpu-1g", |
| 104 | + "onHostMaintenance": "Terminate", |
| 105 | + "metadata": { |
| 106 | + "creationTimestamp": null |
| 107 | + }, |
| 108 | + "networkInterfaces": [ |
| 109 | + { |
| 110 | + "network": "myclustername-2pt9p-network", |
| 111 | + "subnetwork": "myclustername-2pt9p-worker-subnet" |
| 112 | + } |
| 113 | + ], |
| 114 | + "preemptible": true, |
| 115 | + "projectID": "myteam", |
| 116 | + "region": "us-central1", |
| 117 | + "serviceAccounts": [ |
| 118 | + { |
| 119 | + |
| 120 | + "scopes": [ |
| 121 | + "https://www.googleapis.com/auth/cloud-platform" |
| 122 | + ] |
| 123 | + } |
| 124 | + ], |
| 125 | + "tags": [ |
| 126 | + "myclustername-2pt9p-worker" |
| 127 | + ], |
| 128 | + "userDataSecret": { |
| 129 | + "name": "worker-user-data" |
| 130 | + }, |
| 131 | + "zone": "us-central1-a" |
| 132 | + } |
| 133 | + } |
| 134 | + } |
| 135 | + } |
| 136 | + }, |
| 137 | + "status": { |
| 138 | + "availableReplicas": 1, |
| 139 | + "fullyLabeledReplicas": 1, |
| 140 | + "observedGeneration": 1, |
| 141 | + "readyReplicas": 1, |
| 142 | + "replicas": 1 |
| 143 | + } |
| 144 | +} |
| 145 | +---- |
| 146 | + |
| 147 | +. View the existing nodes, machines, and machine sets by running the following command. Note that each node is an instance of a machine definition with a specific GCP region and {product-title} role. |
| 148 | ++ |
| 149 | +[source,terminal] |
| 150 | +---- |
| 151 | +$ oc get nodes |
| 152 | +---- |
| 153 | ++ |
| 154 | +.Example output |
| 155 | ++ |
| 156 | +[source,terminal] |
| 157 | +---- |
| 158 | +NAME STATUS ROLES AGE VERSION |
| 159 | +myclustername-2pt9p-master-0.c.openshift-qe.internal Ready control-plane,master 8h v1.25.4+77bec7a |
| 160 | +myclustername-2pt9p-master-1.c.openshift-qe.internal Ready control-plane,master 8h v1.25.4+77bec7a |
| 161 | +myclustername-2pt9p-master-2.c.openshift-qe.internal Ready control-plane,master 8h v1.25.4+77bec7a |
| 162 | +myclustername-2pt9p-worker-a-mxtnz.c.openshift-qe.internal Ready worker 8h v1.25.4+77bec7a |
| 163 | +myclustername-2pt9p-worker-b-9pzzn.c.openshift-qe.internal Ready worker 8h v1.25.4+77bec7a |
| 164 | +myclustername-2pt9p-worker-c-6pbg6.c.openshift-qe.internal Ready worker 8h v1.25.4+77bec7a |
| 165 | +myclustername-2pt9p-worker-gpu-a-wxcr6.c.openshift-qe.internal Ready worker 4h35m v1.25.4+77bec7a |
| 166 | +---- |
| 167 | + |
| 168 | +. View the machines and machine sets that exist in the `openshift-machine-api` namespace by running the following command. Each compute machine set is associated with a different availability zone within the GCP region. The installer automatically load balances compute machines across availability zones. |
| 169 | ++ |
| 170 | +[source,terminal] |
| 171 | +---- |
| 172 | +$ oc get machinesets -n openshift-machine-api |
| 173 | +---- |
| 174 | ++ |
| 175 | +.Example output |
| 176 | ++ |
| 177 | +[source,terminal] |
| 178 | +---- |
| 179 | +NAME DESIRED CURRENT READY AVAILABLE AGE |
| 180 | +myclustername-2pt9p-worker-a 1 1 1 1 8h |
| 181 | +myclustername-2pt9p-worker-b 1 1 1 1 8h |
| 182 | +myclustername-2pt9p-worker-c 1 1 8h |
| 183 | +myclustername-2pt9p-worker-f 0 0 8h |
| 184 | +---- |
| 185 | + |
| 186 | +. View the machines that exist in the `openshift-machine-api` namespace by running the following command. You can only configure one compute machine per set, although you can scale a compute machine set to add a node in a particular region and zone. |
| 187 | ++ |
| 188 | +[source,terminal] |
| 189 | +---- |
| 190 | +$ oc get machines -n openshift-machine-api | grep worker |
| 191 | +---- |
| 192 | ++ |
| 193 | +.Example output |
| 194 | ++ |
| 195 | +[source,terminal] |
| 196 | +---- |
| 197 | +myclustername-2pt9p-worker-a-mxtnz Running n2-standard-4 us-central1 us-central1-a 8h |
| 198 | +myclustername-2pt9p-worker-b-9pzzn Running n2-standard-4 us-central1 us-central1-b 8h |
| 199 | +myclustername-2pt9p-worker-c-6pbg6 Running n2-standard-4 us-central1 us-central1-c 8h |
| 200 | +---- |
| 201 | + |
| 202 | +. Make a copy of one of the existing compute `MachineSet` definitions and output the result to a JSON file by running the following command. This will be the basis for the GPU-enabled compute machine set definition. |
| 203 | ++ |
| 204 | +[source,terminal] |
| 205 | +---- |
| 206 | +$ oc get machineset myclustername-2pt9p-worker-a -n openshift-machine-api -o json > <output_file.json> |
| 207 | +---- |
| 208 | + |
| 209 | +. Edit the JSON file to make the following changes to the new `MachineSet` definition: |
| 210 | ++ |
| 211 | +* Rename the machine set `name` by inserting the substring `gpu` in `metadata.name` and in both instances of `machine.openshift.io/cluster-api-machineset`. |
| 212 | +* Change the `machineType` of the new `MachineSet` definition to `a2-highgpu-1g`, which includes an NVIDIA A100 GPU. |
| 213 | ++ |
| 214 | +[source,terminal] |
| 215 | +---- |
| 216 | +jq .spec.template.spec.providerSpec.value.machineType ocp_4.12_machineset-a2-highgpu-1g.json |
| 217 | + |
| 218 | +"a2-highgpu-1g" |
| 219 | +---- |
| 220 | ++ |
| 221 | +The `<output_file.json>` file is saved as `ocp_4.12_machineset-a2-highgpu-1g.json`. |
| 222 | + |
| 223 | +. Update the following fields in `ocp_4.12_machineset-a2-highgpu-1g.json`: |
| 224 | ++ |
| 225 | +* Change `.metadata.name` to a name containing `gpu`. |
| 226 | + |
| 227 | +* Change `.spec.selector.matchLabels["machine.openshift.io/cluster-api-machineset"]` to |
| 228 | +match the new `.metadata.name`. |
| 229 | +
|
| 230 | +* Change `.spec.template.metadata.labels["machine.openshift.io/cluster-api-machineset"]` |
| 231 | +to match the new `.metadata.name`. |
| 232 | +
|
| 233 | +* Change `.spec.template.spec.providerSpec.value.MachineType` to `a2-highgpu-1g`. |
| 234 | +
|
| 235 | +* Add the following line under `machineType`: `"onHostMaintenance": "Terminate". For example: |
| 236 | ++ |
| 237 | +[source,json] |
| 238 | +---- |
| 239 | +"machineType": "a2-highgpu-1g", |
| 240 | +"onHostMaintenance": "Terminate", |
| 241 | +---- |
| 242 | +
|
| 243 | +. To verify your changes, perform a `diff` of the original compute definition and the new GPU-enabled node definition by running the following command: |
| 244 | ++ |
| 245 | +[source,terminal] |
| 246 | +---- |
| 247 | +$ oc get machineset/myclustername-2pt9p-worker-a -n openshift-machine-api -o json | diff ocp_4.12_machineset-a2-highgpu-1g.json - |
| 248 | +---- |
| 249 | ++ |
| 250 | +.Example output |
| 251 | ++ |
| 252 | +[source,terminal] |
| 253 | +---- |
| 254 | +15c15 |
| 255 | +< "name": "myclustername-2pt9p-worker-gpu-a", |
| 256 | +--- |
| 257 | +> "name": "myclustername-2pt9p-worker-a", |
| 258 | +25c25 |
| 259 | +< "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a" |
| 260 | +--- |
| 261 | +> "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-a" |
| 262 | +34c34 |
| 263 | +< "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a" |
| 264 | +--- |
| 265 | +> "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-a" |
| 266 | +59,60c59 |
| 267 | +< "machineType": "a2-highgpu-1g", |
| 268 | +< "onHostMaintenance": "Terminate", |
| 269 | +--- |
| 270 | +> "machineType": "n2-standard-4", |
| 271 | +---- |
| 272 | + |
| 273 | +. Create the GPU-enabled compute machine set from the definition file by running the following command: |
| 274 | ++ |
| 275 | +[source,terminal] |
| 276 | +---- |
| 277 | +$ oc create -f ocp_4.12_machineset-a2-highgpu-1g.json |
| 278 | +---- |
| 279 | ++ |
| 280 | +.Example output |
| 281 | ++ |
| 282 | +[source,terminal] |
| 283 | +---- |
| 284 | +machineset.machine.openshift.io/myclustername-2pt9p-worker-gpu-a created |
| 285 | +---- |
| 286 | + |
| 287 | +.Verification |
| 288 | + |
| 289 | +. View the machine set you created by running the following command: |
| 290 | ++ |
| 291 | +[source,terminal] |
| 292 | +---- |
| 293 | +$ oc -n openshift-machine-api get machinesets | grep gpu |
| 294 | +---- |
| 295 | ++ |
| 296 | +The MachineSet replica count is set to `1` so a new `Machine` object is created automatically. |
| 297 | + |
| 298 | ++ |
| 299 | +.Example output |
| 300 | ++ |
| 301 | +[source,terminal] |
| 302 | +---- |
| 303 | +myclustername-2pt9p-worker-gpu-a 1 1 1 1 5h24m |
| 304 | +---- |
| 305 | + |
| 306 | +. View the `Machine` object that the machine set created by running the following command: |
| 307 | ++ |
| 308 | +[source,terminal] |
| 309 | +---- |
| 310 | +$ oc -n openshift-machine-api get machines | grep gpu |
| 311 | +---- |
| 312 | ++ |
| 313 | +.Example output |
| 314 | ++ |
| 315 | +[source,terminal] |
| 316 | +---- |
| 317 | +myclustername-2pt9p-worker-gpu-a-wxcr6 Running a2-highgpu-1g us-central1 us-central1-a 5h25m |
| 318 | +---- |
| 319 | + |
| 320 | +[NOTE] |
| 321 | +==== |
| 322 | +Note that there is no need to specify a namespace for the node. The node definition is cluster scoped. |
| 323 | +==== |
0 commit comments