Skip to content

Commit 1b239eb

Browse files
TELCODOCS-758-GCP: First draft
1 parent 76cd9c8 commit 1b239eb

File tree

3 files changed

+330
-0
lines changed

3 files changed

+330
-0
lines changed

machine_management/creating_machinesets/creating-machineset-gcp.adoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,9 @@ include::modules/machineset-enabling-customer-managed-encryption.adoc[leveloffse
3636
//Enabling GPU support for a compute machine set
3737
include::modules/machineset-gcp-enabling-gpu-support.adoc[leveloffset=+1]
3838
//TODO break out procedure as a L2
39+
40+
//Adding a GPU node to a machine set (stesmith)
41+
include::modules/nvidia-gpu-gcp-adding-a-gpu-node.adoc[leveloffset=+1]
42+
43+
//Deploying the Node Feature Discovery Operator (stesmith)
44+
include::modules/nvidia-gpu-aws-deploying-the-node-feature-discovery-operator.adoc[leveloffset=+1]

modules/nvidia-gpu-aws-deploying-the-node-feature-discovery-operator.adoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Module included in the following assemblies:
22
//
33
// * machine_management/creating_machinesets/creating-machineset-aws.adoc
4+
// * machine_management/creating_machinesets/creating-machineset-gcp.adoc
45

56
:_content-type: PROCEDURE
67
[id="nvidia-gpu-aws-deploying-the-node-feature-discovery-operator_{context}"]
Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
// Module included in the following assemblies:
2+
//
3+
// * machine_management/creating-machinesets/creating-machineset-aws.adoc
4+
5+
:_content-type: PROCEDURE
6+
[id="nvidia-gpu-gcp-adding-a-gpu-node_{context}"]
7+
= Adding a GPU node to an existing {product-title} cluster
8+
9+
You can copy and modify a default compute machine set configuration to create a GPU-enabled machine set and machines for the GCP cloud provider.
10+
11+
The following table lists the validated instance types:
12+
13+
[cols="1,1,1,1"]
14+
|===
15+
|Instance type |NVIDIA GPU accelerator |Maximum number of GPUs |Architecture
16+
17+
|`a2-highgpu-1g`
18+
|A100
19+
|1
20+
|x86
21+
22+
|`n1-standard-4`
23+
|T4
24+
|1
25+
|x86
26+
|===
27+
28+
.Procedure
29+
30+
. Make a copy of an existing `MachineSet`.
31+
32+
. In the new copy, change the machine set `name` in `metadata.name` and in both instances of `machine.openshift.io/cluster-api-machineset`.
33+
34+
. Change the instance type to add the following two lines to the newly copied `MachineSet`:
35+
+
36+
----
37+
machineType: a2-highgpu-1g
38+
onHostMaintenance: Terminate
39+
----
40+
+
41+
.Example `a2-highgpu-1g.json` file
42+
+
43+
[source,json]
44+
----
45+
{
46+
"apiVersion": "machine.openshift.io/v1beta1",
47+
"kind": "MachineSet",
48+
"metadata": {
49+
"annotations": {
50+
"machine.openshift.io/GPU": "0",
51+
"machine.openshift.io/memoryMb": "16384",
52+
"machine.openshift.io/vCPU": "4"
53+
},
54+
"creationTimestamp": "2023-01-13T17:11:02Z",
55+
"generation": 1,
56+
"labels": {
57+
"machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p"
58+
},
59+
"name": "myclustername-2pt9p-worker-gpu-a",
60+
"namespace": "openshift-machine-api",
61+
"resourceVersion": "20185",
62+
"uid": "2daf4712-733e-4399-b4b4-d43cb1ed32bd"
63+
},
64+
"spec": {
65+
"replicas": 1,
66+
"selector": {
67+
"matchLabels": {
68+
"machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p",
69+
"machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a"
70+
}
71+
},
72+
"template": {
73+
"metadata": {
74+
"labels": {
75+
"machine.openshift.io/cluster-api-cluster": "myclustername-2pt9p",
76+
"machine.openshift.io/cluster-api-machine-role": "worker",
77+
"machine.openshift.io/cluster-api-machine-type": "worker",
78+
"machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a"
79+
}
80+
},
81+
"spec": {
82+
"lifecycleHooks": {},
83+
"metadata": {},
84+
"providerSpec": {
85+
"value": {
86+
"apiVersion": "machine.openshift.io/v1beta1",
87+
"canIPForward": false,
88+
"credentialsSecret": {
89+
"name": "gcp-cloud-credentials"
90+
},
91+
"deletionProtection": false,
92+
"disks": [
93+
{
94+
"autoDelete": true,
95+
"boot": true,
96+
"image": "projects/rhcos-cloud/global/images/rhcos-412-86-202212081411-0-gcp-x86-64",
97+
"labels": null,
98+
"sizeGb": 128,
99+
"type": "pd-ssd"
100+
}
101+
],
102+
"kind": "GCPMachineProviderSpec",
103+
"machineType": "a2-highgpu-1g",
104+
"onHostMaintenance": "Terminate",
105+
"metadata": {
106+
"creationTimestamp": null
107+
},
108+
"networkInterfaces": [
109+
{
110+
"network": "myclustername-2pt9p-network",
111+
"subnetwork": "myclustername-2pt9p-worker-subnet"
112+
}
113+
],
114+
"preemptible": true,
115+
"projectID": "myteam",
116+
"region": "us-central1",
117+
"serviceAccounts": [
118+
{
119+
"email": "[email protected]",
120+
"scopes": [
121+
"https://www.googleapis.com/auth/cloud-platform"
122+
]
123+
}
124+
],
125+
"tags": [
126+
"myclustername-2pt9p-worker"
127+
],
128+
"userDataSecret": {
129+
"name": "worker-user-data"
130+
},
131+
"zone": "us-central1-a"
132+
}
133+
}
134+
}
135+
}
136+
},
137+
"status": {
138+
"availableReplicas": 1,
139+
"fullyLabeledReplicas": 1,
140+
"observedGeneration": 1,
141+
"readyReplicas": 1,
142+
"replicas": 1
143+
}
144+
}
145+
----
146+
147+
. View the existing nodes, machines, and machine sets by running the following command. Note that each node is an instance of a machine definition with a specific GCP region and {product-title} role.
148+
+
149+
[source,terminal]
150+
----
151+
$ oc get nodes
152+
----
153+
+
154+
.Example output
155+
+
156+
[source,terminal]
157+
----
158+
NAME STATUS ROLES AGE VERSION
159+
myclustername-2pt9p-master-0.c.openshift-qe.internal Ready control-plane,master 8h v1.25.4+77bec7a
160+
myclustername-2pt9p-master-1.c.openshift-qe.internal Ready control-plane,master 8h v1.25.4+77bec7a
161+
myclustername-2pt9p-master-2.c.openshift-qe.internal Ready control-plane,master 8h v1.25.4+77bec7a
162+
myclustername-2pt9p-worker-a-mxtnz.c.openshift-qe.internal Ready worker 8h v1.25.4+77bec7a
163+
myclustername-2pt9p-worker-b-9pzzn.c.openshift-qe.internal Ready worker 8h v1.25.4+77bec7a
164+
myclustername-2pt9p-worker-c-6pbg6.c.openshift-qe.internal Ready worker 8h v1.25.4+77bec7a
165+
myclustername-2pt9p-worker-gpu-a-wxcr6.c.openshift-qe.internal Ready worker 4h35m v1.25.4+77bec7a
166+
----
167+
168+
. View the machines and machine sets that exist in the `openshift-machine-api` namespace by running the following command. Each compute machine set is associated with a different availability zone within the GCP region. The installer automatically load balances compute machines across availability zones.
169+
+
170+
[source,terminal]
171+
----
172+
$ oc get machinesets -n openshift-machine-api
173+
----
174+
+
175+
.Example output
176+
+
177+
[source,terminal]
178+
----
179+
NAME DESIRED CURRENT READY AVAILABLE AGE
180+
myclustername-2pt9p-worker-a 1 1 1 1 8h
181+
myclustername-2pt9p-worker-b 1 1 1 1 8h
182+
myclustername-2pt9p-worker-c 1 1 8h
183+
myclustername-2pt9p-worker-f 0 0 8h
184+
----
185+
186+
. View the machines that exist in the `openshift-machine-api` namespace by running the following command. You can only configure one compute machine per set, although you can scale a compute machine set to add a node in a particular region and zone.
187+
+
188+
[source,terminal]
189+
----
190+
$ oc get machines -n openshift-machine-api | grep worker
191+
----
192+
+
193+
.Example output
194+
+
195+
[source,terminal]
196+
----
197+
myclustername-2pt9p-worker-a-mxtnz Running n2-standard-4 us-central1 us-central1-a 8h
198+
myclustername-2pt9p-worker-b-9pzzn Running n2-standard-4 us-central1 us-central1-b 8h
199+
myclustername-2pt9p-worker-c-6pbg6 Running n2-standard-4 us-central1 us-central1-c 8h
200+
----
201+
202+
. Make a copy of one of the existing compute `MachineSet` definitions and output the result to a JSON file by running the following command. This will be the basis for the GPU-enabled compute machine set definition.
203+
+
204+
[source,terminal]
205+
----
206+
$ oc get machineset myclustername-2pt9p-worker-a -n openshift-machine-api -o json > <output_file.json>
207+
----
208+
209+
. Edit the JSON file to make the following changes to the new `MachineSet` definition:
210+
+
211+
* Rename the machine set `name` by inserting the substring `gpu` in `metadata.name` and in both instances of `machine.openshift.io/cluster-api-machineset`.
212+
* Change the `machineType` of the new `MachineSet` definition to `a2-highgpu-1g`, which includes an NVIDIA A100 GPU.
213+
+
214+
[source,terminal]
215+
----
216+
jq .spec.template.spec.providerSpec.value.machineType ocp_4.12_machineset-a2-highgpu-1g.json
217+
218+
"a2-highgpu-1g"
219+
----
220+
+
221+
The `<output_file.json>` file is saved as `ocp_4.12_machineset-a2-highgpu-1g.json`.
222+
223+
. Update the following fields in `ocp_4.12_machineset-a2-highgpu-1g.json`:
224+
+
225+
* Change `.metadata.name` to a name containing `gpu`.
226+
227+
* Change `.spec.selector.matchLabels["machine.openshift.io/cluster-api-machineset"]` to
228+
match the new `.metadata.name`.
229+
230+
* Change `.spec.template.metadata.labels["machine.openshift.io/cluster-api-machineset"]`
231+
to match the new `.metadata.name`.
232+
233+
* Change `.spec.template.spec.providerSpec.value.MachineType` to `a2-highgpu-1g`.
234+
235+
* Add the following line under `machineType`: `"onHostMaintenance": "Terminate". For example:
236+
+
237+
[source,json]
238+
----
239+
"machineType": "a2-highgpu-1g",
240+
"onHostMaintenance": "Terminate",
241+
----
242+
243+
. To verify your changes, perform a `diff` of the original compute definition and the new GPU-enabled node definition by running the following command:
244+
+
245+
[source,terminal]
246+
----
247+
$ oc get machineset/myclustername-2pt9p-worker-a -n openshift-machine-api -o json | diff ocp_4.12_machineset-a2-highgpu-1g.json -
248+
----
249+
+
250+
.Example output
251+
+
252+
[source,terminal]
253+
----
254+
15c15
255+
< "name": "myclustername-2pt9p-worker-gpu-a",
256+
---
257+
> "name": "myclustername-2pt9p-worker-a",
258+
25c25
259+
< "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a"
260+
---
261+
> "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-a"
262+
34c34
263+
< "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-gpu-a"
264+
---
265+
> "machine.openshift.io/cluster-api-machineset": "myclustername-2pt9p-worker-a"
266+
59,60c59
267+
< "machineType": "a2-highgpu-1g",
268+
< "onHostMaintenance": "Terminate",
269+
---
270+
> "machineType": "n2-standard-4",
271+
----
272+
273+
. Create the GPU-enabled compute machine set from the definition file by running the following command:
274+
+
275+
[source,terminal]
276+
----
277+
$ oc create -f ocp_4.12_machineset-a2-highgpu-1g.json
278+
----
279+
+
280+
.Example output
281+
+
282+
[source,terminal]
283+
----
284+
machineset.machine.openshift.io/myclustername-2pt9p-worker-gpu-a created
285+
----
286+
287+
.Verification
288+
289+
. View the machine set you created by running the following command:
290+
+
291+
[source,terminal]
292+
----
293+
$ oc -n openshift-machine-api get machinesets | grep gpu
294+
----
295+
+
296+
The MachineSet replica count is set to `1` so a new `Machine` object is created automatically.
297+
298+
+
299+
.Example output
300+
+
301+
[source,terminal]
302+
----
303+
myclustername-2pt9p-worker-gpu-a 1 1 1 1 5h24m
304+
----
305+
306+
. View the `Machine` object that the machine set created by running the following command:
307+
+
308+
[source,terminal]
309+
----
310+
$ oc -n openshift-machine-api get machines | grep gpu
311+
----
312+
+
313+
.Example output
314+
+
315+
[source,terminal]
316+
----
317+
myclustername-2pt9p-worker-gpu-a-wxcr6 Running a2-highgpu-1g us-central1 us-central1-a 5h25m
318+
----
319+
320+
[NOTE]
321+
====
322+
Note that there is no need to specify a namespace for the node. The node definition is cluster scoped.
323+
====

0 commit comments

Comments
 (0)