Skip to content

Commit 0509444

Browse files
author
Joanna Grycz
committed
feat: tpu_vm_create_startup_script
1 parent e0c0d8a commit 0509444

File tree

5 files changed

+206
-34
lines changed

5 files changed

+206
-34
lines changed

tpu/createStartupScriptVM.js

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Copyright 2024 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
'use strict';
18+
19+
async function main(nodeName, zone, tpuType, tpuSoftwareVersion) {
20+
// [START tpu_vm_create_startup_script]
21+
// Import the TPU library
22+
const {TpuClient} = require('@google-cloud/tpu').v2;
23+
const {Node, NetworkConfig} =
24+
require('@google-cloud/tpu').protos.google.cloud.tpu.v2;
25+
26+
// Instantiate a tpuClient
27+
const tpuClient = new TpuClient();
28+
29+
/**
30+
* TODO(developer): Update/uncomment these variables before running the sample.
31+
*/
32+
// Project ID or project number of the Google Cloud project you want to create a node.
33+
const projectId = await tpuClient.getProjectId();
34+
35+
// The name of the network you want the TPU node to connect to. The network should be assigned to your project.
36+
const networkName = 'compute-tpu-network';
37+
38+
// The region of the network, that you want the TPU node to connect to.
39+
const region = 'europe-west4';
40+
41+
// The name for your TPU.
42+
// nodeName = 'node-name-1';
43+
44+
// The zone in which to create the TPU.
45+
// For more information about supported TPU types for specific zones,
46+
// see https://cloud.google.com/tpu/docs/regions-zones
47+
// zone = 'europe-west4-a';
48+
49+
// The accelerator type that specifies the version and size of the Cloud TPU you want to create.
50+
// For more information about supported accelerator types for each TPU version,
51+
// see https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#versions.
52+
// tpuType = 'v2-8';
53+
54+
// Software version that specifies the version of the TPU runtime to install. For more information,
55+
// see https://cloud.google.com/tpu/docs/runtimes
56+
// tpuSoftwareVersion = 'tpu-vm-tf-2.17.0-pod-pjrt';
57+
58+
async function callCreateTpuVMStartupScript() {
59+
// Create a node
60+
const node = new Node({
61+
name: nodeName,
62+
zone,
63+
acceleratorType: tpuType,
64+
runtimeVersion: tpuSoftwareVersion,
65+
// Define network
66+
networkConfig: new NetworkConfig({
67+
enableExternalIps: true,
68+
network: `projects/${projectId}/global/networks/${networkName}`,
69+
subnetwork: `projects/${projectId}/regions/${region}/subnetworks/${networkName}`,
70+
}),
71+
metadata: {
72+
// The script updates numpy to the latest version and logs the output to a file.
73+
'startup-script': `#!/bin/bash
74+
echo "Hello World" > /var/log/hello.log
75+
sudo pip3 install --upgrade numpy >> /var/log/hello.log 2>&1`,
76+
},
77+
});
78+
79+
const parent = `projects/${projectId}/locations/${zone}`;
80+
const request = {parent, node, nodeId: nodeName};
81+
82+
const [operation] = await tpuClient.createNode(request);
83+
84+
// Wait for the create operation to complete.
85+
const [response] = await operation.promise();
86+
87+
console.log(JSON.stringify(response));
88+
}
89+
await callCreateTpuVMStartupScript();
90+
// [END tpu_vm_create_startup_script]
91+
}
92+
93+
main(...process.argv.slice(2)).catch(err => {
94+
console.error(err);
95+
process.exitCode = 1;
96+
});

tpu/vmCreateTopology.js renamed to tpu/createTopologyVM.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ async function main(nodeName, zone, tpuSoftwareVersion) {
5252

5353
// The version of the Cloud TPU you want to create.
5454
// Available options: TYPE_UNSPECIFIED = 0, V2 = 2, V3 = 4, V4 = 7
55-
const tpuVersion = AcceleratorConfig.Type.V2;
55+
const tpuVersion = AcceleratorConfig.Type.V3;
5656

5757
// The physical topology of your TPU slice.
5858
// For more information about topology for each TPU version,
@@ -87,7 +87,6 @@ async function main(nodeName, zone, tpuSoftwareVersion) {
8787
const [response] = await operation.promise();
8888

8989
console.log(JSON.stringify(response));
90-
console.log(`TPU VM: ${nodeName} created.`);
9190
}
9291
await callCreateTpuVMTopology();
9392
// [END tpu_vm_create_topology]
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
* Copyright 2024 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
'use strict';
18+
19+
const path = require('path');
20+
const assert = require('node:assert/strict');
21+
const {after, describe, it} = require('mocha');
22+
const cp = require('child_process');
23+
const {getStaleNodes, deleteNode} = require('./util');
24+
25+
const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'});
26+
const cwd = path.join(__dirname, '..');
27+
28+
describe('Compute tpu', async () => {
29+
const nodePrefix = 'node-name-startup-script-2a2b3c';
30+
const nodeName = `${nodePrefix}${Math.floor(Math.random() * 1000 + 1)}`;
31+
const zone = 'us-central1-b';
32+
const tpuType = 'v3-8';
33+
const tpuSoftwareVersion = 'tpu-vm-tf-2.17.0-pod-pjrt';
34+
35+
after(async () => {
36+
// Clean-up resources
37+
const nodes = await getStaleNodes(nodePrefix);
38+
await Promise.all(nodes.map(node => deleteNode(node.zone, node.nodeName)));
39+
});
40+
41+
it('should create a new tpu with startup script', () => {
42+
const metadata = {
43+
'startup-script':
44+
'#!/bin/bash\n echo "Hello World" > /var/log/hello.log\n sudo pip3 install --upgrade numpy >> /var/log/hello.log 2>&1',
45+
};
46+
47+
const response = JSON.parse(
48+
execSync(
49+
`node ./createStartupScriptVM.js ${nodeName} ${zone} ${tpuType} ${tpuSoftwareVersion}`,
50+
{
51+
cwd,
52+
}
53+
)
54+
);
55+
56+
assert.deepEqual(response.metadata, metadata);
57+
});
58+
});

tpu/test/vmTopology.test.js renamed to tpu/test/createTopologyVM.test.js

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
const path = require('path');
2020
const assert = require('node:assert/strict');
21-
const {before, after, describe, it} = require('mocha');
21+
const {after, describe, it} = require('mocha');
2222
const cp = require('child_process');
2323
const {getStaleNodes, deleteNode} = require('./util');
2424

@@ -28,27 +28,27 @@ const cwd = path.join(__dirname, '..');
2828
describe('Compute tpu with topology', async () => {
2929
const nodePrefix = 'topology-node-name-2a2b3c';
3030
const nodeName = `${nodePrefix}${Math.floor(Math.random() * 1000 + 1)}`;
31-
const zone = 'europe-west4-a';
31+
const zone = 'us-central1-a';
3232
const tpuSoftwareVersion = 'tpu-vm-tf-2.17.0-pod-pjrt';
3333

34-
before(async () => {
35-
// Cleanup resources
36-
const nodes = await getStaleNodes(nodePrefix, zone);
37-
await Promise.all(nodes.map(node => deleteNode(zone, node.nodeName)));
38-
});
39-
4034
after(async () => {
41-
// Delete node
42-
await deleteNode(zone, nodeName);
35+
// Clean-up resources
36+
const nodes = await getStaleNodes(nodePrefix);
37+
await Promise.all(nodes.map(node => deleteNode(node.zone, node.nodeName)));
4338
});
4439

45-
it('should create a new tpu', () => {
46-
const response = execSync(
47-
`node ./vmCreateTopology.js ${nodeName} ${zone} ${tpuSoftwareVersion}`,
48-
{
49-
cwd,
50-
}
40+
it('should create a new tpu with topology', () => {
41+
const acceleratorConfig = {type: 'V3', topology: '2x2'};
42+
43+
const response = JSON.parse(
44+
execSync(
45+
`node ./createTopologyVM.js ${nodeName} ${zone} ${tpuSoftwareVersion}`,
46+
{
47+
cwd,
48+
}
49+
)
5150
);
52-
assert(response.includes(`TPU VM: ${nodeName} created.`));
51+
52+
assert.deepEqual(response.acceleratorConfig, acceleratorConfig);
5353
});
5454
});

tpu/test/util.js

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,30 +18,49 @@ const {TpuClient} = require('@google-cloud/tpu').v2;
1818

1919
const tpuClient = new TpuClient();
2020

21+
async function getTpuZones() {
22+
const projectId = await tpuClient.getProjectId();
23+
const parent = `projects/${projectId}/locations/-`; // List zones for the project
24+
25+
const [operations] = await tpuClient.listAcceleratorTypes({parent});
26+
const zones = new Set();
27+
operations.forEach(operation => {
28+
zones.add(operation.name.split('/')[3]);
29+
});
30+
31+
return Array.from(zones);
32+
}
33+
2134
/**
2235
* Get nodes created more than one hour ago.
2336
*/
24-
async function getStaleNodes(prefix, zone) {
37+
async function getStaleNodes(prefix) {
2538
const projectId = await tpuClient.getProjectId();
2639
const result = [];
2740
const currentDate = new Date();
28-
currentDate.setHours(currentDate.getHours() - 3);
41+
currentDate.setHours(currentDate.getHours() - 1);
2942

30-
const listNodesAsyncRequest = tpuClient.listNodesAsync({
31-
parent: `projects/${projectId}/locations/${zone}`,
32-
});
43+
const zones = await getTpuZones();
3344

34-
for await (const tpuObject of listNodesAsyncRequest) {
35-
const name = tpuObject.name.split('/').slice(-1)[0];
36-
const data = new Date(tpuObject.createTime.nanos / 1000000);
37-
if (data < currentDate && name.startsWith(prefix)) {
38-
result.push({
39-
nodeName: name,
40-
timestamp: tpuObject.createTime,
41-
});
45+
for (const zone of zones) {
46+
const [list] = await tpuClient.listNodes({
47+
parent: `projects/${projectId}/locations/${zone}`,
48+
});
49+
50+
for (const tpuObject of list) {
51+
const name = tpuObject.name.split('/').slice(-1)[0];
52+
const data = new Date(tpuObject.createTime.nanos / 1000000);
53+
if (data < currentDate) {
54+
if (data < currentDate && name.startsWith(prefix)) {
55+
result.push({
56+
zone,
57+
nodeName: name,
58+
timestamp: tpuObject.createTime,
59+
});
60+
}
61+
}
4262
}
4363
}
44-
4564
return result;
4665
}
4766

@@ -52,7 +71,7 @@ async function deleteNode(zone, nodeName) {
5271
name: `projects/${projectId}/locations/${zone}/nodes/${nodeName}`,
5372
};
5473

55-
console.log('Deleting node: ', nodeName);
74+
console.log('Deleting node:', nodeName);
5675

5776
const [operation] = await tpuClient.deleteNode(request);
5877

0 commit comments

Comments
 (0)