Skip to content

Commit f46cbc0

Browse files
committed
Add maanaged lustre and add tags in tests
1 parent fd81af0 commit f46cbc0

File tree

11 files changed

+68
-67
lines changed

11 files changed

+68
-67
lines changed

examples/machine-learning/a3-highgpu-8g/a3high-slurm-blueprint.yaml

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ blueprint_name: a3high-slurm
1616

1717
vars:
1818
sys_net_range: 172.16.0.0/16
19-
filestore_ip_range: 192.168.0.0/29
2019
source_image_project_id: ubuntu-os-cloud
2120
source_image_family: ubuntu-2204-lts
2221
local_mount_homefs: /home
@@ -31,6 +30,14 @@ vars:
3130
a3_reservation_name: "" # supply reservation name
3231
a3_dws_flex_enabled: false
3332
a3_enable_spot_vm: false
33+
# Managed-Lustre instance name. This should be unique for each deployment.
34+
lustre_instance_id: lustre-instance
35+
# The values of size_gib and per_unit_storage_throughput are co-related
36+
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
37+
# Storage capacity of the lustre instance in GiB
38+
lustre_size_gib: 36000
39+
# Maximum throughput of the lustre instance in MBps per TiB
40+
per_unit_storage_throughput: 500
3441

3542
deployment_groups:
3643
- group: base
@@ -58,15 +65,23 @@ deployment_groups:
5865
network_count: 4
5966
subnetwork_cidr_suffix: 20
6067

68+
- id: private_service_access
69+
source: modules/network/private-service-access
70+
use:
71+
- sysnet
72+
73+
# REPLACE: Filestore with Managed Lustre
6174
- id: homefs
62-
source: modules/file-system/filestore
75+
source: modules/file-system/managed-lustre
6376
use:
6477
- sysnet
78+
- private_service_access
6579
settings:
66-
filestore_tier: BASIC_SSD
67-
size_gb: 2560
68-
reserved_ip_range: $(vars.filestore_ip_range)
80+
size_gib: $(vars.lustre_size_gib)
81+
name: $(vars.lustre_instance_id)
6982
local_mount: $(vars.local_mount_homefs)
83+
remote_mount: lustrefs
84+
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
7085

7186
- group: build-script
7287
modules:
@@ -294,6 +309,7 @@ deployment_groups:
294309
"install_cuda": false,
295310
"install_gcsfuse": true,
296311
"install_lustre": false,
312+
"install_managed_lustre": true,
297313
"install_ompi": true,
298314
"monitoring_agent": "cloud-ops",
299315
"nvidia_version": "latest",

examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,16 @@ vars:
4141
a3mega_dws_flex_enabled: false
4242
a3mega_enable_spot_vm: false
4343

44+
# Managed-Lustre instance name. This should be unique for each deployment.
45+
lustre_instance_id: lustre-instance
46+
47+
# The values of size_gib and per_unit_storage_throughput are co-related
48+
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
49+
# Storage capacity of the lustre instance in GiB
50+
lustre_size_gib: 36000
51+
# Maximum throughput of the lustre instance in MBps per TiB
52+
per_unit_storage_throughput: 500
53+
4454
deployment_groups:
4555
- group: primary
4656
modules:
@@ -135,7 +145,7 @@ deployment_groups:
135145
"install_cuda": false,
136146
"install_ompi": true,
137147
"install_lustre": false,
138-
"install_managed_lustre": false,
148+
"install_managed_lustre": true,
139149
"install_gcsfuse": true,
140150
"monitoring_agent": "cloud-ops",
141151
"use_open_drivers": true
@@ -408,19 +418,19 @@ deployment_groups:
408418
source: modules/network/private-service-access
409419
use:
410420
- sysnet
421+
422+
# 2. Add Managed Lustre as homefs
411423
- id: homefs
412-
source: modules/file-system/filestore
424+
source: modules/file-system/managed-lustre
413425
use:
414426
- sysnet
415427
- private_service_access
416428
settings:
417-
filestore_tier: HIGH_SCALE_SSD
418-
size_gb: 10240
419-
local_mount: /home
420-
mount_options: "defaults,hard"
421-
deletion_protection:
422-
enabled: true
423-
reason: Avoid data loss
429+
size_gib: $(vars.lustre_size_gib)
430+
name: $(vars.lustre_instance_id)
431+
local_mount: $(vars.local_mount_homefs)
432+
remote_mount: lustrefs
433+
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
424434
outputs:
425435
- network_storage
426436

examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ vars:
2828
image_build_machine_type: n2-standard-16
2929
build_slurm_from_git_ref: 6.10.10
3030
# Cluster env settings
31-
# net0 and filestore ranges must not overlap
3231
net0_range: 192.168.0.0/19
3332
net1_range: 192.168.64.0/18
3433
rdma_net_range: 192.168.128.0/18
@@ -50,7 +49,6 @@ vars:
5049

5150
# Managed-Lustre instance name. This should be unique for each deployment.
5251
lustre_instance_id: lustre-instance
53-
5452
# The values of size_gib and per_unit_storage_throughput are co-related
5553
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
5654
# Storage capacity of the lustre instance in GiB
@@ -322,8 +320,6 @@ deployment_groups:
322320
ip_range: $(vars.rdma_net_range)
323321
region: $(vars.region)
324322

325-
# To use Managed Lustre as for the shared /home directory:
326-
# 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block.
327323
- id: private_service_access
328324
source: modules/network/private-service-access
329325
use: [a4high-slurm-net-0]

examples/machine-learning/a4x-highgpu-4g/a4xhigh-slurm-blueprint.yaml

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,7 @@ vars:
3232
built_image_family: slurm-ubuntu2404-accelerator-arm64-64k
3333
build_slurm_from_git_ref: 6.10.10
3434
# Cluster env settings
35-
# net0 and filestore ranges must not overlap
3635
net0_range: 192.168.0.0/19
37-
filestore_ip_range: 192.168.32.0/24
3836
net1_range: 192.168.64.0/19
3937
rdma_net_range: 192.168.128.0/18
4038
# Cluster Settings
@@ -47,21 +45,14 @@ vars:
4745
a4x_reservation_name: "" # supply reservation name
4846
benchmark_dir: $(ghpc_stage("system_benchmarks"))
4947

50-
# To enable Managed-Lustre please uncomment this section and fill out the settings.
51-
# Additionally, please uncomment the private_service_access and managed-lustre modules.
52-
# Managed Lustre is only supported in specific regions and zones
53-
# Please refer https://cloud.google.com/managed-lustre/docs/locations
54-
5548
# Managed-Lustre instance name. This should be unique for each deployment.
56-
# lustre_instance_id: lustre-instance
57-
49+
lustre_instance_id: lustre-instance
5850
# The values of size_gib and per_unit_storage_throughput are co-related
5951
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
6052
# Storage capacity of the lustre instance in GiB
61-
# lustre_size_gib: 36000
62-
53+
lustre_size_gib: 36000
6354
# Maximum throughput of the lustre instance in MBps per TiB
64-
# per_unit_storage_throughput: 500
55+
per_unit_storage_throughput: 500
6556

6657
deployment_groups:
6758
- group: image-env
@@ -151,7 +142,7 @@ deployment_groups:
151142
"install_cuda": false,
152143
"allow_kernel_upgrades": false,
153144
"monitoring_agent": "cloud-ops",
154-
install_managed_lustre: false,
145+
install_managed_lustre: true,
155146
}
156147
- type: shell
157148
destination: install_slurm.sh
@@ -333,43 +324,24 @@ deployment_groups:
333324
ip_range: $(vars.rdma_net_range)
334325
region: $(vars.region)
335326

327+
- id: private_service_access
328+
source: modules/network/private-service-access
329+
use: [a4x-slurm-net-0]
330+
336331
- id: homefs
337-
source: modules/file-system/filestore
332+
source: modules/file-system/managed-lustre
338333
use:
339334
- a4x-slurm-net-0
335+
- private_service_access
340336
settings:
341-
filestore_tier: HIGH_SCALE_SSD
342-
size_gb: 10240
337+
size_gib: $(vars.lustre_size_gib)
338+
name: $(vars.lustre_instance_id)
343339
local_mount: /home
344-
reserved_ip_range: $(vars.filestore_ip_range)
345-
deletion_protection:
346-
enabled: true
347-
reason: Avoid data loss
340+
remote_mount: lustrefs
341+
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
348342
outputs:
349343
- network_storage
350344

351-
# To use Managed Lustre as for the shared /home directory:
352-
# 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block.
353-
# 2. Uncomment the managed-lustre and private-service-access blocks
354-
# 3. Change the value for "install_managed_lustre" in /var/tmp/slurm_vars.json above to true
355-
# - id: private_service_access
356-
# source: modules/network/private-service-access
357-
# use: [a4x-slurm-net-0]
358-
359-
# - id: homefs
360-
# source: modules/file-system/managed-lustre
361-
# use:
362-
# - a4x-slurm-net-0
363-
# - private_service_access
364-
# settings:
365-
# size_gib: $(vars.lustre_size_gib)
366-
# name: $(vars.lustre_instance_id)
367-
# local_mount: /home
368-
# remote_mount: lustrefs
369-
# per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
370-
# outputs:
371-
# - network_storage
372-
373345
# The following four modules create and mount a Cloud Storage Bucket with
374346
# gcsfuse. They are optional but recommended for many use cases.
375347
# (Optional) The following creates a GCS bucket that will be mounted

tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
---
1616
tags:
1717
- slurm6
18-
- m.filestore
18+
- m.managed-lustre
19+
- m.private-service-access
1920
- m.multivpc
2021
- m.schedmd-slurm-gcp-v6-controller
2122
- m.schedmd-slurm-gcp-v6-login

tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
---
1616
tags:
1717
- slurm6
18-
- m.filestore
18+
- m.managed-lustre
19+
- m.private-service-access
1920
- m.multivpc
2021
- m.schedmd-slurm-gcp-v6-controller
2122
- m.schedmd-slurm-gcp-v6-login

tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ tags:
1717
- m.custom-image
1818
- m.startup-script
1919
- slurm6
20-
- m.filestore
20+
- m.managed-lustre
21+
- m.private-service-access
2122
- m.schedmd-slurm-gcp-v6-controller
2223
- m.schedmd-slurm-gcp-v6-login
2324
- m.schedmd-slurm-gcp-v6-nodeset

tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ tags:
1717
- m.custom-image
1818
- m.startup-script
1919
- slurm6
20-
- m.filestore
20+
- m.managed-lustre
21+
- m.private-service-access
2122
- m.schedmd-slurm-gcp-v6-controller
2223
- m.schedmd-slurm-gcp-v6-login
2324
- m.schedmd-slurm-gcp-v6-nodeset

tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ tags:
1717
- m.custom-image
1818
- m.cloud-storage-bucket
1919
- m.pre-existing-network-storage
20-
- m.filestore
20+
- m.managed-lustre
21+
- m.private-service-access
2122
- m.gpu-rdma-vpc
2223
- m.schedmd-slurm-gcp-v6-controller
2324
- m.schedmd-slurm-gcp-v6-login

tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ tags:
1717
- m.custom-image
1818
- m.cloud-storage-bucket
1919
- m.pre-existing-network-storage
20-
- m.filestore
20+
- m.managed-lustre
21+
- m.private-service-access
2122
- m.gpu-rdma-vpc
2223
- m.schedmd-slurm-gcp-v6-controller
2324
- m.schedmd-slurm-gcp-v6-login

0 commit comments

Comments
 (0)