Skip to content

Commit dff3234

Browse files
committed
use lustre in a3u a4h
1 parent 3db9949 commit dff3234

File tree

2 files changed

+27
-72
lines changed

2 files changed

+27
-72
lines changed

examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ vars:
2828
image_build_machine_type: n2-standard-16
2929
build_slurm_from_git_ref: 6.10.10
3030
# Cluster env settings
31-
# net0 and filestore ranges must not overlap
3231
net0_range: 192.168.0.0/19
33-
filestore_ip_range: 192.168.32.0/24
3432
net1_range: 192.168.64.0/18
3533
rdma_net_range: 192.168.128.0/18
3634
# Cluster Settings
@@ -126,7 +124,7 @@ deployment_groups:
126124
"install_cuda": false,
127125
"install_gcsfuse": true,
128126
"install_lustre": false,
129-
"install_managed_lustre": false,
127+
"install_managed_lustre": true,
130128
"install_nvidia_repo": true,
131129
"install_ompi": true,
132130
"allow_kernel_upgrades": false,
@@ -326,43 +324,24 @@ deployment_groups:
326324
ip_range: $(vars.rdma_net_range)
327325
region: $(vars.region)
328326

327+
- id: private_service_access
328+
source: modules/network/private-service-access
329+
use: [a3ultra-slurm-net-0]
330+
329331
- id: homefs
330-
source: modules/file-system/filestore
332+
source: modules/file-system/managed-lustre
331333
use:
332334
- a3ultra-slurm-net-0
335+
- private_service_access
333336
settings:
334-
filestore_tier: HIGH_SCALE_SSD
335-
size_gb: 10240
337+
size_gib: $(vars.lustre_size_gib)
338+
name: $(vars.lustre_instance_id)
336339
local_mount: /home
337-
reserved_ip_range: $(vars.filestore_ip_range)
338-
deletion_protection:
339-
enabled: true
340-
reason: Avoid data loss
340+
remote_mount: lustrefs
341+
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
341342
outputs:
342343
- network_storage
343344

344-
# To use Managed Lustre as for the shared /home directory:
345-
# 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block.
346-
# 2. Uncomment the managed-lustre and private-service-access blocks
347-
# 3. Change the value for "install_managed_lustre" in /var/tmp/slurm_vars.json above to true
348-
# - id: private_service_access
349-
# source: modules/network/private-service-access
350-
# use: [a3ultra-slurm-net-0]
351-
352-
# - id: homefs
353-
# source: modules/file-system/managed-lustre
354-
# use:
355-
# - a3ultra-slurm-net-0
356-
# - private_service_access
357-
# settings:
358-
# size_gib: $(vars.lustre_size_gib)
359-
# name: $(vars.lustre_instance_id)
360-
# local_mount: /home
361-
# remote_mount: lustrefs
362-
# per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
363-
# outputs:
364-
# - network_storage
365-
366345
# The following four modules create and mount a Cloud Storage Bucket with
367346
# gcsfuse. They are optional but recommended for many use cases.
368347
# (Optional) The following creates a GCS bucket that will be mounted

examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml

Lines changed: 16 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ vars:
3030
# Cluster env settings
3131
# net0 and filestore ranges must not overlap
3232
net0_range: 192.168.0.0/19
33-
filestore_ip_range: 192.168.32.0/24
3433
net1_range: 192.168.64.0/18
3534
rdma_net_range: 192.168.128.0/18
3635
# Cluster Settings
@@ -49,21 +48,15 @@ vars:
4948
a4h_dws_flex_enabled: false
5049
a4h_enable_spot_vm: false
5150

52-
# To enable Managed-Lustre please uncomment this section and fill out the settings.
53-
# Additionally, please uncomment the private_service_access and managed-lustre modules.
54-
# Managed Lustre is only supported in specific regions and zones
55-
# Please refer https://cloud.google.com/managed-lustre/docs/locations
56-
5751
# Managed-Lustre instance name. This should be unique for each deployment.
58-
# lustre_instance_id: lustre-instance
52+
lustre_instance_id: lustre-instance
5953

6054
# The values of size_gib and per_unit_storage_throughput are co-related
6155
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
6256
# Storage capacity of the lustre instance in GiB
63-
# lustre_size_gib: 36000
64-
57+
lustre_size_gib: 36000
6558
# Maximum throughput of the lustre instance in MBps per TiB
66-
# per_unit_storage_throughput: 500
59+
per_unit_storage_throughput: 500
6760

6861
deployment_groups:
6962
- group: image-env
@@ -127,7 +120,7 @@ deployment_groups:
127120
"install_cuda": false,
128121
"install_gcsfuse": true,
129122
"install_lustre": false,
130-
"install_managed_lustre": false,
123+
"install_managed_lustre": true,
131124
"install_nvidia_repo": true,
132125
"install_ompi": true,
133126
"allow_kernel_upgrades": false,
@@ -329,43 +322,26 @@ deployment_groups:
329322
ip_range: $(vars.rdma_net_range)
330323
region: $(vars.region)
331324

325+
# To use Managed Lustre as for the shared /home directory:
326+
# 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block.
327+
- id: private_service_access
328+
source: modules/network/private-service-access
329+
use: [a4high-slurm-net-0]
330+
332331
- id: homefs
333-
source: modules/file-system/filestore
332+
source: modules/file-system/managed-lustre
334333
use:
335334
- a4high-slurm-net-0
335+
- private_service_access
336336
settings:
337-
filestore_tier: HIGH_SCALE_SSD
338-
size_gb: 10240
337+
size_gib: $(vars.lustre_size_gib)
338+
name: $(vars.lustre_instance_id)
339339
local_mount: /home
340-
reserved_ip_range: $(vars.filestore_ip_range)
341-
deletion_protection:
342-
enabled: true
343-
reason: Avoid data loss
340+
remote_mount: lustrefs
341+
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
344342
outputs:
345343
- network_storage
346344

347-
# To use Managed Lustre as for the shared /home directory:
348-
# 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block.
349-
# 2. Uncomment the managed-lustre and private-service-access blocks
350-
# 3. Change the value for "install_managed_lustre" in /var/tmp/slurm_vars.json above to true
351-
# - id: private_service_access
352-
# source: modules/network/private-service-access
353-
# use: [a4high-slurm-net-0]
354-
355-
# - id: homefs
356-
# source: modules/file-system/managed-lustre
357-
# use:
358-
# - a4high-slurm-net-0
359-
# - private_service_access
360-
# settings:
361-
# size_gib: $(vars.lustre_size_gib)
362-
# name: $(vars.lustre_instance_id)
363-
# local_mount: /home
364-
# remote_mount: lustrefs
365-
# per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
366-
# outputs:
367-
# - network_storage
368-
369345
# The following four modules create and mount a Cloud Storage Bucket with
370346
# gcsfuse. They are optional but recommended for many use cases.
371347
# (Optional) The following creates a GCS bucket that will be mounted

0 commit comments

Comments
 (0)