diff --git a/examples/machine-learning/a3-highgpu-8g/a3high-slurm-blueprint.yaml b/examples/machine-learning/a3-highgpu-8g/a3high-slurm-blueprint.yaml index 45f3b494cb..eb2d833351 100644 --- a/examples/machine-learning/a3-highgpu-8g/a3high-slurm-blueprint.yaml +++ b/examples/machine-learning/a3-highgpu-8g/a3high-slurm-blueprint.yaml @@ -16,7 +16,6 @@ blueprint_name: a3high-slurm vars: sys_net_range: 172.16.0.0/16 - filestore_ip_range: 192.168.0.0/29 source_image_project_id: ubuntu-os-cloud source_image_family: ubuntu-2204-lts local_mount_homefs: /home @@ -31,6 +30,14 @@ vars: a3_reservation_name: "" # supply reservation name a3_dws_flex_enabled: false a3_enable_spot_vm: false + # Managed-Lustre instance name. This should be unique for each deployment. + lustre_instance_id: $(vars.deployment_name)-lustre + # The values of size_gib and per_unit_storage_throughput are co-related + # Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers + # Storage capacity of the lustre instance in GiB + lustre_size_gib: 36000 + # Maximum throughput of the lustre instance in MBps per TiB + per_unit_storage_throughput: 500 deployment_groups: - group: base @@ -58,15 +65,22 @@ deployment_groups: network_count: 4 subnetwork_cidr_suffix: 20 + - id: private_service_access + source: modules/network/private-service-access + use: + - sysnet + - id: homefs - source: modules/file-system/filestore + source: modules/file-system/managed-lustre use: - sysnet + - private_service_access settings: - filestore_tier: BASIC_SSD - size_gb: 2560 - reserved_ip_range: $(vars.filestore_ip_range) + size_gib: $(vars.lustre_size_gib) + name: $(vars.lustre_instance_id) local_mount: $(vars.local_mount_homefs) + remote_mount: lustrefs + per_unit_storage_throughput: $(vars.per_unit_storage_throughput) - group: build-script modules: @@ -294,6 +308,7 @@ deployment_groups: "install_cuda": false, "install_gcsfuse": true, "install_lustre": false, + "install_managed_lustre": true, "install_ompi": true, "monitoring_agent": "cloud-ops", "nvidia_version": "latest", diff --git a/examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml b/examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml index a1e3601a6d..1b110fac05 100644 --- a/examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml +++ b/examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml @@ -41,6 +41,16 @@ vars: a3mega_dws_flex_enabled: false a3mega_enable_spot_vm: false + # Managed-Lustre instance name. This should be unique for each deployment. + lustre_instance_id: $(vars.deployment_name)-lustre + + # The values of size_gib and per_unit_storage_throughput are co-related + # Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers + # Storage capacity of the lustre instance in GiB + lustre_size_gib: 36000 + # Maximum throughput of the lustre instance in MBps per TiB + per_unit_storage_throughput: 500 + deployment_groups: - group: primary modules: @@ -135,7 +145,7 @@ deployment_groups: "install_cuda": false, "install_ompi": true, "install_lustre": false, - "install_managed_lustre": false, + "install_managed_lustre": true, "install_gcsfuse": true, "monitoring_agent": "cloud-ops", "use_open_drivers": true @@ -408,19 +418,18 @@ deployment_groups: source: modules/network/private-service-access use: - sysnet + - id: homefs - source: modules/file-system/filestore + source: modules/file-system/managed-lustre use: - sysnet - private_service_access settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 - local_mount: /home - mount_options: "defaults,hard" - deletion_protection: - enabled: true - reason: Avoid data loss + size_gib: $(vars.lustre_size_gib) + name: $(vars.lustre_instance_id) + local_mount: $(vars.local_mount_homefs) + remote_mount: lustrefs + per_unit_storage_throughput: $(vars.per_unit_storage_throughput) outputs: - network_storage diff --git a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml index 6a6b1b06f1..145d8b5014 100644 --- a/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml +++ b/examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml @@ -28,9 +28,7 @@ vars: image_build_machine_type: n2-standard-16 build_slurm_from_git_ref: 6.12.1 # Cluster env settings - # net0 and filestore ranges must not overlap net0_range: 192.168.0.0/19 - filestore_ip_range: 192.168.32.0/24 net1_range: 192.168.64.0/18 rdma_net_range: 192.168.128.0/18 # Cluster Settings @@ -48,21 +46,18 @@ vars: a3u_dws_flex_enabled: false a3u_enable_spot_vm: false - # To enable Managed-Lustre please uncomment this section and fill out the settings. - # Additionally, please uncomment the private_service_access and managed-lustre modules. # Managed Lustre is only supported in specific regions and zones # Please refer https://cloud.google.com/managed-lustre/docs/locations - # Managed-Lustre instance name. This should be unique for each deployment. - # lustre_instance_id: lustre-instance + lustre_instance_id: $(vars.deployment_name)-lustre # The values of size_gib and per_unit_storage_throughput are co-related # Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers # Storage capacity of the lustre instance in GiB - # lustre_size_gib: 36000 + lustre_size_gib: 36000 # Maximum throughput of the lustre instance in MBps per TiB - # per_unit_storage_throughput: 500 + per_unit_storage_throughput: 500 deployment_groups: - group: image-env @@ -126,7 +121,7 @@ deployment_groups: "install_cuda": false, "install_gcsfuse": true, "install_lustre": false, - "install_managed_lustre": false, + "install_managed_lustre": true, "install_nvidia_repo": true, "install_ompi": true, "allow_kernel_upgrades": false, @@ -326,43 +321,24 @@ deployment_groups: ip_range: $(vars.rdma_net_range) region: $(vars.region) + - id: private_service_access + source: modules/network/private-service-access + use: [a3ultra-slurm-net-0] + - id: homefs - source: modules/file-system/filestore + source: modules/file-system/managed-lustre use: - a3ultra-slurm-net-0 + - private_service_access settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 + size_gib: $(vars.lustre_size_gib) + name: $(vars.lustre_instance_id) local_mount: /home - reserved_ip_range: $(vars.filestore_ip_range) - deletion_protection: - enabled: true - reason: Avoid data loss + remote_mount: lustrefs + per_unit_storage_throughput: $(vars.per_unit_storage_throughput) outputs: - network_storage - # To use Managed Lustre as for the shared /home directory: - # 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block. - # 2. Uncomment the managed-lustre and private-service-access blocks - # 3. Change the value for "install_managed_lustre" in /var/tmp/slurm_vars.json above to true - # - id: private_service_access - # source: modules/network/private-service-access - # use: [a3ultra-slurm-net-0] - - # - id: homefs - # source: modules/file-system/managed-lustre - # use: - # - a3ultra-slurm-net-0 - # - private_service_access - # settings: - # size_gib: $(vars.lustre_size_gib) - # name: $(vars.lustre_instance_id) - # local_mount: /home - # remote_mount: lustrefs - # per_unit_storage_throughput: $(vars.per_unit_storage_throughput) - # outputs: - # - network_storage - # The following four modules create and mount a Cloud Storage Bucket with # gcsfuse. They are optional but recommended for many use cases. # (Optional) The following creates a GCS bucket that will be mounted diff --git a/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml b/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml index f46f4b310f..3b44c6bfd3 100644 --- a/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml +++ b/examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml @@ -28,9 +28,7 @@ vars: image_build_machine_type: n2-standard-16 build_slurm_from_git_ref: 6.12.1 # Cluster env settings - # net0 and filestore ranges must not overlap net0_range: 192.168.0.0/19 - filestore_ip_range: 192.168.32.0/24 net1_range: 192.168.64.0/18 rdma_net_range: 192.168.128.0/18 # Cluster Settings @@ -49,21 +47,14 @@ vars: a4h_dws_flex_enabled: false a4h_enable_spot_vm: false - # To enable Managed-Lustre please uncomment this section and fill out the settings. - # Additionally, please uncomment the private_service_access and managed-lustre modules. - # Managed Lustre is only supported in specific regions and zones - # Please refer https://cloud.google.com/managed-lustre/docs/locations - # Managed-Lustre instance name. This should be unique for each deployment. - # lustre_instance_id: lustre-instance - + lustre_instance_id: $(vars.deployment_name)-lustre # The values of size_gib and per_unit_storage_throughput are co-related # Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers # Storage capacity of the lustre instance in GiB - # lustre_size_gib: 36000 - + lustre_size_gib: 36000 # Maximum throughput of the lustre instance in MBps per TiB - # per_unit_storage_throughput: 500 + per_unit_storage_throughput: 500 deployment_groups: - group: image-env @@ -127,7 +118,7 @@ deployment_groups: "install_cuda": false, "install_gcsfuse": true, "install_lustre": false, - "install_managed_lustre": false, + "install_managed_lustre": true, "install_nvidia_repo": true, "install_ompi": true, "allow_kernel_upgrades": false, @@ -329,43 +320,24 @@ deployment_groups: ip_range: $(vars.rdma_net_range) region: $(vars.region) + - id: private_service_access + source: modules/network/private-service-access + use: [a4high-slurm-net-0] + - id: homefs - source: modules/file-system/filestore + source: modules/file-system/managed-lustre use: - a4high-slurm-net-0 + - private_service_access settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 + size_gib: $(vars.lustre_size_gib) + name: $(vars.lustre_instance_id) local_mount: /home - reserved_ip_range: $(vars.filestore_ip_range) - deletion_protection: - enabled: true - reason: Avoid data loss + remote_mount: lustrefs + per_unit_storage_throughput: $(vars.per_unit_storage_throughput) outputs: - network_storage - # To use Managed Lustre as for the shared /home directory: - # 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block. - # 2. Uncomment the managed-lustre and private-service-access blocks - # 3. Change the value for "install_managed_lustre" in /var/tmp/slurm_vars.json above to true - # - id: private_service_access - # source: modules/network/private-service-access - # use: [a4high-slurm-net-0] - - # - id: homefs - # source: modules/file-system/managed-lustre - # use: - # - a4high-slurm-net-0 - # - private_service_access - # settings: - # size_gib: $(vars.lustre_size_gib) - # name: $(vars.lustre_instance_id) - # local_mount: /home - # remote_mount: lustrefs - # per_unit_storage_throughput: $(vars.per_unit_storage_throughput) - # outputs: - # - network_storage - # The following four modules create and mount a Cloud Storage Bucket with # gcsfuse. They are optional but recommended for many use cases. # (Optional) The following creates a GCS bucket that will be mounted diff --git a/examples/machine-learning/a4x-highgpu-4g/a4xhigh-slurm-blueprint.yaml b/examples/machine-learning/a4x-highgpu-4g/a4xhigh-slurm-blueprint.yaml index c7c0112326..52fd95f0ab 100644 --- a/examples/machine-learning/a4x-highgpu-4g/a4xhigh-slurm-blueprint.yaml +++ b/examples/machine-learning/a4x-highgpu-4g/a4xhigh-slurm-blueprint.yaml @@ -32,9 +32,7 @@ vars: built_image_family: slurm-ubuntu2404-accelerator-arm64-64k build_slurm_from_git_ref: 6.12.1 # Cluster env settings - # net0 and filestore ranges must not overlap net0_range: 192.168.0.0/19 - filestore_ip_range: 192.168.32.0/24 net1_range: 192.168.64.0/19 rdma_net_range: 192.168.128.0/18 # Cluster Settings @@ -47,21 +45,14 @@ vars: a4x_reservation_name: "" # supply reservation name benchmark_dir: $(ghpc_stage("system_benchmarks")) - # To enable Managed-Lustre please uncomment this section and fill out the settings. - # Additionally, please uncomment the private_service_access and managed-lustre modules. - # Managed Lustre is only supported in specific regions and zones - # Please refer https://cloud.google.com/managed-lustre/docs/locations - # Managed-Lustre instance name. This should be unique for each deployment. - # lustre_instance_id: lustre-instance - + lustre_instance_id: $(vars.deployment_name)-lustre # The values of size_gib and per_unit_storage_throughput are co-related # Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers # Storage capacity of the lustre instance in GiB - # lustre_size_gib: 36000 - + lustre_size_gib: 36000 # Maximum throughput of the lustre instance in MBps per TiB - # per_unit_storage_throughput: 500 + per_unit_storage_throughput: 500 deployment_groups: - group: image-env @@ -151,7 +142,7 @@ deployment_groups: "install_cuda": false, "allow_kernel_upgrades": false, "monitoring_agent": "cloud-ops", - install_managed_lustre: false, + "install_managed_lustre": true, } - type: shell destination: install_slurm.sh @@ -333,43 +324,24 @@ deployment_groups: ip_range: $(vars.rdma_net_range) region: $(vars.region) + - id: private_service_access + source: modules/network/private-service-access + use: [a4x-slurm-net-0] + - id: homefs - source: modules/file-system/filestore + source: modules/file-system/managed-lustre use: - a4x-slurm-net-0 + - private_service_access settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 + size_gib: $(vars.lustre_size_gib) + name: $(vars.lustre_instance_id) local_mount: /home - reserved_ip_range: $(vars.filestore_ip_range) - deletion_protection: - enabled: true - reason: Avoid data loss + remote_mount: lustrefs + per_unit_storage_throughput: $(vars.per_unit_storage_throughput) outputs: - network_storage - # To use Managed Lustre as for the shared /home directory: - # 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block. - # 2. Uncomment the managed-lustre and private-service-access blocks - # 3. Change the value for "install_managed_lustre" in /var/tmp/slurm_vars.json above to true - # - id: private_service_access - # source: modules/network/private-service-access - # use: [a4x-slurm-net-0] - - # - id: homefs - # source: modules/file-system/managed-lustre - # use: - # - a4x-slurm-net-0 - # - private_service_access - # settings: - # size_gib: $(vars.lustre_size_gib) - # name: $(vars.lustre_instance_id) - # local_mount: /home - # remote_mount: lustrefs - # per_unit_storage_throughput: $(vars.per_unit_storage_throughput) - # outputs: - # - network_storage - # The following four modules create and mount a Cloud Storage Bucket with # gcsfuse. They are optional but recommended for many use cases. # (Optional) The following creates a GCS bucket that will be mounted diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml index f9c3e90b86..a160cb2568 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml @@ -15,7 +15,8 @@ --- tags: - slurm6 -- m.filestore +- m.managed-lustre +- m.private-service-access - m.multivpc - m.schedmd-slurm-gcp-v6-controller - m.schedmd-slurm-gcp-v6-login diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml index 74e917d617..00d4d6f833 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml @@ -15,7 +15,8 @@ --- tags: - slurm6 -- m.filestore +- m.managed-lustre +- m.private-service-access - m.multivpc - m.schedmd-slurm-gcp-v6-controller - m.schedmd-slurm-gcp-v6-login diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml index dd42a42c58..8db09c2720 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml @@ -17,7 +17,8 @@ tags: - m.custom-image - m.startup-script - slurm6 -- m.filestore +- m.managed-lustre +- m.private-service-access - m.schedmd-slurm-gcp-v6-controller - m.schedmd-slurm-gcp-v6-login - m.schedmd-slurm-gcp-v6-nodeset diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml index 912c2f9918..2282009cc3 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml @@ -17,7 +17,8 @@ tags: - m.custom-image - m.startup-script - slurm6 -- m.filestore +- m.managed-lustre +- m.private-service-access - m.schedmd-slurm-gcp-v6-controller - m.schedmd-slurm-gcp-v6-login - m.schedmd-slurm-gcp-v6-nodeset diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml index da1706136e..95b3bd23c2 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml @@ -17,7 +17,8 @@ tags: - m.custom-image - m.cloud-storage-bucket - m.pre-existing-network-storage -- m.filestore +- m.managed-lustre +- m.private-service-access - m.gpu-rdma-vpc - m.schedmd-slurm-gcp-v6-controller - m.schedmd-slurm-gcp-v6-login diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml index 10d3ca3118..b099c2db0a 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml @@ -17,7 +17,8 @@ tags: - m.custom-image - m.cloud-storage-bucket - m.pre-existing-network-storage -- m.filestore +- m.managed-lustre +- m.private-service-access - m.gpu-rdma-vpc - m.schedmd-slurm-gcp-v6-controller - m.schedmd-slurm-gcp-v6-login diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml index 807a6d9dc9..3aa5b7b377 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml @@ -17,7 +17,8 @@ tags: - m.custom-image - m.cloud-storage-bucket - m.pre-existing-network-storage -- m.filestore +- m.managed-lustre +- m.private-service-access - m.gpu-rdma-vpc - m.schedmd-slurm-gcp-v6-controller - m.schedmd-slurm-gcp-v6-login