Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ blueprint_name: a3high-slurm

vars:
sys_net_range: 172.16.0.0/16
filestore_ip_range: 192.168.0.0/29
source_image_project_id: ubuntu-os-cloud
source_image_family: ubuntu-2204-lts
local_mount_homefs: /home
Expand All @@ -31,6 +30,14 @@ vars:
a3_reservation_name: "" # supply reservation name
a3_dws_flex_enabled: false
a3_enable_spot_vm: false
# Managed-Lustre instance name. This should be unique for each deployment.
lustre_instance_id: $(vars.deployment_name)-lustre
# The values of size_gib and per_unit_storage_throughput are co-related
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
# Storage capacity of the lustre instance in GiB
lustre_size_gib: 36000
# Maximum throughput of the lustre instance in MBps per TiB
per_unit_storage_throughput: 500

deployment_groups:
- group: base
Expand Down Expand Up @@ -58,15 +65,22 @@ deployment_groups:
network_count: 4
subnetwork_cidr_suffix: 20

- id: private_service_access
source: modules/network/private-service-access
use:
- sysnet

- id: homefs
source: modules/file-system/filestore
source: modules/file-system/managed-lustre
use:
- sysnet
- private_service_access
settings:
filestore_tier: BASIC_SSD
size_gb: 2560
reserved_ip_range: $(vars.filestore_ip_range)
size_gib: $(vars.lustre_size_gib)
name: $(vars.lustre_instance_id)
local_mount: $(vars.local_mount_homefs)
remote_mount: lustrefs
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)

- group: build-script
modules:
Expand Down Expand Up @@ -294,6 +308,7 @@ deployment_groups:
"install_cuda": false,
"install_gcsfuse": true,
"install_lustre": false,
"install_managed_lustre": true,
"install_ompi": true,
"monitoring_agent": "cloud-ops",
"nvidia_version": "latest",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ vars:
a3mega_dws_flex_enabled: false
a3mega_enable_spot_vm: false

# Managed-Lustre instance name. This should be unique for each deployment.
lustre_instance_id: $(vars.deployment_name)-lustre

# The values of size_gib and per_unit_storage_throughput are co-related
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
# Storage capacity of the lustre instance in GiB
lustre_size_gib: 36000
# Maximum throughput of the lustre instance in MBps per TiB
per_unit_storage_throughput: 500

deployment_groups:
- group: primary
modules:
Expand Down Expand Up @@ -135,7 +145,7 @@ deployment_groups:
"install_cuda": false,
"install_ompi": true,
"install_lustre": false,
"install_managed_lustre": false,
"install_managed_lustre": true,
"install_gcsfuse": true,
"monitoring_agent": "cloud-ops",
"use_open_drivers": true
Expand Down Expand Up @@ -408,19 +418,18 @@ deployment_groups:
source: modules/network/private-service-access
use:
- sysnet

- id: homefs
source: modules/file-system/filestore
source: modules/file-system/managed-lustre
use:
- sysnet
- private_service_access
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
local_mount: /home
mount_options: "defaults,hard"
deletion_protection:
enabled: true
reason: Avoid data loss
size_gib: $(vars.lustre_size_gib)
name: $(vars.lustre_instance_id)
local_mount: $(vars.local_mount_homefs)
remote_mount: lustrefs
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
outputs:
- network_storage

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ vars:
image_build_machine_type: n2-standard-16
build_slurm_from_git_ref: 6.12.1
# Cluster env settings
# net0 and filestore ranges must not overlap
net0_range: 192.168.0.0/19
filestore_ip_range: 192.168.32.0/24
net1_range: 192.168.64.0/18
rdma_net_range: 192.168.128.0/18
# Cluster Settings
Expand All @@ -48,21 +46,18 @@ vars:
a3u_dws_flex_enabled: false
a3u_enable_spot_vm: false

# To enable Managed-Lustre please uncomment this section and fill out the settings.
# Additionally, please uncomment the private_service_access and managed-lustre modules.
# Managed Lustre is only supported in specific regions and zones
# Please refer https://cloud.google.com/managed-lustre/docs/locations

# Managed-Lustre instance name. This should be unique for each deployment.
# lustre_instance_id: lustre-instance
lustre_instance_id: $(vars.deployment_name)-lustre

# The values of size_gib and per_unit_storage_throughput are co-related
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
# Storage capacity of the lustre instance in GiB
# lustre_size_gib: 36000
lustre_size_gib: 36000

# Maximum throughput of the lustre instance in MBps per TiB
# per_unit_storage_throughput: 500
per_unit_storage_throughput: 500

deployment_groups:
- group: image-env
Expand Down Expand Up @@ -126,7 +121,7 @@ deployment_groups:
"install_cuda": false,
"install_gcsfuse": true,
"install_lustre": false,
"install_managed_lustre": false,
"install_managed_lustre": true,
"install_nvidia_repo": true,
"install_ompi": true,
"allow_kernel_upgrades": false,
Expand Down Expand Up @@ -326,43 +321,24 @@ deployment_groups:
ip_range: $(vars.rdma_net_range)
region: $(vars.region)

- id: private_service_access
source: modules/network/private-service-access
use: [a3ultra-slurm-net-0]

- id: homefs
source: modules/file-system/filestore
source: modules/file-system/managed-lustre
use:
- a3ultra-slurm-net-0
- private_service_access
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
size_gib: $(vars.lustre_size_gib)
name: $(vars.lustre_instance_id)
local_mount: /home
reserved_ip_range: $(vars.filestore_ip_range)
deletion_protection:
enabled: true
reason: Avoid data loss
remote_mount: lustrefs
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
outputs:
- network_storage

# To use Managed Lustre as for the shared /home directory:
# 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block.
# 2. Uncomment the managed-lustre and private-service-access blocks
# 3. Change the value for "install_managed_lustre" in /var/tmp/slurm_vars.json above to true
# - id: private_service_access
# source: modules/network/private-service-access
# use: [a3ultra-slurm-net-0]

# - id: homefs
# source: modules/file-system/managed-lustre
# use:
# - a3ultra-slurm-net-0
# - private_service_access
# settings:
# size_gib: $(vars.lustre_size_gib)
# name: $(vars.lustre_instance_id)
# local_mount: /home
# remote_mount: lustrefs
# per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
# outputs:
# - network_storage

# The following four modules create and mount a Cloud Storage Bucket with
# gcsfuse. They are optional but recommended for many use cases.
# (Optional) The following creates a GCS bucket that will be mounted
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ vars:
image_build_machine_type: n2-standard-16
build_slurm_from_git_ref: 6.12.1
# Cluster env settings
# net0 and filestore ranges must not overlap
net0_range: 192.168.0.0/19
filestore_ip_range: 192.168.32.0/24
net1_range: 192.168.64.0/18
rdma_net_range: 192.168.128.0/18
# Cluster Settings
Expand All @@ -49,21 +47,14 @@ vars:
a4h_dws_flex_enabled: false
a4h_enable_spot_vm: false

# To enable Managed-Lustre please uncomment this section and fill out the settings.
# Additionally, please uncomment the private_service_access and managed-lustre modules.
# Managed Lustre is only supported in specific regions and zones
# Please refer https://cloud.google.com/managed-lustre/docs/locations

# Managed-Lustre instance name. This should be unique for each deployment.
# lustre_instance_id: lustre-instance

lustre_instance_id: $(vars.deployment_name)-lustre
# The values of size_gib and per_unit_storage_throughput are co-related
# Please refer https://cloud.google.com/managed-lustre/docs/create-instance#performance-tiers
# Storage capacity of the lustre instance in GiB
# lustre_size_gib: 36000

lustre_size_gib: 36000
# Maximum throughput of the lustre instance in MBps per TiB
# per_unit_storage_throughput: 500
per_unit_storage_throughput: 500

deployment_groups:
- group: image-env
Expand Down Expand Up @@ -127,7 +118,7 @@ deployment_groups:
"install_cuda": false,
"install_gcsfuse": true,
"install_lustre": false,
"install_managed_lustre": false,
"install_managed_lustre": true,
"install_nvidia_repo": true,
"install_ompi": true,
"allow_kernel_upgrades": false,
Expand Down Expand Up @@ -329,43 +320,24 @@ deployment_groups:
ip_range: $(vars.rdma_net_range)
region: $(vars.region)

- id: private_service_access
source: modules/network/private-service-access
use: [a4high-slurm-net-0]

- id: homefs
source: modules/file-system/filestore
source: modules/file-system/managed-lustre
use:
- a4high-slurm-net-0
- private_service_access
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
size_gib: $(vars.lustre_size_gib)
name: $(vars.lustre_instance_id)
local_mount: /home
reserved_ip_range: $(vars.filestore_ip_range)
deletion_protection:
enabled: true
reason: Avoid data loss
remote_mount: lustrefs
per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
outputs:
- network_storage

# To use Managed Lustre as for the shared /home directory:
# 1. Comment out the filestore block above and the`filestore_ip_range` line in the vars block.
# 2. Uncomment the managed-lustre and private-service-access blocks
# 3. Change the value for "install_managed_lustre" in /var/tmp/slurm_vars.json above to true
# - id: private_service_access
# source: modules/network/private-service-access
# use: [a4high-slurm-net-0]

# - id: homefs
# source: modules/file-system/managed-lustre
# use:
# - a4high-slurm-net-0
# - private_service_access
# settings:
# size_gib: $(vars.lustre_size_gib)
# name: $(vars.lustre_instance_id)
# local_mount: /home
# remote_mount: lustrefs
# per_unit_storage_throughput: $(vars.per_unit_storage_throughput)
# outputs:
# - network_storage

# The following four modules create and mount a Cloud Storage Bucket with
# gcsfuse. They are optional but recommended for many use cases.
# (Optional) The following creates a GCS bucket that will be mounted
Expand Down
Loading
Loading