diff --git a/a3/terraform/modules/cluster/mig-cos/README.md b/a3/terraform/modules/cluster/mig-cos/README.md index e65e9bd3..6b591d9b 100644 --- a/a3/terraform/modules/cluster/mig-cos/README.md +++ b/a3/terraform/modules/cluster/mig-cos/README.md @@ -45,7 +45,7 @@ No resources. | [enable\_install\_gpu](#input\_enable\_install\_gpu) | Setting this to false will disable a built-in startup script which:
- installs GPU drivers
- configures docker auth
- installs iptable rules
- installs NCCL and GPUDirectTCPX plugin

Any installation replacements should be in the startup\_script variable | `bool` | `true` | no | | [filestore\_new](#input\_filestore\_new) | Configurations to mount newly created network storage. Each object describes NFS file-servers to be hosted in Filestore.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#inputs).

------------
`filestore_new.filestore_tier`

The service tier of the instance.

Possible values: `["BASIC_HDD", "BASIC_SSD", "HIGH_SCALE_SSD", "ENTERPRISE"]`.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_filestore_tier), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--tier).

------------
`filestore_new.local_mount`

Mountpoint for this filestore instance.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount).

------------
`filestore_new.size_gb`

Storage size of the filestore instance in GB.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--file-share).

------------
`filestore_new.zone`

Location for filestore instance.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_zone). |
list(object({
filestore_tier = string
local_mount = string
size_gb = number
zone = string
}))
| `[]` | no | | [gcsfuse\_existing](#input\_gcsfuse\_existing) | Configurations to mount existing network storage. Each object describes Cloud Storage Buckets to be mounted with Cloud Storage FUSE.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#inputs).

------------
`gcsfuse_existing.local_mount`

The mount point where the contents of the device may be accessed after mounting.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_local_mount).

------------
`gcsfuse_existing.remote_mount`

Bucket name without “gs://”.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_remote_mount). |
list(object({
local_mount = string
remote_mount = string
}))
| `[]` | no | -| [instance\_groups](#input\_instance\_groups) | Required Fields:
- `target_size`: The number of running instances for this managed instance group. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size).
- `zone`: The zone that instances in this group should be created in. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone).
- `machine_type`: (Optional)The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource). Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type).
- `existing_resource_policy_name`: (Optional) The existing resource policy. |
list(object({
zone = string
target_size = number
machine_type = optional(string, "a3-highgpu-8g")
existing_resource_policy_name = optional(string, null)
}))
| n/a | yes | +| [instance\_groups](#input\_instance\_groups) | Required Fields:
- `target_size`: The number of running instances for this managed instance group. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size).
- `zone`: The zone that instances in this group should be created in. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone).
- `machine_type`: (Optional)The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource). Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type). |
list(object({
zone = string
target_size = number
machine_type = optional(string, "a3-highgpu-8g")
compact_placement_policy = optional(object({
new_policy = optional(bool, false)
existing_policy_name = optional(string)
specific_reservation = optional(string)
}))
}))
| n/a | yes | | [labels](#input\_labels) | The resource labels (a map of key/value pairs) to be applied to the GPU cluster.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#labels), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--labels). | `map(string)` | `{}` | no | | [machine\_image](#input\_machine\_image) | The image with which this disk will initialize. This image must be in the project `cos-cloud`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#source_image).

------------
`machine_image.family`

The family of images from which the latest non-deprecated image will be selected. Conflicts with `machine_image.name`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-family).

------------
`machine_image.name`

The name of a specific image. Conflicts with `machine_image.family`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image).

------------
`machine_image.project`

The project\_id to which this image belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#project), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-project). |
object({
family = string
name = string
project = string
})
|
{
"family": "cos-stable",
"name": null,
"project": "cos-cloud"
}
| no | | [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for maintenance\_interval.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#maintenance_interval). | `string` | `null` | no | diff --git a/a3/terraform/modules/cluster/mig-cos/main.tf b/a3/terraform/modules/cluster/mig-cos/main.tf index 701581b3..59be3357 100644 --- a/a3/terraform/modules/cluster/mig-cos/main.tf +++ b/a3/terraform/modules/cluster/mig-cos/main.tf @@ -70,22 +70,21 @@ module "compute_instance_template" { source = "../../common/instance_template" count = length(var.instance_groups) - disk_size_gb = var.disk_size_gb - disk_type = var.disk_type - machine_image = var.machine_image - machine_type = var.instance_groups[count.index].machine_type - maintenance_interval = var.maintenance_interval - metadata = local.metadata - project_id = var.project_id - region = var.region - resource_prefix = var.resource_prefix - service_account = var.service_account - use_compact_placement_policy = var.use_compact_placement_policy - existing_resource_policy_name = var.instance_groups[count.index].existing_resource_policy_name - startup_script = null - subnetwork_self_links = module.network.subnetwork_self_links - network_self_links = module.network.network_self_links - labels = merge(var.labels, { ghpc_role = "compute" }) + compact_placement_policy = var.instance_groups[count.index].compact_placement_policy + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + machine_image = var.machine_image + machine_type = var.instance_groups[count.index].machine_type + maintenance_interval = var.maintenance_interval + metadata = local.metadata + project_id = var.project_id + region = var.region + resource_prefix = var.resource_prefix + service_account = var.service_account + startup_script = null + subnetwork_self_links = module.network.subnetwork_self_links + network_self_links = module.network.network_self_links + labels = merge(var.labels, { ghpc_role = "compute" }) } module "compute_instance_group_manager" { diff --git a/a3/terraform/modules/cluster/mig-cos/variables.tf b/a3/terraform/modules/cluster/mig-cos/variables.tf index 8af2c762..dc241001 100644 --- a/a3/terraform/modules/cluster/mig-cos/variables.tf +++ b/a3/terraform/modules/cluster/mig-cos/variables.tf @@ -20,13 +20,16 @@ variable "instance_groups" { - `target_size`: The number of running instances for this managed instance group. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size). - `zone`: The zone that instances in this group should be created in. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone). - `machine_type`: (Optional)The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource). Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type). - - `existing_resource_policy_name`: (Optional) The existing resource policy. EOT type = list(object({ - zone = string - target_size = number - machine_type = optional(string, "a3-highgpu-8g") - existing_resource_policy_name = optional(string, null) + zone = string + target_size = number + machine_type = optional(string, "a3-highgpu-8g") + compact_placement_policy = optional(object({ + new_policy = optional(bool, false) + existing_policy_name = optional(string) + specific_reservation = optional(string) + })) })) nullable = false diff --git a/a3/terraform/modules/common/instance_template/README.md b/a3/terraform/modules/common/instance_template/README.md index e827b803..7978e830 100644 --- a/a3/terraform/modules/common/instance_template/README.md +++ b/a3/terraform/modules/common/instance_template/README.md @@ -42,9 +42,9 @@ No requirements. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [compact\_placement\_policy](#input\_compact\_placement\_policy) | The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#resource_policies). |
object({
new_policy = optional(bool, false)
existing_policy_name = optional(string)
specific_reservation = optional(string)
})
| `null` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | The size of the image in gigabytes for the boot disk of each instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-size). | `number` | n/a | yes | | [disk\_type](#input\_disk\_type) | The GCE disk type for the boot disk of each instance.

Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]`

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-type). | `string` | n/a | yes | -| [existing\_resource\_policy\_name](#input\_existing\_resource\_policy\_name) | The name of the existing resource policy.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#name). | `string` | `null` | no | | [labels](#input\_labels) | A set of key/value label pairs to assign to instances created from this template.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#labels), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--labels). | `map(string)` | n/a | yes | | [machine\_image](#input\_machine\_image) | The image with which this disk will initialize.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#source_image).

------------
`machine_image.family`

The family of images from which the latest non-deprecated image will be selected. Conflicts with `machine_image.name`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-family).

------------
`machine_image.name`

The name of a specific image. Conflicts with `machine_image.family`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image).

------------
`machine_image.project`

The project\_id to which this image belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#project), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-project). |
object({
family = string
name = string
project = string
})
| n/a | yes | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource).

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type). | `string` | n/a | yes | @@ -57,7 +57,6 @@ No requirements. | [service\_account](#input\_service\_account) | Service account to attach to the instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#service_account).

------------
`service_account.email`

The service account e-mail address. If not given, the default Google Compute Engine service account is used.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#email), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--service-account).

------------
`service_account.scopes`

A list of service scopes. Both OAuth2 URLs and gcloud short names are supported. To allow full access to all Cloud APIs, use the `"cloud-platform"` scope. See a complete list of scopes [here](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/instances/set-scopes#--scopes).

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#scopes), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--scopes). |
object({
email = string,
scopes = set(string)
})
| n/a | yes | | [startup\_script](#input\_startup\_script) | Script to run at boot on each instance. This is here for convenience and will just be appended to `metadata` under the key `"startup-script"`. | `string` | n/a | yes | | [subnetwork\_self\_links](#input\_subnetwork\_self\_links) | The subnet self-links for all the VPCs. | `list(string)` | n/a | yes | -| [use\_compact\_placement\_policy](#input\_use\_compact\_placement\_policy) | The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#resource_policies). | `bool` | `false` | no | | [use\_static\_naming](#input\_use\_static\_naming) | Flag to determine whether to use static naming for instance\_template name. If used static naming, then instance\_template cannot be updated. it needs to be destroyed and then recreated.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#name_prefix). | `bool` | `false` | no | ## Outputs diff --git a/a3/terraform/modules/common/instance_template/main.tf b/a3/terraform/modules/common/instance_template/main.tf index 578b90f9..69698f3c 100644 --- a/a3/terraform/modules/common/instance_template/main.tf +++ b/a3/terraform/modules/common/instance_template/main.tf @@ -73,27 +73,28 @@ data "google_compute_image" "image" { module "resource_policy" { source = "../resource_policy" - count = var.use_compact_placement_policy ? 1 : 0 + count = try( + var.compact_placement_policy.new_policy || var.compact_placement_policy.existing_policy_name != null + , false + ) ? 1 : 0 project_id = var.project_id region = var.region - new_resource_policy_name = var.existing_resource_policy_name == null ? var.resource_prefix : null - existing_resource_policy_name = var.existing_resource_policy_name == null ? null : var.existing_resource_policy_name + new_resource_policy_name = var.compact_placement_policy.new_policy ? var.resource_prefix : null + existing_resource_policy_name = var.compact_placement_policy.existing_policy_name } resource "google_compute_instance_template" "template" { provider = google-beta - labels = var.labels - machine_type = var.machine_type - metadata = local.metadata - name = var.use_static_naming ? var.resource_prefix : null - name_prefix = var.use_static_naming ? null : var.resource_prefix - project = var.project_id - region = var.region - resource_policies = var.use_compact_placement_policy ? [ - module.resource_policy[0].resource_self_link - ] : [] + labels = var.labels + machine_type = var.machine_type + metadata = local.metadata + name = var.use_static_naming ? var.resource_prefix : null + name_prefix = var.use_static_naming ? null : var.resource_prefix + project = var.project_id + region = var.region + resource_policies = module.resource_policy[*].resource_self_link disk { auto_delete = true @@ -141,4 +142,19 @@ resource "google_compute_instance_template" "template" { metadata["ssh-keys"], ] } + + dynamic "reservation_affinity" { + for_each = try( + var.compact_placement_policy.specific_reservation, + null, + ) != null ? [var.compact_placement_policy.specific_reservation] : [] + + content { + type = "SPECIFIC_RESERVATION" + specific_reservation { + key = "compute.googleapis.com/reservation-name" + values = [reservation_affinity.value] + } + } + } } diff --git a/a3/terraform/modules/common/instance_template/variables.tf b/a3/terraform/modules/common/instance_template/variables.tf index a4ef627d..9568ad59 100644 --- a/a3/terraform/modules/common/instance_template/variables.tf +++ b/a3/terraform/modules/common/instance_template/variables.tf @@ -120,24 +120,18 @@ variable "maintenance_interval" { } } -variable "use_compact_placement_policy" { +variable "compact_placement_policy" { description = <<-EOT The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#resource_policies). EOT - type = bool - default = false -} - -variable "existing_resource_policy_name" { - description = <<-EOT - The name of the existing resource policy. - - Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#name). - EOT - type = string - default = null + type = object({ + new_policy = optional(bool, false) + existing_policy_name = optional(string) + specific_reservation = optional(string) + }) + default = null } variable "metadata" {