Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 4 additions & 39 deletions modules/internal/gpu-definition/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
*/

variable "machine_type" {
description = "Machine type to use for the instance creation"
Expand Down Expand Up @@ -45,44 +45,9 @@ locals {
# type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100"
# },
# ]
accelerator_machines = {
"a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 },
"a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 },
"a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 },
"a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 },
"a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 },
"a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 },
"a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 },
"a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 },
"a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 },
"a3-highgpu-1g" = { type = "nvidia-h100-80gb", count = 1 },
"a3-highgpu-2g" = { type = "nvidia-h100-80gb", count = 2 },
"a3-highgpu-4g" = { type = "nvidia-h100-80gb", count = 4 },
"a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 },
"a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 },
"a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 },
"a4-highgpu-8g-lowmem" = { type = "nvidia-b200", count = 8 },
"a4-highgpu-8g" = { type = "nvidia-b200", count = 8 },
"a4x-highgpu-4g" = { type = "nvidia-gb200", count = 4 },
"a4x-highgpu-4g-nolssd" = { type = "nvidia-gb200", count = 4 },
"a4x-maxgpu-4g-metal" = { type = "nvidia-gb300", count = 4 },
"a4x-maxgpu-4g-metal-nolssd" = { type = "nvidia-gb300", count = 4 },
"g2-standard-4" = { type = "nvidia-l4", count = 1 },
"g2-standard-8" = { type = "nvidia-l4", count = 1 },
"g2-standard-12" = { type = "nvidia-l4", count = 1 },
"g2-standard-16" = { type = "nvidia-l4", count = 1 },
"g2-standard-24" = { type = "nvidia-l4", count = 2 },
"g2-standard-32" = { type = "nvidia-l4", count = 1 },
"g2-standard-48" = { type = "nvidia-l4", count = 4 },
"g2-standard-96" = { type = "nvidia-l4", count = 8 },
"g4-standard-6" = { type = "nvidia-rtx-pro-6000", count = 1 },
"g4-standard-12" = { type = "nvidia-rtx-pro-6000", count = 1 },
"g4-standard-24" = { type = "nvidia-rtx-pro-6000", count = 1 },
"g4-standard-48" = { type = "nvidia-rtx-pro-6000", count = 1 },
"g4-standard-96" = { type = "nvidia-rtx-pro-6000", count = 2 },
"g4-standard-192" = { type = "nvidia-rtx-pro-6000", count = 4 },
"g4-standard-384" = { type = "nvidia-rtx-pro-6000", count = 8 },
}
accelerators_json = jsondecode(file("${path.module}/../../../pkg/config/accelerators.json"))
accelerator_machines = local.accelerators_json.gpus

generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])

# Select in priority order:
Expand Down
30 changes: 7 additions & 23 deletions modules/internal/tpu-definition/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
*/

locals {
# Load shared JSON
accelerators_json = jsondecode(file("${path.module}/../../../pkg/config/accelerators.json"))

# Determine if this is a TPU node pool by checking if the machine_type exists in our authoritative map of TPU machine types.
is_tpu = contains(keys(local.tpu_chip_count_map), var.machine_type)

Expand All @@ -33,31 +36,12 @@ locals {
"tpu7x" = "tpu7x" # TPU v7x
}

# Map specific GCE machine types to the number of TPU chips per node (VM).
# The machine-type map must be updated to reflect new TPU releases with reference to public documentation: https://docs.cloud.google.com/tpu/docs/intro-to-tpu
# Project shared JSON into the expected format for tpu_chip_count_map (machine_type -> count)
tpu_chip_count_map = {
# v4 - ct4p
"ct4p-hightpu-4t" = 4

# v5e - ct5lp
"ct5lp-hightpu-1t" = 1
"ct5lp-hightpu-4t" = 4
"ct5lp-hightpu-8t" = 8

# v5p - ct5p
"ct5p-hightpu-1t" = 1
"ct5p-hightpu-2t" = 2
"ct5p-hightpu-4t" = 4

# v6e - ct6e
"ct6e-standard-1t" = 1
"ct6e-standard-4t" = 4
"ct6e-standard-8t" = 8

# v7x - tpu7x
"tpu7x-standard-4t" = 4
for k, v in local.accelerators_json.tpus : k => v.count
}


# Robustly extract the machine family prefix (e.g., "ct6e").
tpu_machine_family = local.is_tpu ? element(split("-", var.machine_type), 0) : ""
tpu_accelerator_type = local.is_tpu ? lookup(local.tpu_accelerator_map, local.tpu_machine_family, null) : null
Expand Down
50 changes: 50 additions & 0 deletions pkg/config/accelerators.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"gpus": {
"a2-highgpu-1g": { "count": 1, "type": "nvidia-tesla-a100" },
"a2-highgpu-2g": { "count": 2, "type": "nvidia-tesla-a100" },
"a2-highgpu-4g": { "count": 4, "type": "nvidia-tesla-a100" },
"a2-highgpu-8g": { "count": 8, "type": "nvidia-tesla-a100" },
"a2-megagpu-16g": { "count": 16, "type": "nvidia-tesla-a100" },
"a2-ultragpu-1g": { "count": 1, "type": "nvidia-a100-80gb" },
"a2-ultragpu-2g": { "count": 2, "type": "nvidia-a100-80gb" },
"a2-ultragpu-4g": { "count": 4, "type": "nvidia-a100-80gb" },
"a2-ultragpu-8g": { "count": 8, "type": "nvidia-a100-80gb" },
"a3-highgpu-1g": { "count": 1, "type": "nvidia-h100-80gb" },
"a3-highgpu-2g": { "count": 2, "type": "nvidia-h100-80gb" },
"a3-highgpu-4g": { "count": 4, "type": "nvidia-h100-80gb" },
"a3-highgpu-8g": { "count": 8, "type": "nvidia-h100-80gb" },
"a3-megagpu-8g": { "count": 8, "type": "nvidia-h100-mega-80gb" },
"a3-ultragpu-8g": { "count": 8, "type": "nvidia-h200-141gb" },
"a4-highgpu-8g-lowmem": { "count": 8, "type": "nvidia-b200" },
"a4-highgpu-8g": { "count": 8, "type": "nvidia-b200" },
"a4x-highgpu-4g": { "count": 4, "type": "nvidia-gb200" },
"a4x-highgpu-4g-nolssd": { "count": 4, "type": "nvidia-gb200" },
"a4x-maxgpu-4g-metal": { "count": 4, "type": "nvidia-gb300" },
"a4x-maxgpu-4g-metal-nolssd": { "count": 4, "type": "nvidia-gb300" },
"g2-standard-4": { "count": 1, "type": "nvidia-l4" },
"g2-standard-8": { "count": 1, "type": "nvidia-l4" },
"g2-standard-12": { "count": 1, "type": "nvidia-l4" },
"g2-standard-16": { "count": 1, "type": "nvidia-l4" },
"g2-standard-24": { "count": 2, "type": "nvidia-l4" },
"g2-standard-32": { "count": 1, "type": "nvidia-l4" },
"g2-standard-48": { "count": 4, "type": "nvidia-l4" },
"g2-standard-96": { "count": 8, "type": "nvidia-l4" },
"g4-standard-48": { "count": 1, "type": "nvidia-rtx-pro-6000" },
"g4-standard-96": { "count": 2, "type": "nvidia-rtx-pro-6000" },
"g4-standard-192": { "count": 4, "type": "nvidia-rtx-pro-6000" },
"g4-standard-384": { "count": 8, "type": "nvidia-rtx-pro-6000" }
},
"tpus": {
"ct4p-hightpu-4t": { "count": 4, "type": "tpu-v4-podslice" },
"ct5lp-hightpu-1t": { "count": 1, "type": "tpu-v5-lite-podslice" },
"ct5lp-hightpu-4t": { "count": 4, "type": "tpu-v5-lite-podslice" },
"ct5lp-hightpu-8t": { "count": 8, "type": "tpu-v5-lite-podslice" },
"ct5p-hightpu-1t": { "count": 1, "type": "tpu-v5p-slice" },
"ct5p-hightpu-2t": { "count": 2, "type": "tpu-v5p-slice" },
"ct5p-hightpu-4t": { "count": 4, "type": "tpu-v5p-slice" },
"ct6e-standard-1t": { "count": 1, "type": "tpu-v6e-slice" },
"ct6e-standard-4t": { "count": 4, "type": "tpu-v6e-slice" },
"ct6e-standard-8t": { "count": 8, "type": "tpu-v6e-slice" },
"tpu7x-standard-4t": { "count": 4, "type": "tpu7x" }
}
}
Loading