diff --git a/goldens/Basic_cluster_create.txt b/goldens/Basic_cluster_create.txt index 014f9359d..d245db1e4 100644 --- a/goldens/Basic_cluster_create.txt +++ b/goldens/Basic_cluster_create.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_for_multi-host_nodepool.txt b/goldens/Cluster_create_for_multi-host_nodepool.txt index 2e222613a..3d01f997c 100644 --- a/goldens/Cluster_create_for_multi-host_nodepool.txt +++ b/goldens/Cluster_create_for_multi-host_nodepool.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-16 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-16 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_for_single-host_nodepool.txt b/goldens/Cluster_create_for_single-host_nodepool.txt index 6f2d11c86..0bbe5170d 100644 --- a/goldens/Cluster_create_for_single-host_nodepool.txt +++ b/goldens/Cluster_create_for_single-host_nodepool.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of v4-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v4-podslice', gce_machine_type='ct4p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v4-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v4-podslice', gce_machine_type='ct4p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v4-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of v4-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v4-podslice', gce_machine_type='ct4p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v4-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v4-podslice', gce_machine_type='ct4p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v4-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_private.txt b/goldens/Cluster_create_private.txt index fc39d50cb..2d44af8bf 100644 --- a/goldens/Cluster_create_private.txt +++ b/goldens/Cluster_create_private.txt @@ -43,13 +43,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster-private --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of v5p-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 1 node pool or pools of v5p-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_sub-slicing.txt b/goldens/Cluster_create_sub-slicing.txt index 37a563510..ed42ef52f 100644 --- a/goldens/Cluster_create_sub-slicing.txt +++ b/goldens/Cluster_create_sub-slicing.txt @@ -41,13 +41,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of v6e-4x4 -We assume that the underlying system is: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 1 node pool or pools of v6e-16 -Underlyingly, we assume that means: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=1) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_super-slicing.txt b/goldens/Cluster_create_super-slicing.txt index 77b3a148d..875995872 100644 --- a/goldens/Cluster_create_super-slicing.txt +++ b/goldens/Cluster_create_super-slicing.txt @@ -41,13 +41,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 5 node pool or pools of tpu7x-4x4x4 -We assume that the underlying system is: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 5 node pool or pools of tpu7x-128 -Underlyingly, we assume that means: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=True, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt b/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt index 9e99b3396..15ff54d3d 100644 --- a/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +++ b/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt b/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt index fe2718b27..7ddc2ecd3 100644 --- a/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +++ b/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_Managed_Lustre_driver.txt b/goldens/Cluster_create_with_Managed_Lustre_driver.txt index 10377281e..d2123e354 100644 --- a/goldens/Cluster_create_with_Managed_Lustre_driver.txt +++ b/goldens/Cluster_create_with_Managed_Lustre_driver.txt @@ -42,11 +42,11 @@ gcloud container clusters update golden-cluster --project=golden-project --locat [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt b/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt index bde2f2bfc..7fa872e64 100644 --- a/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt +++ b/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt @@ -42,11 +42,11 @@ gcloud container clusters update golden-cluster --project=golden-project --locat [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_gb200-4.txt b/goldens/Cluster_create_with_gb200-4.txt index 7e20395dd..0ec4289b7 100644 --- a/goldens/Cluster_create_with_gb200-4.txt +++ b/goldens/Cluster_create_with_gb200-4.txt @@ -39,13 +39,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of gb200-4 -We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=GPU, device_type='gb200-4', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=True, gpu_config=GpuConfig(requires_topology=True, gpu_direct_name='rdma', nccl_installer='https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml', jobset_decorator_fn=)) +We assume that the underlying system is: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=GPU, device_type='gb200-4', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=True, gpu_config=GpuConfig(requires_topology=True, gpu_direct_name='rdma', nccl_installer='https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml', jobset_decorator_fn=), parallel_containers=1) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a [XPK] Creating 1 node pool with 2 nodes of gb200-4 -Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=GPU, device_type='gb200-4', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=True, gpu_config=GpuConfig(requires_topology=True, gpu_direct_name='rdma', nccl_installer='https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml', jobset_decorator_fn=)) +Underlyingly, we assume that means: SystemCharacteristics(topology='1x72', vms_per_slice=1, gke_accelerator='nvidia-gb200', gce_machine_type='a4x-highgpu-4g', chips_per_vm=4, accelerator_type=GPU, device_type='gb200-4', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=, requires_workload_policy=True, gpu_config=GpuConfig(requires_topology=True, gpu_direct_name='rdma', nccl_installer='https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml', jobset_decorator_fn=), parallel_containers=1) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Cluster_create_with_shared_reservation.txt b/goldens/Cluster_create_with_shared_reservation.txt index 36b177209..0a03c30c6 100644 --- a/goldens/Cluster_create_with_shared_reservation.txt +++ b/goldens/Cluster_create_with_shared_reservation.txt @@ -39,13 +39,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run. gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/NAP_cluster-create.txt b/goldens/NAP_cluster-create.txt index 255331414..cc206893b 100644 --- a/goldens/NAP_cluster-create.txt +++ b/goldens/NAP_cluster-create.txt @@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/NAP_cluster-create_with_pathways.txt b/goldens/NAP_cluster-create_with_pathways.txt index 798149ca8..55cef080e 100644 --- a/goldens/NAP_cluster-create_with_pathways.txt +++ b/goldens/NAP_cluster-create_with_pathways.txt @@ -39,11 +39,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube- [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run. gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)" [XPK] Creating 1 node pool or pools of tpu7x-8 -We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)" [XPK] Creating 1 node pool or pools of tpu7x-8 -Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None) +Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=, requires_workload_policy=False, gpu_config=None, parallel_containers=2) [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run. gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)" [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. diff --git a/goldens/Workload_create.txt b/goldens/Workload_create.txt index 3917fa271..c967b8949 100644 --- a/goldens/Workload_create.txt +++ b/goldens/Workload_create.txt @@ -35,7 +35,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Temp file (9c33f52b12eab63cfb9ce1aba6ff74f4642cb9e40e2a0ad9517d189ee41f09a5) content: +[XPK] Temp file (06487845b6acc9dc2eccb8d04d9b40823797dcc10e23b999766f18f7ab1643ab) content: apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet metadata: @@ -64,6 +64,7 @@ spec: podFailurePolicy: rules: + - action: FailJob onPodConditions: [] onExitCodes: @@ -94,14 +95,11 @@ spec: dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: 30 containers: + - name: jax-tpu image: gcr.io/golden-project/dry-run-runner:prefix-current env: - ports: - - containerPort: 8471 - - containerPort: 8080 - securityContext: privileged: true command: @@ -146,7 +144,7 @@ spec: [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 9c33f52b12eab63cfb9ce1aba6ff74f4642cb9e40e2a0ad9517d189ee41f09a5 +kubectl apply -f 06487845b6acc9dc2eccb8d04d9b40823797dcc10e23b999766f18f7ab1643ab [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/goldens/Workload_create_pathways.txt b/goldens/Workload_create_pathways.txt index 3e4870660..ec565da5c 100644 --- a/goldens/Workload_create_pathways.txt +++ b/goldens/Workload_create_pathways.txt @@ -37,7 +37,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Temp file (6fb0f350cf4e0dccc71f77392d12db3de6371d5148519657046613794358bfce) content: +[XPK] Temp file (321584e701d68faa848df77a0e87ecbec8ce31e2b2aeb0d1e3ddb7027acc5021) content: apiVersion: pathways-job.pathways.domain/v1 kind: PathwaysJob @@ -74,13 +74,11 @@ docker push gcr.io/golden-project/dry-run-runner:prefix-current metadata: spec: containers: + - name: jax-tpu image: gcr.io/golden-project/dry-run-runner:prefix-current imagePullPolicy: Always env: - ports: - - securityContext: privileged: true command: @@ -126,7 +124,7 @@ docker push gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 6fb0f350cf4e0dccc71f77392d12db3de6371d5148519657046613794358bfce +kubectl apply -f 321584e701d68faa848df77a0e87ecbec8ce31e2b2aeb0d1e3ddb7027acc5021 [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/goldens/Workload_create_sub-slicing.txt b/goldens/Workload_create_sub-slicing.txt index 677af3dc0..69b5e3450 100644 --- a/goldens/Workload_create_sub-slicing.txt +++ b/goldens/Workload_create_sub-slicing.txt @@ -39,7 +39,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Temp file (8d5155a477cf99bc463104e0b22de0d21ee90548f51297fe429cdaa721d70a63) content: +[XPK] Temp file (d1c742386d222824f96cb2ad177a93717c45c02b8614bb89181ba8f273e81104) content: apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet metadata: @@ -68,6 +68,7 @@ spec: podFailurePolicy: rules: + - action: FailJob onPodConditions: [] onExitCodes: @@ -99,14 +100,11 @@ spec: dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: 30 containers: + - name: jax-tpu image: gcr.io/golden-project/dry-run-runner:prefix-current env: - ports: - - containerPort: 8471 - - containerPort: 8080 - securityContext: privileged: true command: @@ -151,7 +149,7 @@ spec: [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 8d5155a477cf99bc463104e0b22de0d21ee90548f51297fe429cdaa721d70a63 +kubectl apply -f d1c742386d222824f96cb2ad177a93717c45c02b8614bb89181ba8f273e81104 [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/goldens/Workload_create_super-slicing.txt b/goldens/Workload_create_super-slicing.txt index 059b84d4a..a32b6f2a1 100644 --- a/goldens/Workload_create_super-slicing.txt +++ b/goldens/Workload_create_super-slicing.txt @@ -39,7 +39,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94 docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current -[XPK] Temp file (23a6075e1e928b8d7162df16f5495fe1d33c1c5b5d35a215d28f429091a84a63) content: +[XPK] Temp file (061d3a2bbf745a333f39650b7ec847e27183a2b74442c5af97e37052a7965857) content: apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet metadata: @@ -68,10 +68,17 @@ spec: podFailurePolicy: rules: + + - action: FailJob + onPodConditions: [] + onExitCodes: + containerName: jax-tpu-1 + operator: NotIn + values: [42,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255] - action: FailJob onPodConditions: [] onExitCodes: - containerName: jax-tpu + containerName: jax-tpu-2 operator: NotIn values: [42,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255] template: @@ -98,14 +105,46 @@ spec: dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: 30 containers: - - name: jax-tpu + + - name: jax-tpu-1 image: gcr.io/golden-project/dry-run-runner:prefix-current env: - ports: - - containerPort: 8471 - - containerPort: 8080 + securityContext: + privileged: true + command: + - bash + - -c + - | + echo XPK Start: $(date); + _sigterm() (kill -SIGTERM $! 2>/dev/null;); + trap _sigterm SIGTERM; + + (bash hello) & PID=$!; + while kill -0 $PID 2>/dev/null; + do sleep 5; + done; + wait $PID; + EXIT_CODE=$?; + + echo XPK End: $(date); + echo EXIT_CODE=$EXIT_CODE; + + + exit $EXIT_CODE + resources: + limits: + google.com/tpu: 2 + + volumeMounts: + - mountPath: /dev/shm + name: dshm-2 + + - name: jax-tpu-2 + image: gcr.io/golden-project/dry-run-runner:prefix-current + + env: securityContext: privileged: true command: @@ -130,7 +169,7 @@ spec: exit $EXIT_CODE resources: limits: - google.com/tpu: 4 + google.com/tpu: 2 volumeMounts: - mountPath: /dev/shm @@ -150,7 +189,7 @@ spec: [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 23a6075e1e928b8d7162df16f5495fe1d33c1c5b5d35a215d28f429091a84a63 +kubectl apply -f 061d3a2bbf745a333f39650b7ec847e27183a2b74442c5af97e37052a7965857 [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/goldens/Workload_create_with_output-manifest-file.txt b/goldens/Workload_create_with_output-manifest-file.txt index 6874f0e9e..52b6cb563 100644 --- a/goldens/Workload_create_with_output-manifest-file.txt +++ b/goldens/Workload_create_with_output-manifest-file.txt @@ -36,7 +36,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run. docker push gcr.io/golden-project/dry-run-runner:prefix-current [XPK] Workload golden-workload manifest written to /var/tmp/manifest.yaml -[XPK] Temp file (9c33f52b12eab63cfb9ce1aba6ff74f4642cb9e40e2a0ad9517d189ee41f09a5) content: +[XPK] Temp file (06487845b6acc9dc2eccb8d04d9b40823797dcc10e23b999766f18f7ab1643ab) content: apiVersion: jobset.x-k8s.io/v1alpha2 kind: JobSet metadata: @@ -65,6 +65,7 @@ spec: podFailurePolicy: rules: + - action: FailJob onPodConditions: [] onExitCodes: @@ -95,14 +96,11 @@ spec: dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: 30 containers: + - name: jax-tpu image: gcr.io/golden-project/dry-run-runner:prefix-current env: - ports: - - containerPort: 8471 - - containerPort: 8080 - securityContext: privileged: true command: @@ -147,7 +145,7 @@ spec: [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run. -kubectl apply -f 9c33f52b12eab63cfb9ce1aba6ff74f4642cb9e40e2a0ad9517d189ee41f09a5 +kubectl apply -f 06487845b6acc9dc2eccb8d04d9b40823797dcc10e23b999766f18f7ab1643ab [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run. gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard. diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index ac8139e26..48eda6928 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -490,13 +490,21 @@ def workload_create(args) -> None: - PodFailurePolicy""" restart_on_exit_codes_list = get_restart_exit_codes(args) restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes_list)) - pod_failure_policy = f""" + + pod_failure_policy = """ podFailurePolicy: rules: + """ + docker_image = get_main_container_docker_image(args, workload_system) + for i in range(workload_system.parallel_containers): + docker_image_sufix = ( + f'-{i + 1}' if workload_system.parallel_containers > 1 else '' + ) + pod_failure_policy += f""" - action: FailJob onPodConditions: [] onExitCodes: - containerName: {get_main_container_docker_image(args, workload_system)} + containerName: {docker_image}{docker_image_sufix} operator: NotIn values: [{restart_on_exit_codes}]""" diff --git a/src/xpk/commands/workload_test.py b/src/xpk/commands/workload_test.py index 160f52695..a59e1b4b0 100644 --- a/src/xpk/commands/workload_test.py +++ b/src/xpk/commands/workload_test.py @@ -23,6 +23,7 @@ from ..core.system_characteristics import DockerPlatform, SystemCharacteristics, AcceleratorType, UserFacingNameToSystemCharacteristics, GpuConfig from .workload import workload_create from .cluster_test import construct_args +from ..core.docker_container import get_user_workload_container as real_get_user_workload_container SYSTEM_CHARACTERISTICS = SystemCharacteristics( @@ -206,3 +207,69 @@ def test_workload_create_dry_run_with_output_file(mocker): written_content = mock_open.return_value.write.call_args[0][0] assert 'test-workload' in written_content assert 'cloud.google.com/gke-tpu-topology: 8x8' in written_content + + +def test_workload_create_multi_container_for_tpu7x( + workload_create_mocks: _WorkloadCreateMocks, + mocker, +): + """Tests that the generated YAML for a multi-container workload has correct pod failure policy and container structure.""" + + # Enable dry_run to prevent external calls like get_storages_to_mount -> gcloud + mocker.patch('xpk.utils.execution_context.dry_run', True) + + # Mock dependencies required by get_user_workload_container -> get_main_container + mocker.patch( + 'xpk.core.docker_container.setup_docker_image', + return_value=(0, 'dummy-image'), + ) + mocker.patch( + 'xpk.core.docker_container.get_gke_debugging_dashboard', return_value=None + ) + + # Use the real get_user_workload_container to test integration + workload_create_mocks.get_user_workload_container.side_effect = ( + real_get_user_workload_container + ) + + args = construct_args( + workload='test-workload', + command='echo hello', + num_nodes=1, + tpu_type='tpu7x-2x2x2', + restart_on_exit_codes=None, + docker_name='test-docker', + deploy_stacktrace_sidecar=False, + enable_debug_logs=False, + scheduler='default-scheduler', + ) + workload_create(args) + + assert workload_create_mocks.write_tmp_file.called + yaml_content = workload_create_mocks.write_tmp_file.call_args[0][0] + jobset = yaml.safe_load(yaml_content) + + # Verify Pod Failure Policy + pod_failure_rules = jobset['spec']['replicatedJobs'][0]['template']['spec'][ + 'podFailurePolicy' + ]['rules'] + # Should have 2 rules for multi_container + assert len(pod_failure_rules) == 2 + assert pod_failure_rules[0]['onExitCodes']['containerName'].endswith('-1') + assert pod_failure_rules[1]['onExitCodes']['containerName'].endswith('-2') + + # Verify Containers + # Navigate to the containers list in the YAML + containers = jobset['spec']['replicatedJobs'][0]['template']['spec'][ + 'template' + ]['spec']['containers'] + + assert len(containers) == 2 + assert containers[0]['name'].endswith('-1') + assert containers[1]['name'].endswith('-2') + assert containers[0]['image'] == 'dummy-image' + assert containers[1]['image'] == 'dummy-image' + + # Check if resources are split correctly (4 chips / 2 containers = 2 chips) + assert containers[0]['resources']['limits']['google.com/tpu'] == 2 + assert containers[1]['resources']['limits']['google.com/tpu'] == 2 diff --git a/src/xpk/core/docker_container.py b/src/xpk/core/docker_container.py index 1708a99da..422d72c13 100644 --- a/src/xpk/core/docker_container.py +++ b/src/xpk/core/docker_container.py @@ -17,9 +17,7 @@ from ..utils.console import xpk_exit, xpk_print from .docker_image import setup_docker_image from .docker_resources import ( - add_container_ports, add_image_pull_policy_for_pw_or_gpu, - add_jax_coordinator_port, get_env_container, get_main_container_resources, get_volume_mounts, @@ -112,13 +110,12 @@ def get_main_container(args, system, docker_image, resource_type) -> str: 'touch /shared-volume/stacktrace_signal; ' ) - yaml = """- name: {docker_name} + containers = [] + container_yaml = """ + - name: {docker_name} image: {docker_image} {image_pull_policy} env: {env} - ports: - {container_ports} - {jax_coordinator_port} securityContext: privileged: true command: @@ -145,29 +142,39 @@ def get_main_container(args, system, docker_image, resource_type) -> str: limits: {resources} """ + docker_name = get_main_container_docker_image(args, system) volume_mounts = get_volume_mounts(args, system) if volume_mounts != '': - yaml += """ + container_yaml += """ volumeMounts: {volume_mounts} """ - return yaml.format( - args=args, - system=system, - image_pull_policy=add_image_pull_policy_for_pw_or_gpu(args, system), - env=get_env_container(args, system), - container_ports=add_container_ports(args, system), - jax_coordinator_port=add_jax_coordinator_port(system), - docker_name=get_main_container_docker_image(args, system), - docker_image=docker_image, - gsutil_test_command=gsutil_test_command, - command=command, - tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command, - gpu_workload_terminate_command=gpu_workload_terminate_command, - xpk_internal_commands=xpk_internal_commands, - resources=get_main_container_resources(args, system, resource_type), - volume_mounts=volume_mounts, - ) + # pathways job running on 2 parallel containers is not verified yet + if args.use_pathways: + system.parallel_containers = 1 + + env = get_env_container(args, system) + image_pull_policy = add_image_pull_policy_for_pw_or_gpu(args, system) + for i in range(system.parallel_containers): + docker_name_sufix = f'-{i + 1}' if system.parallel_containers > 1 else '' + containers.append( + container_yaml.format( + args=args, + system=system, + image_pull_policy=image_pull_policy, + env=env, + docker_name=f'{docker_name}{docker_name_sufix}', + docker_image=docker_image, + gsutil_test_command=gsutil_test_command, + command=command, + tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command, + gpu_workload_terminate_command=gpu_workload_terminate_command, + xpk_internal_commands=xpk_internal_commands, + resources=get_main_container_resources(args, system, resource_type), + volume_mounts=volume_mounts, + ) + ) + return ''.join(containers) def get_user_workload_container(args, system: SystemCharacteristics): diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py index 031d74fc5..4167fc07a 100644 --- a/src/xpk/core/docker_resources.py +++ b/src/xpk/core/docker_resources.py @@ -53,7 +53,10 @@ def get_main_container_resources( offset_vCPUs = int(system.chips_per_vm) * 0.95 return f'{resource_type}: {offset_vCPUs}' - return f'{resource_type}: {system.chips_per_vm}' + return ( + f'{resource_type}:' + f' {int(system.chips_per_vm / system.parallel_containers)}' + ) def get_env_container(args, system: SystemCharacteristics) -> str: diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 9ab97321d..47c67e8b6 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -131,6 +131,8 @@ class SystemCharacteristics: supports_super_slicing: Whether the Super-slicing feature is supported. requires_workload_policy: A boolean indicating if a GCE resource workload policy is required. This is automatically set to True for GPUs. + parallel_containers: The number of containers running on a single VM. + """ topology: str @@ -146,6 +148,7 @@ class SystemCharacteristics: docker_platform: DockerPlatform requires_workload_policy: bool = False gpu_config: Optional[GpuConfig] = None + parallel_containers: int = 1 def __post_init__(self): if self.accelerator_type == AcceleratorType.GPU: @@ -239,6 +242,7 @@ def get_tpu_system_characteristics_map( default_topologies: set[str] | None = None, sub_slicing_topologies: set[str] | None = None, super_slicing_topologies: set[str] | None = None, + parallel_containers: int = 1, ) -> dict[str, SystemCharacteristics]: system_characteristics_map = {} default_topologies = default_topologies or set() @@ -263,6 +267,7 @@ def get_tpu_system_characteristics_map( supports_super_slicing=topology in super_slicing_topologies, supports_accelerator_network_profile=supports_accelerator_network_profile, docker_platform=docker_platform, + parallel_containers=parallel_containers, ) system_characteristics_map[f'{prefix}-{topology}'] = system if ( @@ -544,6 +549,7 @@ def compute_vms_per_slice(topology: str) -> int: tpu_type_requires_workload_policy=True, supports_accelerator_network_profile=False, docker_platform=AMD_PLATFORM, + parallel_containers=2, supported_topologies=generate_tpu_topologies(max_cubes=144), super_slicing_topologies=set(['4x4x4']), default_topologies=set([