@@ -67,6 +67,120 @@ cleanup_tpu_nodes() {
6767 done
6868}
6969
70+ check_tpu_capacity () {
71+ local zone=$1
72+ local provisioning_model=$2
73+ local success=false
74+
75+ # --- TPU Capacity Check ---
76+ for i in $( seq 1 " ${NUM_NODES} " ) ; do
77+ local tpu_name
78+ tpu_name=" ${FULL_INSTANCE_PREFIX} $( printf " %02d" " $i " ) "
79+ local gcloud_tpu_cmd=(
80+ gcloud compute tpus tpu-vm create " ${tpu_name} "
81+ --project=" ${PROJECT_ID} "
82+ --zone=" ${zone} "
83+ --accelerator-type=" ${ACCELERATOR_TYPE} "
84+ --version=" ${TPU_RUNTIME_VERSION} "
85+ --quiet
86+ )
87+ if [[ " ${provisioning_model} " == " SPOT" ]]; then
88+ gcloud_tpu_cmd+=(--spot)
89+ fi
90+
91+ if ! tpu_create_output=$( " ${gcloud_tpu_cmd[@]} " 2>&1 ) ; then
92+ echo " ERROR: Unexpected error during TPU create in ${zone} (${provisioning_model} ): ${tpu_create_output} " >&2
93+ fi
94+ done
95+
96+ local created_tpu_names=()
97+ local tpu_created_count=0
98+ if tpu_list_output=$( gcloud compute tpus tpu-vm list --project=" ${PROJECT_ID} " --zone=" ${zone} " \
99+ --filter=" name ~ ${FULL_INSTANCE_PREFIX} " --format=' value(name)' 2> /dev/null) ; then
100+ if [[ -n " ${tpu_list_output} " ]]; then
101+ readarray -t created_tpu_names <<< " ${tpu_list_output}"
102+ tpu_created_count=${# created_tpu_names[@]}
103+ echo " INFO: Found ${tpu_created_count} TPU nodes: ${created_tpu_names[*]} "
104+ else
105+ echo " INFO: No matching TPU nodes found in ${zone} via list."
106+ fi
107+ else
108+ echo " ERROR: Failed to list TPU nodes in ${zone} : ${tpu_list_output} " >&2
109+ fi
110+
111+ if [[ " ${tpu_created_count} " -ge 1 ]]; then
112+ success=true
113+ fi
114+
115+ cleanup_tpu_nodes " ${PROJECT_ID} " " ${zone} " " ${created_tpu_names[@]} "
116+ [[ " ${success} " == " true" ]]
117+ }
118+
119+ check_vm_capacity () {
120+ local zone=$1
121+ local provisioning_model=$2
122+ local success=false
123+
124+ readarray -t instance_names_array < <( generate_instance_names " ${FULL_INSTANCE_PREFIX} " " ${NUM_NODES} " )
125+ local instance_names_str
126+ instance_names_str=$(
127+ IFS=,
128+ echo " ${instance_names_array[*]} "
129+ )
130+
131+ local gcloud_cmd=(
132+ gcloud compute instances bulk create
133+ --predefined-names=" ${instance_names_str} "
134+ --project=" ${PROJECT_ID} "
135+ --zone=" ${zone} "
136+ --machine-type=" ${MACHINE_TYPE} "
137+ --provisioning-model=" ${provisioning_model} "
138+ --no-address
139+ --quiet
140+ --min-count=" ${MIN_NODES} "
141+ )
142+
143+ if [[ " ${provisioning_model} " == " SPOT" ]]; then
144+ gcloud_cmd+=(--instance-termination-action=" ${TERMINATION_ACTION} " )
145+ else
146+ gcloud_cmd+=(--on-host-maintenance=" TERMINATE" )
147+ fi
148+
149+ echo " INFO: Attempting to bulk create ${NUM_NODES} VMs in ${zone} (Model: ${provisioning_model} )..."
150+ if create_output=$( " ${gcloud_cmd[@]} " 2>&1 ) ; then
151+ if instance_list_output=$( gcloud compute instances list --project=" ${PROJECT_ID} " --zones=" ${zone} " \
152+ --filter=" name ~ ^${FULL_INSTANCE_PREFIX} " --format=' value(name)' ) ; then
153+ if [[ -n " ${instance_list_output} " ]]; then
154+ readarray -t created_instances < <( echo " ${instance_list_output} " )
155+ local num_created=${# created_instances[@]}
156+ cleanup_vm_instances " ${PROJECT_ID} " " ${zone} " " ${FULL_INSTANCE_PREFIX} "
157+
158+ if [[ " ${num_created} " -ge " ${MIN_NODES} " ]]; then
159+ echo " INFO: Found sufficient VM capacity in ${zone} ."
160+ success=true
161+ else
162+ echo " ERROR: Bulk create & list succeeded in ${zone} , but only ${num_created} instances found, less than min_count ${MIN_NODES} ." >&2
163+ fi
164+ else
165+ echo " ERROR: Bulk create command apparently succeeded in ${zone} , but LIST command found no instances with the prefix." >&2
166+ cleanup_vm_instances " ${PROJECT_ID} " " ${zone} " " ${FULL_INSTANCE_PREFIX} "
167+ fi
168+ else
169+ echo " ERROR: Bulk create command succeeded in ${zone} , but the command to list instances failed." >&2
170+ cleanup_vm_instances " ${PROJECT_ID} " " ${zone} " " ${FULL_INSTANCE_PREFIX} "
171+ fi
172+ else
173+ if [[ " ${create_output} " != * " INSUFFICIENT_CAPACITY" * && " ${create_output} " != * " ZONE_RESOURCE_POOL_EXHAUSTED" * ]]; then
174+ echo " ERROR: Unexpected error during bulk create in ${zone} : ${create_output} " >&2
175+ else
176+ echo " INFO: Insufficient VM capacity for bulk create in ${zone} (Model: ${provisioning_model} )."
177+ fi
178+ cleanup_vm_instances " ${PROJECT_ID} " " ${zone} " " ${FULL_INSTANCE_PREFIX} "
179+ fi
180+
181+ [[ " ${success} " == " true" ]]
182+ }
183+
70184if ! GCS_CONTENT=$( gcloud storage cat " ${OPTIONS_GCS_PATH} " ) ; then
71185 echo " ERROR: Failed to read ${OPTIONS_GCS_PATH} ." >&2
72186 exit 1
86200SELECTED_ZONE=" "
87201SUCCESS=false
88202
89- # Loop through all zones to find capacity
90- for ZONE in " ${ZONES_ARRAY[@]} " ; do
91- if [[ " ${MACHINE_TYPE} " == " tpu" ]]; then
92- # --- TPU Capacity Check ---
93- for i in $( seq 1 " ${NUM_NODES} " ) ; do
94- TPU_NAME=" ${FULL_INSTANCE_PREFIX} $( printf " %02d" " $i " ) "
95- declare -a GCLOUD_TPU_CMD
96- GCLOUD_TPU_CMD=(
97- gcloud compute tpus tpu-vm create " ${TPU_NAME} "
98- --project=" ${PROJECT_ID} "
99- --zone=" ${ZONE} "
100- --accelerator-type=" ${ACCELERATOR_TYPE} "
101- --version=" ${TPU_RUNTIME_VERSION} "
102- --spot
103- --quiet
104- )
105-
106- if ! TPU_CREATE_OUTPUT=$( " ${GCLOUD_TPU_CMD[@]} " 2>&1 ) ; then
107- echo " ERROR: Unexpected error during TPU create in ${ZONE} : ${TPU_CREATE_OUTPUT} " >&2
108- fi
109- done
110-
111- declare -a CREATED_TPU_NAMES=()
112- TPU_CREATED_COUNT=0
113- if tpu_list_output=$( gcloud compute tpus tpu-vm list --project=" ${PROJECT_ID} " --zone=" ${ZONE} " \
114- --filter=" name ~ ${FULL_INSTANCE_PREFIX} " --format=' value(name)' 2> /dev/null) ; then
115- if [[ -n " ${tpu_list_output} " ]]; then
116- readarray -t CREATED_TPU_NAMES <<< " ${tpu_list_output}"
117- TPU_CREATED_COUNT=${# CREATED_TPU_NAMES[@]}
118- echo " INFO: Found ${TPU_CREATED_COUNT} TPU nodes: ${CREATED_TPU_NAMES[*]} "
119- else
120- echo " INFO: No matching TPU nodes found in ${ZONE} via list."
121- fi
122- else
123- echo " ERROR: Failed to list TPU nodes in ${ZONE} : ${tpu_list_output} " >&2
124- fi
125-
126- if [[ " ${TPU_CREATED_COUNT} " -ge 1 ]]; then
127- SELECTED_ZONE=" ${ZONE} "
128- SUCCESS=true
129- fi
203+ declare -a PROVISIONING_MODELS=(" SPOT" )
204+ if [[ " ${ENABLE_SPOT_FALLBACK:- false} " == " true" ]]; then
205+ PROVISIONING_MODELS+=(" STANDARD" )
206+ fi
130207
131- cleanup_tpu_nodes " ${PROJECT_ID} " " ${ZONE} " " ${CREATED_TPU_NAMES[@]} "
132- else
133- readarray -t INSTANCE_NAMES_ARRAY < <( generate_instance_names " ${FULL_INSTANCE_PREFIX} " " ${NUM_NODES} " )
134- instance_names_str=$(
135- IFS=,
136- echo " ${INSTANCE_NAMES_ARRAY[*]} "
137- )
208+ for PROVISIONING_MODEL in " ${PROVISIONING_MODELS[@]} " ; do
209+ echo " INFO: Trying provisioning model: ${PROVISIONING_MODEL} "
138210
139- declare -a GCLOUD_CMD
140- GCLOUD_CMD=(
141- gcloud compute instances bulk create
142- --predefined-names=" ${instance_names_str} "
143- --project=" ${PROJECT_ID} "
144- --zone=" ${ZONE} "
145- --machine-type=" ${MACHINE_TYPE} "
146- --provisioning-model=" ${PROVISIONING_MODEL} "
147- --instance-termination-action=" ${TERMINATION_ACTION} "
148- --no-address
149- --quiet
150- --min-count=" ${MIN_NODES} "
151- )
152- echo " INFO: Attempting to bulk create ${NUM_NODES} VMs in ${ZONE} ..."
153- if CREATE_OUTPUT=$( " ${GCLOUD_CMD[@]} " 2>&1 ) ; then
154- if instance_list_output=$( gcloud compute instances list --project=" ${PROJECT_ID} " --zones=" ${ZONE} " \
155- --filter=" name ~ ^${FULL_INSTANCE_PREFIX} " --format=' value(name)' ) ; then
156- if [[ -n " ${instance_list_output} " ]]; then
157- readarray -t created_instances < <( echo " ${instance_list_output} " )
158- NUM_CREATED=$(( ${# created_instances[@]} ))
159- cleanup_vm_instances " ${PROJECT_ID} " " ${ZONE} " " ${FULL_INSTANCE_PREFIX} "
160-
161- if [[ " ${NUM_CREATED} " -ge " ${MIN_NODES} " ]]; then
162- SELECTED_ZONE=" ${ZONE} "
163- SUCCESS=true
164- echo " INFO: Found sufficient VM capacity in ${ZONE} ."
165- else
166- echo " ERROR: Bulk create & list succeeded in ${ZONE} , but only ${NUM_CREATED} instances found, less than min_count ${MIN_NODES} ." >&2
167- fi
168- else
169- echo " ERROR: Bulk create command apparently succeeded in ${ZONE} , but LIST command found no instances with the prefix." >&2
170- cleanup_vm_instances " ${PROJECT_ID} " " ${ZONE} " " ${FULL_INSTANCE_PREFIX} "
171- fi
172- else
173- echo " ERROR: Bulk create command succeeded in ${ZONE} , but the command to list instances failed." >&2
174- cleanup_vm_instances " ${PROJECT_ID} " " ${ZONE} " " ${FULL_INSTANCE_PREFIX} "
211+ for ZONE in " ${ZONES_ARRAY[@]} " ; do
212+ if [[ " ${MACHINE_TYPE} " == " tpu" ]]; then
213+ if check_tpu_capacity " ${ZONE} " " ${PROVISIONING_MODEL} " ; then
214+ SELECTED_ZONE=" ${ZONE} "
215+ SUCCESS=true
216+ break
175217 fi
176218 else
177- if [[ " ${CREATE_OUTPUT} " != * " INSUFFICIENT_CAPACITY" * &&
178- " ${CREATE_OUTPUT} " != * " ZONE_RESOURCE_POOL_EXHAUSTED" * ]]; then
179- echo " ERROR: Unexpected error during bulk create in ${ZONE} : ${CREATE_OUTPUT} " >&2
180- else
181- echo " INFO: Insufficient VM capacity for bulk create in ${ZONE} ."
219+ if check_vm_capacity " ${ZONE} " " ${PROVISIONING_MODEL} " ; then
220+ SELECTED_ZONE=" ${ZONE} "
221+ SUCCESS=true
222+ break
182223 fi
183- cleanup_vm_instances " ${PROJECT_ID} " " ${ZONE} " " ${FULL_INSTANCE_PREFIX} "
184224 fi
185- fi
225+ done
186226
187227 if [[ " ${SUCCESS} " == " true" ]]; then
188228 break
189229 fi
190230done
191231
192232if [[ " ${SUCCESS} " == " true" ]]; then
193- echo " Deploying in ZONE: ${SELECTED_ZONE} "
233+ echo " Deploying in ZONE: ${SELECTED_ZONE} , MODEL: ${PROVISIONING_MODEL} "
194234 export ZONE=" ${SELECTED_ZONE} "
235+ export PROVISIONING_MODEL=" ${PROVISIONING_MODEL} "
195236else
196237 echo " --- DEPLOYMENT FAILED(Couldn't find a zone to deploy) ---" >&2
197238 exit 1
0 commit comments