Skip to content

Commit f7b69e3

Browse files
multi-tier provisioning strategy (#5226)
1 parent e94ba49 commit f7b69e3

File tree

2 files changed

+148
-94
lines changed

2 files changed

+148
-94
lines changed

tools/cloud-build/daily-tests/builds/h4d-vm.yaml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ steps:
4343
- "INSTANCE_PREFIX=h4dsp"
4444
- "BUILD_ID=$BUILD_ID"
4545
- "OPTIONS_GCS_PATH=gs://hpc-ctk1357/h4doptions.txt"
46+
- "ENABLE_SPOT_FALLBACK=true"
4647
args:
4748
- -c
4849
- |
@@ -69,13 +70,25 @@ steps:
6970
sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT}
7071
sed -i -e '/reason:/d' $${BLUEPRINT}
7172
sed -i '/ - id: h4d-vms/,/ - id: wait-for-vms/ { / settings:/a \
72-
provisioning_model: "SPOT"
73+
provisioning_model: "'"$$PROVISIONING_MODEL"'"
7374
}' $${BLUEPRINT}
75+
76+
ENABLE_SPOT="true"
77+
H4D_VARS_FILE="tools/cloud-build/daily-tests/tests/h4d-vm.yml"
78+
if [[ "$$PROVISIONING_MODEL" == "STANDARD" ]]; then
79+
ENABLE_SPOT="false"
80+
sed -i '/instance_labels:/,+1d' "$${H4D_VARS_FILE}"
81+
sed -i '/enable_spot: true/d' "$${H4D_VARS_FILE}"
82+
else
83+
echo "INFO: Using $${H4D_VARS_FILE} as it is for SPOT provisioning."
84+
fi
85+
7486
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
7587
--user=sa_106486320838376751393 \
7688
--extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
7789
--extra-vars="region=$${REGION} zone=$${ZONE}" \
78-
--extra-vars="@tools/cloud-build/daily-tests/tests/h4d-vm.yml"
90+
--extra-vars="enable_spot=$${ENABLE_SPOT}" \
91+
--extra-vars="@$${H4D_VARS_FILE}"
7992
secretEnv: ['GCLUSTER_GCS_PATH']
8093
availableSecrets:
8194
secretManager:

tools/cloud-build/find_available_zone.sh

Lines changed: 133 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,120 @@ cleanup_tpu_nodes() {
6767
done
6868
}
6969

70+
check_tpu_capacity() {
71+
local zone=$1
72+
local provisioning_model=$2
73+
local success=false
74+
75+
# --- TPU Capacity Check ---
76+
for i in $(seq 1 "${NUM_NODES}"); do
77+
local tpu_name
78+
tpu_name="${FULL_INSTANCE_PREFIX}$(printf "%02d" "$i")"
79+
local gcloud_tpu_cmd=(
80+
gcloud compute tpus tpu-vm create "${tpu_name}"
81+
--project="${PROJECT_ID}"
82+
--zone="${zone}"
83+
--accelerator-type="${ACCELERATOR_TYPE}"
84+
--version="${TPU_RUNTIME_VERSION}"
85+
--quiet
86+
)
87+
if [[ "${provisioning_model}" == "SPOT" ]]; then
88+
gcloud_tpu_cmd+=(--spot)
89+
fi
90+
91+
if ! tpu_create_output=$("${gcloud_tpu_cmd[@]}" 2>&1); then
92+
echo "ERROR: Unexpected error during TPU create in ${zone} (${provisioning_model}): ${tpu_create_output}" >&2
93+
fi
94+
done
95+
96+
local created_tpu_names=()
97+
local tpu_created_count=0
98+
if tpu_list_output=$(gcloud compute tpus tpu-vm list --project="${PROJECT_ID}" --zone="${zone}" \
99+
--filter="name ~ ${FULL_INSTANCE_PREFIX}" --format='value(name)' 2>/dev/null); then
100+
if [[ -n "${tpu_list_output}" ]]; then
101+
readarray -t created_tpu_names <<<"${tpu_list_output}"
102+
tpu_created_count=${#created_tpu_names[@]}
103+
echo "INFO: Found ${tpu_created_count} TPU nodes: ${created_tpu_names[*]}"
104+
else
105+
echo "INFO: No matching TPU nodes found in ${zone} via list."
106+
fi
107+
else
108+
echo "ERROR: Failed to list TPU nodes in ${zone}: ${tpu_list_output}" >&2
109+
fi
110+
111+
if [[ "${tpu_created_count}" -ge 1 ]]; then
112+
success=true
113+
fi
114+
115+
cleanup_tpu_nodes "${PROJECT_ID}" "${zone}" "${created_tpu_names[@]}"
116+
[[ "${success}" == "true" ]]
117+
}
118+
119+
check_vm_capacity() {
120+
local zone=$1
121+
local provisioning_model=$2
122+
local success=false
123+
124+
readarray -t instance_names_array < <(generate_instance_names "${FULL_INSTANCE_PREFIX}" "${NUM_NODES}")
125+
local instance_names_str
126+
instance_names_str=$(
127+
IFS=,
128+
echo "${instance_names_array[*]}"
129+
)
130+
131+
local gcloud_cmd=(
132+
gcloud compute instances bulk create
133+
--predefined-names="${instance_names_str}"
134+
--project="${PROJECT_ID}"
135+
--zone="${zone}"
136+
--machine-type="${MACHINE_TYPE}"
137+
--provisioning-model="${provisioning_model}"
138+
--no-address
139+
--quiet
140+
--min-count="${MIN_NODES}"
141+
)
142+
143+
if [[ "${provisioning_model}" == "SPOT" ]]; then
144+
gcloud_cmd+=(--instance-termination-action="${TERMINATION_ACTION}")
145+
else
146+
gcloud_cmd+=(--on-host-maintenance="TERMINATE")
147+
fi
148+
149+
echo "INFO: Attempting to bulk create ${NUM_NODES} VMs in ${zone} (Model: ${provisioning_model})..."
150+
if create_output=$("${gcloud_cmd[@]}" 2>&1); then
151+
if instance_list_output=$(gcloud compute instances list --project="${PROJECT_ID}" --zones="${zone}" \
152+
--filter="name ~ ^${FULL_INSTANCE_PREFIX}" --format='value(name)'); then
153+
if [[ -n "${instance_list_output}" ]]; then
154+
readarray -t created_instances < <(echo "${instance_list_output}")
155+
local num_created=${#created_instances[@]}
156+
cleanup_vm_instances "${PROJECT_ID}" "${zone}" "${FULL_INSTANCE_PREFIX}"
157+
158+
if [[ "${num_created}" -ge "${MIN_NODES}" ]]; then
159+
echo "INFO: Found sufficient VM capacity in ${zone}."
160+
success=true
161+
else
162+
echo "ERROR: Bulk create & list succeeded in ${zone}, but only ${num_created} instances found, less than min_count ${MIN_NODES}." >&2
163+
fi
164+
else
165+
echo "ERROR: Bulk create command apparently succeeded in ${zone}, but LIST command found no instances with the prefix." >&2
166+
cleanup_vm_instances "${PROJECT_ID}" "${zone}" "${FULL_INSTANCE_PREFIX}"
167+
fi
168+
else
169+
echo "ERROR: Bulk create command succeeded in ${zone}, but the command to list instances failed." >&2
170+
cleanup_vm_instances "${PROJECT_ID}" "${zone}" "${FULL_INSTANCE_PREFIX}"
171+
fi
172+
else
173+
if [[ "${create_output}" != *"INSUFFICIENT_CAPACITY"* && "${create_output}" != *"ZONE_RESOURCE_POOL_EXHAUSTED"* ]]; then
174+
echo "ERROR: Unexpected error during bulk create in ${zone}: ${create_output}" >&2
175+
else
176+
echo "INFO: Insufficient VM capacity for bulk create in ${zone} (Model: ${provisioning_model})."
177+
fi
178+
cleanup_vm_instances "${PROJECT_ID}" "${zone}" "${FULL_INSTANCE_PREFIX}"
179+
fi
180+
181+
[[ "${success}" == "true" ]]
182+
}
183+
70184
if ! GCS_CONTENT=$(gcloud storage cat "${OPTIONS_GCS_PATH}"); then
71185
echo "ERROR: Failed to read ${OPTIONS_GCS_PATH}." >&2
72186
exit 1
@@ -86,112 +200,39 @@ fi
86200
SELECTED_ZONE=""
87201
SUCCESS=false
88202

89-
# Loop through all zones to find capacity
90-
for ZONE in "${ZONES_ARRAY[@]}"; do
91-
if [[ "${MACHINE_TYPE}" == "tpu" ]]; then
92-
# --- TPU Capacity Check ---
93-
for i in $(seq 1 "${NUM_NODES}"); do
94-
TPU_NAME="${FULL_INSTANCE_PREFIX}$(printf "%02d" "$i")"
95-
declare -a GCLOUD_TPU_CMD
96-
GCLOUD_TPU_CMD=(
97-
gcloud compute tpus tpu-vm create "${TPU_NAME}"
98-
--project="${PROJECT_ID}"
99-
--zone="${ZONE}"
100-
--accelerator-type="${ACCELERATOR_TYPE}"
101-
--version="${TPU_RUNTIME_VERSION}"
102-
--spot
103-
--quiet
104-
)
105-
106-
if ! TPU_CREATE_OUTPUT=$("${GCLOUD_TPU_CMD[@]}" 2>&1); then
107-
echo "ERROR: Unexpected error during TPU create in ${ZONE}: ${TPU_CREATE_OUTPUT}" >&2
108-
fi
109-
done
110-
111-
declare -a CREATED_TPU_NAMES=()
112-
TPU_CREATED_COUNT=0
113-
if tpu_list_output=$(gcloud compute tpus tpu-vm list --project="${PROJECT_ID}" --zone="${ZONE}" \
114-
--filter="name ~ ${FULL_INSTANCE_PREFIX}" --format='value(name)' 2>/dev/null); then
115-
if [[ -n "${tpu_list_output}" ]]; then
116-
readarray -t CREATED_TPU_NAMES <<<"${tpu_list_output}"
117-
TPU_CREATED_COUNT=${#CREATED_TPU_NAMES[@]}
118-
echo "INFO: Found ${TPU_CREATED_COUNT} TPU nodes: ${CREATED_TPU_NAMES[*]}"
119-
else
120-
echo "INFO: No matching TPU nodes found in ${ZONE} via list."
121-
fi
122-
else
123-
echo "ERROR: Failed to list TPU nodes in ${ZONE}: ${tpu_list_output}" >&2
124-
fi
125-
126-
if [[ "${TPU_CREATED_COUNT}" -ge 1 ]]; then
127-
SELECTED_ZONE="${ZONE}"
128-
SUCCESS=true
129-
fi
203+
declare -a PROVISIONING_MODELS=("SPOT")
204+
if [[ "${ENABLE_SPOT_FALLBACK:-false}" == "true" ]]; then
205+
PROVISIONING_MODELS+=("STANDARD")
206+
fi
130207

131-
cleanup_tpu_nodes "${PROJECT_ID}" "${ZONE}" "${CREATED_TPU_NAMES[@]}"
132-
else
133-
readarray -t INSTANCE_NAMES_ARRAY < <(generate_instance_names "${FULL_INSTANCE_PREFIX}" "${NUM_NODES}")
134-
instance_names_str=$(
135-
IFS=,
136-
echo "${INSTANCE_NAMES_ARRAY[*]}"
137-
)
208+
for PROVISIONING_MODEL in "${PROVISIONING_MODELS[@]}"; do
209+
echo "INFO: Trying provisioning model: ${PROVISIONING_MODEL}"
138210

139-
declare -a GCLOUD_CMD
140-
GCLOUD_CMD=(
141-
gcloud compute instances bulk create
142-
--predefined-names="${instance_names_str}"
143-
--project="${PROJECT_ID}"
144-
--zone="${ZONE}"
145-
--machine-type="${MACHINE_TYPE}"
146-
--provisioning-model="${PROVISIONING_MODEL}"
147-
--instance-termination-action="${TERMINATION_ACTION}"
148-
--no-address
149-
--quiet
150-
--min-count="${MIN_NODES}"
151-
)
152-
echo "INFO: Attempting to bulk create ${NUM_NODES} VMs in ${ZONE}..."
153-
if CREATE_OUTPUT=$("${GCLOUD_CMD[@]}" 2>&1); then
154-
if instance_list_output=$(gcloud compute instances list --project="${PROJECT_ID}" --zones="${ZONE}" \
155-
--filter="name ~ ^${FULL_INSTANCE_PREFIX}" --format='value(name)'); then
156-
if [[ -n "${instance_list_output}" ]]; then
157-
readarray -t created_instances < <(echo "${instance_list_output}")
158-
NUM_CREATED=$((${#created_instances[@]}))
159-
cleanup_vm_instances "${PROJECT_ID}" "${ZONE}" "${FULL_INSTANCE_PREFIX}"
160-
161-
if [[ "${NUM_CREATED}" -ge "${MIN_NODES}" ]]; then
162-
SELECTED_ZONE="${ZONE}"
163-
SUCCESS=true
164-
echo "INFO: Found sufficient VM capacity in ${ZONE}."
165-
else
166-
echo "ERROR: Bulk create & list succeeded in ${ZONE}, but only ${NUM_CREATED} instances found, less than min_count ${MIN_NODES}." >&2
167-
fi
168-
else
169-
echo "ERROR: Bulk create command apparently succeeded in ${ZONE}, but LIST command found no instances with the prefix." >&2
170-
cleanup_vm_instances "${PROJECT_ID}" "${ZONE}" "${FULL_INSTANCE_PREFIX}"
171-
fi
172-
else
173-
echo "ERROR: Bulk create command succeeded in ${ZONE}, but the command to list instances failed." >&2
174-
cleanup_vm_instances "${PROJECT_ID}" "${ZONE}" "${FULL_INSTANCE_PREFIX}"
211+
for ZONE in "${ZONES_ARRAY[@]}"; do
212+
if [[ "${MACHINE_TYPE}" == "tpu" ]]; then
213+
if check_tpu_capacity "${ZONE}" "${PROVISIONING_MODEL}"; then
214+
SELECTED_ZONE="${ZONE}"
215+
SUCCESS=true
216+
break
175217
fi
176218
else
177-
if [[ "${CREATE_OUTPUT}" != *"INSUFFICIENT_CAPACITY"* &&
178-
"${CREATE_OUTPUT}" != *"ZONE_RESOURCE_POOL_EXHAUSTED"* ]]; then
179-
echo "ERROR: Unexpected error during bulk create in ${ZONE}: ${CREATE_OUTPUT}" >&2
180-
else
181-
echo "INFO: Insufficient VM capacity for bulk create in ${ZONE}."
219+
if check_vm_capacity "${ZONE}" "${PROVISIONING_MODEL}"; then
220+
SELECTED_ZONE="${ZONE}"
221+
SUCCESS=true
222+
break
182223
fi
183-
cleanup_vm_instances "${PROJECT_ID}" "${ZONE}" "${FULL_INSTANCE_PREFIX}"
184224
fi
185-
fi
225+
done
186226

187227
if [[ "${SUCCESS}" == "true" ]]; then
188228
break
189229
fi
190230
done
191231

192232
if [[ "${SUCCESS}" == "true" ]]; then
193-
echo "Deploying in ZONE: ${SELECTED_ZONE}"
233+
echo "Deploying in ZONE: ${SELECTED_ZONE}, MODEL: ${PROVISIONING_MODEL}"
194234
export ZONE="${SELECTED_ZONE}"
235+
export PROVISIONING_MODEL="${PROVISIONING_MODEL}"
195236
else
196237
echo "--- DEPLOYMENT FAILED(Couldn't find a zone to deploy) ---" >&2
197238
exit 1

0 commit comments

Comments
 (0)