@@ -226,8 +226,6 @@ def _get_user_data_script(
226226 """
227227 ).strip ()
228228
229- platform_logger = ':' # no-op; GCP logging is via stdout + log file
230-
231229 gpu_wait_block = ''
232230 if gpu_expected and assert_gpu :
233231 gpu_wait_block = textwrap .dedent (
@@ -287,7 +285,6 @@ def _get_user_data_script(
287285 msg="$*"
288286 echo "$msg"
289287 echo "$msg" | sudo tee -a "$LOG_FILE" >/dev/null
290- { platform_logger } <<<"$msg" >/dev/null 2>&1 || true
291288 }}
292289
293290 run() {{
@@ -296,6 +293,11 @@ def _get_user_data_script(
296293 return ${{PIPESTATUS[0]}}
297294 }}
298295
296+ run_secret() {{
297+ "$@" 2>&1 | sudo tee -a "$LOG_FILE"
298+ return ${{PIPESTATUS[0]}}
299+ }}
300+
299301 { log_upload_fn }
300302
301303 { delete_fn }
@@ -324,8 +326,8 @@ def _get_user_data_script(
324326 echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab >/dev/null
325327
326328 log "======== Configure AWS CLI =========="
327- run aws configure set aws_access_key_id '{ aws_key } '
328- run aws configure set aws_secret_access_key '{ aws_secret } '
329+ run_secret aws configure set aws_access_key_id '{ aws_key } '
330+ run_secret aws configure set aws_secret_access_key '{ aws_secret } '
329331 run aws configure set default.region '{ S3_REGION } '
330332
331333 log "======== Create Virtual Environment =========="
@@ -383,7 +385,8 @@ def _run_on_gcp(
383385
384386 instance_name = _make_instance_name (config ['name_prefix' ])
385387 print ( # noqa: T201
386- f'Launching instance: { instance_name } (service=gcp project={ gcp_project } zone={ gcp_zone } )'
388+ f'Launching instance: { instance_name } '
389+ f'(service=gcp project={ gcp_project } zone={ gcp_zone } )'
387390 )
388391
389392 startup_script = _get_user_data_script (
@@ -398,7 +401,9 @@ def _run_on_gcp(
398401 source_disk_image = config ['source_image' ]
399402
400403 gpu = compute_v1 .AcceleratorConfig (
401- accelerator_type = (f'zones/{ gcp_zone } /acceleratorTypes/{ config ["gpu_type" ]} ' ),
404+ accelerator_type = (
405+ f'zones/{ gcp_zone } /acceleratorTypes/{ config ["gpu_type" ]} '
406+ ),
402407 accelerator_count = int (config ['gpu_count' ]),
403408 )
404409
@@ -449,11 +454,25 @@ def _run_on_gcp(
449454 )
450455
451456 instance_client = compute_v1 .InstancesClient (credentials = gcp_creds )
452- instance_client .insert (
457+ operation = instance_client .insert (
453458 project = gcp_project ,
454459 zone = gcp_zone ,
455460 instance_resource = instance ,
456461 )
462+
463+ op_client = compute_v1 .ZoneOperationsClient (credentials = gcp_creds )
464+ operation = op_client .wait (
465+ project = gcp_project ,
466+ zone = gcp_zone ,
467+ operation = operation .name ,
468+ )
469+
470+ if operation .error and operation .error .errors :
471+ messages = [e .message for e in operation .error .errors if e .message ]
472+ joined = '; ' .join (messages ) if messages else str (operation .error )
473+ raise RuntimeError (f'GCP instance creation failed: { joined } ' )
474+
475+ print (f'Instance created: { instance_name } ' ) # noqa: T201
457476 return instance_name
458477
459478
0 commit comments