Skip to content

Commit a66730b

Browse files
committed
update gcp machine
1 parent 5896701 commit a66730b

File tree

2 files changed

+40
-16
lines changed

2 files changed

+40
-16
lines changed

sdgym/_benchmark/benchmark.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,6 @@ def _get_user_data_script(
226226
"""
227227
).strip()
228228

229-
platform_logger = ':' # no-op; GCP logging is via stdout + log file
230-
231229
gpu_wait_block = ''
232230
if gpu_expected and assert_gpu:
233231
gpu_wait_block = textwrap.dedent(
@@ -287,7 +285,6 @@ def _get_user_data_script(
287285
msg="$*"
288286
echo "$msg"
289287
echo "$msg" | sudo tee -a "$LOG_FILE" >/dev/null
290-
{platform_logger} <<<"$msg" >/dev/null 2>&1 || true
291288
}}
292289
293290
run() {{
@@ -296,6 +293,11 @@ def _get_user_data_script(
296293
return ${{PIPESTATUS[0]}}
297294
}}
298295
296+
run_secret() {{
297+
"$@" 2>&1 | sudo tee -a "$LOG_FILE"
298+
return ${{PIPESTATUS[0]}}
299+
}}
300+
299301
{log_upload_fn}
300302
301303
{delete_fn}
@@ -324,8 +326,8 @@ def _get_user_data_script(
324326
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab >/dev/null
325327
326328
log "======== Configure AWS CLI =========="
327-
run aws configure set aws_access_key_id '{aws_key}'
328-
run aws configure set aws_secret_access_key '{aws_secret}'
329+
run_secret aws configure set aws_access_key_id '{aws_key}'
330+
run_secret aws configure set aws_secret_access_key '{aws_secret}'
329331
run aws configure set default.region '{S3_REGION}'
330332
331333
log "======== Create Virtual Environment =========="
@@ -383,7 +385,8 @@ def _run_on_gcp(
383385

384386
instance_name = _make_instance_name(config['name_prefix'])
385387
print( # noqa: T201
386-
f'Launching instance: {instance_name} (service=gcp project={gcp_project} zone={gcp_zone})'
388+
f'Launching instance: {instance_name} '
389+
f'(service=gcp project={gcp_project} zone={gcp_zone})'
387390
)
388391

389392
startup_script = _get_user_data_script(
@@ -398,7 +401,9 @@ def _run_on_gcp(
398401
source_disk_image = config['source_image']
399402

400403
gpu = compute_v1.AcceleratorConfig(
401-
accelerator_type=(f'zones/{gcp_zone}/acceleratorTypes/{config["gpu_type"]}'),
404+
accelerator_type=(
405+
f'zones/{gcp_zone}/acceleratorTypes/{config["gpu_type"]}'
406+
),
402407
accelerator_count=int(config['gpu_count']),
403408
)
404409

@@ -449,11 +454,25 @@ def _run_on_gcp(
449454
)
450455

451456
instance_client = compute_v1.InstancesClient(credentials=gcp_creds)
452-
instance_client.insert(
457+
operation = instance_client.insert(
453458
project=gcp_project,
454459
zone=gcp_zone,
455460
instance_resource=instance,
456461
)
462+
463+
op_client = compute_v1.ZoneOperationsClient(credentials=gcp_creds)
464+
operation = op_client.wait(
465+
project=gcp_project,
466+
zone=gcp_zone,
467+
operation=operation.name,
468+
)
469+
470+
if operation.error and operation.error.errors:
471+
messages = [e.message for e in operation.error.errors if e.message]
472+
joined = '; '.join(messages) if messages else str(operation.error)
473+
raise RuntimeError(f'GCP instance creation failed: {joined}')
474+
475+
print(f'Instance created: {instance_name}') # noqa: T201
457476
return instance_name
458477

459478

sdgym/_benchmark/config_utils.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,28 @@
22
'common': {
33
'swap_gb': 32,
44
'disk_size_gb': 100,
5-
'sdgym_install': 'sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@gcp-benchmark-romain',
5+
'sdgym_install': (
6+
'sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@gcp-benchmark-romain'
7+
),
68
'install_s3fs': True,
7-
'assert_gpu': True, # if GPU is expected, fail if not available
9+
'assert_gpu': True,
810
'gpu_wait_seconds': 10 * 60,
911
'gpu_wait_interval_seconds': 10,
1012
'upload_logs_to_s3': True,
1113
},
1214
'gcp': {
1315
'name_prefix': 'sdgym-run',
14-
'machine_type': 'g2-standard-16',
15-
'source_image': 'projects/debian-cloud/global/images/family/debian-12',
16-
'gpu_type': 'nvidia-l4',
16+
'machine_type': 'n1-standard-8',
17+
'source_image': (
18+
'projects/deeplearning-platform-release/global/images/family/'
19+
'common-cu128-ubuntu-2204-nvidia-570'
20+
),
21+
'gpu_type': 'nvidia-tesla-t4',
1722
'gpu_count': 1,
18-
'install_nvidia_driver': True,
23+
'install_nvidia_driver': False, # DLVM already has drivers/tooling
1924
'delete_on_success': True,
20-
'delete_on_error': True, # you can make this False if you prefer
21-
'stop_fallback': True, # if delete fails, shutdown
25+
'delete_on_error': True,
26+
'stop_fallback': True,
2227
},
2328
'aws': {
2429
'name_prefix': 'sdgym-run',

0 commit comments

Comments
 (0)