Skip to content

Commit db72058

Browse files
authored
Merge pull request #321 from DataBiosphere/0.5.2
porting over changes from verily-src/dsub 0.5.2
2 parents 11e3631 + a7ca7ee commit db72058

File tree

11 files changed

+675
-10
lines changed

11 files changed

+675
-10
lines changed

docs/code.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ container in order to run:
88
* --image "Docker image"
99
* --input "path to file in cloud storage"
1010
* --input-recursive "path to directory in cloud storage"
11+
* --boot-disk-image "Custom boot disk image to use"
1112

1213
The following explains each option and how to choose which to use.
1314

@@ -231,3 +232,12 @@ chmod u+x ${SCRIPT_DIR}/*.sh
231232
${SCRIPT_DIR}/script1.sh
232233
${SCRIPT_DIR}/script2.sh
233234
```
235+
236+
## --boot-disk-image "Custom boot disk image to use"
237+
238+
For most uses, the underlying virtual machine image is transparent to your tasks; the provider default is sufficient. However, there are cases where you may want to specify a custom boot disk image.
239+
240+
When using GPU accelerators, an image with GPU drivers is needed. The `google-batch` provider will automatically select the `batch-debian` image when `--accelerator-type` and `--accelerator-count` are specified.
241+
242+
If your `dsub` task does not have internet access, it may fail as this image will attempt to update the GPU drivers. You may instead pre-build a custom image with the drivers installed, specify it with the `--boot-disk-image`, and set '--install-gpu-drivers' to `false`
243+
Specifically for the `google-batch` provider, information about available images can be found [here](https://cloud.google.com/batch/docs/view-os-images)

dsub/_dsub_version.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,6 @@
2626
0.1.3.dev0 -> 0.1.3 -> 0.1.4.dev0 -> ...
2727
"""
2828

29-
DSUB_VERSION = '0.5.1'
29+
30+
DSUB_VERSION = '0.5.2.dev0'
31+

dsub/commands/dsub.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,20 @@ def _parse_arguments(prog, argv):
501501
following third-party software onto your job's Compute Engine
502502
instances: NVIDIA(R) Tesla(R) drivers and NVIDIA(R) CUDA toolkit.
503503
(default: 0)""")
504+
google_common.add_argument(
505+
'--boot-disk-image',
506+
help="""Custom boot disk image to use (e.g., a deeplearning image with
507+
GPU drivers pre-installed). If not specified and an accelerator is
508+
present, the `google-batch` provider defaults to 'batch-debian'.
509+
(default: None)""")
510+
google_common.add_argument(
511+
'--install-gpu-drivers',
512+
type=lambda x: {'true': True, 'false': False}[x.lower()],
513+
default=None,
514+
help="""Whether to install GPU drivers. Defaults to true when an
515+
accelerator is present. Set to false when
516+
using images with pre-installed drivers. Valid values: true, false.
517+
(default: auto-detect)""")
504518
google_common.add_argument(
505519
'--credentials-file',
506520
type=str,
@@ -645,7 +659,9 @@ def _get_job_resources(args):
645659
enable_stackdriver_monitoring=args.enable_stackdriver_monitoring,
646660
max_retries=args.retries,
647661
max_preemptible_attempts=args.preemptible,
648-
block_external_network=args.block_external_network)
662+
block_external_network=args.block_external_network,
663+
boot_disk_image=args.boot_disk_image,
664+
install_gpu_drivers=args.install_gpu_drivers)
649665

650666

651667
def _get_job_metadata(provider, user_id, job_name, script, task_ids,

dsub/lib/job_model.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,8 @@ class Resources(
445445
'max_retries',
446446
'max_preemptible_attempts',
447447
'block_external_network',
448+
'boot_disk_image',
449+
'install_gpu_drivers',
448450
])):
449451
"""Job resource parameters related to CPUs, memory, and disk.
450452
@@ -484,6 +486,8 @@ class Resources(
484486
representing always preemtible.
485487
block_external_network (bool): Prevents the containers from accessing the
486488
external network.
489+
boot_disk_image (str): Custom boot disk image to use
490+
install_gpu_drivers (bool): Whether to install GPU drivers.
487491
"""
488492
__slots__ = ()
489493

@@ -515,7 +519,9 @@ def __new__(cls,
515519
enable_stackdriver_monitoring=None,
516520
max_retries=None,
517521
max_preemptible_attempts=None,
518-
block_external_network=None):
522+
block_external_network=None,
523+
boot_disk_image=None,
524+
install_gpu_drivers=None):
519525
return super(Resources,
520526
cls).__new__(cls, min_cores, min_ram, machine_type, disk_size,
521527
disk_type, boot_disk_size, preemptible, image,
@@ -525,7 +531,8 @@ def __new__(cls,
525531
accelerator_count, nvidia_driver_version, timeout,
526532
log_interval, ssh, enable_stackdriver_monitoring,
527533
max_retries, max_preemptible_attempts,
528-
block_external_network)
534+
block_external_network, boot_disk_image,
535+
install_gpu_drivers)
529536

530537

531538
def ensure_job_params_are_complete(job_params):

dsub/providers/google_batch.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,7 @@ def _create_batch_request(
698698
entrypoint='/bin/bash',
699699
volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'],
700700
commands=['-c', continuous_logging_cmd],
701+
options=None
701702
)
702703
)
703704

@@ -711,6 +712,7 @@ def _create_batch_request(
711712
entrypoint='/bin/bash',
712713
volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'],
713714
commands=['-c', prepare_command],
715+
options=None
714716
)
715717
)
716718

@@ -732,12 +734,15 @@ def _create_batch_request(
732734
cp_loop=google_utils.LOCALIZATION_LOOP,
733735
),
734736
],
737+
options=None
735738
)
736739
)
737740

738741
user_command_volumes = [f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}']
739742
for gcs_volume in self._get_gcs_volumes_for_user_command(mounts):
740743
user_command_volumes.append(gcs_volume)
744+
# Add --gpus all option for GPU-enabled containers
745+
container_options = "--gpus all" if job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia') else None
741746
runnables.append(
742747
# user-command
743748
google_batch_operations.build_runnable(
@@ -756,6 +761,7 @@ def _create_batch_request(
756761
user_script=script_path,
757762
),
758763
],
764+
options=container_options,
759765
)
760766
)
761767

@@ -777,6 +783,7 @@ def _create_batch_request(
777783
cp_loop=google_utils.DELOCALIZATION_LOOP,
778784
),
779785
],
786+
options=None
780787
)
781788
)
782789

@@ -790,6 +797,7 @@ def _create_batch_request(
790797
entrypoint='/bin/bash',
791798
volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'],
792799
commands=['-c', logging_cmd],
800+
options=None
793801
),
794802
)
795803

@@ -800,13 +808,23 @@ def _create_batch_request(
800808
boot_disk_size = (
801809
job_resources.boot_disk_size if job_resources.boot_disk_size else 0
802810
)
811+
# Determine boot disk image: use user-specified value, or default to batch-debian for GPU jobs
812+
if job_resources.boot_disk_image:
813+
boot_disk_image = job_resources.boot_disk_image
814+
elif job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia'):
815+
boot_disk_image = "batch-debian"
816+
else:
817+
boot_disk_image = None
818+
803819
boot_disk = google_batch_operations.build_persistent_disk(
804820
size_gb=max(boot_disk_size, job_model.LARGE_BOOT_DISK_SIZE),
805821
disk_type=job_model.DEFAULT_DISK_TYPE,
822+
image=boot_disk_image,
806823
)
807824
disk = google_batch_operations.build_persistent_disk(
808825
size_gb=job_resources.disk_size,
809826
disk_type=job_resources.disk_type or job_model.DEFAULT_DISK_TYPE,
827+
image=None
810828
)
811829
attached_disk = google_batch_operations.build_attached_disk(
812830
disk=disk, device_name=google_utils.DATA_DISK_NAME
@@ -834,11 +852,17 @@ def _create_batch_request(
834852
provisioning_model=self._get_provisioning_model(task_resources),
835853
)
836854

855+
# Determine whether to install GPU drivers: use user-specified value, or default to True for GPU jobs
856+
if job_resources.install_gpu_drivers is not None:
857+
install_gpu_drivers = job_resources.install_gpu_drivers
858+
elif job_resources.accelerator_type is not None:
859+
install_gpu_drivers = True
860+
else:
861+
install_gpu_drivers = False
862+
837863
ipt = google_batch_operations.build_instance_policy_or_template(
838864
instance_policy=instance_policy,
839-
install_gpu_drivers=True
840-
if job_resources.accelerator_type is not None
841-
else False,
865+
install_gpu_drivers=install_gpu_drivers,
842866
)
843867

844868
if job_resources.service_account:

dsub/providers/google_batch_operations.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,14 @@ def build_task_group(
211211

212212

213213
def build_container(
214-
image_uri: str, entrypoint: str, volumes: List[str], commands: List[str]
214+
image_uri: str, entrypoint: str, volumes: List[str], commands: List[str], options: Optional[str]
215215
) -> batch_v1.types.task.Runnable.Container:
216216
container = batch_v1.types.task.Runnable.Container()
217217
container.image_uri = image_uri
218218
container.entrypoint = entrypoint
219219
container.commands = commands
220220
container.volumes = volumes
221+
container.options = options
221222
return container
222223

223224

@@ -229,6 +230,7 @@ def build_runnable(
229230
entrypoint: str,
230231
volumes: List[str],
231232
commands: List[str],
233+
options: Optional[str],
232234
) -> batch_v1.types.task.Runnable:
233235
"""Build a Runnable object for a Batch request.
234236
@@ -241,11 +243,12 @@ def build_runnable(
241243
entrypoint (str): Docker image entrypoint path
242244
volumes (List[str]): List of volume mounts (host_path:container_path)
243245
commands (List[str]): Command arguments to pass to the entrypoint
246+
options (str): Container options such as "--gpus all"
244247
245248
Returns:
246249
An object representing a Runnable
247250
"""
248-
container = build_container(image_uri, entrypoint, volumes, commands)
251+
container = build_container(image_uri, entrypoint, volumes, commands, options)
249252
runnable = batch_v1.Runnable()
250253
runnable.container = container
251254
runnable.background = run_in_background
@@ -401,11 +404,12 @@ def build_attached_disk(
401404

402405

403406
def build_persistent_disk(
404-
size_gb: int, disk_type: str
407+
size_gb: int, disk_type: str, image: str
405408
) -> batch_v1.types.AllocationPolicy.Disk:
406409
disk = batch_v1.AllocationPolicy.Disk()
407410
disk.type = disk_type
408411
disk.size_gb = size_gb
412+
disk.image = image
409413
return disk
410414

411415

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#!/bin/bash
2+
3+
# Copyright 2025 Verily Life Sciences Inc. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -o errexit
18+
set -o nounset
19+
20+
# Test GPU support in Google Batch provider.
21+
# Validates that NVIDIA accelerators trigger:
22+
# 1. --gpus all container option
23+
# 2. batch-debian boot disk image
24+
# 3. Actual GPU access in the running container
25+
#
26+
# Required environment variables:
27+
# DOCKER_IMAGE - Google Artifact Registry image with GPU support
28+
# Example: us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest
29+
# PET_SA_EMAIL - Service account with access to GAR image and GPU resources
30+
# Example: my-service-account@my-project.iam.gserviceaccount.com
31+
#
32+
# Optional environment variables (for VPC-SC or custom networking):
33+
# GPU_NETWORK - Network configuration
34+
# Example: projects/my-project/global/networks/my-network
35+
# GPU_SUBNETWORK - Subnetwork configuration
36+
# Example: projects/my-project/regions/us-central1/subnetworks/my-subnet
37+
# GPU_USE_PRIVATE_ADDRESS - Set to any value to use private address
38+
39+
readonly SCRIPT_DIR="$(dirname "${0}")"
40+
41+
# Do standard test setup
42+
source "${SCRIPT_DIR}/test_setup_e2e.sh"
43+
44+
# Check GPU-specific prerequisites
45+
if [[ -z "${DOCKER_IMAGE:-}" ]]; then
46+
1>&2 echo "ERROR: DOCKER_IMAGE environment variable is not set."
47+
1>&2 echo "This test requires a Google Artifact Registry image with GPU support."
48+
1>&2 echo "Set it with: export DOCKER_IMAGE='REGION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG'"
49+
1>&2 echo "Example: export DOCKER_IMAGE='us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest'"
50+
exit 1
51+
fi
52+
53+
if [[ -z "${PET_SA_EMAIL:-}" ]]; then
54+
1>&2 echo "ERROR: PET_SA_EMAIL environment variable is not set."
55+
1>&2 echo "This test requires a service account with access to the GAR image and GPU resources."
56+
1>&2 echo "Set it with: export PET_SA_EMAIL='my-service-account@my-project.iam.gserviceaccount.com'"
57+
exit 1
58+
fi
59+
60+
echo "Launching GPU pipeline with Google Batch provider..."
61+
echo " Using image: ${DOCKER_IMAGE}"
62+
echo " Using service account: ${PET_SA_EMAIL}"
63+
64+
# Test nvidia accelerator enables GPU features
65+
# Uses DOCKER_IMAGE and PET_SA_EMAIL environment variables (required)
66+
# Optionally uses GPU_NETWORK, GPU_SUBNETWORK, and GPU_USE_PRIVATE_ADDRESS if set
67+
run_dsub \
68+
--provider 'google-batch' \
69+
--image "${DOCKER_IMAGE}" \
70+
--service-account "${PET_SA_EMAIL}" \
71+
${GPU_NETWORK:+--network "${GPU_NETWORK}"} \
72+
${GPU_SUBNETWORK:+--subnetwork "${GPU_SUBNETWORK}"} \
73+
${GPU_USE_PRIVATE_ADDRESS:+--use-private-address} \
74+
--accelerator-type 'nvidia-tesla-t4' \
75+
--accelerator-count 1 \
76+
--env NVIDIA_VISIBLE_DEVICES=all \
77+
--command '\
78+
echo "=== GPU Detection Test ===" && \
79+
nvidia-smi && \
80+
echo "=== Boot Image Test ===" && \
81+
cat /etc/os-release | grep "ID=" && \
82+
echo "=== Container GPU Access Test ===" && \
83+
nvidia-smi -L' \
84+
--wait
85+
86+
echo
87+
echo "Checking GPU detection output..."
88+
89+
# Check that GPU was detected and accessible
90+
RESULT="$(gsutil cat "${STDOUT_LOG}")"
91+
92+
# Validate GPU hardware was detected
93+
if ! echo "${RESULT}" | grep -qi "Tesla T4"; then
94+
1>&2 echo "ERROR: Tesla T4 GPU not detected in nvidia-smi output!"
95+
1>&2 echo "stdout content:"
96+
1>&2 echo "${RESULT}"
97+
exit 1
98+
fi
99+
100+
# Validate GPU memory info is present
101+
if ! echo "${RESULT}" | grep -qi "GPU.*Memory"; then
102+
1>&2 echo "ERROR: GPU Memory information not found!"
103+
1>&2 echo "stdout content:"
104+
1>&2 echo "${RESULT}"
105+
exit 1
106+
fi
107+
108+
# Validate container has GPU access (nvidia-smi -L should list GPUs)
109+
if ! echo "${RESULT}" | grep -qi "GPU 0:"; then
110+
1>&2 echo "ERROR: Container does not have GPU access (nvidia-smi -L failed)!"
111+
1>&2 echo "stdout content:"
112+
1>&2 echo "${RESULT}"
113+
exit 1
114+
fi
115+
116+
echo
117+
echo "GPU test output (showing GPU was accessible):"
118+
echo "*****************************"
119+
echo "${RESULT}"
120+
echo "*****************************"
121+
echo "SUCCESS: GPU accelerator test passed!"
122+
echo "- GPU hardware detected"
123+
echo "- Container has GPU access"
124+
echo "- batch-debian image used (implied by successful GPU access)"

0 commit comments

Comments
 (0)