DataBiosphere
diff --git a/‎docs/code.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/code.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎dsub/_dsub_version.py‎
Lines changed: 3 additions & 1 deletion b/‎dsub/_dsub_version.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎dsub/commands/dsub.py‎
Lines changed: 17 additions & 1 deletion b/‎dsub/commands/dsub.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎dsub/lib/job_model.py‎
Lines changed: 9 additions & 2 deletions b/‎dsub/lib/job_model.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎dsub/providers/google_batch.py‎
Lines changed: 27 additions & 3 deletions b/‎dsub/providers/google_batch.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎dsub/providers/google_batch_operations.py‎
Lines changed: 7 additions & 3 deletions b/‎dsub/providers/google_batch_operations.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎test/integration/e2e_accelerator.google-batch.sh‎
Lines changed: 124 additions & 0 deletions b/‎test/integration/e2e_accelerator.google-batch.sh‎
Lines changed: 124 additions & 0 deletions
@@ -8,6 +8,7 @@ container in order to run:
 * --image "Docker image"
 * --input "path to file in cloud storage"
 * --input-recursive "path to directory in cloud storage"
+* --boot-disk-image "Custom boot disk image to use"
 
 The following explains each option and how to choose which to use.
 
@@ -231,3 +232,12 @@ chmod u+x ${SCRIPT_DIR}/*.sh
 ${SCRIPT_DIR}/script1.sh
 ${SCRIPT_DIR}/script2.sh
 ```
+
+## --boot-disk-image "Custom boot disk image to use"
+
+For most uses, the underlying virtual machine image is transparent to your tasks; the provider default is sufficient. However, there are cases where you may want to specify a custom boot disk image.
+
+When using GPU accelerators, an image with GPU drivers is needed. The `google-batch` provider will automatically select the `batch-debian` image when `--accelerator-type` and `--accelerator-count` are specified. 
+
+If your `dsub` task does not have internet access, it may fail as this image will attempt to update the GPU drivers. You may instead pre-build a custom image with the drivers installed, specify it with the `--boot-disk-image`, and set '--install-gpu-drivers' to `false`
+Specifically for the `google-batch` provider, information about available images can be found [here](https://cloud.google.com/batch/docs/view-os-images)
@@ -26,4 +26,6 @@
   0.1.3.dev0 -> 0.1.3 -> 0.1.4.dev0 -> ...
 """
 
-DSUB_VERSION = '0.5.1'
+
+DSUB_VERSION = '0.5.2.dev0'
+
@@ -501,6 +501,20 @@ def _parse_arguments(prog, argv):
           following third-party software onto your job's Compute Engine
           instances: NVIDIA(R) Tesla(R) drivers and NVIDIA(R) CUDA toolkit.
           (default: 0)""")
+  google_common.add_argument(
+      '--boot-disk-image',
+      help="""Custom boot disk image to use (e.g., a deeplearning image with
+          GPU drivers pre-installed). If not specified and an accelerator is
+          present, the `google-batch` provider defaults to 'batch-debian'.
+          (default: None)""")
+  google_common.add_argument(
+      '--install-gpu-drivers',
+      type=lambda x: {'true': True, 'false': False}[x.lower()],
+      default=None,
+      help="""Whether to install GPU drivers. Defaults to true when an
+          accelerator is present. Set to false when
+          using images with pre-installed drivers. Valid values: true, false.
+          (default: auto-detect)""")
   google_common.add_argument(
       '--credentials-file',
       type=str,
@@ -645,7 +659,9 @@ def _get_job_resources(args):
       enable_stackdriver_monitoring=args.enable_stackdriver_monitoring,
       max_retries=args.retries,
       max_preemptible_attempts=args.preemptible,
-      block_external_network=args.block_external_network)
+      block_external_network=args.block_external_network,
+      boot_disk_image=args.boot_disk_image,
+      install_gpu_drivers=args.install_gpu_drivers)
 
 
 def _get_job_metadata(provider, user_id, job_name, script, task_ids,
 
@@ -445,6 +445,8 @@ class Resources(
         'max_retries',
         'max_preemptible_attempts',
         'block_external_network',
+        'boot_disk_image',
+        'install_gpu_drivers',
     ])):
   """Job resource parameters related to CPUs, memory, and disk.
 
@@ -484,6 +486,8 @@ class Resources(
       representing always preemtible.
     block_external_network (bool): Prevents the containers from accessing the
       external network.
+    boot_disk_image (str): Custom boot disk image to use
+    install_gpu_drivers (bool): Whether to install GPU drivers.
   """
   __slots__ = ()
 
@@ -515,7 +519,9 @@ def __new__(cls,
               enable_stackdriver_monitoring=None,
               max_retries=None,
               max_preemptible_attempts=None,
-              block_external_network=None):
+              block_external_network=None,
+              boot_disk_image=None,
+              install_gpu_drivers=None):
     return super(Resources,
                  cls).__new__(cls, min_cores, min_ram, machine_type, disk_size,
                               disk_type, boot_disk_size, preemptible, image,
@@ -525,7 +531,8 @@ def __new__(cls,
                               accelerator_count, nvidia_driver_version, timeout,
                               log_interval, ssh, enable_stackdriver_monitoring,
                               max_retries, max_preemptible_attempts,
-                              block_external_network)
+                              block_external_network, boot_disk_image,
+                              install_gpu_drivers)
 
 
 def ensure_job_params_are_complete(job_params):
 
@@ -698,6 +698,7 @@ def _create_batch_request(
             entrypoint='/bin/bash',
             volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'],
             commands=['-c', continuous_logging_cmd],
+            options=None
         )
     )
 
@@ -711,6 +712,7 @@ def _create_batch_request(
             entrypoint='/bin/bash',
             volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'],
             commands=['-c', prepare_command],
+            options=None
         )
     )
 
@@ -732,12 +734,15 @@ def _create_batch_request(
                     cp_loop=google_utils.LOCALIZATION_LOOP,
                 ),
             ],
+            options=None
         )
     )
 
     user_command_volumes = [f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}']
     for gcs_volume in self._get_gcs_volumes_for_user_command(mounts):
       user_command_volumes.append(gcs_volume)
+    # Add --gpus all option for GPU-enabled containers
+    container_options = "--gpus all" if job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia') else None
     runnables.append(
         # user-command
         google_batch_operations.build_runnable(
@@ -756,6 +761,7 @@ def _create_batch_request(
                     user_script=script_path,
                 ),
             ],
+            options=container_options,
         )
     )
 
@@ -777,6 +783,7 @@ def _create_batch_request(
                     cp_loop=google_utils.DELOCALIZATION_LOOP,
                 ),
             ],
+            options=None
         )
     )
 
@@ -790,6 +797,7 @@ def _create_batch_request(
             entrypoint='/bin/bash',
             volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'],
             commands=['-c', logging_cmd],
+            options=None
         ),
     )
 
@@ -800,13 +808,23 @@ def _create_batch_request(
     boot_disk_size = (
         job_resources.boot_disk_size if job_resources.boot_disk_size else 0
     )
+    # Determine boot disk image: use user-specified value, or default to batch-debian for GPU jobs
+    if job_resources.boot_disk_image:
+      boot_disk_image = job_resources.boot_disk_image
+    elif job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia'):
+      boot_disk_image = "batch-debian"
+    else:
+      boot_disk_image = None
+
     boot_disk = google_batch_operations.build_persistent_disk(
         size_gb=max(boot_disk_size, job_model.LARGE_BOOT_DISK_SIZE),
         disk_type=job_model.DEFAULT_DISK_TYPE,
+        image=boot_disk_image,
     )
     disk = google_batch_operations.build_persistent_disk(
         size_gb=job_resources.disk_size,
         disk_type=job_resources.disk_type or job_model.DEFAULT_DISK_TYPE,
+        image=None
     )
     attached_disk = google_batch_operations.build_attached_disk(
         disk=disk, device_name=google_utils.DATA_DISK_NAME
@@ -834,11 +852,17 @@ def _create_batch_request(
         provisioning_model=self._get_provisioning_model(task_resources),
     )
 
+    # Determine whether to install GPU drivers: use user-specified value, or default to True for GPU jobs
+    if job_resources.install_gpu_drivers is not None:
+      install_gpu_drivers = job_resources.install_gpu_drivers
+    elif job_resources.accelerator_type is not None:
+      install_gpu_drivers = True
+    else:
+      install_gpu_drivers = False
+
     ipt = google_batch_operations.build_instance_policy_or_template(
         instance_policy=instance_policy,
-        install_gpu_drivers=True
-        if job_resources.accelerator_type is not None
-        else False,
+        install_gpu_drivers=install_gpu_drivers,
     )
 
     if job_resources.service_account:
 
@@ -211,13 +211,14 @@ def build_task_group(
 
 
 def build_container(
-    image_uri: str, entrypoint: str, volumes: List[str], commands: List[str]
+    image_uri: str, entrypoint: str, volumes: List[str], commands: List[str], options: Optional[str]
 ) -> batch_v1.types.task.Runnable.Container:
   container = batch_v1.types.task.Runnable.Container()
   container.image_uri = image_uri
   container.entrypoint = entrypoint
   container.commands = commands
   container.volumes = volumes
+  container.options = options
   return container
 
 
@@ -229,6 +230,7 @@ def build_runnable(
     entrypoint: str,
     volumes: List[str],
     commands: List[str],
+    options: Optional[str],
 ) -> batch_v1.types.task.Runnable:
   """Build a Runnable object for a Batch request.
 
@@ -241,11 +243,12 @@ def build_runnable(
     entrypoint (str): Docker image entrypoint path
     volumes (List[str]): List of volume mounts (host_path:container_path)
     commands (List[str]): Command arguments to pass to the entrypoint
+    options (str): Container options such as "--gpus all"
 
   Returns:
     An object representing a Runnable
   """
-  container = build_container(image_uri, entrypoint, volumes, commands)
+  container = build_container(image_uri, entrypoint, volumes, commands, options)
   runnable = batch_v1.Runnable()
   runnable.container = container
   runnable.background = run_in_background
@@ -401,11 +404,12 @@ def build_attached_disk(
 
 
 def build_persistent_disk(
-    size_gb: int, disk_type: str
+    size_gb: int, disk_type: str, image: str
 ) -> batch_v1.types.AllocationPolicy.Disk:
   disk = batch_v1.AllocationPolicy.Disk()
   disk.type = disk_type
   disk.size_gb = size_gb
+  disk.image = image
   return disk
 
 
 
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+# Copyright 2025 Verily Life Sciences Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+set -o nounset
+
+# Test GPU support in Google Batch provider.
+# Validates that NVIDIA accelerators trigger:
+# 1. --gpus all container option
+# 2. batch-debian boot disk image
+# 3. Actual GPU access in the running container
+#
+# Required environment variables:
+#   DOCKER_IMAGE - Google Artifact Registry image with GPU support
+#               Example: us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest
+#   PET_SA_EMAIL - Service account with access to GAR image and GPU resources
+#                  Example: my-service-account@my-project.iam.gserviceaccount.com
+#
+# Optional environment variables (for VPC-SC or custom networking):
+#   GPU_NETWORK - Network configuration
+#                 Example: projects/my-project/global/networks/my-network
+#   GPU_SUBNETWORK - Subnetwork configuration
+#                    Example: projects/my-project/regions/us-central1/subnetworks/my-subnet
+#   GPU_USE_PRIVATE_ADDRESS - Set to any value to use private address
+
+readonly SCRIPT_DIR="$(dirname "${0}")"
+
+# Do standard test setup
+source "${SCRIPT_DIR}/test_setup_e2e.sh"
+
+# Check GPU-specific prerequisites
+if [[ -z "${DOCKER_IMAGE:-}" ]]; then
+  1>&2 echo "ERROR: DOCKER_IMAGE environment variable is not set."
+  1>&2 echo "This test requires a Google Artifact Registry image with GPU support."
+  1>&2 echo "Set it with: export DOCKER_IMAGE='REGION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG'"
+  1>&2 echo "Example: export DOCKER_IMAGE='us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest'"
+  exit 1
+fi
+
+if [[ -z "${PET_SA_EMAIL:-}" ]]; then
+  1>&2 echo "ERROR: PET_SA_EMAIL environment variable is not set."
+  1>&2 echo "This test requires a service account with access to the GAR image and GPU resources."
+  1>&2 echo "Set it with: export PET_SA_EMAIL='my-service-account@my-project.iam.gserviceaccount.com'"
+  exit 1
+fi
+
+echo "Launching GPU pipeline with Google Batch provider..."
+echo "  Using image: ${DOCKER_IMAGE}"
+echo "  Using service account: ${PET_SA_EMAIL}"
+
+# Test nvidia accelerator enables GPU features
+# Uses DOCKER_IMAGE and PET_SA_EMAIL environment variables (required)
+# Optionally uses GPU_NETWORK, GPU_SUBNETWORK, and GPU_USE_PRIVATE_ADDRESS if set
+run_dsub \
+  --provider 'google-batch' \
+  --image "${DOCKER_IMAGE}" \
+  --service-account "${PET_SA_EMAIL}" \
+  ${GPU_NETWORK:+--network "${GPU_NETWORK}"} \
+  ${GPU_SUBNETWORK:+--subnetwork "${GPU_SUBNETWORK}"} \
+  ${GPU_USE_PRIVATE_ADDRESS:+--use-private-address} \
+  --accelerator-type 'nvidia-tesla-t4' \
+  --accelerator-count 1 \
+  --env NVIDIA_VISIBLE_DEVICES=all \
+  --command '\
+    echo "=== GPU Detection Test ===" && \
+    nvidia-smi && \
+    echo "=== Boot Image Test ===" && \
+    cat /etc/os-release | grep "ID=" && \
+    echo "=== Container GPU Access Test ===" && \
+    nvidia-smi -L' \
+  --wait
+
+echo
+echo "Checking GPU detection output..."
+
+# Check that GPU was detected and accessible
+RESULT="$(gsutil cat "${STDOUT_LOG}")"
+
+# Validate GPU hardware was detected
+if ! echo "${RESULT}" | grep -qi "Tesla T4"; then
+  1>&2 echo "ERROR: Tesla T4 GPU not detected in nvidia-smi output!"
+  1>&2 echo "stdout content:"
+  1>&2 echo "${RESULT}"
+  exit 1
+fi
+
+# Validate GPU memory info is present
+if ! echo "${RESULT}" | grep -qi "GPU.*Memory"; then
+  1>&2 echo "ERROR: GPU Memory information not found!"
+  1>&2 echo "stdout content:"
+  1>&2 echo "${RESULT}"
+  exit 1
+fi
+
+# Validate container has GPU access (nvidia-smi -L should list GPUs)
+if ! echo "${RESULT}" | grep -qi "GPU 0:"; then
+  1>&2 echo "ERROR: Container does not have GPU access (nvidia-smi -L failed)!"
+  1>&2 echo "stdout content:"
+  1>&2 echo "${RESULT}"
+  exit 1
+fi
+
+echo
+echo "GPU test output (showing GPU was accessible):"
+echo "*****************************"
+echo "${RESULT}"
+echo "*****************************"
+echo "SUCCESS: GPU accelerator test passed!"
+echo "- GPU hardware detected"
+echo "- Container has GPU access"
+echo "- batch-debian image used (implied by successful GPU access)"