Skip to content

Commit 7e52b5f

Browse files
committed
feat: Enable spark-rapids on Dataproc 2.1 Rocky Linux 8
This commit integrates changes to enable the spark-rapids initialization action on Dataproc 2.1-rocky8 images. - Updates the NVIDIA driver installation process in `spark-rapids.sh` for Rocky Linux: - Uses `curl` with retry and fail-fast options for downloading the CUDA installer. - Executes the NVIDIA installer with `--silent --driver --toolkit --no-opengl-libs` flags and wraps it in `execute_with_retries`. - Modifies `test_spark_rapids.py` to enable tests for Rocky Linux on Dataproc 2.1 and below, while keeping them skipped for 2.2+ (Rocky 9). This resolves the installation issues on Rocky 8. Further work is required to support Rocky 9 (Dataproc 2.2).
1 parent d8e2156 commit 7e52b5f

File tree

2 files changed

+8
-8
lines changed

2 files changed

+8
-8
lines changed

spark-rapids/spark-rapids.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -505,11 +505,11 @@ function install_nvidia_gpu_driver() {
505505
execute_with_retries "dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r)"
506506

507507
# Download the CUDA installer run file
508-
curl -o driver.run \
508+
curl -fsSL --retry-connrefused --retry 3 --retry-max-time 30 -o driver.run \
509509
"https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run"
510510

511511
# Run the installer in silent mode
512-
bash driver.run --silent
512+
execute_with_retries "bash driver.run --silent --driver --toolkit --no-opengl-libs"
513513

514514
# Remove the installer file after installation to clean up
515515
rm driver.run

spark-rapids/test_spark_rapids.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ def verify_spark_job_sql(self):
5858
("STANDARD", ["w-0"], GPU_T4))
5959
def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
6060

61-
if self.getImageOs() == "rocky":
62-
self.skipTest("Not supported for Rocky OS")
61+
if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky":
62+
self.skipTest("Not supported for Rocky 9")
6363

6464
if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
6565
self.skipTest("Not supported in 2.0 and earlier images")
@@ -88,8 +88,8 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
8888
("STANDARD", ["w-0"], GPU_T4))
8989
def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
9090

91-
if self.getImageOs() == "rocky":
92-
self.skipTest("Not supported for Rocky OS")
91+
if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky":
92+
self.skipTest("Not supported for Rocky 9")
9393

9494
if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
9595
self.skipTest("Not supported in 2.0 and earlier images")
@@ -118,8 +118,8 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
118118
def test_non_default_cuda_versions(self, configuration, machine_suffixes,
119119
accelerator, cuda_version, driver_version):
120120

121-
if self.getImageOs() == "rocky":
122-
self.skipTest("Not supported for Rocky OS")
121+
if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky":
122+
self.skipTest("Not supported for Rocky 9")
123123

124124
if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
125125
self.skipTest("Not supported in 2.0 and earlier images")

0 commit comments

Comments
 (0)