From a9e0e9676ac25683e98ccddd9429a73f7249aa44 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Tue, 4 Nov 2025 00:36:46 -0800 Subject: [PATCH 1/6] Fix driver installation for ubuntu image --- .github/actions/setup-nvidia/action.yml | 32 +++++++++++++++---------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/.github/actions/setup-nvidia/action.yml b/.github/actions/setup-nvidia/action.yml index 6d7adc8e70..7eab42206b 100644 --- a/.github/actions/setup-nvidia/action.yml +++ b/.github/actions/setup-nvidia/action.yml @@ -49,7 +49,7 @@ runs: ) } - install_nvidia_docker2_ubuntu20() { + install_nvidia_docker2_ubuntu() { ( set -x # Install nvidia-driver package if not installed @@ -101,13 +101,19 @@ runs: if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then # CAUTION: this may need to be updated in future - if [ "${DISTRIBUTION}" != ubuntu20.04 ]; then - sudo yum groupinstall -y "Development Tools" - # ensure our kernel install is the same as our underlying kernel, - # groupinstall "Development Tools" has a habit of mismatching kernel headers - sudo yum install -y "kernel-devel-uname-r == $(uname -r)" - sudo modprobe backlight - fi + case "${DISTRIBUTION}" in + ubuntu*) + # Ubuntu doesn't need yum packages + ;; + *) + # Amazon Linux and others need Development Tools + sudo yum groupinstall -y "Development Tools" + # ensure our kernel install is the same as our underlying kernel, + # groupinstall "Development Tools" has a habit of mismatching kernel headers + sudo yum install -y "kernel-devel-uname-r == $(uname -r)" + sudo modprobe backlight + ;; + esac sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" set +e @@ -214,7 +220,7 @@ runs: ) } - install_nvidia_driver_ubuntu20() { + install_nvidia_driver_ubuntu() { ( set -x install_nvidia_driver_common @@ -227,8 +233,8 @@ runs: amzn*) install_nvidia_driver_amzn2 ;; - ubuntu20.04) - install_nvidia_driver_ubuntu20 + ubuntu*) + install_nvidia_driver_ubuntu ;; *) echo "ERROR: Unknown distribution ${DISTRIBUTION}" @@ -242,8 +248,8 @@ runs: amzn*) install_nvidia_docker2_amzn2 ;; - ubuntu20.04) - install_nvidia_docker2_ubuntu20 + ubuntu*) + install_nvidia_docker2_ubuntu ;; *) echo "ERROR: Unknown distribution ${DISTRIBUTION}" From eb66729c780f02cf03b144fbe11ed729deda6c64 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Tue, 4 Nov 2025 10:08:50 -0800 Subject: [PATCH 2/6] make lspci optional --- .github/actions/setup-nvidia/action.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/setup-nvidia/action.yml b/.github/actions/setup-nvidia/action.yml index 7eab42206b..710d1d63c7 100644 --- a/.github/actions/setup-nvidia/action.yml +++ b/.github/actions/setup-nvidia/action.yml @@ -72,8 +72,8 @@ runs: ( # Try to gather more information about the runner and its existing NVIDIA driver if any echo "Before installing NVIDIA driver" - lspci - lsmod + lspci || true + lsmod || true modinfo nvidia || true HAS_NVIDIA_DRIVER=0 @@ -136,7 +136,7 @@ runs: fi if [ "$RESET_GPU" -eq 1 ]; then - NVIDIA_DEVICES=$(lspci -D | grep -i NVIDIA | cut -d' ' -f1) + NVIDIA_DEVICES=$(lspci -D 2>/dev/null | grep -i NVIDIA | cut -d' ' -f1 || true) # The GPU can get stuck in a failure state if somehow the test crashs the GPU microcode. When this # happens, we'll try to reset all NVIDIA devices https://github.com/pytorch/pytorch/issues/88388 for PCI_ID in $NVIDIA_DEVICES; do @@ -159,8 +159,8 @@ runs: ( sudo modprobe nvidia || true echo "After installing NVIDIA driver" - lspci - lsmod + lspci || true + lsmod || true modinfo nvidia || true ( From 83697851ff5287f7ab43f8a9446ccf76e6764167 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Tue, 4 Nov 2025 13:36:36 -0800 Subject: [PATCH 3/6] Nvidia-docker2 deprecated and need to search for nvidia-container-toolkit instead --- .github/actions/setup-nvidia/action.yml | 26 ++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/.github/actions/setup-nvidia/action.yml b/.github/actions/setup-nvidia/action.yml index 710d1d63c7..d4e38eed85 100644 --- a/.github/actions/setup-nvidia/action.yml +++ b/.github/actions/setup-nvidia/action.yml @@ -52,11 +52,24 @@ runs: install_nvidia_docker2_ubuntu() { ( set -x - # Install nvidia-driver package if not installed - status="$(dpkg-query -W --showformat='${db:Status-Status}' nvidia-docker2 2>&1)" - if [ ! $? = 0 ] || [ ! "$status" = installed ]; then + # Check if nvidia-container-toolkit or nvidia-docker2 is installed + set +e + toolkit_status="$(dpkg-query -W --showformat='${db:Status-Status}' nvidia-container-toolkit 2>&1)" + toolkit_result=$? + docker2_status="$(dpkg-query -W --showformat='${db:Status-Status}' nvidia-docker2 2>&1)" + docker2_result=$? + set -e + + # Install if neither package is installed + if [ "$toolkit_result" -ne 0 ] && [ "$docker2_result" -ne 0 ]; then + echo "Installing nvidia-container-toolkit..." + sudo apt-get update -qq sudo apt-get install -y nvidia-container-toolkit-1.17.8 sudo systemctl restart docker + elif [ "$toolkit_result" -eq 0 ] && [ "$toolkit_status" = "installed" ]; then + echo "nvidia-container-toolkit is already installed" + elif [ "$docker2_result" -eq 0 ] && [ "$docker2_status" = "installed" ]; then + echo "nvidia-docker2 (legacy) is already installed, skipping" fi ) } @@ -92,6 +105,13 @@ runs: # Turn off persistent mode so that the installation script can unload the kernel module sudo killall nvidia-persistenced || true + + # Unload NVIDIA kernel modules to allow new driver installation + echo "Attempting to unload NVIDIA kernel modules..." + sudo rmmod nvidia_uvm || true + sudo rmmod nvidia_drm || true + sudo rmmod nvidia_modeset || true + sudo rmmod nvidia || true else HAS_NVIDIA_DRIVER=1 echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation" From d7bc8e03017bce55c3b013e8b929d00deec105e7 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Tue, 4 Nov 2025 15:57:36 -0800 Subject: [PATCH 4/6] Add nvidia-container-toolkit version --- .github/actions/setup-nvidia/action.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/actions/setup-nvidia/action.yml b/.github/actions/setup-nvidia/action.yml index d4e38eed85..2166ff273f 100644 --- a/.github/actions/setup-nvidia/action.yml +++ b/.github/actions/setup-nvidia/action.yml @@ -63,8 +63,17 @@ runs: # Install if neither package is installed if [ "$toolkit_result" -ne 0 ] && [ "$docker2_result" -ne 0 ]; then echo "Installing nvidia-container-toolkit..." + + # Add NVIDIA Docker repository + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + + # Update and install sudo apt-get update -qq - sudo apt-get install -y nvidia-container-toolkit-1.17.8 + sudo apt-get install -y nvidia-container-toolkit=1.17.8-1 sudo systemctl restart docker elif [ "$toolkit_result" -eq 0 ] && [ "$toolkit_status" = "installed" ]; then echo "nvidia-container-toolkit is already installed" From c2a8096dfca93c0f677385095ec904de28acf7c2 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Tue, 4 Nov 2025 22:17:43 -0800 Subject: [PATCH 5/6] fix gpg exit error --- .github/actions/setup-nvidia/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/setup-nvidia/action.yml b/.github/actions/setup-nvidia/action.yml index 2166ff273f..021fcf485c 100644 --- a/.github/actions/setup-nvidia/action.yml +++ b/.github/actions/setup-nvidia/action.yml @@ -66,7 +66,7 @@ runs: # Add NVIDIA Docker repository distribution=$(. /etc/os-release;echo $ID$VERSION_ID) - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --batch --yes --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list From d3aa762676b66cf85ee0a369b705c77dae113685 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Wed, 5 Nov 2025 00:00:06 -0800 Subject: [PATCH 6/6] dont pin ver --- .github/actions/setup-nvidia/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/setup-nvidia/action.yml b/.github/actions/setup-nvidia/action.yml index 021fcf485c..4ba7cb9868 100644 --- a/.github/actions/setup-nvidia/action.yml +++ b/.github/actions/setup-nvidia/action.yml @@ -73,7 +73,7 @@ runs: # Update and install sudo apt-get update -qq - sudo apt-get install -y nvidia-container-toolkit=1.17.8-1 + sudo apt-get install -y nvidia-container-toolkit sudo systemctl restart docker elif [ "$toolkit_result" -eq 0 ] && [ "$toolkit_status" = "installed" ]; then echo "nvidia-container-toolkit is already installed"