Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions download_azure_grid_driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env bash

set -eu

# GRID_INSTALLER_DIR is provided by Dockerfile ENV
GRID_INSTALLER_DIR=${GRID_INSTALLER_DIR:-/opt/nvidia-grid-install}

# Available Azure GRID driver versions
AVAILABLE_VERSIONS="550.144.06, 550.144.03, 535.161.08, 535.154.05, 535.54.03, 525.105.17, 525.85.05, 525.60.13"

print_usage() {
echo "Usage: $0 <driver_version>"
echo "Available versions: $AVAILABLE_VERSIONS"
}

get_grid_azure_url() {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to support all those versions, especially since they are hardcoded anyway. Only keeping 1 (the latest) per driver branch would shorten the script a little bit

local version="$1"

# Azure GRID driver version mapping
case "$version" in
550.144.06*)
echo "https://download.microsoft.com/download/c5319e92-672e-4067-8d85-ab66a7a64db3/NVIDIA-Linux-x86_64-550.144.06-grid-azure.run"
;;
550.144.03*)
echo "https://download.microsoft.com/download/c/3/4/c3484f19-fe76-4495-a65d-a5222ead9517/NVIDIA-Linux-x86_64-550.144.03-grid-azure.run"
;;
535.161.08*)
echo "https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run"
;;
535.154.05*)
echo "https://download.microsoft.com/download/1/4/4/14450d0e-a3f2-4b0a-9bb4-a8e729e986c4/NVIDIA-Linux-x86_64-535.154.05-grid-azure.run"
;;
535.54.03*)
echo "https://download.microsoft.com/download/2/e/8/2e85b622-d376-4166-be95-38fd60f18eda/NVIDIA-Linux-x86_64-535.54.03-grid-azure.run"
;;
525.105.17*)
echo "https://download.microsoft.com/download/6/b/d/6bd2850f-5883-4e2a-9a35-edbd3dd6808c/NVIDIA-Linux-x86_64-525.105.17-grid-azure.run"
;;
525.85.05*)
echo "https://download.microsoft.com/download/c/e/9/ce913061-ccf1-4c88-94ff-294e48c55439/NVIDIA-Linux-x86_64-525.85.05-grid-azure.run"
;;
525.60.13*)
echo "https://download.microsoft.com/download/1/e/8/1e82a212-9e77-4d74-9455-828d430a39f1/NVIDIA-Linux-x86_64-525.60.13-grid-azure.run"
;;
*)
echo ""
return 1
;;
esac
return 0
}

fetch_grid_azure_installer() {
local driver_version="$1"

if [ -z "$driver_version" ]; then
echo "ERROR: Driver version must be provided as an argument"
print_usage
exit 1
fi

mkdir -p "$GRID_INSTALLER_DIR"
cd "$GRID_INSTALLER_DIR"

local download_url=$(get_grid_azure_url "$driver_version")

if [ -z "$download_url" ]; then
echo "ERROR: No Azure GRID driver URL found for version $driver_version"
print_usage
exit 1
fi

local filename=$(basename "$download_url")
echo "Downloading GRID driver from: $download_url"

curl -fSsl -o "$filename" "$download_url"
chmod +x "$filename"

echo "GRID installer downloaded successfully to $GRID_INSTALLER_DIR/$filename"
}

fetch_grid_azure_installer "$@"
19 changes: 19 additions & 0 deletions precompiled.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,29 @@ RUN . /versions.env && \
DEP_PACKAGES=$(apt-rdepends $BASE_PACKAGES_NAMES | grep -v "^ " | grep -v "^debconf-2.0$" | grep -v "^linux-image-unsigned-") && \
apt-get install -y --download-only --no-install-recommends --reinstall $BASE_PACKAGES $DEP_PACKAGES

# Remove cuda repository before downloading dkms to avoid version conflicts

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you gather all the build required steps in a single block and make them only run on Azure?

# CUDA repo has dkms 1:3.3.0 but Ubuntu has 2.8.7 - we need Ubuntu version for runtime
# Note: We remove repo files but don't run apt-get update to preserve package cache
# for runtime installation of precompiled driver packages
RUN rm -f /etc/apt/sources.list.d/cuda*

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know removing the /etc/apt/sources.list.d/cuda* file has been an issue in some cases where we could not find some packages. Can you try doing apt install nvlsm for instance?


# Download kernel headers, dkms, linux-modules (for video.ko) for GRID driver support
# linux-modules contains video.ko which nvidia-modeset depends on for __acpi_video_get_backlight_type symbol
RUN . /versions.env && \
apt-get install -y --download-only --no-install-recommends \
linux-headers-${KERNEL_VERSION} \
linux-modules-${KERNEL_VERSION} \
dkms

RUN mkdir -p /opt/nvidia-driver/bin
COPY ubuntu22.04/precompiled/nvidia-driver /opt/nvidia-driver/bin/nvidia-driver
COPY nvidia-driver-wrapper.sh /usr/local/bin/nvidia-driver

ADD download_azure_grid_driver.sh /tmp

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit but for consistency reasons could you use COPY (also it is officially recommended to use COPY)

# TODO: Azure support only several GRID driver versions. Temporary hardcode the version.
# RUN . /versions.env && /tmp/download_azure_grid_driver.sh "$DRIVER_VERSION"
RUN /tmp/download_azure_grid_driver.sh "550.144.06"

WORKDIR /drivers

ENTRYPOINT ["nvidia-driver", "init"]
110 changes: 109 additions & 1 deletion ubuntu22.04/precompiled/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is more or less the upstream nvidia-driver script. For the sake of keeping it easy to rebase, could you please reduce to a bare minimum (a line of script import) all changes that are related to Azure specificities and put everything you add in a separate script?

KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
MODPROBE_CONFIG_DIR="/etc/modprobe.d"
GRID_INSTALLER_DIR=${GRID_INSTALLER_DIR:-/opt/nvidia-grid-install}

fabricmanager_install() {
apt-get install -y --no-install-recommends --no-download nvidia-fabricmanager-${DRIVER_BRANCH}=${FULL_DRIVER_VERSION}
Expand Down Expand Up @@ -390,7 +391,7 @@ _resolve_kernel_type() {
}

# Link and install the kernel modules from a precompiled packages
_install_driver() {
_install_precompiled_driver() {
# Install necessary driver userspace packages
apt-get install -y --no-install-recommends --no-download \
nvidia-utils-${DRIVER_BRANCH}-server=${FULL_DRIVER_VERSION} \
Expand All @@ -415,6 +416,113 @@ _install_driver() {
fi
}

_install_grid_driver() {
echo "Installing NVIDIA GRID driver from Azure package..."

if [ ! -d "$GRID_INSTALLER_DIR" ]; then
echo "ERROR: GRID installer directory not found: $GRID_INSTALLER_DIR"
exit 1
fi

# Find the .run installer file
local installer_file=$(find "$GRID_INSTALLER_DIR" -maxdepth 1 -type f -name "NVIDIA-Linux-*.run" | head -n 1)

if [ -z "$installer_file" ]; then
echo "ERROR: GRID installer .run file not found in $GRID_INSTALLER_DIR"
exit 1
fi

echo "Using GRID installer: $installer_file"

# Install kernel headers and modules required for DKMS
# linux-modules provides video.ko which nvidia-modeset depends on for __acpi_video_get_backlight_type symbol
echo "Installing kernel headers and modules for ${KERNEL_VERSION}..."
apt-get install --no-install-recommends --no-download -y \
linux-headers-${KERNEL_VERSION} \
linux-modules-${KERNEL_VERSION} \
dkms

# Create temporary directory for installer
local tmpdir="$GRID_INSTALLER_DIR/nvidia-grid-tmp"
mkdir -p "$tmpdir"

# Install GRID driver using the .run installer
# -s (--silent): non-interactive silent mode
# --dkms: use DKMS to build and load kernel modules automatically
# --tmpdir: specify temporary directory for installation
# Note: GRID drivers do not support --skip-module-load option
bash -c "$installer_file -s --dkms --tmpdir $tmpdir"

local exit_code=$?

# Clean up temporary directory
rm -rf "$tmpdir"

if [ $exit_code -ne 0 ]; then
echo "ERROR: GRID driver installation failed with exit code $exit_code"
exit 1
fi

# Updating gridd.conf

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a link to the doc here because it's not obvious why we are doing this here

echo "Creating GRID config"
cp /etc/nvidia/gridd.conf.template /etc/nvidia/gridd.conf

# Replace EnableUI in place (handles both commented and uncommented)
sed -i 's/^#\?[[:space:]]*EnableUI=.*/EnableUI=FALSE/' /etc/nvidia/gridd.conf

# Add EnableUI if not present anywhere in the file
grep -q '^EnableUI=' /etc/nvidia/gridd.conf || echo "EnableUI=FALSE" >> /etc/nvidia/gridd.conf

# Replace IgnoreSP in place (handles both commented and uncommented)
sed -i 's/^#\?[[:space:]]*IgnoreSP=.*/IgnoreSP=FALSE/' /etc/nvidia/gridd.conf

# Add IgnoreSP if not present anywhere in the file
grep -q '^IgnoreSP=' /etc/nvidia/gridd.conf || echo "IgnoreSP=FALSE" >> /etc/nvidia/gridd.conf

# Comment out FeatureType if uncommented
sed -i 's/^FeatureType=/#FeatureType=/' /etc/nvidia/gridd.conf

echo "GRID driver installed successfully"
}

_has_nvidia_a10_gpu() {
# Check for NVIDIA A10 GPU (vendor: 0x10de, device: 0x2236)
# NVIDIA A10 requires GRID driver on Azure
for dev in /sys/bus/pci/devices/*; do
if [ -f "$dev/vendor" ] && [ -f "$dev/device" ]; then
vendor=$(cat "$dev/vendor")
device=$(cat "$dev/device")

if [ "$vendor" = "0x10de" ] && [ "$device" = "0x2236" ]; then
echo "Detected NVIDIA A10 GPU at $(basename $dev), GRID driver required"
return 0 # A10 GPU present
fi
fi
done

return 1 # A10 GPU not present
}

_is_grid_driver_required() {
# Extract kernel name (what comes after the last '-')
local csp_name="${KERNEL_VERSION##*-}"

# Check if this is an Azure instance with NVidia A10 GPU
if [ "$csp_name" = "azure" ] && _has_nvidia_a10_gpu; then
return 0 # GRID driver required
fi

return 1 # GRID driver not required
}

_install_driver() {
if _is_grid_driver_required; then
_install_grid_driver
else
_install_precompiled_driver
fi
}

# Mount the driver rootfs into the run directory with the exception of sysfs.
_mount_rootfs() {
echo "Mounting NVIDIA driver rootfs..."
Expand Down