-
Notifications
You must be signed in to change notification settings - Fork 0
Enable GPU operator to install GRID driver on Azure NV instances #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: datadog
Are you sure you want to change the base?
Changes from all commits
5084376
5bdc758
c05e53f
86cfb3f
d29ff40
a888111
3e1efa4
dbea80f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| set -eu | ||
|
|
||
| # GRID_INSTALLER_DIR is provided by Dockerfile ENV | ||
| GRID_INSTALLER_DIR=${GRID_INSTALLER_DIR:-/opt/nvidia-grid-install} | ||
|
|
||
| # Available Azure GRID driver versions | ||
| AVAILABLE_VERSIONS="550.144.06, 550.144.03, 535.161.08, 535.154.05, 535.54.03, 525.105.17, 525.85.05, 525.60.13" | ||
|
|
||
| print_usage() { | ||
| echo "Usage: $0 <driver_version>" | ||
| echo "Available versions: $AVAILABLE_VERSIONS" | ||
| } | ||
|
|
||
| get_grid_azure_url() { | ||
| local version="$1" | ||
|
|
||
| # Azure GRID driver version mapping | ||
| case "$version" in | ||
| 550.144.06*) | ||
| echo "https://download.microsoft.com/download/c5319e92-672e-4067-8d85-ab66a7a64db3/NVIDIA-Linux-x86_64-550.144.06-grid-azure.run" | ||
| ;; | ||
| 550.144.03*) | ||
| echo "https://download.microsoft.com/download/c/3/4/c3484f19-fe76-4495-a65d-a5222ead9517/NVIDIA-Linux-x86_64-550.144.03-grid-azure.run" | ||
| ;; | ||
| 535.161.08*) | ||
| echo "https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run" | ||
| ;; | ||
| 535.154.05*) | ||
| echo "https://download.microsoft.com/download/1/4/4/14450d0e-a3f2-4b0a-9bb4-a8e729e986c4/NVIDIA-Linux-x86_64-535.154.05-grid-azure.run" | ||
| ;; | ||
| 535.54.03*) | ||
| echo "https://download.microsoft.com/download/2/e/8/2e85b622-d376-4166-be95-38fd60f18eda/NVIDIA-Linux-x86_64-535.54.03-grid-azure.run" | ||
| ;; | ||
| 525.105.17*) | ||
| echo "https://download.microsoft.com/download/6/b/d/6bd2850f-5883-4e2a-9a35-edbd3dd6808c/NVIDIA-Linux-x86_64-525.105.17-grid-azure.run" | ||
| ;; | ||
| 525.85.05*) | ||
| echo "https://download.microsoft.com/download/c/e/9/ce913061-ccf1-4c88-94ff-294e48c55439/NVIDIA-Linux-x86_64-525.85.05-grid-azure.run" | ||
| ;; | ||
| 525.60.13*) | ||
| echo "https://download.microsoft.com/download/1/e/8/1e82a212-9e77-4d74-9455-828d430a39f1/NVIDIA-Linux-x86_64-525.60.13-grid-azure.run" | ||
| ;; | ||
| *) | ||
| echo "" | ||
| return 1 | ||
| ;; | ||
| esac | ||
| return 0 | ||
| } | ||
|
|
||
| fetch_grid_azure_installer() { | ||
| local driver_version="$1" | ||
|
|
||
| if [ -z "$driver_version" ]; then | ||
| echo "ERROR: Driver version must be provided as an argument" | ||
| print_usage | ||
| exit 1 | ||
| fi | ||
|
|
||
| mkdir -p "$GRID_INSTALLER_DIR" | ||
| cd "$GRID_INSTALLER_DIR" | ||
|
|
||
| local download_url=$(get_grid_azure_url "$driver_version") | ||
|
|
||
| if [ -z "$download_url" ]; then | ||
| echo "ERROR: No Azure GRID driver URL found for version $driver_version" | ||
| print_usage | ||
| exit 1 | ||
| fi | ||
|
|
||
| local filename=$(basename "$download_url") | ||
| echo "Downloading GRID driver from: $download_url" | ||
|
|
||
| curl -fSsl -o "$filename" "$download_url" | ||
| chmod +x "$filename" | ||
|
|
||
| echo "GRID installer downloaded successfully to $GRID_INSTALLER_DIR/$filename" | ||
| } | ||
|
|
||
| fetch_grid_azure_installer "$@" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -97,10 +97,29 @@ RUN . /versions.env && \ | |
| DEP_PACKAGES=$(apt-rdepends $BASE_PACKAGES_NAMES | grep -v "^ " | grep -v "^debconf-2.0$" | grep -v "^linux-image-unsigned-") && \ | ||
| apt-get install -y --download-only --no-install-recommends --reinstall $BASE_PACKAGES $DEP_PACKAGES | ||
|
|
||
| # Remove cuda repository before downloading dkms to avoid version conflicts | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you gather all the build required steps in a single block and make them only run on Azure? |
||
| # CUDA repo has dkms 1:3.3.0 but Ubuntu has 2.8.7 - we need Ubuntu version for runtime | ||
| # Note: We remove repo files but don't run apt-get update to preserve package cache | ||
| # for runtime installation of precompiled driver packages | ||
| RUN rm -f /etc/apt/sources.list.d/cuda* | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know removing the |
||
|
|
||
| # Download kernel headers, dkms, linux-modules (for video.ko) for GRID driver support | ||
| # linux-modules contains video.ko which nvidia-modeset depends on for __acpi_video_get_backlight_type symbol | ||
| RUN . /versions.env && \ | ||
| apt-get install -y --download-only --no-install-recommends \ | ||
| linux-headers-${KERNEL_VERSION} \ | ||
| linux-modules-${KERNEL_VERSION} \ | ||
| dkms | ||
|
|
||
| RUN mkdir -p /opt/nvidia-driver/bin | ||
| COPY ubuntu22.04/precompiled/nvidia-driver /opt/nvidia-driver/bin/nvidia-driver | ||
| COPY nvidia-driver-wrapper.sh /usr/local/bin/nvidia-driver | ||
|
|
||
| ADD download_azure_grid_driver.sh /tmp | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit but for consistency reasons could you use |
||
| # TODO: Azure support only several GRID driver versions. Temporary hardcode the version. | ||
| # RUN . /versions.env && /tmp/download_azure_grid_driver.sh "$DRIVER_VERSION" | ||
| RUN /tmp/download_azure_grid_driver.sh "550.144.06" | ||
|
|
||
| WORKDIR /drivers | ||
|
|
||
| ENTRYPOINT ["nvidia-driver", "init"] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ NVIDIA_PEERMEM_MODULE_PARAMS=() | |
| TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is more or less the upstream nvidia-driver script. For the sake of keeping it easy to rebase, could you please reduce to a bare minimum (a line of script import) all changes that are related to Azure specificities and put everything you add in a separate script? |
||
| KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} | ||
| MODPROBE_CONFIG_DIR="/etc/modprobe.d" | ||
| GRID_INSTALLER_DIR=${GRID_INSTALLER_DIR:-/opt/nvidia-grid-install} | ||
|
|
||
| fabricmanager_install() { | ||
| apt-get install -y --no-install-recommends --no-download nvidia-fabricmanager-${DRIVER_BRANCH}=${FULL_DRIVER_VERSION} | ||
|
|
@@ -390,7 +391,7 @@ _resolve_kernel_type() { | |
| } | ||
|
|
||
| # Link and install the kernel modules from a precompiled packages | ||
| _install_driver() { | ||
| _install_precompiled_driver() { | ||
| # Install necessary driver userspace packages | ||
| apt-get install -y --no-install-recommends --no-download \ | ||
| nvidia-utils-${DRIVER_BRANCH}-server=${FULL_DRIVER_VERSION} \ | ||
|
|
@@ -415,6 +416,113 @@ _install_driver() { | |
| fi | ||
| } | ||
|
|
||
| _install_grid_driver() { | ||
| echo "Installing NVIDIA GRID driver from Azure package..." | ||
|
|
||
| if [ ! -d "$GRID_INSTALLER_DIR" ]; then | ||
| echo "ERROR: GRID installer directory not found: $GRID_INSTALLER_DIR" | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Find the .run installer file | ||
| local installer_file=$(find "$GRID_INSTALLER_DIR" -maxdepth 1 -type f -name "NVIDIA-Linux-*.run" | head -n 1) | ||
|
|
||
| if [ -z "$installer_file" ]; then | ||
| echo "ERROR: GRID installer .run file not found in $GRID_INSTALLER_DIR" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Using GRID installer: $installer_file" | ||
|
|
||
| # Install kernel headers and modules required for DKMS | ||
| # linux-modules provides video.ko which nvidia-modeset depends on for __acpi_video_get_backlight_type symbol | ||
| echo "Installing kernel headers and modules for ${KERNEL_VERSION}..." | ||
| apt-get install --no-install-recommends --no-download -y \ | ||
| linux-headers-${KERNEL_VERSION} \ | ||
| linux-modules-${KERNEL_VERSION} \ | ||
| dkms | ||
|
|
||
| # Create temporary directory for installer | ||
| local tmpdir="$GRID_INSTALLER_DIR/nvidia-grid-tmp" | ||
| mkdir -p "$tmpdir" | ||
|
|
||
| # Install GRID driver using the .run installer | ||
| # -s (--silent): non-interactive silent mode | ||
| # --dkms: use DKMS to build and load kernel modules automatically | ||
| # --tmpdir: specify temporary directory for installation | ||
| # Note: GRID drivers do not support --skip-module-load option | ||
| bash -c "$installer_file -s --dkms --tmpdir $tmpdir" | ||
|
|
||
| local exit_code=$? | ||
|
|
||
| # Clean up temporary directory | ||
| rm -rf "$tmpdir" | ||
|
|
||
| if [ $exit_code -ne 0 ]; then | ||
| echo "ERROR: GRID driver installation failed with exit code $exit_code" | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Updating gridd.conf | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add a link to the doc here because it's not obvious why we are doing this here |
||
| echo "Creating GRID config" | ||
| cp /etc/nvidia/gridd.conf.template /etc/nvidia/gridd.conf | ||
|
|
||
| # Replace EnableUI in place (handles both commented and uncommented) | ||
| sed -i 's/^#\?[[:space:]]*EnableUI=.*/EnableUI=FALSE/' /etc/nvidia/gridd.conf | ||
|
|
||
| # Add EnableUI if not present anywhere in the file | ||
| grep -q '^EnableUI=' /etc/nvidia/gridd.conf || echo "EnableUI=FALSE" >> /etc/nvidia/gridd.conf | ||
|
|
||
| # Replace IgnoreSP in place (handles both commented and uncommented) | ||
| sed -i 's/^#\?[[:space:]]*IgnoreSP=.*/IgnoreSP=FALSE/' /etc/nvidia/gridd.conf | ||
|
|
||
| # Add IgnoreSP if not present anywhere in the file | ||
| grep -q '^IgnoreSP=' /etc/nvidia/gridd.conf || echo "IgnoreSP=FALSE" >> /etc/nvidia/gridd.conf | ||
|
|
||
| # Comment out FeatureType if uncommented | ||
| sed -i 's/^FeatureType=/#FeatureType=/' /etc/nvidia/gridd.conf | ||
|
|
||
| echo "GRID driver installed successfully" | ||
| } | ||
|
|
||
| _has_nvidia_a10_gpu() { | ||
| # Check for NVIDIA A10 GPU (vendor: 0x10de, device: 0x2236) | ||
| # NVIDIA A10 requires GRID driver on Azure | ||
| for dev in /sys/bus/pci/devices/*; do | ||
| if [ -f "$dev/vendor" ] && [ -f "$dev/device" ]; then | ||
| vendor=$(cat "$dev/vendor") | ||
| device=$(cat "$dev/device") | ||
|
|
||
| if [ "$vendor" = "0x10de" ] && [ "$device" = "0x2236" ]; then | ||
| echo "Detected NVIDIA A10 GPU at $(basename $dev), GRID driver required" | ||
| return 0 # A10 GPU present | ||
| fi | ||
| fi | ||
| done | ||
|
|
||
| return 1 # A10 GPU not present | ||
| } | ||
|
|
||
| _is_grid_driver_required() { | ||
| # Extract kernel name (what comes after the last '-') | ||
| local csp_name="${KERNEL_VERSION##*-}" | ||
|
|
||
| # Check if this is an Azure instance with NVidia A10 GPU | ||
| if [ "$csp_name" = "azure" ] && _has_nvidia_a10_gpu; then | ||
| return 0 # GRID driver required | ||
| fi | ||
|
|
||
| return 1 # GRID driver not required | ||
| } | ||
|
|
||
| _install_driver() { | ||
| if _is_grid_driver_required; then | ||
| _install_grid_driver | ||
| else | ||
| _install_precompiled_driver | ||
| fi | ||
| } | ||
|
|
||
| # Mount the driver rootfs into the run directory with the exception of sysfs. | ||
| _mount_rootfs() { | ||
| echo "Mounting NVIDIA driver rootfs..." | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we need to support all those versions, especially since they are hardcoded anyway. Only keeping 1 (the latest) per driver branch would shorten the script a little bit