|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +set -o errexit |
| 4 | +set -o nounset |
| 5 | +set -o xtrace |
| 6 | + |
| 7 | +if [ "$ENABLE_ACCELERATOR" != "nvidia" ]; then |
| 8 | + exit 0 |
| 9 | +fi |
| 10 | + |
| 11 | +##### UTILITY FUNCTIONS ###### |
| 12 | + |
| 13 | +# utility function for pulling rpms from an S3 bucket |
| 14 | +function rpm_install() { |
| 15 | + local RPMS=("$@") |
| 16 | + echo "pulling and installing rpms:(${RPMS[*]}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" |
| 17 | + for RPM in "${RPMS[@]}"; do |
| 18 | + # we're pulling these rpms from the same bucket as the binaries, because those |
| 19 | + # can be replicated up to highside easily |
| 20 | + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} |
| 21 | + sudo yum localinstall -y ${WORKING_DIR}/${RPM} |
| 22 | + # the WORKING_DIR will be cleaned up at the end of the build |
| 23 | + rm ${WORKING_DIR}/${RPM} |
| 24 | + done |
| 25 | +} |
| 26 | + |
| 27 | +# utility function to resolve the latest driver version from a driver prefix |
| 28 | +function resolve_latest_driver_version_from_json() { |
| 29 | + local DRIVER_PREFIX="$1" |
| 30 | + local JSON_URL="" |
| 31 | + local TEMP_JSON_FILE="/tmp/nvidia_releases.json" |
| 32 | + DRIVER_PREFIX=$(echo "${DRIVER_PREFIX}" | sed 's/\*$//') |
| 33 | + if [[ $AWS_REGION == cn-* ]]; then |
| 34 | + DOMAIN="nvidia.cn" |
| 35 | + else |
| 36 | + DOMAIN="nvidia.com" |
| 37 | + fi |
| 38 | + |
| 39 | + JSON_URL="https://docs.${DOMAIN}/datacenter/tesla/drivers/releases.json" |
| 40 | + |
| 41 | + echo "Resolving latest NVIDIA driver version for prefix: ${DRIVER_PREFIX}" >&2 |
| 42 | + echo "Using JSON URL: ${JSON_URL}" >&2 |
| 43 | + |
| 44 | + if ! curl -s -o "${TEMP_JSON_FILE}" "${JSON_URL}"; then |
| 45 | + echo "Failed to download NVIDIA driver releases JSON. Unable to resolve runfile from the provided prefix: ${DRIVER_PREFIX}" >&2 |
| 46 | + echo "${DRIVER_PREFIX}" |
| 47 | + exit 1 |
| 48 | + fi |
| 49 | + |
| 50 | + local LATEST_VERSION |
| 51 | + LATEST_VERSION=$(jq -e -r --arg prefix "${DRIVER_PREFIX}" '.[$prefix].driver_info[0].release_version' "${TEMP_JSON_FILE}") |
| 52 | + |
| 53 | + if [ -z "${LATEST_VERSION}" ] || [ "${LATEST_VERSION}" = "null" ]; then |
| 54 | + echo "No matching driver version found for prefix ${DRIVER_PREFIX}" >&2 |
| 55 | + echo "${DRIVER_PREFIX}" |
| 56 | + return 1 |
| 57 | + fi |
| 58 | + |
| 59 | + echo "Resolved latest driver version: ${LATEST_VERSION}" >&2 |
| 60 | + echo "${LATEST_VERSION}" |
| 61 | + rm -f "${TEMP_JSON_FILE}" |
| 62 | +} |
| 63 | + |
| 64 | +# A utility function to ensure that nvidia-open-supported-devices.txt is correctly generated |
| 65 | +validate_nvidia_supported_devices_file() { |
| 66 | + local KMOD_MAJOR_VERSION="$1" |
| 67 | + # add some quick validations to ensure that the build fails if |
| 68 | + GENERATED_SUPPORTED_DEVICES_FILE="/etc/eks/nvidia-open-supported-devices-${KMOD_MAJOR_VERSION}.txt" |
| 69 | + if [ ! -s "$GENERATED_SUPPORTED_DEVICES_FILE" ]; then |
| 70 | + echo "ERROR: Generated supported devices file is empty or missing" |
| 71 | + exit 1 |
| 72 | + fi |
| 73 | + |
| 74 | + # check to ensure that the file is not empty |
| 75 | + TOTAL_SUPPORTED_GPU_ENTRY_COUNT=$(grep -c "^0x" "$GENERATED_SUPPORTED_DEVICES_FILE" 2> /dev/null || echo "0") |
| 76 | + echo "Count of GPU entries in ${GENERATED_SUPPORTED_DEVICES_FILE}: ${TOTAL_SUPPORTED_GPU_ENTRY_COUNT}" |
| 77 | + if [ "$TOTAL_SUPPORTED_GPU_ENTRY_COUNT" -eq 0 ]; then |
| 78 | + echo "ERROR: No GPU entries found in generated nvidia-open-supported-devices.txt file" |
| 79 | + exit 1 |
| 80 | + fi |
| 81 | + |
| 82 | + # check to ensure that the format of the file is correct |
| 83 | + if ! grep -E "^0x[0-9A-F]{4} .+" "$GENERATED_SUPPORTED_DEVICES_FILE" > /dev/null; then |
| 84 | + echo "ERROR: Generated file contains malformed entries" |
| 85 | + echo "Expected format: '0xXXXX GPU_NAME'" |
| 86 | + exit 1 |
| 87 | + fi |
| 88 | +} |
| 89 | + |
| 90 | +# function that downloads the nvidia driver .run file and then builds and archives the kernel modules |
| 91 | +function install_nvidia_driver() { |
| 92 | + local NVIDIA_RUNFILE_URL="" |
| 93 | + local EXTRACT_DIR="${WORKING_DIR}/NVIDIA-Linux-extract" |
| 94 | + local NVIDIA_RUNFILE_NAME="NVIDIA-Linux-${NVIDIA_ARCH}-${RESOLVED_DRIVER_VERSION}.run" |
| 95 | + echo "Installing NVIDIA driver ${RESOLVED_DRIVER_VERSION} for ${NVIDIA_ARCH} using runfile method" |
| 96 | + |
| 97 | + if gpu-ami-util is-isolated-partition || [[ $AWS_REGION == cn-* ]]; then |
| 98 | + NVIDIA_DRIVER_MAJOR_VERSION=$(echo $RESOLVED_DRIVER_VERSION | cut -d. -f1) |
| 99 | + NVIDIA_RUNFILE_URL="s3://${BINARY_BUCKET_NAME}/bin/nvidia-runfiles/${NVIDIA_DRIVER_MAJOR_VERSION}/${NVIDIA_RUNFILE_NAME}" |
| 100 | + echo "S3 download URL: ${NVIDIA_RUNFILE_URL}" |
| 101 | + aws s3 cp --region ${BINARY_BUCKET_REGION} "${NVIDIA_RUNFILE_URL}" "${WORKING_DIR}/${NVIDIA_RUNFILE_NAME}" |
| 102 | + else |
| 103 | + DOMAIN="us.download.nvidia.com" |
| 104 | + NVIDIA_RUNFILE_URL="https://${DOMAIN}/tesla/${RESOLVED_DRIVER_VERSION}/${NVIDIA_RUNFILE_NAME}" |
| 105 | + |
| 106 | + echo "Download URL: ${NVIDIA_RUNFILE_URL}" |
| 107 | + echo "Downloading NVIDIA driver runfile..." |
| 108 | + wget -O "${WORKING_DIR}/${NVIDIA_RUNFILE_NAME}" "${NVIDIA_RUNFILE_URL}" |
| 109 | + fi |
| 110 | + |
| 111 | + chmod +x "${WORKING_DIR}/${NVIDIA_RUNFILE_NAME}" |
| 112 | + |
| 113 | + echo "Extracting NVIDIA driver runfile..." |
| 114 | + sudo "${WORKING_DIR}/${NVIDIA_RUNFILE_NAME}" --extract-only --target "${EXTRACT_DIR}" |
| 115 | + |
| 116 | + pushd "${EXTRACT_DIR}" |
| 117 | + |
| 118 | + # install proprietary kernel modules |
| 119 | + echo "Installing NVIDIA driver with proprietary kernel modules..." |
| 120 | + sudo ./nvidia-installer \ |
| 121 | + --kernel-module-type proprietary \ |
| 122 | + --dkms \ |
| 123 | + --silent || sudo cat /var/log/nvidia-installer.log |
| 124 | + |
| 125 | + # archive and remove proprietary modules |
| 126 | + echo "Archiving proprietary kernel modules..." |
| 127 | + sudo kmod-util archive nvidia |
| 128 | + sudo kmod-util remove nvidia |
| 129 | + |
| 130 | + # The DKMS package name differs between the RPM and the dkms.conf in the OSS kmod sources |
| 131 | + # TODO: can be removed if this is merged: https://github.com/NVIDIA/open-gpu-kernel-modules/pull/567 |
| 132 | + echo "Modifying DKMS configuration for open-source modules..." |
| 133 | + sudo sed -i 's/PACKAGE_NAME="nvidia"/PACKAGE_NAME="nvidia-open"/g' kernel-open/dkms.conf |
| 134 | + |
| 135 | + # install open-source kernel modules |
| 136 | + echo "Installing NVIDIA driver with open-source kernel modules..." |
| 137 | + sudo ./nvidia-installer \ |
| 138 | + --kernel-module-type open \ |
| 139 | + --dkms \ |
| 140 | + --silent \ |
| 141 | + --kernel-module-source-dir=nvidia-open-${RESOLVED_DRIVER_VERSION} || sudo cat /var/log/nvidia-installer.log |
| 142 | + |
| 143 | + KMOD_MAJOR_VERSION=$(sudo kmod-util module-version nvidia-open | cut -d. -f1) |
| 144 | + # assemble the list of supported nvidia devices for the open kernel modules |
| 145 | + echo -e "# This file was generated from supported-gpus/supported-gpus.json\n$(sed -e 's/^/# /g' supported-gpus/LICENSE)" \ |
| 146 | + | sudo tee -a /etc/eks/nvidia-open-supported-devices-$KMOD_MAJOR_VERSION.txt |
| 147 | + |
| 148 | + cat supported-gpus/supported-gpus.json \ |
| 149 | + | jq -r '.chips[] | select(.features[] | contains("kernelopen")) | "\(.devid) \(.name)"' \ |
| 150 | + | sort -u \ |
| 151 | + | sudo tee -a /etc/eks/nvidia-open-supported-devices-$KMOD_MAJOR_VERSION.txt |
| 152 | + |
| 153 | + validate_nvidia_supported_devices_file $KMOD_MAJOR_VERSION |
| 154 | + |
| 155 | + # archive and remove open-source modules |
| 156 | + echo "Archiving open-source kernel modules..." |
| 157 | + sudo kmod-util archive nvidia-open |
| 158 | + sudo kmod-util remove nvidia-open |
| 159 | + |
| 160 | + # uninstall everything before doing a clean install of just the user-space components |
| 161 | + echo "Uninstalling previous driver components..." |
| 162 | + sudo ./nvidia-installer --uninstall --silent |
| 163 | + sudo rm -rf /usr/src/nvidia* |
| 164 | + sudo rm -rf /usr/src/nvidia-open* |
| 165 | + |
| 166 | + # install user-space components only |
| 167 | + echo "Installing NVIDIA driver user-space components..." |
| 168 | + sudo ./nvidia-installer \ |
| 169 | + --no-kernel-modules \ |
| 170 | + --silent |
| 171 | + |
| 172 | + popd |
| 173 | + sudo rm -rf "${EXTRACT_DIR}" |
| 174 | + # removing the downloaded runfile |
| 175 | + sudo rm "${WORKING_DIR}/${NVIDIA_RUNFILE_NAME}" |
| 176 | +} |
| 177 | + |
| 178 | +function create_nvidia_persistenced_service() { |
| 179 | + # The nvidia-persistenced rpms for 570 drivers contain pre-install scripts that set up |
| 180 | + # the necessary group and user for nvidia-persistenced service. Ex. rpm -qp --scripts nvidia-persistenced-latest-dkms-550.163.01-1.el7.x86_64.rpm |
| 181 | + # When we install drivers from the run files, nvidia-persistenced binaries are created but the corresponding .service file and user groups need to be created |
| 182 | + # Ref: https://download.nvidia.com/XFree86/Linux-x86_64/570.195.03/README/nvidia-persistenced.html |
| 183 | + if [ -f /usr/bin/nvidia-persistenced ]; then |
| 184 | + echo "Setting up nvidia-persistenced service..." |
| 185 | + |
| 186 | + # mirror the RPM preinstall scriptlet setup |
| 187 | + sudo groupadd -r nvidia-persistenced |
| 188 | + sudo useradd -r -g nvidia-persistenced -d /var/run/nvidia-persistenced -s /sbin/nologin \ |
| 189 | + -c "NVIDIA persistent software state" nvidia-persistenced |
| 190 | + |
| 191 | + sudo tee /usr/lib/systemd/system/nvidia-persistenced.service > /dev/null << 'EOF' |
| 192 | +[Unit] |
| 193 | +Description=NVIDIA Persistence Daemon |
| 194 | +After=syslog.target |
| 195 | +
|
| 196 | +[Service] |
| 197 | +Type=forking |
| 198 | +PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid |
| 199 | +Restart=always |
| 200 | +ExecStart=/usr/bin/nvidia-persistenced --verbose |
| 201 | +ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced/* |
| 202 | +TimeoutSec=300 |
| 203 | +
|
| 204 | +[Install] |
| 205 | +WantedBy=multi-user.target |
| 206 | +EOF |
| 207 | + |
| 208 | + sudo systemctl daemon-reload |
| 209 | + sudo systemctl enable nvidia-persistenced |
| 210 | + |
| 211 | + echo "nvidia-persistenced service set up successfully." |
| 212 | + else |
| 213 | + echo "Error: nvidia-persistenced binary not found!" |
| 214 | + exit 1 |
| 215 | + fi |
| 216 | + |
| 217 | +} |
| 218 | + |
| 219 | +function install_nvidia_fabric_manager() { |
| 220 | + if gpu-ami-util is-isolated-partition; then |
| 221 | + # For isolated regions, we install nvidia-fabric-manager from the s3 buckets |
| 222 | + rpm_install "nvidia-fabric-manager-${RESOLVED_DRIVER_VERSION}-1.x86_64.rpm" |
| 223 | + else |
| 224 | + # For standard and china regions, the fabric manager is installed from rhel8 repo |
| 225 | + sudo yum install -y "nvidia-fabric-manager-${RESOLVED_DRIVER_VERSION}" |
| 226 | + |
| 227 | + # Exclude nvidia-fabricmanager packages from cuda-rhel8.repo to prevent version conflicts during yum updates |
| 228 | + echo "exclude=nvidia-fabricmanager*" | sudo tee -a /etc/yum.repos.d/cuda-rhel8.repo |
| 229 | + fi |
| 230 | + sudo systemctl enable nvidia-fabricmanager |
| 231 | + |
| 232 | +} |
| 233 | + |
| 234 | +function install_nvidia_container_toolkit() { |
| 235 | + if gpu-ami-util is-isolated-partition || [[ $AWS_REGION == cn-* ]]; then |
| 236 | + # dependency of libnvidia-container-tools |
| 237 | + rpm_install "libnvidia-container1-1.17.8-1.x86_64.rpm" |
| 238 | + # dependencies of nvidia-container-toolkit |
| 239 | + rpm_install "nvidia-container-toolkit-base-1.17.8-1.x86_64.rpm" "libnvidia-container-tools-1.17.8-1.x86_64.rpm" |
| 240 | + rpm_install "nvidia-container-toolkit-1.17.8-1.x86_64.rpm" |
| 241 | + else |
| 242 | + # Install nvidia container toolkit, based on |
| 243 | + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-yum-or-dnf |
| 244 | + sudo yum-config-manager --add-repo=https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo |
| 245 | + sudo yum --setopt=timeout=60 --setopt=retries=10 --setopt=retrydelay=10 install -y nvidia-container-toolkit |
| 246 | + fi |
| 247 | +} |
| 248 | + |
| 249 | +############################# |
| 250 | + |
| 251 | +if [[ "${NVIDIA_DRIVER_VERSION}" == *"."*"."* ]]; then |
| 252 | + # if the full driver version is provided, no need to resolve it from the releases.json |
| 253 | + RESOLVED_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION}" |
| 254 | + echo "Provided driver version: ${RESOLVED_DRIVER_VERSION}" |
| 255 | +else |
| 256 | + RESOLVED_DRIVER_VERSION=$(resolve_latest_driver_version_from_json "${NVIDIA_DRIVER_VERSION}") |
| 257 | +fi |
| 258 | + |
| 259 | +NVIDIA_ARCH=$(uname -m) |
| 260 | + |
| 261 | +# installing required dependencies for building kernel modules and runfile installation |
| 262 | +# The kernel* versionlocks are added by install-worker.sh provisioner in the upstream: |
| 263 | +# https://github.com/awslabs/amazon-eks-ami/blob/main/templates/al2/provisioners/install-worker.sh#L59 |
| 264 | +sudo yum install -y "kernel-devel-$(uname -r)" "kernel-headers-$(uname -r)" gcc make dkms jq |
| 265 | + |
| 266 | +if gpu-ami-util is-isolated-partition; then |
| 267 | + # these are required in order to build kmod-nvidia-open-dkms, and would |
| 268 | + # normally be available from epel but that isn't reachable in ADC |
| 269 | + rpm_install "opencl-filesystem-1.0-5.el7.noarch.rpm" "ocl-icd-2.2.12-1.el7.x86_64.rpm" |
| 270 | +else |
| 271 | + sudo amazon-linux-extras install epel -y |
| 272 | + |
| 273 | + if [[ $AWS_REGION == cn-* ]]; then |
| 274 | + DOMAIN="nvidia.cn" |
| 275 | + else |
| 276 | + DOMAIN="nvidia.com" |
| 277 | + fi |
| 278 | + |
| 279 | + # Add NVIDIA REHL8 repo for nvidia-fabricmanager |
| 280 | + sudo yum-config-manager --add-repo=https://developer.download.${DOMAIN}/compute/cuda/repos/rhel8/${NVIDIA_ARCH}/cuda-rhel8.repo |
| 281 | + |
| 282 | +fi |
| 283 | + |
| 284 | +# The driver setup will happen in five steps: |
| 285 | +# 1. install the nvidia driver: we install the open-source, closed-source kernel modules as well as the user-space modules. We archive the kernel modules. |
| 286 | +# 2. install the nvidia fabric manager |
| 287 | +# 3. set up nvidia-persistenced service and enable it |
| 288 | +# 4. install nvidia container toolkit libraries |
| 289 | + |
| 290 | +install_nvidia_driver |
| 291 | +install_nvidia_fabric_manager |
| 292 | +create_nvidia_persistenced_service |
| 293 | +install_nvidia_container_toolkit |
| 294 | + |
| 295 | +# We versionlock the NVIDIA packages, because our archived kernel modules will only work with the accompanying userland packages on the same version |
| 296 | +sudo yum versionlock \ |
| 297 | + nvidia-* libnvidia-* |
| 298 | + |
| 299 | +mkdir -p /etc/eks |
| 300 | +# writing latest installed driver version to a text file to provide it to nvidia-kmod-load.sh |
| 301 | +# that determines if the instance supports an open-source nvidia-driver |
| 302 | +echo "Writing driver version to /etc/eks/nvidia-latest-driver-version.txt" |
| 303 | +mkdir -p /etc/eks |
| 304 | +echo "${RESOLVED_DRIVER_VERSION}" | sudo tee /etc/eks/nvidia-latest-driver-version.txt |
| 305 | + |
| 306 | +# Show the NVIDIA EULA at startup |
| 307 | +sudo mv ${WORKING_DIR}/accelerator/nvidia-eula.sh /etc/eks/ |
| 308 | +sudo mv ${WORKING_DIR}/accelerator/nvidia-eula.service /etc/systemd/system/ |
| 309 | + |
| 310 | +# Add a systemd unit that will load NVIDIA kernel modules on applicable instance types |
| 311 | +sudo mv ${WORKING_DIR}/accelerator/nvidia-kmod-load.service /etc/systemd/system/ |
| 312 | +sudo mv ${WORKING_DIR}/accelerator/nvidia-kmod-load.sh /etc/eks/ |
| 313 | +sudo systemctl daemon-reload |
| 314 | +sudo systemctl enable nvidia-kmod-load |
| 315 | + |
| 316 | +# Add a bootstrap helper that will configure containerd appropriately |
| 317 | +sudo mv ${WORKING_DIR}/accelerator/bootstrap-gpu.sh /etc/eks/ |
| 318 | +sudo mv ${WORKING_DIR}/accelerator/bootstrap-gpu-nvidia.sh /etc/eks/ |
0 commit comments