Skip to content

Commit 3a62fbb

Browse files
committed
fix(al2023/nvidia): allow nvidia grid install bucket customization
1 parent ef7e0c0 commit 3a62fbb

File tree

4 files changed

+17
-19
lines changed

4 files changed

+17
-19
lines changed

doc/usage/al2023.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
| `nodeadm_build_image` | Image to use as a build environment for nodeadm |
3535
| `nvidia_driver_major_version` | To be used only when ```enable_accelerator = nvidia```. Driver version to install, depends on what is available in NVIDIA repository. |
3636
| `nvidia_repository_url` | YUM/DNF Repository override for the NVIDIA driver packages |
37+
| `nvidia_grid_runfile_bucket_name` | Bucket name for sourcing the Nvidia GRID runfile |
3738
| `pause_container_image` | Image ref for the pause container image |
3839
| `remote_folder` | Directory path for shell provisioner scripts on the builder instance |
3940
| `runc_version` | |

templates/al2023/provisioners/install-nvidia-driver.sh

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,7 @@ echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..."
2929
# of the AMI, we want to ensure that all three kernel modules (and also the userspace modules)
3030
# are on the same NVIDIA driver version. Currently, the script installs the NVIDIA GRID drivers
3131
# first and decides the full NVIDIA driver version that the AMI will adhere to
32-
if [[ "$AWS_REGION" == "us-isof-south-1" || "$AWS_REGION" == "eusc-de-east-1" || "$AWS_REGION" == "eu-isoe-west-1" ]]; then
33-
EC2_GRID_DRIVER_S3_BUCKET="${BINARY_BUCKET_NAME}"
34-
GRID_DRIVER_S3_SCAN_PATH="s3://${EC2_GRID_DRIVER_S3_BUCKET}/bin/nvidia-grid-drivers/"
35-
else
36-
EC2_GRID_DRIVER_S3_BUCKET="ec2-linux-nvidia-drivers"
37-
GRID_DRIVER_S3_SCAN_PATH="s3://${EC2_GRID_DRIVER_S3_BUCKET}/"
38-
fi
39-
40-
NVIDIA_DRIVER_FULL_VERSION=$(aws s3 ls --recursive ${GRID_DRIVER_S3_SCAN_PATH} \
32+
NVIDIA_DRIVER_FULL_VERSION=$(aws s3 ls --recursive ${EC2_GRID_DRIVER_S3_BUCKET} \
4133
| grep -Eo "(NVIDIA-Linux-x86_64-)${NVIDIA_DRIVER_MAJOR_VERSION}\.[0-9]+\.[0-9]+(-grid-aws\.run)" \
4234
| cut -d'-' -f4 \
4335
| sort -V \
@@ -174,7 +166,7 @@ function archive-open-kmods() {
174166

175167
function archive-grid-kmod() {
176168
local MACHINE
177-
local NVIDIA_GRID_RUNFILE_NAME
169+
local NVIDIA_GRID_RUNFILE_KEY
178170
local GRID_INSTALLATION_TEMP_DIR
179171
local EXTRACT_DIR
180172

@@ -187,26 +179,28 @@ function archive-grid-kmod() {
187179
fi
188180

189181
echo "Archiving NVIDIA GRID kernel modules for major version ${NVIDIA_DRIVER_MAJOR_VERSION}"
190-
NVIDIA_GRID_RUNFILE_NAME=$(aws s3 ls --recursive ${GRID_DRIVER_S3_SCAN_PATH} \
182+
NVIDIA_GRID_RUNFILE_KEY=$(aws s3 ls --recursive ${EC2_GRID_DRIVER_S3_BUCKET} \
191183
| grep "NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_FULL_VERSION}" \
192184
| sort -k1,2 \
193185
| tail -1 \
194186
| awk '{print $4}')
195187

196-
if [[ -z "$NVIDIA_GRID_RUNFILE_NAME" ]]; then
188+
if [[ -z "$NVIDIA_GRID_RUNFILE_KEY" ]]; then
197189
echo "ERROR: No GRID driver found for driver version ${NVIDIA_DRIVER_FULL_VERSION} in EC2 S3 bucket"
198190
exit 1
199191
fi
200192

201-
echo "Found GRID runfile: ${NVIDIA_GRID_RUNFILE_NAME}"
202-
local GRID_RUNFILE_LOCAL_NAME
203-
GRID_RUNFILE_LOCAL_NAME=$(basename "${NVIDIA_GRID_RUNFILE_NAME}")
193+
echo "Found GRID runfile: ${NVIDIA_GRID_RUNFILE_KEY}"
194+
local GRID_RUNFILE_NAME
195+
GRID_RUNFILE_NAME=$(basename "${NVIDIA_GRID_RUNFILE_KEY}")
204196

205197
echo "Downloading GRID driver runfile..."
206-
aws s3 cp "s3://${EC2_GRID_DRIVER_S3_BUCKET}/${NVIDIA_GRID_RUNFILE_NAME}" "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_LOCAL_NAME}"
207-
chmod +x "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_LOCAL_NAME}"
198+
# This is the only command that requires the bucket name to actually just be the bucket (no prefix) b/c of how the
199+
# s3 ls recursive output dumps the full object key regardless of the supplied prefix
200+
aws s3 cp "s3://${EC2_GRID_DRIVER_S3_BUCKET%%/*}/${NVIDIA_GRID_RUNFILE_KEY}" "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_NAME}"
201+
chmod +x "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_NAME}"
208202
echo "Extracting NVIDIA GRID driver runfile..."
209-
sudo "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_LOCAL_NAME}" --extract-only --target "${EXTRACT_DIR}"
203+
sudo "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_NAME}" --extract-only --target "${EXTRACT_DIR}"
210204

211205
pushd "${EXTRACT_DIR}"
212206

templates/al2023/template.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
"nodeadm_build_image": null,
3131
"nvidia_driver_major_version": null,
3232
"nvidia_repository_url": null,
33+
"nvidia_grid_runfile_bucket_name": null,
3334
"pause_container_image": null,
3435
"remote_folder": null,
3536
"runc_version": null,
@@ -267,6 +268,7 @@
267268
"BINARY_BUCKET_REGION={{user `binary_bucket_region`}}",
268269
"NVIDIA_DRIVER_MAJOR_VERSION={{user `nvidia_driver_major_version`}}",
269270
"NVIDIA_REPOSITORY={{user `nvidia_repository_url`}}",
271+
"EC2_GRID_DRIVER_S3_BUCKET={{user `nvidia_grid_runfile_bucket_name`}}",
270272
"WORKING_DIR={{user `working_dir`}}"
271273
]
272274
},
@@ -338,4 +340,4 @@
338340
}
339341
}
340342
]
341-
}
343+
}

templates/al2023/variables-default.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"nodeadm_build_image": "public.ecr.aws/eks-distro-build-tooling/golang:1.25",
2424
"nvidia_driver_major_version": "580",
2525
"nvidia_repository_url": null,
26+
"nvidia_grid_runfile_bucket_name": "ec2-linux-nvidia-drivers",
2627
"pause_container_image": "602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/pause:3.10",
2728
"remote_folder": "/tmp",
2829
"runc_version": "*",

0 commit comments

Comments
 (0)