Skip to content

Commit 9fe061a

Browse files
committed
fix(neuron): install aws-neuronx-dkms-2.21 at boot on inf1
1 parent 379c4c5 commit 9fe061a

File tree

4 files changed

+82
-1
lines changed

4 files changed

+82
-1
lines changed

templates/al2023/provisioners/install-neuron-driver.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ set -o pipefail
44
set -o nounset
55
set -o errexit
66

7+
readonly PACKAGE_CACHE_PATH="/var/cache/eks/packages"
8+
79
if [ "$ENABLE_ACCELERATOR" != "neuron" ]; then
810
exit 0
911
fi
@@ -24,6 +26,27 @@ EOF
2426
# Manually install the GPG key, verifies repository can be reached
2527
sudo rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
2628

29+
################################################################################
30+
### Cache packages for conditional install at boot #############################
31+
################################################################################
32+
# TODO: remove this section if inf1 is not supported
33+
34+
# Install and remove aws-neuronx-dkms-2.21.x to ensure all of its dependencies are
35+
# pre-installed
36+
sudo dnf install -y aws-neuronx-dkms-2.21.*
37+
sudo dnf remove -y --noautoremove aws-neuronx-dkms
38+
39+
# Cache the 2.21.x rpm for contidtional boot-time install
40+
sudo dnf download aws-neuronx-dkms-2.21.*
41+
sudo mkdir -p "$PACKAGE_CACHE_PATH"
42+
sudo mv aws-neuronx-dkms-2.21.*.rpm "${PACKAGE_CACHE_PATH}/"
43+
44+
sudo mv ${WORKING_DIR}/gpu/neuron-package-install.sh /etc/eks/
45+
sudo mv ${WORKING_DIR}/gpu/neuron-package-install.service /etc/systemd/system/
46+
47+
sudo systemctl daemon-reload
48+
sudo systemctl enable neuron-package-install.service
49+
2750
################################################################################
2851
### Install packages ###########################################################
2952
################################################################################
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[Unit]
2+
Description=Install Neuron packages
3+
# Run before cloud-init so packages are installed
4+
# before user data that may query the installed information
5+
Before=cloud-init.service
6+
7+
[Service]
8+
Type=oneshot
9+
ExecStart=/etc/eks/neuron-package-install.sh
10+
RemainAfterExit=true
11+
12+
[Install]
13+
WantedBy=multi-user.target
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/env bash
2+
3+
set -o errexit
4+
set -o pipefail
5+
set -o nounset
6+
7+
readonly PACKAGE_CACHE_PATH="/var/cache/eks/packages"
8+
9+
readonly AMAZON_VENDOR_CODE="1d0f"
10+
# Based on https://github.com/aws-neuron/aws-neuron-driver/blob/fca19f7df31b44cbbffaf121230a66df6f59d118/neuron_device.h#L36-L39
11+
readonly INF1_DEVICE_IDS=("7064" "7065" "7066" "7067")
12+
13+
function is-inf1() {
14+
for DEVICE_ID in "${INF1_DEVICE_IDS[@]}"; do
15+
MATCHED_DEVICES=$(lspci -d "${AMAZON_VENDOR_CODE}:${DEVICE_ID}" | wc -l)
16+
if [[ "$MATCHED_DEVICES" -gt 0 ]]; then
17+
return 0
18+
fi
19+
done
20+
21+
return 1
22+
}
23+
24+
function update-pciids() {
25+
echo "update-pciids called: doing nothing"
26+
}
27+
28+
function installed-neuron-driver-version() {
29+
rpm -q aws-neuronx-dkms --queryformat '%{VERSION}'
30+
}
31+
32+
if is-inf1 && [[ $(installed-neuron-driver-version) != 2.21.* ]]; then
33+
echo "downgrading driver to 2.21"
34+
# "dnf downgrade" would fail because the post remove script for the package
35+
# does not fully remove the module, and then the post install script for the
36+
# downgraded version fails because of an attempt to probe an older version of
37+
# a loaded module without --force. relying on the rpm cli directly makes the
38+
# operations more intuitive
39+
export -f update-pciids
40+
rpm --erase aws-neuronx-dkms
41+
rpm -i "${PACKAGE_CACHE_PATH}/aws-neuronx-dkms-2.21.*.rpm"
42+
else
43+
echo "nothing to do!"
44+
fi

templates/al2023/template.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,8 @@
251251
"script": "{{template_dir}}/provisioners/install-neuron-driver.sh",
252252
"environment_vars": [
253253
"AWS_REGION={{user `aws_region`}}",
254-
"ENABLE_ACCELERATOR={{user `enable_accelerator`}}"
254+
"ENABLE_ACCELERATOR={{user `enable_accelerator`}}",
255+
"WORKING_DIR={{user `working_dir`}}"
255256
]
256257
},
257258
{

0 commit comments

Comments
 (0)