Skip to content

Commit 6fa7f3b

Browse files
committed
fix(neuron): install aws-neuronx-dkms-2.21 at boot on inf1
1 parent 379c4c5 commit 6fa7f3b

File tree

3 files changed

+75
-0
lines changed

3 files changed

+75
-0
lines changed

templates/al2023/provisioners/install-neuron-driver.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ set -o pipefail
44
set -o nounset
55
set -o errexit
66

7+
readonly PACKAGE_CACHE_PATH="/var/cache/eks/packages"
8+
79
if [ "$ENABLE_ACCELERATOR" != "neuron" ]; then
810
exit 0
911
fi
@@ -24,6 +26,27 @@ EOF
2426
# Manually install the GPG key, verifies repository can be reached
2527
sudo rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
2628

29+
################################################################################
30+
### Cache packages for conditional install at boot #############################
31+
################################################################################
32+
# TODO: remove this section if inf1 is not supported
33+
34+
# Install and remove aws-neuronx-dkms-2.21.x to ensure all of its dependencies are
35+
# pre-installed
36+
sudo dnf install aws-neuronx-dkms-2.21.*
37+
sudo dnf remove --noautoremove aws-neuronx-dkms
38+
39+
# Cache the 2.21.x rpm for contidtional boot-time install
40+
sudo dnf download aws-neuronx-dkms-2.21.*
41+
sudo mkdir -p "$PACKAGE_CACHE_PATH"
42+
sudo mv aws-neuronx-dkms-*.rpm "${PACKAGE_CACHE_PATH}/"
43+
44+
sudo mv ${WORKING_DIR}/gpu/neuron-package-install.sh /etc/eks/
45+
sudo mv ${WORKING_DIR}/gpu/neuron-package-install.service /etc/systemd/system/
46+
47+
sudo systemctl daemon-reload
48+
sudo systemctl enable neuron-package-install.service
49+
2750
################################################################################
2851
### Install packages ###########################################################
2952
################################################################################
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[Unit]
2+
Description=Install Neuron packages
3+
# Run before cloud-init so packages are installed
4+
# before user data that may query the installed information
5+
Before=cloud-init.service
6+
7+
[Service]
8+
Type=oneshot
9+
ExecStart=/etc/eks/neuron-package-install.sh
10+
RemainAfterExit=true
11+
12+
[Install]
13+
WantedBy=multi-user.target
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env bash
2+
3+
set -o errexit
4+
set -o pipefail
5+
set -o nounset
6+
7+
readonly PACKAGE_CACHE_PATH="/var/cache/eks/packages"
8+
9+
readonly AMAZON_VENDOR_CODE="1d0f"
10+
# Based on https://github.com/aws-neuron/aws-neuron-driver/blob/fca19f7df31b44cbbffaf121230a66df6f59d118/neuron_device.h#L36-L39
11+
readonly INF1_DEVICE_IDS=("7064" "7065" "7066" "7067")
12+
13+
function is-inf1() {
14+
for DEVICE_ID in "${INF1_DEVICE_IDS[@]}"; do
15+
MATCHED_DEVICES=$(lspci -d "${AMAZON_VENDOR_CODE}:${DEVICE_ID}" | wc -l)
16+
if [[ "$MATCHED_DEVICES" -gt 0 ]]; then
17+
return 0
18+
fi
19+
done
20+
21+
return 1
22+
}
23+
24+
function installed-neuron-driver-version() {
25+
rpm -q aws-neuronx-dkms --queryformat '%{VERSION}'
26+
}
27+
28+
if is-inf1 && [[ $(installed-neuron-driver-version) != 2.21.* ]]; then
29+
echo "downgrading driver to 2.21"
30+
# "dnf downgrade" would fail because the post remove script for the package
31+
# does not fully remove the module, and then the post install script for the
32+
# downgraded version fails because of an attempt to probe an older version of
33+
# a loaded module without --force. relying on the rpm cli directly makes the
34+
# operations more intuitive
35+
sudo rpm --erase aws-neuronx-dkms
36+
sudo rpm -i "${PACKAGE_CACHE_PATH}/aws-neuronx-dkms-2.21.*.rpm"
37+
else
38+
echo "nothing to do!"
39+
fi

0 commit comments

Comments
 (0)