Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions templates/al2023/provisioners/install-neuron-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ set -o pipefail
set -o nounset
set -o errexit

readonly PACKAGE_CACHE_PATH="/var/cache/eks/packages"

if [ "$ENABLE_ACCELERATOR" != "neuron" ]; then
exit 0
fi
Expand All @@ -24,6 +26,27 @@ EOF
# Manually install the GPG key, verifies repository can be reached
sudo rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB

################################################################################
### Cache packages for conditional install at boot #############################
################################################################################
# TODO: remove this section if inf1 is not supported

# Install and remove aws-neuronx-dkms-2.21.x to ensure all of its dependencies are
# pre-installed
sudo dnf install -y aws-neuronx-dkms-2.21.*
sudo dnf remove -y --noautoremove aws-neuronx-dkms

# Cache the 2.21.x rpm for contidtional boot-time install
sudo dnf download aws-neuronx-dkms-2.21.*
sudo mkdir -p "$PACKAGE_CACHE_PATH"
sudo mv aws-neuronx-dkms-2.21.*.rpm "${PACKAGE_CACHE_PATH}/"

sudo mv ${WORKING_DIR}/gpu/neuron-package-install.sh /etc/eks/
sudo mv ${WORKING_DIR}/gpu/neuron-package-install.service /etc/systemd/system/

sudo systemctl daemon-reload
sudo systemctl enable neuron-package-install.service

################################################################################
### Install packages ###########################################################
################################################################################
Expand Down
13 changes: 13 additions & 0 deletions templates/al2023/runtime/gpu/neuron-package-install.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[Unit]
Description=Install Neuron packages
# Run before cloud-init so packages are installed
# before user data that may query the installed information
Before=cloud-init.service
Comment on lines +3 to +5
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need some ordering relative to user data script execution to ensure deterministic behavior, each has it's trade-offs.

  • before user data: node joining and SSM agent registering are delayed by the time it takes to execute this service, which is ~45s from my testing. advantage is that customers can query the installed package version and act on it
  • after user data: node joining is not delayed, but the neuron device plugin may schedule before the driver is loading, leading it into a crashloop. there's no clean way to establish an ordering there, and because of the exponential backoff that time could add up to quite a bit. customer user data queries of package version would also always return latest, regardless of what this service would load


[Service]
Type=oneshot
ExecStart=/etc/eks/neuron-package-install.sh
RemainAfterExit=true

[Install]
WantedBy=multi-user.target
49 changes: 49 additions & 0 deletions templates/al2023/runtime/gpu/neuron-package-install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash

set -o errexit
set -o pipefail
set -o nounset

readonly PACKAGE_CACHE_PATH="/var/cache/eks/packages"

readonly AMAZON_VENDOR_CODE="1d0f"
# Based on https://github.com/aws-neuron/aws-neuron-driver/blob/fca19f7df31b44cbbffaf121230a66df6f59d118/neuron_device.h#L36-L39
readonly INF1_DEVICE_IDS=("7064" "7065" "7066" "7067")

function is-inf1() {
for DEVICE_ID in "${INF1_DEVICE_IDS[@]}"; do
MATCHED_DEVICES=$(lspci -d "${AMAZON_VENDOR_CODE}:${DEVICE_ID}" | wc -l)
if [[ "$MATCHED_DEVICES" -gt 0 ]]; then
return 0
fi
done

return 1
}

# the aws-neuronx-dkms module has a pre-install script that calls
# on update-pciids, which will hang if called from a node
# that cannot reach https://pci-ids.ucw.cz/v2.2/pci.ids
# the values pulled by the install at build time can be used
# in lieu of this
function update-pciids() {
echo "update-pciids called: doing nothing"
}

function installed-neuron-driver-version() {
rpm -q aws-neuronx-dkms --queryformat '%{VERSION}'
}

if is-inf1 && [[ $(installed-neuron-driver-version) != 2.21.* ]]; then
echo "downgrading driver to 2.21"
# "dnf downgrade" would fail because the post remove script for the package
# does not fully remove the module, and then the post install script for the
# downgraded version fails because of an attempt to probe an older version of
# a loaded module without --force. relying on the rpm cli directly makes the
# operations more intuitive
export -f update-pciids
rpm --erase aws-neuronx-dkms
rpm -i "${PACKAGE_CACHE_PATH}/aws-neuronx-dkms-2.21.*.rpm"
else
echo "nothing to do!"
fi
3 changes: 2 additions & 1 deletion templates/al2023/template.json
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@
"script": "{{template_dir}}/provisioners/install-neuron-driver.sh",
"environment_vars": [
"AWS_REGION={{user `aws_region`}}",
"ENABLE_ACCELERATOR={{user `enable_accelerator`}}"
"ENABLE_ACCELERATOR={{user `enable_accelerator`}}",
"WORKING_DIR={{user `working_dir`}}"
]
},
{
Expand Down
Loading