Skip to content

Commit ed7cbf3

Browse files
[HuggingFace][Neuronx] Training - DLC for Optimum-neuron 0.4.1 - Neuron SDK 2.26.0 PyTorch 2.8.0 - Transformers 4.55.4 (#5418)
Co-authored-by: Arjun Raman <[email protected]> Co-authored-by: Arjun Raman <[email protected]>
1 parent 4e63085 commit ed7cbf3

File tree

7 files changed

+462
-6
lines changed

7 files changed

+462
-6
lines changed

huggingface/pytorch/training/buildspec-neuronx.yml

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
22
region: &REGION <set-$REGION-in-environment>
33
base_framework: &BASE_FRAMEWORK pytorch
44
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
5-
version: &VERSION 2.7.0
6-
short_version: &SHORT_VERSION "2.7"
5+
version: &VERSION 2.8.0
6+
short_version: &SHORT_VERSION "2.8"
77
contributor: huggingface
88
arch_type: x86
99

@@ -25,19 +25,28 @@ context:
2525
deep_learning_container:
2626
source: ../../../src/deep_learning_container.py
2727
target: deep_learning_container.py
28+
apex_setup:
29+
source: docker/build_artifacts/apex_setup.py
30+
target: apex_setup.py
31+
nxdt_install_setup:
32+
source: docker/build_artifacts/nxdt_install_setup.sh
33+
target: nxdt_install_setup.sh
34+
nxdt_requirements:
35+
source: docker/build_artifacts/nxdt_requirements.txt
36+
target: nxdt_requirements.txt
2837

2938
images:
3039
BuildNeuronHFPytorchPy310TrainingDockerImage:
3140
<<: *TRAINING_REPOSITORY
3241
build: &HUGGINGFACE_PYTORCH_INF_TRAINING_PY3 false
33-
image_size_baseline: 28000
42+
image_size_baseline: 40000
3443
device_type: &DEVICE_TYPE neuronx
3544
python_version: &DOCKER_PYTHON_VERSION py3
3645
tag_python_version: &TAG_PYTHON_VERSION py310
37-
neuron_sdk_version: &NEURON_SDK_VERSION sdk2.24.1
46+
neuron_sdk_version: &NEURON_SDK_VERSION sdk2.26.0
3847
os_version: &OS_VERSION ubuntu22.04
39-
transformers_version: &TRANSFORMERS_VERSION 4.51.0
40-
datasets_version: &DATASETS_VERSION 2.18.0
48+
transformers_version: &TRANSFORMERS_VERSION 4.55.4
49+
datasets_version: &DATASETS_VERSION 4.1.1
4150
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION,"-", *NEURON_SDK_VERSION, '-', *OS_VERSION ]
4251
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., *DEVICE_TYPE ]
4352
context:
Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
ARG BUILD_STAGE=prod
2+
3+
FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base
4+
5+
LABEL maintainer="Amazon AI"
6+
LABEL dlc_major_version="2"
7+
8+
ARG PYTHON=python3.10
9+
ARG PYTHON_VERSION=3.10.12
10+
ARG PIP=pip3
11+
ARG OMPI_VERSION=4.1.5
12+
13+
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
14+
ARG DEBIAN_FRONTEND=noninteractive
15+
16+
# Python won't try to write .pyc or .pyo files on the import of source modules
17+
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
18+
ENV PYTHONDONTWRITEBYTECODE=1
19+
ENV PYTHONUNBUFFERED=1
20+
ENV PYTHONIOENCODING=UTF-8
21+
ENV LANG=C.UTF-8
22+
ENV LC_ALL=C.UTF-8
23+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib"
24+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
25+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
26+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
27+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
28+
ENV PATH="/opt/aws/neuron/bin:${PATH}"
29+
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
30+
ENV DGLBACKEND=pytorch
31+
32+
RUN apt-get update \
33+
&& apt-get upgrade -y \
34+
&& apt-get install -y --no-install-recommends \
35+
build-essential \
36+
ca-certificates \
37+
cmake \
38+
curl \
39+
emacs \
40+
git \
41+
gnupg2 \
42+
gpg-agent \
43+
jq \
44+
libopencv-dev \
45+
libglib2.0-0 \
46+
libgl1-mesa-glx \
47+
libsm6 \
48+
libxext6 \
49+
libxrender-dev \
50+
libssl-dev \
51+
libsqlite3-dev \
52+
libgdbm-dev \
53+
libc6-dev \
54+
libbz2-dev \
55+
libncurses-dev \
56+
libffi-dev \
57+
libcap-dev \
58+
libhwloc-dev \
59+
openjdk-8-jdk-headless \
60+
openjdk-8-jdk \
61+
openjdk-8-jre \
62+
openjdk-11-jdk \
63+
openssl \
64+
software-properties-common \
65+
tk-dev \
66+
unzip \
67+
wget \
68+
vim \
69+
zlib1g-dev \
70+
&& rm -rf /var/lib/apt/lists/* \
71+
&& rm -rf /tmp/tmp* \
72+
&& apt-get clean
73+
74+
# Install Open MPI
75+
RUN mkdir -p /tmp/openmpi \
76+
&& cd /tmp/openmpi \
77+
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
78+
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
79+
&& cd openmpi-${OMPI_VERSION} \
80+
&& ./configure --enable-orterun-prefix-by-default \
81+
&& make -j $(nproc) all \
82+
&& make install \
83+
&& ldconfig \
84+
&& rm -rf /tmp/openmpi
85+
86+
# Install packages and configure SSH for MPI operator in k8s
87+
RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
88+
&& mkdir -p /var/run/sshd \
89+
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
90+
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
91+
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
92+
&& rm -rf /var/lib/apt/lists/* \
93+
&& rm -rf /tmp/tmp* \
94+
&& apt-get clean
95+
96+
# Install Python
97+
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
98+
&& tar -xzf Python-$PYTHON_VERSION.tgz \
99+
&& cd Python-$PYTHON_VERSION \
100+
&& ./configure --enable-shared --prefix=/usr/local \
101+
&& make -j $(nproc) && make install \
102+
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
103+
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
104+
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
105+
&& ${PIP} --no-cache-dir install --upgrade pip \
106+
&& rm -rf ~/.cache/pip/*
107+
108+
WORKDIR /
109+
110+
# The ENV variables declared below are changed in the previous section
111+
# Grouping these ENV variables in the first section causes
112+
# ompi_info to fail. This is only observed in CPU containers
113+
ENV PATH="$PATH:/home/.openmpi/bin"
114+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
115+
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
116+
117+
RUN ${PIP} install --no-cache-dir -U \
118+
"bokeh>=2.3,<3" \
119+
"awscli<2" \
120+
scipy \
121+
click \
122+
"cryptography" \
123+
"sagemaker>=2,<3" \
124+
"sagemaker-pytorch-training" \
125+
psutil==5.6.7 \
126+
dataset \
127+
Pillow \
128+
&& rm -rf ~/.cache/pip/*
129+
130+
RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
131+
132+
# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
133+
# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
134+
# awscli 1.25.47 has requirement docutils<0.17,>=0.10
135+
# etcd for kubernetes installation
136+
# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9.
137+
# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2
138+
RUN ${PIP} install --no-cache-dir -U \
139+
"attrs<24,>=23.1.0" \
140+
"docutils>=0.10,<0.17" \
141+
"rsa<4.8,>=3.1.2" \
142+
"python-etcd" \
143+
"urllib3>=1.26.0,<1.27" \
144+
# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import)
145+
&& ${PIP} install --no-cache-dir -U \
146+
"bokeh>=3.0.1,<4" \
147+
"imageio>=2.22,<3" \
148+
"opencv-python>=4.8.1.78" \
149+
"plotly>=5.11,<6" \
150+
"seaborn>=0.12,<1" \
151+
"shap>=0.41,<1" \
152+
&& rm -rf ~/.cache/pip/*
153+
154+
# EFA Installer does apt get. Make sure to run apt update before that
155+
RUN apt-get update \
156+
&& cd $HOME \
157+
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
158+
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
159+
&& cat aws-efa-installer.key | gpg --fingerprint \
160+
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
161+
&& tar -xf aws-efa-installer-latest.tar.gz \
162+
&& cd aws-efa-installer \
163+
&& ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
164+
&& cd $HOME \
165+
&& rm -rf /var/lib/apt/lists/* \
166+
&& rm -rf /tmp/tmp* \
167+
&& apt-get clean
168+
169+
# Install some common packages used by training scripts
170+
# torchvision needed for MLP. since it depends on torch and torch neuron/torch
171+
# is already installed install it with nodeps
172+
RUN ${PIP} install --no-cache-dir --no-deps -U \
173+
torchvision==0.23.0 \
174+
# Needed for running bert training scripts
175+
&& ${PIP} install --no-cache-dir -U \
176+
graphviz \
177+
tensorboard==2.6 \
178+
accelerate \
179+
# Install NxDT dependencies
180+
&& ${PIP} install --no-cache-dir \
181+
Cython \
182+
wheel \
183+
&& rm -rf ~/.cache/pip/*
184+
185+
# Copy workaround script for incorrect hostname
186+
COPY changehostname.c /
187+
COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/
188+
189+
RUN HOME_DIR=/root \
190+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
191+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
192+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
193+
&& chmod +x /usr/local/bin/testOSSCompliance \
194+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
195+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
196+
&& rm -rf ${HOME_DIR}/oss_compliance* \
197+
&& rm -rf /tmp/tmp*
198+
199+
RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt
200+
201+
# Setting up APT and PIP repo for neuron artifacts
202+
ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com
203+
ARG NEURON_APT_REPO_KEY
204+
ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com
205+
ARG NEURON_PIP_REPO_KEY
206+
RUN mkdir -p /etc/apt/keyrings \
207+
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
208+
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
209+
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
210+
&& PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
211+
&& ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
212+
213+
# Neuron SDK components
214+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
215+
ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
216+
ARG NEURONX_TOOLS_VERSION=2.26.14.0
217+
ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca
218+
ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7
219+
ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf
220+
221+
FROM base AS repo
222+
223+
# Install Neuron components from the apt and pip repos (latest versions)
224+
RUN apt-get update \
225+
&& apt-get install -y \
226+
aws-neuronx-tools \
227+
aws-neuronx-collectives \
228+
aws-neuronx-runtime-lib \
229+
&& rm -rf /var/lib/apt/lists/* \
230+
&& rm -rf /tmp/tmp* \
231+
&& apt-get clean
232+
233+
RUN ${PIP} install --no-cache-dir --force-reinstall \
234+
torch-neuronx \
235+
neuronx-cc \
236+
neuronx_distributed \
237+
&& rm -rf ~/.cache/pip/*
238+
239+
FROM base AS prod
240+
241+
# Install Neuron components with specific versions
242+
RUN apt-get update \
243+
&& apt-get install -y \
244+
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
245+
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
246+
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
247+
&& rm -rf /var/lib/apt/lists/* \
248+
&& rm -rf /tmp/tmp* \
249+
&& apt-get clean
250+
251+
RUN ${PIP} install --force-reinstall \
252+
torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
253+
neuronx-cc==$NEURONX_CC_VERSION \
254+
neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \
255+
&& rm -rf ~/.cache/pip/*
256+
257+
FROM ${BUILD_STAGE} AS final
258+
259+
# Hugging Face version args
260+
ARG OPTIMUM_NEURON_VERSION=0.4.1
261+
ARG TRANSFORMERS_VERSION=4.55.4
262+
ARG DATASETS_VERSION=4.1.1
263+
ARG GEVENT_VERSION=24.10.3
264+
265+
RUN apt-get remove -y --purge emacs && \
266+
apt-get autoremove -y
267+
268+
# We need to set this environment variable to avoid the following error when building KenLM:
269+
# https://github.com/kpu/kenlm/issues/462
270+
ENV CMAKE_POLICY_VERSION_MINIMUM=3.5
271+
272+
# Install Hugging Face libraries and its dependencies
273+
# Install optimum-neuron with this exta starting from next release. \
274+
# "optimum-neuron[training]"==${OPTIMUM_NEURON_VERSION} \
275+
RUN ${PIP} install --no-cache-dir \
276+
evaluate \
277+
transformers[sklearn,sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
278+
datasets==${DATASETS_VERSION} \
279+
optimum-neuron[training]==${OPTIMUM_NEURON_VERSION} \
280+
gevent==${GEVENT_VERSION} \
281+
&& rm -rf ~/.cache/pip/*
282+
283+
# Pin numpy to version required by neuronx-cc
284+
# Update Pillow, urllib, wandb versions to fix high and critical vulnerabilities
285+
# neuronx-cc has requirement networkx~=2.6
286+
RUN ${PIP} install -U \
287+
"sagemaker>=2.237.0" \
288+
sagemaker-training \
289+
"sagemaker-pytorch-training<3.0.0" \
290+
"tensorboard>=2.11.0" \
291+
"numpy" \
292+
"numba" \
293+
"Pillow==10.3.0" \
294+
"requests" \
295+
wandb \
296+
pytorch-lightning \
297+
Jinja2 \
298+
mlflow \
299+
tornado \
300+
"awscli<2" \
301+
"boto3<2.0" \
302+
"botocore<1.35.94,>=1.35.74" \
303+
google-auth \
304+
"urllib3>=1.26.17,<1.27" \
305+
"networkx==2.8.8" \
306+
bokeh \
307+
"opencv-python<4.12.0" \
308+
"fsspec==2025.9.0" \
309+
"protobuf<4" \
310+
"multiprocess<0.70.17" \
311+
&& rm -rf ~/.cache/pip/*
312+
313+
RUN apt-get update \
314+
&& apt install -y --no-install-recommends \
315+
git-lfs \
316+
libgssapi-krb5-2 \
317+
libexpat1 \
318+
expat \
319+
libarchive13 \
320+
libgstreamer1.0-0 \
321+
libgstreamer-plugins-base1.0-0 \
322+
&& apt-get upgrade -y apparmor \
323+
&& rm -rf /var/lib/apt/lists/* \
324+
&& rm -rf /tmp/tmp* \
325+
&& apt-get clean
326+
327+
ENV WANDB_MODE=disabled
328+
329+
# Starts framework
330+
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
331+
CMD ["/bin/bash"]
332+
333+
HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"76839": "[pkg: gevent] [installed: 24.10.3]",
3+
"71691": "[pkg: mlflow] [installed: 3.4.0]",
4+
"77740": "[pkg: protobuf] [installed: 3.20.3]",
5+
"77744": "[pkg: urllib3] [installed: 1.26.20]"
6+
}

0 commit comments

Comments
 (0)