Skip to content

Commit 79ceef3

Browse files
neuron-containers-cikaenafwi
andauthored
Updated Dockerfiles (#138)
Updated Dockerfiles Co-authored-by: kaenafwi <[email protected]>
1 parent 78f2c3f commit 79ceef3

File tree

5 files changed

+528
-57
lines changed

5 files changed

+528
-57
lines changed

docker/common/nxdt_requirements.txt

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
hydra-core>=1.2.0,<1.3
1+
hydra-core>=1.3.0
22
omegaconf>=2.2,<2.3
33
pyyaml==6.0.1
44
torchmetrics>=0.4.1rc0,<=0.10.3
5-
trl==0.10.1
6-
transformers==4.48.0
5+
transformers==4.52.4
76
wandb
87
webdataset>=0.1.48,<=0.1.62
98
pandas
@@ -31,11 +30,11 @@ pytorch-lightning==2.5.0
3130
ipadic
3231
mecab-python3
3332
protobuf==3.20.3
34-
datasets==2.19.1
33+
datasets==3.0.0
3534
dill==0.3.8
3635
nemo_toolkit==2.1.0
3736
regex
38-
requests<2.32.0
37+
requests==2.32.4
3938
python-daemon
4039
huggingface_hub>=0.27.1
4140
multiprocess==0.70.16
@@ -44,4 +43,8 @@ numpy>=1.24.3,<=1.25.2
4443
rouge_score
4544
setuptools>=70.0
4645
lightning==2.5.0
47-
ml-dtypes==0.2.0
46+
ml-dtypes==0.5.0
47+
boto3==1.35.93
48+
botocore==1.35.93
49+
s3transfer==0.10.4
50+
s3fs

docker/jax/training/0.6/Dockerfile.neuronx

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ LABEL maintainer="Amazon AI"
77

88
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
99
ARG DEBIAN_FRONTEND=noninteractive
10-
ARG PYTHON=python3.10
11-
ARG PYTHON_VERSION=3.10.12
10+
ARG PYTHON=python3.11
11+
ARG PYTHON_VERSION=3.11.13
1212
ARG PIP=pip3
1313
ARG OMPI_VERSION=4.1.5
1414

@@ -160,16 +160,11 @@ RUN mkdir -p /etc/apt/keyrings \
160160
&& ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
161161

162162
# Neuron SDK components version numbers
163-
ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
164-
ARG IGNORE_MISSING_NEURON_COMPONENTS=false
165-
RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
166-
167-
ARG NEURONX_RUNTIME_LIB_VERSION=2.27.23.0-8deec4dbf
168-
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.27.34.0-ec8cd5e8b
169-
ARG NEURONX_TOOLS_VERSION=2.25.145.0
170-
171-
ARG NEURONX_CC_VERSION=2.20.9961.0
172-
ARG NEURONX_JAX_TRAINING_VERSION=0.6.1.1.0.3499+2edccbed
163+
ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
164+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
165+
ARG NEURONX_TOOLS_VERSION=2.26.14.0
166+
ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7
167+
ARG NEURONX_JAX_TRAINING_VERSION=0.6.2.1.0.6446+d8c0de77
173168

174169
FROM base AS repo
175170

@@ -213,4 +208,4 @@ FROM ${BUILD_STAGE} AS final
213208
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
214209
CMD ["/bin/bash"]
215210

216-
HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
211+
HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
ARG BUILD_STAGE=prod
2+
3+
FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base
4+
5+
LABEL dlc_major_version="1"
6+
LABEL maintainer="Amazon AI"
7+
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
8+
9+
ARG DEBIAN_FRONTEND=noninteractive
10+
ARG PIP=pip3
11+
ARG PYTHON=python3.11
12+
ARG PYTHON_VERSION=3.11.13
13+
ARG TORCHSERVE_VERSION=0.11.0
14+
ARG SM_TOOLKIT_VERSION=2.0.25
15+
ARG MINIFORGE_VERSION=25.3.1-0
16+
17+
# See http://bugs.python.org/issue19846
18+
ENV LANG=C.UTF-8
19+
ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH
20+
ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH
21+
22+
RUN apt-get update \
23+
&& apt-get upgrade -y \
24+
&& apt-get install -y --no-install-recommends \
25+
apt-transport-https \
26+
build-essential \
27+
ca-certificates \
28+
cmake \
29+
curl \
30+
emacs \
31+
git \
32+
gnupg2 \
33+
gpg-agent \
34+
jq \
35+
libgl1-mesa-glx \
36+
libglib2.0-0 \
37+
libsm6 \
38+
libxext6 \
39+
libxrender-dev \
40+
libcap-dev \
41+
libhwloc-dev \
42+
openjdk-11-jdk \
43+
unzip \
44+
vim \
45+
wget \
46+
zlib1g-dev \
47+
&& rm -rf /var/lib/apt/lists/* \
48+
&& rm -rf /tmp/tmp* \
49+
&& apt-get clean
50+
51+
# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files
52+
RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \
53+
mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \
54+
/var/lib/dpkg/info/ca-certificates-java.postinst configure;
55+
56+
RUN curl -L -o ~/miniforge.sh https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-${MINIFORGE_VERSION}-Linux-x86_64.sh \
57+
&& chmod +x ~/miniforge.sh \
58+
&& ~/miniforge.sh -b -p /opt/conda \
59+
&& rm ~/miniforge.sh \
60+
&& /opt/conda/bin/conda update -y conda \
61+
&& /opt/conda/bin/mamba install -c conda-forge -y \
62+
python=$PYTHON_VERSION \
63+
pyopenssl \
64+
cython \
65+
mkl-include \
66+
mkl \
67+
parso \
68+
typing \
69+
# Below 2 are included in miniconda base, but not mamba so need to install
70+
conda-content-trust \
71+
charset-normalizer \
72+
&& /opt/conda/bin/conda clean -ya
73+
74+
RUN /opt/conda/bin/mamba install -c conda-forge \
75+
python=$PYTHON_VERSION \
76+
scikit-learn \
77+
h5py \
78+
requests \
79+
&& conda clean -ya \
80+
&& pip install --upgrade pip \
81+
--trusted-host pypi.org --trusted-host files.pythonhosted.org \
82+
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \
83+
&& pip install \
84+
packaging \
85+
enum-compat \
86+
ipython \
87+
&& rm -rf ~/.cache/pip/*
88+
89+
RUN ${PIP} install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
90+
&& ${PIP} install --no-cache-dir -U \
91+
opencv-python>=4.8.1.78 \
92+
"numpy<1.24,>1.21" \
93+
"scipy>=1.8.0" \
94+
six \
95+
"awscli<2" \
96+
pandas==1.* \
97+
boto3 \
98+
cryptography \
99+
"protobuf>=3.18.3,<4" \
100+
torchserve==${TORCHSERVE_VERSION} \
101+
torch-model-archiver==${TORCHSERVE_VERSION} \
102+
&& rm -rf ~/.cache/pip/*
103+
104+
ENV SAGEMAKER_SERVING_MODULE=sagemaker_pytorch_serving_container.serving:main
105+
ENV TEMP=/home/model-server/tmp
106+
107+
RUN useradd -m model-server \
108+
&& mkdir -p /home/model-server/tmp /opt/ml/model \
109+
&& chown -R model-server /home/model-server /opt/ml/model
110+
111+
COPY --chmod=755 neuron-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
112+
COPY --chmod=755 neuron-monitor.sh deep_learning_container.py /usr/local/bin/
113+
COPY --chmod=755 torchserve-neuron.sh /usr/local/bin/entrypoint.sh
114+
COPY config.properties /home/model-server
115+
116+
RUN ${PIP} install --no-cache-dir "sagemaker-pytorch-inference==${SM_TOOLKIT_VERSION}" \
117+
# patch default_pytorch_inference_handler.py to import torch_neuronx
118+
&& DEST_DIR=$(python -c "import os.path, sagemaker_pytorch_serving_container; print(os.path.dirname(sagemaker_pytorch_serving_container.__file__))") \
119+
&& DEST_FILE=${DEST_DIR}/default_pytorch_inference_handler.py \
120+
&& sed -i "s/import torch/import torch, torch_neuronx/" ${DEST_FILE} \
121+
&& rm -rf ~/.cache/pip/*
122+
123+
# Compliance
124+
RUN HOME_DIR=/root \
125+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
126+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
127+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
128+
&& chmod +x /usr/local/bin/testOSSCompliance \
129+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
130+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
131+
&& rm -rf ${HOME_DIR}/oss_compliance* \
132+
# conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya
133+
&& rm -rf ${HOME_DIR}/.cache/conda
134+
135+
RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt
136+
137+
# Setting up APT and PIP repo for neuron artifacts
138+
ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com
139+
ARG NEURON_APT_REPO_KEY
140+
ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com
141+
ARG NEURON_PIP_REPO_KEY
142+
RUN mkdir -p /etc/apt/keyrings \
143+
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
144+
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
145+
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
146+
&& PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
147+
&& ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
148+
149+
# Neuron SDK components version numbers
150+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
151+
ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
152+
ARG NEURONX_TOOLS_VERSION=2.26.14.0
153+
154+
ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7
155+
ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca
156+
ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf
157+
ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.10598+a59fdc00
158+
159+
FROM base AS repo
160+
161+
# Install Neuron components from the apt and pip repos (latest versions)
162+
RUN apt-get update \
163+
&& apt-get install -y \
164+
aws-neuronx-tools \
165+
aws-neuronx-collectives \
166+
aws-neuronx-runtime-lib \
167+
&& rm -rf /var/lib/apt/lists/* \
168+
&& rm -rf /tmp/tmp* \
169+
&& apt-get clean
170+
171+
RUN ${PIP} install --no-cache-dir \
172+
neuronx-cc \
173+
torch-neuronx \
174+
neuronx_distributed \
175+
neuronx_distributed_inference \
176+
&& rm -rf ~/.cache/pip/*
177+
178+
FROM base AS prod
179+
180+
# Install Neuron components with specific versions
181+
RUN apt-get update \
182+
&& apt-get install -y \
183+
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
184+
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
185+
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
186+
&& rm -rf /var/lib/apt/lists/* \
187+
&& rm -rf /tmp/tmp* \
188+
&& apt-get clean
189+
190+
RUN ${PIP} install --no-cache-dir \
191+
neuronx-cc==$NEURONX_CC_VERSION \
192+
torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
193+
neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \
194+
neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \
195+
&& rm -rf ~/.cache/pip/*
196+
197+
FROM ${BUILD_STAGE} AS final
198+
199+
EXPOSE 8080 8081
200+
201+
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
202+
CMD ["/usr/local/bin/entrypoint.sh"]
203+
204+
HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1

0 commit comments

Comments
 (0)