Skip to content

Commit 0a7317b

Browse files
Updated Dockerfiles for 2.25 Release
Co-authored-by: Ahsan Khan <[email protected]>
1 parent 5861f4a commit 0a7317b

File tree

4 files changed

+260
-22
lines changed

4 files changed

+260
-22
lines changed

docker/jax/training/0.6/Dockerfile.neuronx

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,12 +164,12 @@ ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
164164
ARG IGNORE_MISSING_NEURON_COMPONENTS=false
165165
RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
166166

167-
ARG NEURONX_RUNTIME_LIB_VERSION=2.26.42.0-2ff3b5c7d
168-
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.26.43.0-47cc904ea
169-
ARG NEURONX_TOOLS_VERSION=2.24.54.0
167+
ARG NEURONX_RUNTIME_LIB_VERSION=2.27.23.0-8deec4dbf
168+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.27.34.0-ec8cd5e8b
169+
ARG NEURONX_TOOLS_VERSION=2.25.145.0
170170

171-
ARG NEURONX_CC_VERSION=2.19.8089.0
172-
ARG NEURONX_JAX_TRAINING_VERSION=0.6.0.1.0.1296+1f770067
171+
ARG NEURONX_CC_VERSION=2.20.9961.0
172+
ARG NEURONX_JAX_TRAINING_VERSION=0.6.1.1.0.3499+2edccbed
173173

174174
FROM base AS repo
175175

docker/pytorch/inference/2.7.0/Dockerfile.neuronx

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -149,15 +149,15 @@ ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
149149
ARG IGNORE_MISSING_NEURON_COMPONENTS=false
150150
RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
151151

152-
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.26.43.0-47cc904ea
153-
ARG NEURONX_RUNTIME_LIB_VERSION=2.26.42.0-2ff3b5c7d
154-
ARG NEURONX_TOOLS_VERSION=2.24.54.0
155-
156-
ARG NEURONX_CC_VERSION=2.19.8089.0
157-
ARG NEURONX_FRAMEWORK_VERSION=2.7.0.2.8.6734+ac864f72
158-
ARG NEURONX_TRANSFORMERS_VERSION=0.13.985
159-
ARG NEURONX_DISTRIBUTED_VERSION=0.13.14393+b8569585
160-
ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.4.7422+9483d307
152+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.27.34.0-ec8cd5e8b
153+
ARG NEURONX_RUNTIME_LIB_VERSION=2.27.23.0-8deec4dbf
154+
ARG NEURONX_TOOLS_VERSION=2.25.145.0
155+
156+
ARG NEURONX_CC_VERSION=2.20.9961.0
157+
ARG NEURONX_FRAMEWORK_VERSION=2.7.0.2.9.9357+08e1f40d
158+
ARG NEURONX_TRANSFORMERS_VERSION=0.13.1216
159+
ARG NEURONX_DISTRIBUTED_VERSION=0.14.18461+9ac233f2
160+
ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.5.9230+dcf1e2da
161161

162162
FROM base AS repo
163163

docker/pytorch/training/2.7.0/Dockerfile.neuronx

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -224,14 +224,14 @@ ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
224224
ARG IGNORE_MISSING_NEURON_COMPONENTS=false
225225
RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
226226

227-
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.26.43.0-47cc904ea
228-
ARG NEURONX_RUNTIME_LIB_VERSION=2.26.42.0-2ff3b5c7d
229-
ARG NEURONX_TOOLS_VERSION=2.24.54.0
230-
231-
ARG NEURONX_FRAMEWORK_VERSION=2.7.0.2.8.6734+ac864f72
232-
ARG NEURONX_CC_VERSION=2.19.8089.0
233-
ARG NEURONX_DISTRIBUTED_VERSION=0.13.14393+b8569585
234-
ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.4.1
227+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.27.34.0-ec8cd5e8b
228+
ARG NEURONX_RUNTIME_LIB_VERSION=2.27.23.0-8deec4dbf
229+
ARG NEURONX_TOOLS_VERSION=2.25.145.0
230+
231+
ARG NEURONX_FRAMEWORK_VERSION=2.7.0.2.9.9357+08e1f40d
232+
ARG NEURONX_CC_VERSION=2.20.9961.0
233+
ARG NEURONX_DISTRIBUTED_VERSION=0.14.18461+9ac233f2
234+
ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.5.0
235235

236236
FROM base AS repo
237237

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
ARG BUILD_STAGE=prod
2+
3+
FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base
4+
5+
LABEL dlc_major_version="1"
6+
LABEL maintainer="Amazon AI"
7+
8+
ARG DEBIAN_FRONTEND=noninteractive
9+
ARG PIP=pip3
10+
ARG PYTHON=python3.10
11+
ARG PYTHON_VERSION=3.10.12
12+
ARG MAMBA_VERSION=23.1.0-4
13+
ARG TORCHSERVE_VERSION=0.11.0
14+
15+
16+
# See http://bugs.python.org/issue19846
17+
ENV LANG=C.UTF-8
18+
ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH
19+
ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH
20+
ENV VLLM_TARGET_DEVICE=neuron
21+
22+
RUN apt-get update \
23+
&& apt-get upgrade -y \
24+
&& apt-get install -y --no-install-recommends \
25+
apt-transport-https \
26+
build-essential \
27+
ca-certificates \
28+
cmake \
29+
curl \
30+
emacs \
31+
ffmpeg \
32+
git \
33+
gnupg2 \
34+
gpg-agent \
35+
jq \
36+
libgl1 \
37+
libgl1-mesa-glx \
38+
libglib2.0-0 \
39+
libsm6 \
40+
libxext6 \
41+
libxrender-dev \
42+
libcap-dev \
43+
libhwloc-dev \
44+
openssh-client \
45+
openjdk-11-jdk \
46+
unzip \
47+
vim \
48+
wget \
49+
zlib1g-dev \
50+
&& rm -rf /var/lib/apt/lists/* \
51+
&& rm -rf /tmp/tmp* \
52+
&& apt-get clean
53+
54+
# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files
55+
RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \
56+
mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \
57+
/var/lib/dpkg/info/ca-certificates-java.postinst configure;
58+
59+
RUN curl -L -o ~/mambaforge.sh https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-x86_64.sh \
60+
&& chmod +x ~/mambaforge.sh \
61+
&& ~/mambaforge.sh -b -p /opt/conda \
62+
&& rm ~/mambaforge.sh \
63+
&& /opt/conda/bin/conda update -y conda \
64+
&& /opt/conda/bin/mamba install -c conda-forge -y \
65+
python=$PYTHON_VERSION \
66+
pyopenssl \
67+
cython \
68+
mkl-include \
69+
mkl \
70+
parso \
71+
typing \
72+
# Below 2 are included in miniconda base, but not mamba so need to install
73+
conda-content-trust \
74+
charset-normalizer \
75+
&& /opt/conda/bin/conda clean -ya
76+
77+
RUN /opt/conda/bin/mamba install -c conda-forge \
78+
scikit-learn \
79+
h5py \
80+
requests \
81+
&& conda clean -ya \
82+
&& pip install --upgrade pip \
83+
--trusted-host pypi.org --trusted-host files.pythonhosted.org \
84+
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \
85+
&& pip install \
86+
packaging \
87+
enum-compat \
88+
ipython \
89+
&& rm -rf ~/.cache/pip/*
90+
91+
COPY --chmod=755 vllm_entrypoint.py neuron-monitor.sh deep_learning_container.py /usr/local/bin/
92+
# Copy the Vllm Installation files
93+
COPY --chmod=755 vllm_requirements.txt /root/
94+
95+
### Mount Point ###
96+
# When launching the container, mount the code directory to /workspace
97+
ARG APP_MOUNT=/workspace
98+
VOLUME [ ${APP_MOUNT} ]
99+
WORKDIR ${APP_MOUNT}/vllm
100+
101+
RUN ${PIP} install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
102+
&& ${PIP} install --no-cache-dir -U \
103+
"opencv-python" \
104+
"scipy" \
105+
"awscli" \
106+
"pandas" \
107+
"boto3" \
108+
"cryptography" \
109+
"ninja" \
110+
"pytest" \
111+
"packaging" \
112+
"wheel" \
113+
"cmake>=3.26" \
114+
"setuptools-scm>=8" \
115+
"jinja2" \
116+
torchserve==${TORCHSERVE_VERSION} \
117+
torch-model-archiver==${TORCHSERVE_VERSION} \
118+
&& ${PIP} install --no-deps --no-cache-dir -U torchvision \
119+
&& ${PIP} install --no-cache-dir -r /root/vllm_requirements.txt \
120+
&& rm -rf ~/.cache/pip/*
121+
122+
RUN useradd -m model-server \
123+
&& mkdir -p /home/model-server/tmp /opt/ml/model \
124+
&& chown -R model-server /home/model-server /opt/ml/model
125+
COPY config.properties /home/model-server
126+
127+
# Compliance
128+
RUN HOME_DIR=/root \
129+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
130+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
131+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
132+
&& chmod +x /usr/local/bin/testOSSCompliance \
133+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
134+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
135+
&& rm -rf ${HOME_DIR}/oss_compliance* \
136+
# conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya
137+
&& rm -rf ${HOME_DIR}/.cache/conda
138+
139+
# Setting up APT and PIP repo for neuron artifacts
140+
ARG NEURON_APT_REPO=https://apt.repos.neuron.amazonaws.com
141+
ARG NEURON_APT_REPO_KEY
142+
ARG NEURON_PIP_REPO=https://pip.repos.neuron.amazonaws.com
143+
ARG NEURON_PIP_REPO_KEY
144+
RUN mkdir -p /etc/apt/keyrings \
145+
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
146+
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
147+
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
148+
&& PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
149+
&& ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
150+
151+
# Neuron SDK components version numbers
152+
ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
153+
ARG IGNORE_MISSING_NEURON_COMPONENTS=false
154+
RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
155+
156+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.27.34.0-ec8cd5e8b
157+
ARG NEURONX_RUNTIME_LIB_VERSION=2.27.23.0-8deec4dbf
158+
ARG NEURONX_TOOLS_VERSION=2.25.145.0
159+
160+
ARG NEURONX_CC_VERSION=2.20.9961.0
161+
ARG NEURONX_FRAMEWORK_VERSION=2.7.0.2.9.9357+08e1f40d
162+
ARG NEURONX_DISTRIBUTED_VERSION=0.14.18461+9ac233f2
163+
ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.5.9230+dcf1e2da
164+
165+
FROM base AS vllm-clone
166+
167+
RUN mkdir -p /root/.ssh && \
168+
echo "StrictHostKeyChecking no" >> /root/.ssh/config && \
169+
ssh-keyscan -t rsa github.com >> /root/.ssh/known_hosts
170+
171+
WORKDIR /vllm
172+
173+
RUN --mount=type=secret,id=ssh_key,target=/root/.ssh/id_ed25519,mode=0600 \
174+
git clone -b release-2.25 [email protected]:aws-neuron/private-neuronx-vllm-staging.git .
175+
176+
FROM base AS repo
177+
178+
# Install Neuron components from the apt and pip repos (latest versions)
179+
RUN apt-get update \
180+
&& apt-get install -y \
181+
aws-neuronx-tools \
182+
aws-neuronx-collectives \
183+
aws-neuronx-runtime-lib \
184+
&& rm -rf /var/lib/apt/lists/* \
185+
&& rm -rf /tmp/tmp* \
186+
&& apt-get clean
187+
188+
RUN ${PIP} install --no-cache-dir \
189+
neuronx-cc \
190+
torch-neuronx \
191+
neuronx_distributed \
192+
neuronx_distributed_inference \
193+
&& rm -rf ~/.cache/pip/*
194+
195+
# Install VLLM from source
196+
COPY --from=vllm-clone /vllm /opt/vllm
197+
WORKDIR /opt/vllm
198+
199+
RUN ${PIP} install --no-cache-dir -r requirements/neuron.txt \
200+
&& VLLM_TARGET_DEVICE="neuron" ${PIP} install --no-cache-dir -e .
201+
202+
FROM base AS prod
203+
204+
# Install Neuron components with specific versions
205+
RUN apt-get update \
206+
&& apt-get install -y \
207+
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
208+
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
209+
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
210+
&& rm -rf /var/lib/apt/lists/* \
211+
&& rm -rf /tmp/tmp* \
212+
&& apt-get clean
213+
214+
RUN ${PIP} install --no-cache-dir \
215+
neuronx-cc==$NEURONX_CC_VERSION \
216+
torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
217+
neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \
218+
neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \
219+
&& rm -rf ~/.cache/pip/*
220+
221+
# Install VLLM from source
222+
RUN cd /tmp \
223+
&& git clone -b neuron-2.24-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git \
224+
&& cd upstreaming-to-vllm \
225+
&& ${PIP} install --no-cache-dir -r requirements/neuron.txt \
226+
&& SETUPTOOLS_SCM_PRETEND_VERSION="2.24.0.0" VLLM_TARGET_DEVICE="neuron" ${PIP} install --no-cache-dir -e . \
227+
&& cd / \
228+
&& rm -rf /tmp/upstreaming-to-vllm
229+
230+
WORKDIR ${APP_MOUNT}/vllm
231+
232+
FROM ${BUILD_STAGE} AS final
233+
234+
EXPOSE 8080 8081
235+
236+
ENTRYPOINT ["python", "/usr/local/bin/vllm_entrypoint.py"]
237+
CMD ["/bin/bash"]
238+
HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1

0 commit comments

Comments
 (0)