1- FROM public.ecr.aws/docker/library/ubuntu:22.04
1+ ARG BUILD_STAGE=prod
2+
3+ FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base
24
35LABEL dlc_major_version="1"
46LABEL maintainer="Amazon AI"
57
6- # Neuron SDK components version numbers
7- ARG NEURONX_RUNTIME_LIB_VERSION=2.24.53.0-f239092cc
8- ARG NEURONX_COLLECTIVES_LIB_VERSION=2.24.59.0-838c7fc8b
9- ARG NEURONX_TOOLS_VERSION=2.22.61.0
10- ARG NEURONX_CC_VERSION=2.17.194.0
11- ARG NEURONX_JAX_TRAINING_VERSION=0.1.3
12-
8+ # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
9+ ARG DEBIAN_FRONTEND=noninteractive
1310ARG PYTHON=python3.10
1411ARG PYTHON_VERSION=3.10.12
1512ARG PIP=pip3
1613ARG OMPI_VERSION=4.1.5
1714
18- # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
19- ARG DEBIAN_FRONTEND=noninteractive
20-
2115# Python won’t try to write .pyc or .pyo files on the import of source modules
2216# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
2317ENV PYTHONDONTWRITEBYTECODE=1
@@ -30,6 +24,7 @@ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
3024ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
3125ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
3226ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
27+ ENV PATH="/opt/aws/neuron/bin:${PATH}"
3328
3429RUN apt-get update \
3530 && apt-get upgrade -y \
@@ -86,15 +81,17 @@ RUN mkdir -p /tmp/openmpi \
8681 && rm -rf /tmp/openmpi
8782
8883# Install packages and configure SSH for MPI operator in k8s
89- RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
84+ RUN apt-get update \
85+ && apt-get install -y openmpi-bin openssh-server \
9086 && mkdir -p /var/run/sshd \
9187 && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
9288 && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
9389 && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
9490 && rm -rf /var/lib/apt/lists/* \
91+ && rm -rf /tmp/tmp* \
9592 && apt-get clean
9693
97- # install Python
94+ # Install Python
9895RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
9996 && tar -xzf Python-$PYTHON_VERSION.tgz \
10097 && cd Python-$PYTHON_VERSION \
@@ -104,8 +101,26 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER
104101 && ln -s /usr/local/bin/pip3 /usr/bin/pip \
105102 && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
106103 && ${PIP} --no-cache-dir install --upgrade \
104+ "awscli<2" \
107105 pip \
108- setuptools
106+ requests \
107+ setuptools \
108+ && rm -rf ~/.cache/pip/*
109+
110+ # Install EFA
111+ RUN apt-get update \
112+ && cd $HOME \
113+ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
114+ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
115+ && cat aws-efa-installer.key | gpg --fingerprint \
116+ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
117+ && tar -xf aws-efa-installer-latest.tar.gz \
118+ && cd aws-efa-installer \
119+ && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
120+ && cd $HOME \
121+ && rm -rf /var/lib/apt/lists/* \
122+ && rm -rf /tmp/tmp* \
123+ && apt-get clean
109124
110125WORKDIR /
111126
@@ -118,64 +133,141 @@ RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
118133
119134RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
120135
121- # Install Neuron Driver, Runtime and Tools
122- RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
123- RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
136+ # Copy workaround script for incorrect hostname
137+ COPY changehostname.c /
138+ COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/
124139
125- RUN apt-get update \
126- && apt-get install -y \
127- aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
128- aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
129- aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
140+ RUN HOME_DIR=/root \
141+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
142+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
143+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
144+ && chmod +x /usr/local/bin/testOSSCompliance \
145+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
146+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
147+ && rm -rf ${HOME_DIR}/oss_compliance* \
148+ && rm -rf /tmp/tmp*
149+
150+ # Setting up APT and PIP repo for neuron artifacts
151+ ARG NEURON_APT_REPO=https://apt.repos.neuron.amazonaws.com
152+ ARG NEURON_APT_REPO_KEY
153+ ARG NEURON_PIP_REPO=https://pip.repos.neuron.amazonaws.com
154+ ARG NEURON_PIP_REPO_KEY
155+ RUN mkdir -p /etc/apt/keyrings \
156+ && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
157+ && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
158+ && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
159+ && PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
160+ && ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
161+
162+ # Neuron SDK components version numbers
163+ ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
164+ ARG IGNORE_MISSING_NEURON_COMPONENTS=false
165+ RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
166+
167+ ARG NEURONX_RUNTIME_LIB_VERSION=2.25.57.0-166c7a468
168+ ARG NEURONX_COLLECTIVES_LIB_VERSION=2.25.65.0-9858ac9a1
169+ ARG NEURONX_TOOLS_VERSION=2.23.9.0
170+
171+ ARG NEURONX_CC_VERSION=2.18.121.0
172+ ARG NEURONX_JAX_TRAINING_VERSION=0.5.3.1.0.719+1d9c17be
173+
174+ FROM base AS dev
175+
176+ RUN --mount=type=bind,source=apt,target=${NEURON_ARTIFACT_PATH}/apt \
177+ install_apt_package() { \
178+ pkg_name=$1; \
179+ version_arg=$2; \
180+ if [ -f "${NEURON_ARTIFACT_PATH}/apt/${version_arg}" ]; then \
181+ apt-get install -y ${NEURON_ARTIFACT_PATH}/apt/${version_arg}; \
182+ elif [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
183+ apt-get install -y ${pkg_name}=${version_arg}; \
184+ else \
185+ echo "Ignoring package ${pkg_name}"; \
186+ fi; \
187+ } \
188+ && apt-get update \
189+ && install_apt_package "aws-neuronx-collectives" "${NEURONX_COLLECTIVES_LIB_VERSION}" \
190+ && install_apt_package "aws-neuronx-runtime-lib" "${NEURONX_RUNTIME_LIB_VERSION}" \
191+ && install_apt_package "aws-neuronx-tools" "${NEURONX_TOOLS_VERSION}" \
130192 && rm -rf /var/lib/apt/lists/* \
131193 && rm -rf /tmp/tmp* \
132194 && apt-get clean
133195
134- # Add Neuron PATH
135- ENV PATH="/opt/aws/neuron/bin:${PATH}"
196+ RUN --mount=type=bind,source=pip,target=${NEURON_ARTIFACT_PATH}/pip \
197+ install_pip_package() { \
198+ packages=""; \
199+ flags=""; \
200+ while [ "$#" -gt 0 ]; do \
201+ pkg_name=$(echo $1 | cut -d: -f1); \
202+ version_arg=$(echo $1 | cut -d: -f2); \
203+ extra_flags=$(echo $1 | cut -d: -f3); \
204+ if [ -f "${NEURON_ARTIFACT_PATH}/pip/${version_arg}" ]; then \
205+ packages="${packages} ${NEURON_ARTIFACT_PATH}/pip/${version_arg}"; \
206+ else \
207+ if [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
208+ packages="${packages} ${pkg_name}==${version_arg}"; \
209+ else \
210+ echo "Ignoring package ${pkg_name}"; \
211+ fi; \
212+ fi; \
213+ # Store unique flags
214+ if [ ! -z "${extra_flags}" ]; then \
215+ for flag in $(echo "${extra_flags}" | tr ' ' '\n'); do \
216+ case " ${flags} " in \
217+ *" ${flag} "*) ;; \
218+ *) flags="${flags} ${flag}" ;; \
219+ esac \
220+ done; \
221+ fi; \
222+ shift; \
223+ done; \
224+ if [ ! -z "${packages}" ]; then \
225+ echo "Installing packages: ${packages} with flags ${flags}"; \
226+ ${PIP} install --no-cache-dir --force-reinstall \
227+ --extra-index-url="file:///${NEURON_ARTIFACT_PATH}/pip" \
228+ ${packages} ${flags}; \
229+ fi; \
230+ } \
231+ && install_pip_package "neuronx-cc:${NEURONX_CC_VERSION}:" "jax-neuronx:${NEURONX_JAX_TRAINING_VERSION}:" \
232+ && rm -rf ~/.cache/pip/*
136233
137- # Install AWS CLI
138- RUN ${PIP} install --no-cache-dir -U "awscli<2"
234+ FROM base AS repo
139235
140- # Install JAX & Neuron CC
141- RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
142- && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
143- && ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
236+ # Install Neuron components from the apt and pip repos
237+ RUN apt-get update \
238+ && apt-get install -y \
239+ aws-neuronx-tools \
240+ aws-neuronx-collectives \
241+ aws-neuronx-runtime-lib \
242+ && rm -rf /var/lib/apt/lists/* \
243+ && rm -rf /tmp/tmp* \
244+ && apt-get clean \
245+ && ${PIP} install --no-cache-dir --force-reinstall \
246+ neuronx-cc \
247+ jax-neuronx \
248+ && rm -rf ~/.cache/pip/*
144249
145- # EFA Installer does apt get. Make sure to run apt update before that
146- RUN apt-get update
147- RUN cd $HOME \
148- && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
149- && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
150- && cat aws-efa-installer.key | gpg --fingerprint \
151- && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
152- && tar -xf aws-efa-installer-latest.tar.gz \
153- && cd aws-efa-installer \
154- && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
155- && cd $HOME
156250
157- # Clean up after apt update
158- RUN rm -rf /var/lib/apt/lists/* \
251+ FROM base AS prod
252+
253+ # Install Neuron components
254+ # Install Neuron Driver, Runtime and Tools
255+ RUN apt-get update \
256+ && apt-get install -y \
257+ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
258+ aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
259+ aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
260+ && rm -rf /var/lib/apt/lists/* \
159261 && rm -rf /tmp/tmp* \
160262 && apt-get clean
161263
162- # Copy workaround script for incorrect hostname
163- COPY changehostname.c /
164- COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
165- COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
166-
167- RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
168- && chmod +x /usr/local/bin/deep_learning_container.py
264+ # Install JAX & Neuron CC
265+ RUN ${PIP} install --no-cache-dir --force-reinstall \
266+ neuronx-cc==$NEURONX_CC_VERSION \
267+ jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \
268+ && rm -rf ~/.cache/pip/*
169269
170- RUN HOME_DIR=/root \
171- && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
172- && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
173- && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
174- && chmod +x /usr/local/bin/testOSSCompliance \
175- && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
176- && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
177- && rm -rf ${HOME_DIR}/oss_compliance* \
178- && rm -rf /tmp/tmp*
270+ FROM ${BUILD_STAGE} AS final
179271
180272# Starts framework
181273ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
0 commit comments