1+ ARG BUILD_STAGE=prod
2+
3+ FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base
4+
5+ LABEL dlc_major_version="1"
6+ LABEL maintainer="Amazon AI"
7+
8+ # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
9+ ARG DEBIAN_FRONTEND=noninteractive
10+ ARG PYTHON=python3.10
11+ ARG PYTHON_VERSION=3.10.12
12+ ARG PIP=pip3
13+ ARG OMPI_VERSION=4.1.5
14+
15+ # Python won’t try to write .pyc or .pyo files on the import of source modules
16+ # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
17+ ENV PYTHONDONTWRITEBYTECODE=1
18+ ENV PYTHONUNBUFFERED=1
19+ ENV PYTHONIOENCODING=UTF-8
20+ ENV LANG=C.UTF-8
21+ ENV LC_ALL=C.UTF-8
22+ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib"
23+ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
24+ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
25+ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
26+ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
27+ ENV PATH="/opt/aws/neuron/bin:${PATH}"
28+
29+ RUN apt-get update \
30+ && apt-get upgrade -y \
31+ && apt-get install -y --no-install-recommends \
32+ build-essential \
33+ ca-certificates \
34+ cmake \
35+ curl \
36+ emacs \
37+ git \
38+ gnupg2 \
39+ gpg-agent \
40+ jq \
41+ libopencv-dev \
42+ libglib2.0-0 \
43+ libgl1-mesa-glx \
44+ libsm6 \
45+ libxext6 \
46+ libxrender-dev \
47+ libssl-dev \
48+ libsqlite3-dev \
49+ libgdbm-dev \
50+ libc6-dev \
51+ libbz2-dev \
52+ libncurses-dev \
53+ libffi-dev \
54+ libcap-dev \
55+ libhwloc-dev \
56+ openjdk-8-jdk-headless \
57+ openjdk-8-jdk \
58+ openjdk-8-jre \
59+ openjdk-11-jdk \
60+ openssl \
61+ software-properties-common \
62+ tk-dev \
63+ unzip \
64+ wget \
65+ vim \
66+ zlib1g-dev \
67+ && rm -rf /var/lib/apt/lists/* \
68+ && rm -rf /tmp/tmp* \
69+ && apt-get clean
70+
71+ # Install Open MPI
72+ RUN mkdir -p /tmp/openmpi \
73+ && cd /tmp/openmpi \
74+ && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
75+ && tar zxf openmpi-${OMPI_VERSION}.tar.gz \
76+ && cd openmpi-${OMPI_VERSION} \
77+ && ./configure --enable-orterun-prefix-by-default \
78+ && make -j $(nproc) all \
79+ && make install \
80+ && ldconfig \
81+ && rm -rf /tmp/openmpi
82+
83+ # Install packages and configure SSH for MPI operator in k8s
84+ RUN apt-get update \
85+ && apt-get install -y openmpi-bin openssh-server \
86+ && mkdir -p /var/run/sshd \
87+ && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
88+ && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
89+ && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
90+ && rm -rf /var/lib/apt/lists/* \
91+ && rm -rf /tmp/tmp* \
92+ && apt-get clean
93+
94+ # Install Python
95+ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
96+ && tar -xzf Python-$PYTHON_VERSION.tgz \
97+ && cd Python-$PYTHON_VERSION \
98+ && ./configure --enable-shared --prefix=/usr/local \
99+ && make -j $(nproc) && make install \
100+ && cd .. && rm -rf ../Python-$PYTHON_VERSION* \
101+ && ln -s /usr/local/bin/pip3 /usr/bin/pip \
102+ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
103+ && ${PIP} --no-cache-dir install --upgrade \
104+ "awscli<2" \
105+ pip \
106+ requests \
107+ setuptools \
108+ && rm -rf ~/.cache/pip/*
109+
110+ # Install EFA
111+ RUN apt-get update \
112+ && cd $HOME \
113+ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
114+ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
115+ && cat aws-efa-installer.key | gpg --fingerprint \
116+ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
117+ && tar -xf aws-efa-installer-latest.tar.gz \
118+ && cd aws-efa-installer \
119+ && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
120+ && cd $HOME \
121+ && rm -rf /var/lib/apt/lists/* \
122+ && rm -rf /tmp/tmp* \
123+ && apt-get clean
124+
125+ WORKDIR /
126+
127+ # The ENV variables declared below are changed in the previous section
128+ # Grouping these ENV variables in the first section causes
129+ # ompi_info to fail. This is only observed in CPU containers
130+ ENV PATH="$PATH:/home/.openmpi/bin"
131+ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
132+ RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
133+
134+ RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
135+
136+ # Copy workaround script for incorrect hostname
137+ COPY changehostname.c /
138+ COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/
139+
140+ RUN HOME_DIR=/root \
141+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
142+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
143+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
144+ && chmod +x /usr/local/bin/testOSSCompliance \
145+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
146+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
147+ && rm -rf ${HOME_DIR}/oss_compliance* \
148+ && rm -rf /tmp/tmp*
149+
150+ # Setting up APT and PIP repo for neuron artifacts
151+ ARG NEURON_APT_REPO=https://apt.repos.neuron.amazonaws.com
152+ ARG NEURON_APT_REPO_KEY
153+ ARG NEURON_PIP_REPO=https://pip.repos.neuron.amazonaws.com
154+ ARG NEURON_PIP_REPO_KEY
155+ RUN mkdir -p /etc/apt/keyrings \
156+ && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
157+ && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
158+ && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
159+ && PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
160+ && ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
161+
162+ # Neuron SDK components version numbers
163+ ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts
164+ ARG IGNORE_MISSING_NEURON_COMPONENTS=false
165+ RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]')
166+
167+ ARG NEURONX_RUNTIME_LIB_VERSION=2.26.42.0-2ff3b5c7d
168+ ARG NEURONX_COLLECTIVES_LIB_VERSION=2.26.43.0-47cc904ea
169+ ARG NEURONX_TOOLS_VERSION=2.24.54.0
170+
171+ ARG NEURONX_CC_VERSION=2.19.8089.0
172+ ARG NEURONX_JAX_TRAINING_VERSION=0.6.0.1.0.1296+1f770067
173+
174+ FROM base AS dev
175+
176+ RUN --mount=type=bind,source=apt,target=${NEURON_ARTIFACT_PATH}/apt \
177+ install_apt_package() { \
178+ pkg_name=$1; \
179+ version_arg=$2; \
180+ if [ -f "${NEURON_ARTIFACT_PATH}/apt/${version_arg}" ]; then \
181+ apt-get install -y ${NEURON_ARTIFACT_PATH}/apt/${version_arg}; \
182+ elif [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
183+ apt-get install -y ${pkg_name}=${version_arg}; \
184+ else \
185+ echo "Ignoring package ${pkg_name}"; \
186+ fi; \
187+ } \
188+ && apt-get update \
189+ && install_apt_package "aws-neuronx-collectives" "${NEURONX_COLLECTIVES_LIB_VERSION}" \
190+ && install_apt_package "aws-neuronx-runtime-lib" "${NEURONX_RUNTIME_LIB_VERSION}" \
191+ && install_apt_package "aws-neuronx-tools" "${NEURONX_TOOLS_VERSION}" \
192+ && rm -rf /var/lib/apt/lists/* \
193+ && rm -rf /tmp/tmp* \
194+ && apt-get clean
195+
196+ RUN --mount=type=bind,source=pip,target=${NEURON_ARTIFACT_PATH}/pip \
197+ install_pip_package() { \
198+ packages=""; \
199+ flags=""; \
200+ while [ "$#" -gt 0 ]; do \
201+ pkg_name=$(echo $1 | cut -d: -f1); \
202+ version_arg=$(echo $1 | cut -d: -f2); \
203+ extra_flags=$(echo $1 | cut -d: -f3); \
204+ if [ -f "${NEURON_ARTIFACT_PATH}/pip/${version_arg}" ]; then \
205+ packages="${packages} ${NEURON_ARTIFACT_PATH}/pip/${version_arg}"; \
206+ else \
207+ if [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \
208+ packages="${packages} ${pkg_name}==${version_arg}"; \
209+ else \
210+ echo "Ignoring package ${pkg_name}"; \
211+ fi; \
212+ fi; \
213+ # Store unique flags
214+ if [ ! -z "${extra_flags}" ]; then \
215+ for flag in $(echo "${extra_flags}" | tr ' ' '\n'); do \
216+ case " ${flags} " in \
217+ *" ${flag} "*) ;; \
218+ *) flags="${flags} ${flag}" ;; \
219+ esac \
220+ done; \
221+ fi; \
222+ shift; \
223+ done; \
224+ if [ ! -z "${packages}" ]; then \
225+ echo "Installing packages: ${packages} with flags ${flags}"; \
226+ ${PIP} install --no-cache-dir --force-reinstall \
227+ --extra-index-url="file:///${NEURON_ARTIFACT_PATH}/pip" \
228+ ${packages} ${flags}; \
229+ fi; \
230+ } \
231+ && install_pip_package "neuronx-cc:${NEURONX_CC_VERSION}:" "jax-neuronx:${NEURONX_JAX_TRAINING_VERSION}:" \
232+ && rm -rf ~/.cache/pip/*
233+
234+ FROM base AS repo
235+
236+ # Install Neuron components from the apt and pip repos
237+ RUN apt-get update \
238+ && apt-get install -y \
239+ aws-neuronx-tools \
240+ aws-neuronx-collectives \
241+ aws-neuronx-runtime-lib \
242+ && rm -rf /var/lib/apt/lists/* \
243+ && rm -rf /tmp/tmp* \
244+ && apt-get clean \
245+ && ${PIP} install --no-cache-dir --force-reinstall \
246+ neuronx-cc \
247+ jax-neuronx \
248+ && rm -rf ~/.cache/pip/*
249+
250+
251+ FROM base AS prod
252+
253+ # Install Neuron components
254+ # Install Neuron Driver, Runtime and Tools
255+ RUN apt-get update \
256+ && apt-get install -y \
257+ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
258+ aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
259+ aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
260+ && rm -rf /var/lib/apt/lists/* \
261+ && rm -rf /tmp/tmp* \
262+ && apt-get clean
263+
264+ # Install JAX & Neuron CC
265+ RUN ${PIP} install --no-cache-dir --force-reinstall \
266+ neuronx-cc==$NEURONX_CC_VERSION \
267+ jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \
268+ && rm -rf ~/.cache/pip/*
269+
270+ FROM ${BUILD_STAGE} AS final
271+
272+ # Starts framework
273+ ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
274+ CMD ["/bin/bash"]
275+
276+ HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
0 commit comments