|
| 1 | +ARG BUILD_STAGE=prod |
| 2 | + |
| 3 | +FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base |
| 4 | + |
| 5 | +LABEL maintainer="Amazon AI" |
| 6 | +LABEL dlc_major_version="2" |
| 7 | + |
| 8 | +ARG PYTHON=python3.10 |
| 9 | +ARG PYTHON_VERSION=3.10.12 |
| 10 | +ARG PIP=pip3 |
| 11 | +ARG OMPI_VERSION=4.1.5 |
| 12 | + |
| 13 | +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22 |
| 14 | +ARG DEBIAN_FRONTEND=noninteractive |
| 15 | + |
| 16 | +# Python won't try to write .pyc or .pyo files on the import of source modules |
| 17 | +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging |
| 18 | +ENV PYTHONDONTWRITEBYTECODE=1 |
| 19 | +ENV PYTHONUNBUFFERED=1 |
| 20 | +ENV PYTHONIOENCODING=UTF-8 |
| 21 | +ENV LANG=C.UTF-8 |
| 22 | +ENV LC_ALL=C.UTF-8 |
| 23 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" |
| 24 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" |
| 25 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" |
| 26 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" |
| 27 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" |
| 28 | +ENV PATH="/opt/aws/neuron/bin:${PATH}" |
| 29 | +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main |
| 30 | +ENV DGLBACKEND=pytorch |
| 31 | + |
| 32 | +RUN apt-get update \ |
| 33 | + && apt-get upgrade -y \ |
| 34 | + && apt-get install -y --no-install-recommends \ |
| 35 | + build-essential \ |
| 36 | + ca-certificates \ |
| 37 | + cmake \ |
| 38 | + curl \ |
| 39 | + emacs \ |
| 40 | + git \ |
| 41 | + gnupg2 \ |
| 42 | + gpg-agent \ |
| 43 | + jq \ |
| 44 | + libopencv-dev \ |
| 45 | + libglib2.0-0 \ |
| 46 | + libgl1-mesa-glx \ |
| 47 | + libsm6 \ |
| 48 | + libxext6 \ |
| 49 | + libxrender-dev \ |
| 50 | + libssl-dev \ |
| 51 | + libsqlite3-dev \ |
| 52 | + libgdbm-dev \ |
| 53 | + libc6-dev \ |
| 54 | + libbz2-dev \ |
| 55 | + libncurses-dev \ |
| 56 | + libffi-dev \ |
| 57 | + libcap-dev \ |
| 58 | + libhwloc-dev \ |
| 59 | + openjdk-8-jdk-headless \ |
| 60 | + openjdk-8-jdk \ |
| 61 | + openjdk-8-jre \ |
| 62 | + openjdk-11-jdk \ |
| 63 | + openssl \ |
| 64 | + software-properties-common \ |
| 65 | + tk-dev \ |
| 66 | + unzip \ |
| 67 | + wget \ |
| 68 | + vim \ |
| 69 | + zlib1g-dev \ |
| 70 | + && rm -rf /var/lib/apt/lists/* \ |
| 71 | + && rm -rf /tmp/tmp* \ |
| 72 | + && apt-get clean |
| 73 | + |
| 74 | +# Install Open MPI |
| 75 | +RUN mkdir -p /tmp/openmpi \ |
| 76 | + && cd /tmp/openmpi \ |
| 77 | + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ |
| 78 | + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ |
| 79 | + && cd openmpi-${OMPI_VERSION} \ |
| 80 | + && ./configure --enable-orterun-prefix-by-default \ |
| 81 | + && make -j $(nproc) all \ |
| 82 | + && make install \ |
| 83 | + && ldconfig \ |
| 84 | + && rm -rf /tmp/openmpi |
| 85 | + |
| 86 | +# Install packages and configure SSH for MPI operator in k8s |
| 87 | +RUN apt-get update && apt-get install -y openmpi-bin openssh-server \ |
| 88 | + && mkdir -p /var/run/sshd \ |
| 89 | + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ |
| 90 | + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ |
| 91 | + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ |
| 92 | + && rm -rf /var/lib/apt/lists/* \ |
| 93 | + && rm -rf /tmp/tmp* \ |
| 94 | + && apt-get clean |
| 95 | + |
| 96 | +# Install Python |
| 97 | +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ |
| 98 | + && tar -xzf Python-$PYTHON_VERSION.tgz \ |
| 99 | + && cd Python-$PYTHON_VERSION \ |
| 100 | + && ./configure --enable-shared --prefix=/usr/local \ |
| 101 | + && make -j $(nproc) && make install \ |
| 102 | + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ |
| 103 | + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ |
| 104 | + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ |
| 105 | + && ${PIP} --no-cache-dir install --upgrade pip \ |
| 106 | + && rm -rf ~/.cache/pip/* |
| 107 | + |
| 108 | +WORKDIR / |
| 109 | + |
| 110 | +# The ENV variables declared below are changed in the previous section |
| 111 | +# Grouping these ENV variables in the first section causes |
| 112 | +# ompi_info to fail. This is only observed in CPU containers |
| 113 | +ENV PATH="$PATH:/home/.openmpi/bin" |
| 114 | +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" |
| 115 | +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value |
| 116 | + |
| 117 | +RUN ${PIP} install --no-cache-dir -U \ |
| 118 | + "bokeh>=2.3,<3" \ |
| 119 | + "awscli<2" \ |
| 120 | + scipy \ |
| 121 | + click \ |
| 122 | + "cryptography" \ |
| 123 | + "sagemaker>=2,<3" \ |
| 124 | + "sagemaker-pytorch-training" \ |
| 125 | + psutil==5.6.7 \ |
| 126 | + dataset \ |
| 127 | + Pillow \ |
| 128 | + && rm -rf ~/.cache/pip/* |
| 129 | + |
| 130 | +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt |
| 131 | + |
| 132 | +# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 |
| 133 | +# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 |
| 134 | +# awscli 1.25.47 has requirement docutils<0.17,>=0.10 |
| 135 | +# etcd for kubernetes installation |
| 136 | +# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. |
| 137 | +# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 |
| 138 | +RUN ${PIP} install --no-cache-dir -U \ |
| 139 | + "attrs<24,>=23.1.0" \ |
| 140 | + "docutils>=0.10,<0.17" \ |
| 141 | + "rsa<4.8,>=3.1.2" \ |
| 142 | + "python-etcd" \ |
| 143 | + "urllib3>=1.26.0,<1.27" \ |
| 144 | + # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) |
| 145 | + && ${PIP} install --no-cache-dir -U \ |
| 146 | + "bokeh>=3.0.1,<4" \ |
| 147 | + "imageio>=2.22,<3" \ |
| 148 | + "opencv-python>=4.8.1.78" \ |
| 149 | + "plotly>=5.11,<6" \ |
| 150 | + "seaborn>=0.12,<1" \ |
| 151 | + "shap>=0.41,<1" \ |
| 152 | + && rm -rf ~/.cache/pip/* |
| 153 | + |
| 154 | +# EFA Installer does apt get. Make sure to run apt update before that |
| 155 | +RUN apt-get update \ |
| 156 | + && cd $HOME \ |
| 157 | + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ |
| 158 | + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ |
| 159 | + && cat aws-efa-installer.key | gpg --fingerprint \ |
| 160 | + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ |
| 161 | + && tar -xf aws-efa-installer-latest.tar.gz \ |
| 162 | + && cd aws-efa-installer \ |
| 163 | + && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ |
| 164 | + && cd $HOME \ |
| 165 | + && rm -rf /var/lib/apt/lists/* \ |
| 166 | + && rm -rf /tmp/tmp* \ |
| 167 | + && apt-get clean |
| 168 | + |
| 169 | +# Install some common packages used by training scripts |
| 170 | +# torchvision needed for MLP. since it depends on torch and torch neuron/torch |
| 171 | +# is already installed install it with nodeps |
| 172 | +RUN ${PIP} install --no-cache-dir --no-deps -U \ |
| 173 | + torchvision==0.23.0 \ |
| 174 | + # Needed for running bert training scripts |
| 175 | + && ${PIP} install --no-cache-dir -U \ |
| 176 | + graphviz \ |
| 177 | + tensorboard==2.6 \ |
| 178 | + accelerate \ |
| 179 | + # Install NxDT dependencies |
| 180 | + && ${PIP} install --no-cache-dir \ |
| 181 | + Cython \ |
| 182 | + wheel \ |
| 183 | + && rm -rf ~/.cache/pip/* |
| 184 | + |
| 185 | +# Copy workaround script for incorrect hostname |
| 186 | +COPY changehostname.c / |
| 187 | +COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/ |
| 188 | + |
| 189 | +RUN HOME_DIR=/root \ |
| 190 | + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ |
| 191 | + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ |
| 192 | + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ |
| 193 | + && chmod +x /usr/local/bin/testOSSCompliance \ |
| 194 | + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ |
| 195 | + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ |
| 196 | + && rm -rf ${HOME_DIR}/oss_compliance* \ |
| 197 | + && rm -rf /tmp/tmp* |
| 198 | + |
| 199 | +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt |
| 200 | + |
| 201 | +# Setting up APT and PIP repo for neuron artifacts |
| 202 | +ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com |
| 203 | +ARG NEURON_APT_REPO_KEY |
| 204 | +ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com |
| 205 | +ARG NEURON_PIP_REPO_KEY |
| 206 | +RUN mkdir -p /etc/apt/keyrings \ |
| 207 | + && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ |
| 208 | + && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \ |
| 209 | + && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \ |
| 210 | + && PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ |
| 211 | + && ${PIP} config set global.extra-index-url "${PIP_REPO_URL}" |
| 212 | + |
| 213 | +# Neuron SDK components |
| 214 | +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 |
| 215 | +ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 |
| 216 | +ARG NEURONX_TOOLS_VERSION=2.26.14.0 |
| 217 | +ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca |
| 218 | +ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7 |
| 219 | +ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf |
| 220 | + |
| 221 | +FROM base AS repo |
| 222 | + |
| 223 | +# Install Neuron components from the apt and pip repos (latest versions) |
| 224 | +RUN apt-get update \ |
| 225 | + && apt-get install -y \ |
| 226 | + aws-neuronx-tools \ |
| 227 | + aws-neuronx-collectives \ |
| 228 | + aws-neuronx-runtime-lib \ |
| 229 | + && rm -rf /var/lib/apt/lists/* \ |
| 230 | + && rm -rf /tmp/tmp* \ |
| 231 | + && apt-get clean |
| 232 | + |
| 233 | +RUN ${PIP} install --no-cache-dir --force-reinstall \ |
| 234 | + torch-neuronx \ |
| 235 | + neuronx-cc \ |
| 236 | + neuronx_distributed \ |
| 237 | + && rm -rf ~/.cache/pip/* |
| 238 | + |
| 239 | +FROM base AS prod |
| 240 | + |
| 241 | +# Install Neuron components with specific versions |
| 242 | +RUN apt-get update \ |
| 243 | + && apt-get install -y \ |
| 244 | + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ |
| 245 | + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ |
| 246 | + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ |
| 247 | + && rm -rf /var/lib/apt/lists/* \ |
| 248 | + && rm -rf /tmp/tmp* \ |
| 249 | + && apt-get clean |
| 250 | + |
| 251 | +RUN ${PIP} install --force-reinstall \ |
| 252 | + torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ |
| 253 | + neuronx-cc==$NEURONX_CC_VERSION \ |
| 254 | + neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ |
| 255 | + && rm -rf ~/.cache/pip/* |
| 256 | + |
| 257 | +FROM ${BUILD_STAGE} AS final |
| 258 | + |
| 259 | +# Hugging Face version args |
| 260 | +ARG OPTIMUM_NEURON_VERSION=0.4.1 |
| 261 | +ARG TRANSFORMERS_VERSION=4.55.4 |
| 262 | +ARG DATASETS_VERSION=4.1.1 |
| 263 | +ARG GEVENT_VERSION=24.10.3 |
| 264 | + |
| 265 | +RUN apt-get remove -y --purge emacs && \ |
| 266 | +apt-get autoremove -y |
| 267 | + |
| 268 | +# We need to set this environment variable to avoid the following error when building KenLM: |
| 269 | +# https://github.com/kpu/kenlm/issues/462 |
| 270 | +ENV CMAKE_POLICY_VERSION_MINIMUM=3.5 |
| 271 | + |
| 272 | +# Install Hugging Face libraries and its dependencies |
| 273 | +# Install optimum-neuron with this exta starting from next release. \ |
| 274 | +# "optimum-neuron[training]"==${OPTIMUM_NEURON_VERSION} \ |
| 275 | +RUN ${PIP} install --no-cache-dir \ |
| 276 | + evaluate \ |
| 277 | + transformers[sklearn,sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ |
| 278 | + datasets==${DATASETS_VERSION} \ |
| 279 | + optimum-neuron[training]==${OPTIMUM_NEURON_VERSION} \ |
| 280 | + gevent==${GEVENT_VERSION} \ |
| 281 | + && rm -rf ~/.cache/pip/* |
| 282 | + |
| 283 | +# Pin numpy to version required by neuronx-cc |
| 284 | +# Update Pillow, urllib, wandb versions to fix high and critical vulnerabilities |
| 285 | +# neuronx-cc has requirement networkx~=2.6 |
| 286 | +RUN ${PIP} install -U \ |
| 287 | + "sagemaker>=2.237.0" \ |
| 288 | + sagemaker-training \ |
| 289 | + "sagemaker-pytorch-training<3.0.0" \ |
| 290 | + "tensorboard>=2.11.0" \ |
| 291 | + "numpy" \ |
| 292 | + "numba" \ |
| 293 | + "Pillow==10.3.0" \ |
| 294 | + "requests" \ |
| 295 | + wandb \ |
| 296 | + pytorch-lightning \ |
| 297 | + Jinja2 \ |
| 298 | + mlflow \ |
| 299 | + tornado \ |
| 300 | + "awscli<2" \ |
| 301 | + "boto3<2.0" \ |
| 302 | + "botocore<1.35.94,>=1.35.74" \ |
| 303 | + google-auth \ |
| 304 | + "urllib3>=1.26.17,<1.27" \ |
| 305 | + "networkx==2.8.8" \ |
| 306 | + bokeh \ |
| 307 | + "opencv-python<4.12.0" \ |
| 308 | + "fsspec==2025.9.0" \ |
| 309 | + "protobuf<4" \ |
| 310 | + "multiprocess<0.70.17" \ |
| 311 | + && rm -rf ~/.cache/pip/* |
| 312 | + |
| 313 | +RUN apt-get update \ |
| 314 | + && apt install -y --no-install-recommends \ |
| 315 | + git-lfs \ |
| 316 | + libgssapi-krb5-2 \ |
| 317 | + libexpat1 \ |
| 318 | + expat \ |
| 319 | + libarchive13 \ |
| 320 | + libgstreamer1.0-0 \ |
| 321 | + libgstreamer-plugins-base1.0-0 \ |
| 322 | + && apt-get upgrade -y apparmor \ |
| 323 | + && rm -rf /var/lib/apt/lists/* \ |
| 324 | + && rm -rf /tmp/tmp* \ |
| 325 | + && apt-get clean |
| 326 | + |
| 327 | +ENV WANDB_MODE=disabled |
| 328 | + |
| 329 | +# Starts framework |
| 330 | +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] |
| 331 | +CMD ["/bin/bash"] |
| 332 | + |
| 333 | +HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 |
0 commit comments