Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
313 changes: 313 additions & 0 deletions pytorch/training/docker/2.9/py3/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
ARG PYTHON=python3
ARG PYTHON_VERSION=3.12.10
ARG PYTHON_SHORT_VERSION=3.12
ARG PYTORCH_VERSION=2.9.0

ARG OPEN_MPI_VERSION=4.1.7

ARG TORCHTNT_VERSION=0.2.4
ARG TORCHDATA_VERSION=0.11.0
ARG TORCHAUDIO_VERSION=2.9.0
ARG TORCHVISION_VERSION=0.24.0

FROM ubuntu:22.04 AS base_image

# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
ENV DEBIAN_FRONTEND=noninteractive
ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"

RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get autoremove -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

#################################################################
# ____
# / ___| ___ _ __ ___ _ __ ___ ___ _ __
# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \
# | |___ (_) | | | | | | | | | | | (_) | | | |
# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_|
# ___ ____ _
# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___
# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/
# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
# |___/ |_|
#################################################################

FROM base_image AS common

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"

ARG PYTHON
ARG PYTHON_VERSION
ARG PYTHON_SHORT_VERSION
ARG PYTORCH_VERSION
ARG TORCHTNT_VERSION
ARG TORCHDATA_VERSION
ARG TORCHAUDIO_VERSION
ARG TORCHVISION_VERSION

ARG OPEN_MPI_VERSION

ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8

ENV DLC_CONTAINER_TYPE=training
WORKDIR /

RUN apt-get update \
&& apt-get -y upgrade --only-upgrade systemd \
&& apt-get install -y --no-install-recommends \
automake \
build-essential \
ca-certificates \
cmake \
curl \
emacs \
git \
jq \
libcurl4-openssl-dev \
libglib2.0-0 \
libgl1-mesa-glx \
libsm6 \
libssl-dev \
libxext6 \
libxrender-dev \
zlib1g-dev \
unzip \
vim \
wget \
libbz2-dev \
libreadline-dev \
libsqlite3-dev \
llvm \
libncurses5-dev \
libncursesw5-dev \
xz-utils \
tk-dev \
liblzma-dev \
libffi-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Install Open MPI
RUN wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-${OPEN_MPI_VERSION}.tar.gz \
&& gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \
&& cd openmpi-${OPEN_MPI_VERSION} \
&& ./configure --prefix=/home/.openmpi \
&& make all install \
&& cd .. \
&& rm openmpi-${OPEN_MPI_VERSION}.tar.gz \
&& rm -rf openmpi-${OPEN_MPI_VERSION}

# The ENV variables declared below are changed in the previous section
# Grouping these ENV variables in the first section causes
# ompi_info to fail. This is only observed in CPU containers
ENV PATH="/home/.openmpi/bin:${PATH}"
ENV LD_LIBRARY_PATH="/home/.openmpi/lib:${LD_LIBRARY_PATH}"
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value

# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
RUN apt-get update \
&& apt-get install -y --no-install-recommends openssh-client openssh-server \
&& mkdir -p /var/run/sshd \
&& cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
&& mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Configure OpenSSH so that nodes can communicate with each other
RUN mkdir -p /var/run/sshd \
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd

RUN rm -rf /root/.ssh/ \
&& mkdir -p /root/.ssh/ \
&& ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config

# install python
RUN cd /tmp/ \
&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
&& tar xzf Python-${PYTHON_VERSION}.tgz \
&& cd Python-${PYTHON_VERSION} \
&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \
&& make -j "$(nproc)" \
&& make altinstall \
&& cd .. \
&& rm -rf Python-${PYTHON_VERSION} \
&& rm Python-${PYTHON_VERSION}.tgz \
&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \
&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \
# This installation generate a .python_history file in the root directory leads sanity check to fail
&& rm -f /root/.python_history

# Python Path
ENV PATH="/usr/local/bin:${PATH}"

# this will add pip systemlink to pip${PYTHON_SHORT_VERSION}
RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org

# Install common packages
RUN pip install --no-cache-dir \
cython \
cryptography \
pyOpenSSL \
pybind11 \
mkl \
mkl-include \
parso \
typing \
charset-normalizer \
packaging \
boto3 \
PyYAML \
numpy \
scipy \
click \
psutil \
ipython \
ipykernel \
pillow \
h5py \
fsspec \
"idna>=3.7" \
"tqdm>=4.66.3" \
"requests>=2.32.0" \
"setuptools>=70.0.0" \
"urllib3>=2.5.0" \
"awscli" \
opencv-python==4.11.0.86 \
mpi4py \
jinja2>=3.1.6 \
tornado>=6.5.1

# Install PyTorch
RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
torchvision==${TORCHVISION_VERSION} \
torchaudio==${TORCHAUDIO_VERSION} \
--index-url https://download.pytorch.org/whl/cpu \
&& pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
torchdata==${TORCHDATA_VERSION} \
s3torchconnector \
fastai \
accelerate \
# pin numpy requirement for fastai dependency
# requires explicit declaration of spacy, thic, blis
spacy \
# pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy)
thinc==8.3.4 \
blis \
numpy \
&& pip uninstall -y dataclasses

RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt

COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py

RUN chmod +x /usr/local/bin/deep_learning_container.py

COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
RUN chmod +x /usr/local/bin/bash_telemetry.sh
RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc

# Removing the cache as it is needed for security verification
RUN rm -rf /root/.cache | true

########################################################
# _____ ____ ____ ___
# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___
# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \
# | |__| |___ / __/ | || | | | | | (_| | (_| | __/
# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
# |___/
# ____ _
# | _ \ ___ ___(_)_ __ ___
# | |_) / _ \/ __| | '_ \ / _ \
# | _ < __/ (__| | |_) | __/
# |_| \_\___|\___|_| .__/ \___|
# |_|
########################################################

FROM common AS ec2

WORKDIR /

COPY setup_oss_compliance.sh setup_oss_compliance.sh
RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh

COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]

# Starts framework
CMD ["/bin/bash"]

#################################################################
# ____ __ __ _
# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __
# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__|
# ___) | (_| | (_| | __/ | | | (_| | < __/ |
# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_|
# |___/
# ___ ____ _
# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___
# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/
# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
# |___/ |_|
#################################################################

FROM common AS sagemaker

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"

ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main

WORKDIR /

# Install SM packages
RUN pip install --no-cache-dir -U \
smclarify \
"sagemaker>=2" \
sagemaker-experiments \
sagemaker-pytorch-training \
sagemaker-training

# Install extra packages
RUN pip install --no-cache-dir -U \
bokeh \
imageio \
numba \
pandas \
plotly \
scikit-learn \
seaborn \
shap \
cloudpickle

# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh

RUN chmod +x /usr/local/bin/start_with_right_hostname.sh

COPY setup_oss_compliance.sh setup_oss_compliance.sh
RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh

ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
CMD ["/bin/bash"]

Loading