Skip to content

Nearmap Base DLC #5058

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions base/buildspec-cu126.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK base
version: &VERSION 12.6.3
short_version: &SHORT_VERSION "12.6"
arch_type: &ARCH_TYPE x86_64
autopatch_build: "False"

repository_info:
base_repository: &BASE_REPOSITORY
image_type: &IMAGE_TYPE gpu
root: .
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
base_context: &BASE_CONTEXT
deep_learning_container:
source: src/deep_learning_container.py
target: deep_learning_container.py
install_python:
source: scripts/install_python.sh
target: install_python.sh
install_cuda:
source: scripts/install_cuda.sh
target: install_cuda.sh
install_efa:
source: scripts/install_efa.sh
target: install_efa.sh

images:
base_x86_64_gpu_cuda126:
<<: *BASE_REPOSITORY
context:
<<: *BASE_CONTEXT
image_size_baseline: 11000
device_type: &DEVICE_TYPE gpu
cuda_version: &CUDA_VERSION cu126
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ]
target: final
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
54 changes: 54 additions & 0 deletions base/buildspec-cu128.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK base
version: &VERSION 12.8.1
short_version: &SHORT_VERSION "12.8"
arch_type: &ARCH_TYPE x86_64
autopatch_build: "False"

repository_info:
base_repository: &BASE_REPOSITORY
image_type: &IMAGE_TYPE gpu
root: .
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
base_context: &BASE_CONTEXT
deep_learning_container:
source: src/deep_learning_container.py
target: deep_learning_container.py
install_python:
source: scripts/install_python.sh
target: install_python.sh
install_cuda:
source: scripts/install_cuda.sh
target: install_cuda.sh
install_efa:
source: scripts/install_efa.sh
target: install_efa.sh

images:
base_x86_64_gpu_cuda128:
<<: *BASE_REPOSITORY
context:
<<: *BASE_CONTEXT
image_size_baseline: 11000
device_type: &DEVICE_TYPE gpu
cuda_version: &CUDA_VERSION cu128
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu24.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ]
target: final
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
55 changes: 1 addition & 54 deletions base/buildspec.yml
Original file line number Diff line number Diff line change
@@ -1,54 +1 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK base
version: &VERSION 12.8.1
short_version: &SHORT_VERSION "12.8"
arch_type: &ARCH_TYPE x86_64
autopatch_build: "False"

repository_info:
base_repository: &BASE_REPOSITORY
image_type: &IMAGE_TYPE gpu
root: .
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
base_context: &BASE_CONTEXT
deep_learning_container:
source: src/deep_learning_container.py
target: deep_learning_container.py
install_python:
source: scripts/install_python.sh
target: install_python.sh
install_cuda:
source: scripts/install_cuda.sh
target: install_cuda.sh
install_efa:
source: scripts/install_efa.sh
target: install_efa.sh

images:
base_x86_64_gpu_cuda128:
<<: *BASE_REPOSITORY
context:
<<: *BASE_CONTEXT
image_size_baseline: 11000
device_type: &DEVICE_TYPE gpu
cuda_version: &CUDA_VERSION cu128
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu24.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ]
target: final
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
buildspec_pointer: buildspec-cu126.yml
125 changes: 125 additions & 0 deletions base/x86_64/gpu/cu126/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
ARG PYTHON="python3"
ARG PYTHON_VERSION="3.12.11"
ARG PYTHON_SHORT_VERSION="3.12"
ARG CUDA_MAJOR="12"
ARG CUDA_MINOR="6"
ARG EFA_VERSION="1.42.0"
FROM nvidia/cuda:12.6.3-base-ubuntu22.04 AS base-builder


RUN mv /usr/local/cuda/compat /usr/local \
&& apt-get update \
&& apt-get -y upgrade --only-upgrade systemd \
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
automake \
build-essential \
ca-certificates \
cmake \
curl \
emacs \
git \
jq \
libcurl4-openssl-dev \
libglib2.0-0 \
libegl1 \
libgl1 \
libsm6 \
libssl-dev \
libxext6 \
libxrender-dev \
zlib1g-dev \
unzip \
vim \
wget \
libhwloc-dev \
libgomp1 \
libibverbs-dev \
libnuma1 \
libnuma-dev \
libtool \
openssl \
python3-dev \
autoconf \
pkg-config \
check \
libsubunit0 \
libsubunit-dev \
libffi-dev \
libbz2-dev \
liblzma-dev \
&& apt-get autoremove -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

##############################################################################
FROM base-builder AS python-builder
ARG PYTHON_VERSION
COPY install_python.sh install_python.sh
RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh

##############################################################################
FROM base-builder AS cuda-builder
ARG CUDA_MAJOR
ARG CUDA_MINOR
COPY install_cuda.sh install_cuda.sh
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" && rm install_cuda.sh

##############################################################################
FROM nvidia/cuda:12.6.3-base-ubuntu22.04 AS final
ARG PYTHON
ARG PYTHON_SHORT_VERSION
ARG CUDA_MAJOR
ARG CUDA_MINOR
ARG EFA_VERSION
LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"
ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
DLC_CONTAINER_TYPE=base \
# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONIOENCODING=UTF-8 \
CUDA_HOME="/usr/local/cuda" \
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"

WORKDIR /

# + python and pip packages (awscli, boto3, requests)
COPY --from=python-builder /usr/local/lib/python${PYTHON_SHORT_VERSION} /usr/local/lib/python${PYTHON_SHORT_VERSION}
COPY --from=python-builder /usr/local/include/python${PYTHON_SHORT_VERSION} /usr/local/include/python${PYTHON_SHORT_VERSION}
COPY --from=python-builder /usr/local/bin /usr/local/bin
# + cuda-toolkit, cudnn, nccl
COPY --from=cuda-builder /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}
COPY install_efa.sh install_efa.sh
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
RUN chmod +x /usr/local/bin/deep_learning_container.py && \
chmod +x /usr/local/bin/bash_telemetry.sh && \
echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc && \
# Install EFA
bash install_efa.sh ${EFA_VERSION} && \
rm install_efa.sh && \
# OSS compliance
apt-get update && \
apt-get upgrade -y && \
apt-get install -y --allow-change-held-packages --no-install-recommends \
unzip \
wget && \
apt-get clean && \
HOME_DIR=/root && \
curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
chmod +x /usr/local/bin/testOSSCompliance && \
chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} && \
rm -rf ${HOME_DIR}/oss_compliance* && \
rm -rf /tmp/tmp* && \
rm -rf /var/lib/apt/lists/* && \
rm -rf /root/.cache | true

CMD ["/bin/bash"]
41 changes: 41 additions & 0 deletions scripts/install_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,45 @@ function install_nvjpeg_for_cuda_below_129 {
rm -rf /tmp/nvjpeg
}

function install_cuda126_stack {
CUDNN_VERSION="9.7.0.66"
NCCL_VERSION="v2.24.3-1"
CUDA_HOME="/usr/local/cuda"

# move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-*
rm -rf /usr/local/cuda-*
rm -rf /usr/local/cuda

# install CUDA 12.6.3
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
chmod +x cuda_12.6.3_560.35.05_linux.run
./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
rm -f cuda_12.6.3_560.35.05_linux.run
ln -s /usr/local/cuda-12.6 /usr/local/cuda
# bring back cuda-compat
mv /usr/local/compat /usr/local/cuda/compat 2>/dev/null || true

# install cudnn
mkdir -p /tmp/cudnn
cd /tmp/cudnn
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/

# install nccl
mkdir -p /tmp/nccl
cd /tmp/nccl
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl
make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/

install_nvjpeg_for_cuda_below_129
prune_cuda
ldconfig
}

function install_cuda128_stack {
CUDNN_VERSION="9.8.0.87"
Expand Down Expand Up @@ -91,6 +130,8 @@ function install_cuda128_stack {
while test $# -gt 0
do
case "$1" in
12.6) install_cuda126_stack;
;;
12.8) install_cuda128_stack;
;;
*) echo "bad argument $1"; exit 1
Expand Down
3 changes: 2 additions & 1 deletion scripts/install_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ function install_python {

# this will add pip systemlink to pip${PYTHON_MAJOR_VERSION}
python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
python -m pip install --no-cache-dir awscli boto3 requests setuptools
python -m pip install --no-cache-dir awscli boto3 requests
python -m pip install --no-cache-dir "setuptools>=78.1.1"
}

# idiomatic parameter and option handling in sh
Expand Down