Skip to content

Commit 48a09aa

Browse files
Yadan-WeiYadan Wei
andauthored
Add cu128 ul22 base (#5177)
* add cu128 ul22 * fix os_version arg * fix cuda version * build cu128 ul24 * build cu129 ul22 * fix typo * revert toml --------- Co-authored-by: Yadan Wei <[email protected]>
1 parent 28231cf commit 48a09aa

File tree

5 files changed

+247
-15
lines changed

5 files changed

+247
-15
lines changed

base/buildspec-cu128-ubuntu22.yml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
prod_account_id: &PROD_ACCOUNT_ID 763104351884
3+
region: &REGION <set-$REGION-in-environment>
4+
framework: &FRAMEWORK base
5+
version: &VERSION 12.8.0
6+
short_version: &SHORT_VERSION "12.8"
7+
arch_type: &ARCH_TYPE x86_64
8+
autopatch_build: "False"
9+
10+
repository_info:
11+
base_repository: &BASE_REPOSITORY
12+
image_type: &IMAGE_TYPE gpu
13+
root: .
14+
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
15+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
16+
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
17+
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
18+
19+
context:
20+
base_context: &BASE_CONTEXT
21+
deep_learning_container:
22+
source: src/deep_learning_container.py
23+
target: deep_learning_container.py
24+
install_python:
25+
source: scripts/install_python.sh
26+
target: install_python.sh
27+
install_cuda:
28+
source: scripts/install_cuda.sh
29+
target: install_cuda.sh
30+
install_efa:
31+
source: scripts/install_efa.sh
32+
target: install_efa.sh
33+
34+
images:
35+
base_x86_64_gpu_cuda128:
36+
<<: *BASE_REPOSITORY
37+
context:
38+
<<: *BASE_CONTEXT
39+
image_size_baseline: 11000
40+
device_type: &DEVICE_TYPE gpu
41+
cuda_version: &CUDA_VERSION cu128
42+
python_version: &DOCKER_PYTHON_VERSION py3
43+
tag_python_version: &TAG_PYTHON_VERSION py312
44+
os_version: &OS_VERSION ubuntu22.04
45+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
46+
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
47+
docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /, *OS_VERSION, /Dockerfile ]
48+
target: final
49+
build: true
50+
enable_common_stage_build: false
51+
test_configs:
52+
test_platforms:
53+
- sanity
54+
- security
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
ARG PYTHON="python3"
2+
ARG PYTHON_VERSION="3.12.10"
3+
ARG PYTHON_SHORT_VERSION="3.12"
4+
ARG CUDA_MAJOR="12"
5+
ARG CUDA_MINOR="8"
6+
ARG EFA_VERSION="1.40.0"
7+
ARG OS_VERSION="ubuntu22.04"
8+
FROM nvidia/cuda:12.8.0-base-${OS_VERSION} AS base-builder
9+
10+
RUN mv /usr/local/cuda/compat /usr/local \
11+
&& apt-get update \
12+
&& apt-get -y upgrade --only-upgrade systemd \
13+
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
14+
automake \
15+
build-essential \
16+
ca-certificates \
17+
cmake \
18+
curl \
19+
emacs \
20+
git \
21+
jq \
22+
libcurl4-openssl-dev \
23+
libglib2.0-0 \
24+
libegl1 \
25+
libgl1 \
26+
libsm6 \
27+
libssl-dev \
28+
libxext6 \
29+
libxrender-dev \
30+
zlib1g-dev \
31+
unzip \
32+
vim \
33+
wget \
34+
libhwloc-dev \
35+
libgomp1 \
36+
libibverbs-dev \
37+
libnuma1 \
38+
libnuma-dev \
39+
libtool \
40+
openssl \
41+
python3-dev \
42+
autoconf \
43+
pkg-config \
44+
check \
45+
libsubunit0 \
46+
libsubunit-dev \
47+
libffi-dev \
48+
libbz2-dev \
49+
liblzma-dev \
50+
&& apt-get autoremove -y \
51+
&& apt-get clean \
52+
&& rm -rf /var/lib/apt/lists/*
53+
54+
##############################################################################
55+
FROM base-builder AS python-builder
56+
ARG PYTHON_VERSION
57+
COPY install_python.sh install_python.sh
58+
RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh
59+
60+
##############################################################################
61+
FROM base-builder AS cuda-builder
62+
ARG CUDA_MAJOR
63+
ARG CUDA_MINOR
64+
ARG OS_VERSION
65+
COPY install_cuda.sh install_cuda.sh
66+
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" "${OS_VERSION}" && rm install_cuda.sh
67+
68+
##############################################################################
69+
FROM nvidia/cuda:12.8.1-base-${OS_VERSION} AS final
70+
ARG PYTHON
71+
ARG PYTHON_SHORT_VERSION
72+
ARG CUDA_MAJOR
73+
ARG CUDA_MINOR
74+
ARG EFA_VERSION
75+
LABEL maintainer="Amazon AI"
76+
LABEL dlc_major_version="1"
77+
ENV DEBIAN_FRONTEND=noninteractive \
78+
LANG=C.UTF-8 \
79+
LC_ALL=C.UTF-8 \
80+
DLC_CONTAINER_TYPE=base \
81+
# Python won’t try to write .pyc or .pyo files on the import of source modules
82+
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
83+
PYTHONDONTWRITEBYTECODE=1 \
84+
PYTHONUNBUFFERED=1 \
85+
PYTHONIOENCODING=UTF-8 \
86+
CUDA_HOME="/usr/local/cuda" \
87+
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \
88+
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"
89+
90+
WORKDIR /
91+
92+
# + python and pip packages (awscli, boto3, requests)
93+
COPY --from=python-builder /usr/local/lib/python${PYTHON_SHORT_VERSION} /usr/local/lib/python${PYTHON_SHORT_VERSION}
94+
COPY --from=python-builder /usr/local/include/python${PYTHON_SHORT_VERSION} /usr/local/include/python${PYTHON_SHORT_VERSION}
95+
COPY --from=python-builder /usr/local/bin /usr/local/bin
96+
# + cuda-toolkit, cudnn, nccl
97+
COPY --from=cuda-builder /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}
98+
COPY install_efa.sh install_efa.sh
99+
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
100+
COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
101+
RUN chmod +x /usr/local/bin/deep_learning_container.py && \
102+
chmod +x /usr/local/bin/bash_telemetry.sh && \
103+
echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc && \
104+
# Install EFA
105+
bash install_efa.sh ${EFA_VERSION} && \
106+
rm install_efa.sh && \
107+
# OSS compliance
108+
apt-get update && \
109+
apt-get upgrade -y && \
110+
apt-get install -y --allow-change-held-packages --no-install-recommends \
111+
unzip \
112+
wget && \
113+
apt-get clean && \
114+
HOME_DIR=/root && \
115+
curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
116+
unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
117+
cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
118+
chmod +x /usr/local/bin/testOSSCompliance && \
119+
chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
120+
${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} && \
121+
rm -rf ${HOME_DIR}/oss_compliance* && \
122+
rm -rf /tmp/tmp* && \
123+
rm -rf /var/lib/apt/lists/* && \
124+
rm -rf /root/.cache | true
125+
126+
CMD ["/bin/bash"]

base/x86_64/gpu/cu128/ubuntu24.04/Dockerfile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ ARG PYTHON_SHORT_VERSION="3.12"
44
ARG CUDA_MAJOR="12"
55
ARG CUDA_MINOR="8"
66
ARG EFA_VERSION="1.42.0"
7-
FROM nvidia/cuda:12.8.1-base-ubuntu24.04 AS base-builder
7+
ARG OS_VERSION="ubuntu24.04"
8+
FROM nvidia/cuda:12.8.1-base-${OS_VERSION} AS base-builder
89

910

1011
RUN mv /usr/local/cuda/compat /usr/local \
@@ -61,11 +62,12 @@ RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh
6162
FROM base-builder AS cuda-builder
6263
ARG CUDA_MAJOR
6364
ARG CUDA_MINOR
65+
ARG OS_VERSION
6466
COPY install_cuda.sh install_cuda.sh
65-
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" && rm install_cuda.sh
67+
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" "${OS_VERSION}" && rm install_cuda.sh
6668

6769
##############################################################################
68-
FROM nvidia/cuda:12.8.1-base-ubuntu24.04 AS final
70+
FROM nvidia/cuda:12.8.1-base-${OS_VERSION} AS final
6971
ARG PYTHON
7072
ARG PYTHON_SHORT_VERSION
7173
ARG CUDA_MAJOR

base/x86_64/gpu/cu129/ubuntu22.04/Dockerfile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ ARG PYTHON_SHORT_VERSION="3.12"
44
ARG CUDA_MAJOR="12"
55
ARG CUDA_MINOR="9"
66
ARG EFA_VERSION="1.43.1"
7-
FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base-builder
7+
ARG OS_VERSION="ubuntu22.04"
8+
FROM nvidia/cuda:12.9.1-base-${OS_VERSION} AS base-builder
89

910

1011
RUN mv /usr/local/cuda/compat /usr/local \
@@ -61,11 +62,12 @@ RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh
6162
FROM base-builder AS cuda-builder
6263
ARG CUDA_MAJOR
6364
ARG CUDA_MINOR
65+
ARG OS_VERSION
6466
COPY install_cuda.sh install_cuda.sh
65-
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" && rm install_cuda.sh
67+
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" "${OS_VERSION}" && rm install_cuda.sh
6668

6769
##############################################################################
68-
FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS final
70+
FROM nvidia/cuda:12.9.1-base-${OS_VERSION} AS final
6971
ARG PYTHON
7072
ARG PYTHON_SHORT_VERSION
7173
ARG CUDA_MAJOR

scripts/install_cuda.sh

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ function install_nvjpeg_for_cuda_below_129 {
4747
}
4848

4949

50-
function install_cuda128_stack {
50+
function install_cuda128_stack_ul24 {
5151
CUDNN_VERSION="9.8.0.87"
5252
NCCL_VERSION="v2.26.2-1"
5353
CUDA_HOME="/usr/local/cuda"
@@ -87,7 +87,47 @@ function install_cuda128_stack {
8787
ldconfig
8888
}
8989

90-
function install_cuda129_stack {
90+
function install_cuda128_stack_ul22 {
91+
CUDNN_VERSION="9.7.1.26"
92+
NCCL_VERSION="v2.26.2-1"
93+
CUDA_HOME="/usr/local/cuda"
94+
95+
# move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-*
96+
rm -rf /usr/local/cuda-*
97+
rm -rf /usr/local/cuda
98+
99+
# install CUDA
100+
wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
101+
chmod +x cuda_12.8.0_570.86.10_linux.run
102+
./cuda_12.8.0_570.86.10_linux.run --toolkit --silent
103+
rm -f cuda_12.8.0_570.86.10_linux.run
104+
ln -s /usr/local/cuda-12.8 /usr/local/cuda
105+
# bring back cuda-compat
106+
mv /usr/local/compat /usr/local/cuda/compat
107+
108+
# install cudnn
109+
mkdir -p /tmp/cudnn
110+
cd /tmp/cudnn
111+
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
112+
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
113+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
114+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
115+
116+
# install nccl
117+
mkdir -p /tmp/nccl
118+
cd /tmp/nccl
119+
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
120+
cd nccl
121+
make -j src.build
122+
cp -a build/include/* /usr/local/cuda/include/
123+
cp -a build/lib/* /usr/local/cuda/lib64/
124+
125+
install_nvjpeg_for_cuda_below_129
126+
prune_cuda
127+
ldconfig
128+
}
129+
130+
function install_cuda129_stack_ul22 {
91131
CUDNN_VERSION="9.10.2.21"
92132
NCCL_VERSION="v2.27.3-1"
93133
CUDA_HOME="/usr/local/cuda"
@@ -130,12 +170,20 @@ function install_cuda129_stack {
130170
while test $# -gt 0
131171
do
132172
case "$1" in
133-
12.8) install_cuda128_stack;
134-
;;
135-
12.9) install_cuda129_stack;
136-
;;
137-
*) echo "bad argument $1"; exit 1
138-
;;
173+
12.8)
174+
case "$2" in
175+
"ubuntu22.04") install_cuda128_stack_ul22 ;;
176+
"ubuntu24.04") install_cuda128_stack_ul24 ;;
177+
*) echo "bad OS version $2"; exit 1 ;;
178+
esac
179+
;;
180+
12.9)
181+
case "$2" in
182+
"ubuntu22.04") install_cuda129_stack_ul22 ;;
183+
*) echo "bad OS version $2"; exit 1 ;;
184+
esac
185+
;;
186+
*) echo "bad CUDA version $1"; exit 1 ;;
139187
esac
140-
shift
188+
shift 2 # Skip both arguments at once
141189
done

0 commit comments

Comments
 (0)