diff --git a/base/buildspec-cu126.yml b/base/buildspec-cu126.yml new file mode 100644 index 000000000000..9def0adad1d9 --- /dev/null +++ b/base/buildspec-cu126.yml @@ -0,0 +1,54 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK base +version: &VERSION 12.6.3 +short_version: &SHORT_VERSION "12.6" +arch_type: &ARCH_TYPE x86_64 +autopatch_build: "False" + +repository_info: + base_repository: &BASE_REPOSITORY + image_type: &IMAGE_TYPE gpu + root: . + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + base_context: &BASE_CONTEXT + deep_learning_container: + source: src/deep_learning_container.py + target: deep_learning_container.py + install_python: + source: scripts/install_python.sh + target: install_python.sh + install_cuda: + source: scripts/install_cuda.sh + target: install_cuda.sh + install_efa: + source: scripts/install_efa.sh + target: install_efa.sh + +images: + base_x86_64_gpu_cuda126: + <<: *BASE_REPOSITORY + context: + <<: *BASE_CONTEXT + image_size_baseline: 11000 + device_type: &DEVICE_TYPE gpu + cuda_version: &CUDA_VERSION cu126 + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ] + target: final + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security diff --git a/base/buildspec-cu128.yml b/base/buildspec-cu128.yml new file mode 100644 index 000000000000..093a4f0caee1 --- /dev/null +++ b/base/buildspec-cu128.yml @@ -0,0 +1,54 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK base +version: &VERSION 12.8.1 +short_version: &SHORT_VERSION "12.8" +arch_type: &ARCH_TYPE x86_64 +autopatch_build: "False" + +repository_info: + base_repository: &BASE_REPOSITORY + image_type: &IMAGE_TYPE gpu + root: . + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + base_context: &BASE_CONTEXT + deep_learning_container: + source: src/deep_learning_container.py + target: deep_learning_container.py + install_python: + source: scripts/install_python.sh + target: install_python.sh + install_cuda: + source: scripts/install_cuda.sh + target: install_cuda.sh + install_efa: + source: scripts/install_efa.sh + target: install_efa.sh + +images: + base_x86_64_gpu_cuda128: + <<: *BASE_REPOSITORY + context: + <<: *BASE_CONTEXT + image_size_baseline: 11000 + device_type: &DEVICE_TYPE gpu + cuda_version: &CUDA_VERSION cu128 + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu24.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ] + target: final + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security diff --git a/base/buildspec.yml b/base/buildspec.yml index 093a4f0caee1..5cb7142804e9 100644 --- a/base/buildspec.yml +++ b/base/buildspec.yml @@ -1,54 +1 @@ -account_id: &ACCOUNT_ID -prod_account_id: &PROD_ACCOUNT_ID 763104351884 -region: ®ION -framework: &FRAMEWORK base -version: &VERSION 12.8.1 -short_version: &SHORT_VERSION "12.8" -arch_type: &ARCH_TYPE x86_64 -autopatch_build: "False" - -repository_info: - base_repository: &BASE_REPOSITORY - image_type: &IMAGE_TYPE gpu - root: . - repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ] - repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] - release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ] - release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] - -context: - base_context: &BASE_CONTEXT - deep_learning_container: - source: src/deep_learning_container.py - target: deep_learning_container.py - install_python: - source: scripts/install_python.sh - target: install_python.sh - install_cuda: - source: scripts/install_cuda.sh - target: install_cuda.sh - install_efa: - source: scripts/install_efa.sh - target: install_efa.sh - -images: - base_x86_64_gpu_cuda128: - <<: *BASE_REPOSITORY - context: - <<: *BASE_CONTEXT - image_size_baseline: 11000 - device_type: &DEVICE_TYPE gpu - cuda_version: &CUDA_VERSION cu128 - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py312 - os_version: &OS_VERSION ubuntu24.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ] - target: final - build: true - enable_common_stage_build: false - test_configs: - test_platforms: - - sanity - - security +buildspec_pointer: buildspec-cu126.yml \ No newline at end of file diff --git a/base/x86_64/gpu/cu126/Dockerfile b/base/x86_64/gpu/cu126/Dockerfile new file mode 100644 index 000000000000..769035f317d2 --- /dev/null +++ b/base/x86_64/gpu/cu126/Dockerfile @@ -0,0 +1,125 @@ +ARG PYTHON="python3" +ARG PYTHON_VERSION="3.12.11" +ARG PYTHON_SHORT_VERSION="3.12" +ARG CUDA_MAJOR="12" +ARG CUDA_MINOR="6" +ARG EFA_VERSION="1.42.0" +FROM nvidia/cuda:12.6.3-base-ubuntu22.04 AS base-builder + + +RUN mv /usr/local/cuda/compat /usr/local \ + && apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --allow-change-held-packages --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libglib2.0-0 \ + libegl1 \ + libgl1 \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + zlib1g-dev \ + unzip \ + vim \ + wget \ + libhwloc-dev \ + libgomp1 \ + libibverbs-dev \ + libnuma1 \ + libnuma-dev \ + libtool \ + openssl \ + python3-dev \ + autoconf \ + pkg-config \ + check \ + libsubunit0 \ + libsubunit-dev \ + libffi-dev \ + libbz2-dev \ + liblzma-dev \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +############################################################################## +FROM base-builder AS python-builder +ARG PYTHON_VERSION +COPY install_python.sh install_python.sh +RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh + +############################################################################## +FROM base-builder AS cuda-builder +ARG CUDA_MAJOR +ARG CUDA_MINOR +COPY install_cuda.sh install_cuda.sh +RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" && rm install_cuda.sh + +############################################################################## +FROM nvidia/cuda:12.6.3-base-ubuntu22.04 AS final +ARG PYTHON +ARG PYTHON_SHORT_VERSION +ARG CUDA_MAJOR +ARG CUDA_MINOR +ARG EFA_VERSION +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" +ENV DEBIAN_FRONTEND=noninteractive \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + DLC_CONTAINER_TYPE=base \ + # Python won’t try to write .pyc or .pyo files on the import of source modules + # Force stdin, stdout and stderr to be totally unbuffered. Good for logging + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + CUDA_HOME="/usr/local/cuda" \ + PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ + LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}" + +WORKDIR / + +# + python and pip packages (awscli, boto3, requests) +COPY --from=python-builder /usr/local/lib/python${PYTHON_SHORT_VERSION} /usr/local/lib/python${PYTHON_SHORT_VERSION} +COPY --from=python-builder /usr/local/include/python${PYTHON_SHORT_VERSION} /usr/local/include/python${PYTHON_SHORT_VERSION} +COPY --from=python-builder /usr/local/bin /usr/local/bin +# + cuda-toolkit, cudnn, nccl +COPY --from=cuda-builder /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} +COPY install_efa.sh install_efa.sh +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/deep_learning_container.py && \ + chmod +x /usr/local/bin/bash_telemetry.sh && \ + echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc && \ + # Install EFA + bash install_efa.sh ${EFA_VERSION} && \ + rm install_efa.sh && \ + # OSS compliance + apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --allow-change-held-packages --no-install-recommends \ + unzip \ + wget && \ + apt-get clean && \ + HOME_DIR=/root && \ + curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ + unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ + cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ + chmod +x /usr/local/bin/testOSSCompliance && \ + chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ + ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} && \ + rm -rf ${HOME_DIR}/oss_compliance* && \ + rm -rf /tmp/tmp* && \ + rm -rf /var/lib/apt/lists/* && \ + rm -rf /root/.cache | true + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/scripts/install_cuda.sh b/scripts/install_cuda.sh index cf176622ad39..60ee0810d3c0 100644 --- a/scripts/install_cuda.sh +++ b/scripts/install_cuda.sh @@ -46,6 +46,45 @@ function install_nvjpeg_for_cuda_below_129 { rm -rf /tmp/nvjpeg } +function install_cuda126_stack { + CUDNN_VERSION="9.7.0.66" + NCCL_VERSION="v2.24.3-1" + CUDA_HOME="/usr/local/cuda" + + # move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-* + rm -rf /usr/local/cuda-* + rm -rf /usr/local/cuda + + # install CUDA 12.6.3 + wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run + chmod +x cuda_12.6.3_560.35.05_linux.run + ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent + rm -f cuda_12.6.3_560.35.05_linux.run + ln -s /usr/local/cuda-12.6 /usr/local/cuda + # bring back cuda-compat + mv /usr/local/compat /usr/local/cuda/compat 2>/dev/null || true + + # install cudnn + mkdir -p /tmp/cudnn + cd /tmp/cudnn + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/ + + # install nccl + mkdir -p /tmp/nccl + cd /tmp/nccl + git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl + make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + + install_nvjpeg_for_cuda_below_129 + prune_cuda + ldconfig +} function install_cuda128_stack { CUDNN_VERSION="9.8.0.87" @@ -91,6 +130,8 @@ function install_cuda128_stack { while test $# -gt 0 do case "$1" in + 12.6) install_cuda126_stack; + ;; 12.8) install_cuda128_stack; ;; *) echo "bad argument $1"; exit 1 diff --git a/scripts/install_python.sh b/scripts/install_python.sh index c8579da2e4dc..acc572274443 100644 --- a/scripts/install_python.sh +++ b/scripts/install_python.sh @@ -23,7 +23,8 @@ function install_python { # this will add pip systemlink to pip${PYTHON_MAJOR_VERSION} python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org - python -m pip install --no-cache-dir awscli boto3 requests setuptools + python -m pip install --no-cache-dir awscli boto3 requests + python -m pip install --no-cache-dir "setuptools>=78.1.1" } # idiomatic parameter and option handling in sh