diff --git a/base/buildspec.yml b/base/buildspec.yml index 093a4f0caee1..90f61fe24a7f 100644 --- a/base/buildspec.yml +++ b/base/buildspec.yml @@ -2,8 +2,8 @@ account_id: &ACCOUNT_ID prod_account_id: &PROD_ACCOUNT_ID 763104351884 region: ®ION framework: &FRAMEWORK base -version: &VERSION 12.8.1 -short_version: &SHORT_VERSION "12.8" +version: &VERSION 12.6.3 +short_version: &SHORT_VERSION "12.6" arch_type: &ARCH_TYPE x86_64 autopatch_build: "False" @@ -32,16 +32,16 @@ context: target: install_efa.sh images: - base_x86_64_gpu_cuda128: + base_x86_64_gpu_cuda126: <<: *BASE_REPOSITORY context: <<: *BASE_CONTEXT image_size_baseline: 11000 device_type: &DEVICE_TYPE gpu - cuda_version: &CUDA_VERSION cu128 + cuda_version: &CUDA_VERSION cu126 python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py312 - os_version: &OS_VERSION ubuntu24.04 + tag_python_version: &TAG_PYTHON_VERSION py311 + os_version: &OS_VERSION ubuntu22.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ] diff --git a/base/x86_64/gpu/cu128/Dockerfile b/base/x86_64/gpu/cu126/Dockerfile similarity index 95% rename from base/x86_64/gpu/cu128/Dockerfile rename to base/x86_64/gpu/cu126/Dockerfile index 89c60b7205c1..7b811e92e5b9 100644 --- a/base/x86_64/gpu/cu128/Dockerfile +++ b/base/x86_64/gpu/cu126/Dockerfile @@ -1,10 +1,10 @@ ARG PYTHON="python3" -ARG PYTHON_VERSION="3.12.10" -ARG PYTHON_SHORT_VERSION="3.12" +ARG PYTHON_VERSION="3.11.12" +ARG PYTHON_SHORT_VERSION="3.11" ARG CUDA_MAJOR="12" -ARG CUDA_MINOR="8" +ARG CUDA_MINOR="6" ARG EFA_VERSION="1.42.0" -FROM nvidia/cuda:12.8.1-base-ubuntu24.04 AS base-builder +FROM nvidia/cuda:12.6.3-base-ubuntu22.04 AS base-builder RUN mv /usr/local/cuda/compat /usr/local \ @@ -65,7 +65,7 @@ COPY install_cuda.sh install_cuda.sh RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" && rm install_cuda.sh ############################################################################## -FROM nvidia/cuda:12.8.1-base-ubuntu24.04 AS final +FROM nvidia/cuda:12.6.3-base-ubuntu22.04 AS final ARG PYTHON ARG PYTHON_SHORT_VERSION ARG CUDA_MAJOR @@ -97,6 +97,7 @@ COPY --from=cuda-builder /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} /usr/local/ COPY install_efa.sh install_efa.sh COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh + RUN chmod +x /usr/local/bin/deep_learning_container.py && \ chmod +x /usr/local/bin/bash_telemetry.sh && \ echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc && \ diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 1962bfd69e21..e30b390cec8b 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,7 +37,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["base"] # By default we build both training and inference containers. Set true/false values to determine which to build. diff --git a/scripts/install_cuda.sh b/scripts/install_cuda.sh index cf176622ad39..5c0fc27700b5 100644 --- a/scripts/install_cuda.sh +++ b/scripts/install_cuda.sh @@ -46,6 +46,45 @@ function install_nvjpeg_for_cuda_below_129 { rm -rf /tmp/nvjpeg } +function install_cuda126_stack { + CUDNN_VERSION="9.7.0.66" + NCCL_VERSION="v2.24.3-1" + CUDA_HOME="/usr/local/cuda" + + # move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-* + rm -rf /usr/local/cuda-* + rm -rf /usr/local/cuda + + # install CUDA 12.6.3 + wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run + chmod +x cuda_12.6.3_560.35.05_linux.run + ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent + rm -f cuda_12.6.3_560.35.05_linux.run + ln -s /usr/local/cuda-12.6 /usr/local/cuda + # bring back cuda-compat + mv /usr/local/compat /usr/local/cuda/compat 2>/dev/null || true + + # install cudnn + mkdir -p /tmp/cudnn + cd /tmp/cudnn + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/ + + # install nccl + mkdir -p /tmp/nccl + cd /tmp/nccl + git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl + make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + + install_nvjpeg_for_cuda_below_129 + prune_cuda + ldconfig +} function install_cuda128_stack { CUDNN_VERSION="9.8.0.87" @@ -91,6 +130,8 @@ function install_cuda128_stack { while test $# -gt 0 do case "$1" in + 12.6) install_cuda126_stack; + ;; 12.8) install_cuda128_stack; ;; *) echo "bad argument $1"; exit 1 diff --git a/scripts/install_python.sh b/scripts/install_python.sh index c8579da2e4dc..acc572274443 100644 --- a/scripts/install_python.sh +++ b/scripts/install_python.sh @@ -23,7 +23,8 @@ function install_python { # this will add pip systemlink to pip${PYTHON_MAJOR_VERSION} python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org - python -m pip install --no-cache-dir awscli boto3 requests setuptools + python -m pip install --no-cache-dir awscli boto3 requests + python -m pip install --no-cache-dir "setuptools>=78.1.1" } # idiomatic parameter and option handling in sh