Merge pull request #256 from red-hat-data-services/konflux-dockerfiles

riprasad · web-flow · commit 4cdefaeeee37 · 2025-11-07T16:18:48.000Z
add konflux Dockerfiles
diff --git a/images/runtime/training/py311-cuda121-torch241/Dockerfile.konflux b/images/runtime/training/py311-cuda121-torch241/Dockerfile.konflux
@@ -0,0 +1,89 @@
+FROM registry.access.redhat.com/ubi9/python-311:latest
+
+# Copy license
+COPY LICENSE.md /licenses/cuda-license.md
+
+# Set the working directory in the container
+USER 0
+WORKDIR /app
+
+# upgrade requests package
+RUN pip install --no-cache-dir --upgrade requests==2.32.3
+
+# Install CUDA
+WORKDIR /opt/app-root/bin
+
+# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
+ENV CUDA_VERSION=12.1.0 \
+    NVIDIA_REQUIRE_CUDA="cuda>=12.1 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526" \
+    NV_CUDA_LIB_VERSION=12.1.0-1 \
+    NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    NV_CUDA_CUDART_VERSION=12.1.55-1 \
+    NV_CUDA_COMPAT_VERSION=530.30.02-1 \
+    NV_CUDA_NVCC_VERSION=12.1.66-1
+
+# Ref: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.1.1/ubi9/base/Dockerfile
+# nvcc is required for Flash Attention
+RUN dnf config-manager \
+    --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
+ && dnf install -y \
+     cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
+     cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
+     cuda-nvcc-12-1-${NV_CUDA_NVCC_VERSION} \
+ && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
+ && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
+ && dnf clean all
+
+ENV CUDA_HOME="/usr/local/cuda" \
+ PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
+ LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+
+# Install InfiniBand and RDMA packages
+RUN dnf config-manager \
+        --add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo \
+    && dnf install -y \
+        libibverbs-utils \
+        infiniband-diags \
+        libibumad3 \
+        librdmacm \
+        librdmacm-utils \
+        rdma-core \
+        mlnx-tools \
+    && dnf clean all \
+    && rm -rf /var/cache/dnf/*
+
+# Install Python packages
+
+# Install micropipenv to deploy packages from Pipfile.lock
+RUN pip install --no-cache-dir -U "micropipenv[toml]"
+
+# Install Python dependencies from Pipfile.lock file
+COPY Pipfile.lock ./
+
+RUN micropipenv install -- --no-cache-dir && \
+    rm -f ./Pipfile.lock && \
+    # Fix permissions to support pip in OpenShift environments \
+    chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
+    fix-permissions /opt/app-root -P
+
+# Install Flash Attention
+RUN pip install wheel
+RUN pip install --no-cache-dir flash-attn==2.7.4.post1 --no-build-isolation
+
+# Upgrade NCCL to a more recent version until we upgrade torch
+RUN pip install nvidia-nccl-cu12==2.26.2 && \
+    fix-permissions /opt/app-root -P
+
+# Restore user workspace
+USER 1001
+
+WORKDIR /opt/app-root/src
+
+LABEL name="rhoai/odh-training-cuda121-torch24-py311-rhel9" \
+      com.redhat.component="odh-training-cuda121-torch24-py311-rhel9" \
+      io.k8s.display-name="odh-training-cuda121-torch24-py311-rhel9" \
+      summary="CUDA 12.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+      description="CUDA 12.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+      io.k8s.description="CUDA 12.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+      com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf"
diff --git a/images/runtime/training/py311-cuda124-torch251/Dockerfile.konflux b/images/runtime/training/py311-cuda124-torch251/Dockerfile.konflux
@@ -0,0 +1,89 @@
+FROM registry.access.redhat.com/ubi9/python-311:latest
+
+# Copy license
+COPY LICENSE.md /licenses/cuda-license.md
+
+# Set the working directory in the container
+USER 0
+WORKDIR /app
+
+# upgrade requests package
+RUN pip install --no-cache-dir --upgrade requests==2.32.3
+
+# Install CUDA
+WORKDIR /opt/app-root/bin
+
+# Ref: https://docs.nvidia.com/cuda/archive/12.4.0/cuda-toolkit-release-notes/
+ENV CUDA_VERSION=12.4.0 \
+    NVIDIA_REQUIRE_CUDA="cuda>=12.4 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526" \
+    NV_CUDA_LIB_VERSION=12.4.0-1 \
+    NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    NV_CUDA_CUDART_VERSION=12.4.99-1 \
+    NV_CUDA_COMPAT_VERSION=550.54.14-1 \
+    NV_CUDA_NVCC_VERSION=12.4.99-1
+
+# Ref: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.4.1/ubi9/base/Dockerfile
+# nvcc is required for Flash Attention
+RUN dnf config-manager \
+    --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
+ && dnf install -y \
+     cuda-cudart-12-4-${NV_CUDA_CUDART_VERSION} \
+     cuda-compat-12-4-${NV_CUDA_COMPAT_VERSION} \
+     cuda-nvcc-12-4-${NV_CUDA_NVCC_VERSION} \
+ && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
+ && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
+ && dnf clean all
+
+ENV CUDA_HOME="/usr/local/cuda" \
+ PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
+ LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+
+# Install InfiniBand and RDMA packages
+RUN dnf config-manager \
+        --add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo \
+    && dnf install -y \
+        libibverbs-utils \
+        infiniband-diags \
+        libibumad3 \
+        librdmacm \
+        librdmacm-utils \
+        rdma-core \
+        mlnx-tools \
+    && dnf clean all \
+    && rm -rf /var/cache/dnf/*
+
+# Install Python packages
+
+# Install micropipenv to deploy packages from Pipfile.lock
+RUN pip install --no-cache-dir -U "micropipenv[toml]"
+
+# Install Python dependencies from Pipfile.lock file
+COPY Pipfile.lock ./
+
+RUN micropipenv install -- --no-cache-dir && \
+    rm -f ./Pipfile.lock && \
+    # Fix permissions to support pip in OpenShift environments \
+    chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
+    fix-permissions /opt/app-root -P
+
+# Install Flash Attention
+RUN pip install wheel
+RUN pip install --no-cache-dir flash-attn==2.7.4.post1 --no-build-isolation
+
+# Upgrade NCCL to a more recent version until we upgrade torch
+RUN pip install nvidia-nccl-cu12==2.26.2 && \
+    fix-permissions /opt/app-root -P
+
+# Restore user workspace
+USER 1001
+
+WORKDIR /opt/app-root/src
+
+LABEL name="rhoai/odh-training-cuda124-torch25-py311-rhel9" \
+      com.redhat.component="odh-training-cuda124-torch25-py311-rhel9" \
+      io.k8s.display-name="odh-training-cuda124-torch25-py311-rhel9" \
+      summary="CUDA 12.4 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
+      description="CUDA 12.4 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
+      io.k8s.description="CUDA 12.4 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
+      com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf"
diff --git a/images/runtime/training/py311-rocm62-torch241/Dockerfile.konflux b/images/runtime/training/py311-rocm62-torch241/Dockerfile.konflux
@@ -0,0 +1,99 @@
+FROM registry.access.redhat.com/ubi9/python-311:latest AS base
+
+# Copy license
+COPY LICENSE.md /licenses/rocm-license.md
+
+# Set the working directory in the container
+USER 0
+WORKDIR /app
+
+# upgrade requests package
+RUN pip install --no-cache-dir --upgrade requests==2.32.3
+
+# Install ROCm
+WORKDIR /opt/app-root/bin
+
+ARG ROCM_VERSION=6.2.4
+ARG AMDGPU_VERSION=6.2.4
+
+RUN <<EOF
+cat <<EOD > /etc/yum.repos.d/rocm.repo
+[amdgpu]
+name=amdgpu
+baseurl=https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/rhel/9.4/main/x86_64/
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
+
+[ROCm]
+name=ROCm
+baseurl=https://repo.radeon.com/rocm/rhel9/$ROCM_VERSION/main
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
+EOD
+EOF
+
+RUN dnf install -y cmake rocm-developer-tools rocm-ml-sdk rocm-opencl-sdk rocm-openmp-sdk rocm-utils && dnf clean all && rm -rf /var/cache/dnf
+
+# Install Python packages
+
+# Install micropipenv to deploy packages from Pipfile.lock
+RUN pip install --no-cache-dir -U "micropipenv[toml]"
+
+# Install Python dependencies from Pipfile.lock file
+COPY Pipfile.lock ./
+
+RUN micropipenv install && \
+    rm -f ./Pipfile.lock && \
+    # Fix permissions to support pip in OpenShift environments \
+    chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
+    fix-permissions /opt/app-root -P
+
+# Install Flash Attention
+ENV GPU_ARCHS=gfx90a;gfx941;gfx942
+
+RUN pip install wheel ninja
+
+RUN export TMP_DIR=$(mktemp -d) \
+    && cd $TMP_DIR \
+    && git clone --depth 1 --branch v2.7.4 https://github.com/Dao-AILab/flash-attention.git \
+    && cd flash-attention \
+    && git submodule update --init \
+    && MAX_JOBS="16" python3 setup.py install --verbose \
+    && rm -rf $TMP_DIR
+
+# Install BitsAndBytes
+RUN export TMP_DIR=$(mktemp -d) \
+    && cd $TMP_DIR \
+    && git clone --depth 1 --branch rocm_enabled_multi_backend https://github.com/ROCm/bitsandbytes.git \
+    && cd bitsandbytes \
+    && cmake -S . \
+    && make \
+    && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=$GPU_ARCHS -S . \
+    && make \
+    && cp bitsandbytes/libbitsandbytes_rocm62.so bitsandbytes/libbitsandbytes_rocm61.so \
+    && pip install --no-deps . \
+    && rm -rf $TMP_DIR
+    
+# Install a compatible version of pytorch-triton-rocm
+RUN pip install --force-reinstall pytorch-triton-rocm==3.1.0 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+
+# Reapply write permissions on site‑packages
+RUN chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
+    fix-permissions /opt/app-root -P
+
+# Restore user workspace
+USER 1001
+
+WORKDIR /opt/app-root/src
+
+LABEL name="rhoai/odh-training-rocm62-torch24-py311-rhel9" \
+      com.redhat.component="odh-training-rocm62-torch24-py311-rhel9" \
+      io.k8s.display-name="odh-training-rocm62-torch24-py311-rhel9" \
+      summary="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+      description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+      io.k8s.description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+      com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf"
diff --git a/images/runtime/training/py311-rocm62-torch251/Dockerfile.konflux b/images/runtime/training/py311-rocm62-torch251/Dockerfile.konflux
@@ -0,0 +1,98 @@
+FROM registry.access.redhat.com/ubi9/python-311:latest AS base
+
+# Copy license
+COPY LICENSE.md /licenses/rocm-license.md
+
+# Set the working directory in the container
+USER 0
+WORKDIR /app
+
+# upgrade requests package
+RUN pip install --no-cache-dir --upgrade requests==2.32.3
+
+# Install ROCm
+WORKDIR /opt/app-root/bin
+
+ARG ROCM_VERSION=6.2.4
+ARG AMDGPU_VERSION=6.2.4
+
+RUN <<EOF
+cat <<EOD > /etc/yum.repos.d/rocm.repo
+[amdgpu]
+name=amdgpu
+baseurl=https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/rhel/9.4/main/x86_64/
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
+
+[ROCm]
+name=ROCm
+baseurl=https://repo.radeon.com/rocm/rhel9/$ROCM_VERSION/main
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
+EOD
+EOF
+
+RUN dnf install -y cmake rocm-developer-tools rocm-ml-sdk rocm-opencl-sdk rocm-openmp-sdk rocm-utils && dnf clean all && rm -rf /var/cache/dnf
+
+# Install Python packages
+
+# Install micropipenv to deploy packages from Pipfile.lock
+RUN pip install --no-cache-dir -U "micropipenv[toml]"
+
+# Install Python dependencies from Pipfile.lock file
+COPY Pipfile.lock ./
+
+RUN micropipenv install && \
+    rm -f ./Pipfile.lock && \
+    # Fix permissions to support pip in OpenShift environments \
+    chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
+    fix-permissions /opt/app-root -P
+
+# Install Flash Attention
+ENV GPU_ARCHS=gfx90a;gfx941;gfx942
+
+RUN pip install wheel ninja
+
+RUN export TMP_DIR=$(mktemp -d) \
+    && cd $TMP_DIR \
+    && git clone --depth 1 --branch v2.7.4 https://github.com/Dao-AILab/flash-attention.git \
+    && cd flash-attention \
+    && git submodule update --init \
+    && MAX_JOBS="16" pip install --no-build-isolation . \
+    && rm -rf $TMP_DIR
+
+# Install BitsAndBytes
+RUN export TMP_DIR=$(mktemp -d) \
+    && cd $TMP_DIR \
+    && git clone --depth 1 --branch rocm_enabled_multi_backend https://github.com/ROCm/bitsandbytes.git \
+    && cd bitsandbytes \
+    && cmake -S . \
+    && make \
+    && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=$GPU_ARCHS -S . \
+    && make \
+    && pip install --no-deps . \
+    && rm -rf $TMP_DIR
+    
+# Install a compatible version of pytorch-triton-rocm
+RUN pip install --force-reinstall pytorch-triton-rocm==3.1.0 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+
+# Reapply write permissions on site‑packages
+RUN chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
+    fix-permissions /opt/app-root -P
+
+# Restore user workspace
+USER 1001
+
+WORKDIR /opt/app-root/src
+
+LABEL name="rhoai/odh-training-rocm62-torch25-py311-rhel9" \
+      com.redhat.component="odh-training-rocm62-torch25-py311-rhel9" \
+      io.k8s.display-name="odh-training-rocm62-torch25-py311-rhel9" \
+      summary="ROCm 6.2 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
+      description="ROCm 6.2 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
+      io.k8s.description="ROCm 6.2 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
+      com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf"