Skip to content

Commit 4cdefae

Browse files
authored
Merge pull request #256 from red-hat-data-services/konflux-dockerfiles
add konflux Dockerfiles
2 parents 4c02282 + b09deb8 commit 4cdefae

File tree

4 files changed

+375
-0
lines changed

4 files changed

+375
-0
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
FROM registry.access.redhat.com/ubi9/python-311:latest
2+
3+
# Copy license
4+
COPY LICENSE.md /licenses/cuda-license.md
5+
6+
# Set the working directory in the container
7+
USER 0
8+
WORKDIR /app
9+
10+
# upgrade requests package
11+
RUN pip install --no-cache-dir --upgrade requests==2.32.3
12+
13+
# Install CUDA
14+
WORKDIR /opt/app-root/bin
15+
16+
# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
17+
ENV CUDA_VERSION=12.1.0 \
18+
NVIDIA_REQUIRE_CUDA="cuda>=12.1 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526" \
19+
NV_CUDA_LIB_VERSION=12.1.0-1 \
20+
NVIDIA_VISIBLE_DEVICES=all \
21+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
22+
NV_CUDA_CUDART_VERSION=12.1.55-1 \
23+
NV_CUDA_COMPAT_VERSION=530.30.02-1 \
24+
NV_CUDA_NVCC_VERSION=12.1.66-1
25+
26+
# Ref: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.1.1/ubi9/base/Dockerfile
27+
# nvcc is required for Flash Attention
28+
RUN dnf config-manager \
29+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
30+
&& dnf install -y \
31+
cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
32+
cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
33+
cuda-nvcc-12-1-${NV_CUDA_NVCC_VERSION} \
34+
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
35+
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
36+
&& dnf clean all
37+
38+
ENV CUDA_HOME="/usr/local/cuda" \
39+
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
40+
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
41+
42+
# Install InfiniBand and RDMA packages
43+
RUN dnf config-manager \
44+
--add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo \
45+
&& dnf install -y \
46+
libibverbs-utils \
47+
infiniband-diags \
48+
libibumad3 \
49+
librdmacm \
50+
librdmacm-utils \
51+
rdma-core \
52+
mlnx-tools \
53+
&& dnf clean all \
54+
&& rm -rf /var/cache/dnf/*
55+
56+
# Install Python packages
57+
58+
# Install micropipenv to deploy packages from Pipfile.lock
59+
RUN pip install --no-cache-dir -U "micropipenv[toml]"
60+
61+
# Install Python dependencies from Pipfile.lock file
62+
COPY Pipfile.lock ./
63+
64+
RUN micropipenv install -- --no-cache-dir && \
65+
rm -f ./Pipfile.lock && \
66+
# Fix permissions to support pip in OpenShift environments \
67+
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
68+
fix-permissions /opt/app-root -P
69+
70+
# Install Flash Attention
71+
RUN pip install wheel
72+
RUN pip install --no-cache-dir flash-attn==2.7.4.post1 --no-build-isolation
73+
74+
# Upgrade NCCL to a more recent version until we upgrade torch
75+
RUN pip install nvidia-nccl-cu12==2.26.2 && \
76+
fix-permissions /opt/app-root -P
77+
78+
# Restore user workspace
79+
USER 1001
80+
81+
WORKDIR /opt/app-root/src
82+
83+
LABEL name="rhoai/odh-training-cuda121-torch24-py311-rhel9" \
84+
com.redhat.component="odh-training-cuda121-torch24-py311-rhel9" \
85+
io.k8s.display-name="odh-training-cuda121-torch24-py311-rhel9" \
86+
summary="CUDA 12.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
87+
description="CUDA 12.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
88+
io.k8s.description="CUDA 12.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
89+
com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf"
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
FROM registry.access.redhat.com/ubi9/python-311:latest
2+
3+
# Copy license
4+
COPY LICENSE.md /licenses/cuda-license.md
5+
6+
# Set the working directory in the container
7+
USER 0
8+
WORKDIR /app
9+
10+
# upgrade requests package
11+
RUN pip install --no-cache-dir --upgrade requests==2.32.3
12+
13+
# Install CUDA
14+
WORKDIR /opt/app-root/bin
15+
16+
# Ref: https://docs.nvidia.com/cuda/archive/12.4.0/cuda-toolkit-release-notes/
17+
ENV CUDA_VERSION=12.4.0 \
18+
NVIDIA_REQUIRE_CUDA="cuda>=12.4 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526" \
19+
NV_CUDA_LIB_VERSION=12.4.0-1 \
20+
NVIDIA_VISIBLE_DEVICES=all \
21+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
22+
NV_CUDA_CUDART_VERSION=12.4.99-1 \
23+
NV_CUDA_COMPAT_VERSION=550.54.14-1 \
24+
NV_CUDA_NVCC_VERSION=12.4.99-1
25+
26+
# Ref: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.4.1/ubi9/base/Dockerfile
27+
# nvcc is required for Flash Attention
28+
RUN dnf config-manager \
29+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
30+
&& dnf install -y \
31+
cuda-cudart-12-4-${NV_CUDA_CUDART_VERSION} \
32+
cuda-compat-12-4-${NV_CUDA_COMPAT_VERSION} \
33+
cuda-nvcc-12-4-${NV_CUDA_NVCC_VERSION} \
34+
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
35+
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
36+
&& dnf clean all
37+
38+
ENV CUDA_HOME="/usr/local/cuda" \
39+
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
40+
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
41+
42+
# Install InfiniBand and RDMA packages
43+
RUN dnf config-manager \
44+
--add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo \
45+
&& dnf install -y \
46+
libibverbs-utils \
47+
infiniband-diags \
48+
libibumad3 \
49+
librdmacm \
50+
librdmacm-utils \
51+
rdma-core \
52+
mlnx-tools \
53+
&& dnf clean all \
54+
&& rm -rf /var/cache/dnf/*
55+
56+
# Install Python packages
57+
58+
# Install micropipenv to deploy packages from Pipfile.lock
59+
RUN pip install --no-cache-dir -U "micropipenv[toml]"
60+
61+
# Install Python dependencies from Pipfile.lock file
62+
COPY Pipfile.lock ./
63+
64+
RUN micropipenv install -- --no-cache-dir && \
65+
rm -f ./Pipfile.lock && \
66+
# Fix permissions to support pip in OpenShift environments \
67+
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
68+
fix-permissions /opt/app-root -P
69+
70+
# Install Flash Attention
71+
RUN pip install wheel
72+
RUN pip install --no-cache-dir flash-attn==2.7.4.post1 --no-build-isolation
73+
74+
# Upgrade NCCL to a more recent version until we upgrade torch
75+
RUN pip install nvidia-nccl-cu12==2.26.2 && \
76+
fix-permissions /opt/app-root -P
77+
78+
# Restore user workspace
79+
USER 1001
80+
81+
WORKDIR /opt/app-root/src
82+
83+
LABEL name="rhoai/odh-training-cuda124-torch25-py311-rhel9" \
84+
com.redhat.component="odh-training-cuda124-torch25-py311-rhel9" \
85+
io.k8s.display-name="odh-training-cuda124-torch25-py311-rhel9" \
86+
summary="CUDA 12.4 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
87+
description="CUDA 12.4 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
88+
io.k8s.description="CUDA 12.4 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
89+
com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf"
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
FROM registry.access.redhat.com/ubi9/python-311:latest AS base
2+
3+
# Copy license
4+
COPY LICENSE.md /licenses/rocm-license.md
5+
6+
# Set the working directory in the container
7+
USER 0
8+
WORKDIR /app
9+
10+
# upgrade requests package
11+
RUN pip install --no-cache-dir --upgrade requests==2.32.3
12+
13+
# Install ROCm
14+
WORKDIR /opt/app-root/bin
15+
16+
ARG ROCM_VERSION=6.2.4
17+
ARG AMDGPU_VERSION=6.2.4
18+
19+
RUN <<EOF
20+
cat <<EOD > /etc/yum.repos.d/rocm.repo
21+
[amdgpu]
22+
name=amdgpu
23+
baseurl=https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/rhel/9.4/main/x86_64/
24+
enabled=1
25+
priority=50
26+
gpgcheck=1
27+
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
28+
29+
[ROCm]
30+
name=ROCm
31+
baseurl=https://repo.radeon.com/rocm/rhel9/$ROCM_VERSION/main
32+
enabled=1
33+
priority=50
34+
gpgcheck=1
35+
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
36+
EOD
37+
EOF
38+
39+
RUN dnf install -y cmake rocm-developer-tools rocm-ml-sdk rocm-opencl-sdk rocm-openmp-sdk rocm-utils && dnf clean all && rm -rf /var/cache/dnf
40+
41+
# Install Python packages
42+
43+
# Install micropipenv to deploy packages from Pipfile.lock
44+
RUN pip install --no-cache-dir -U "micropipenv[toml]"
45+
46+
# Install Python dependencies from Pipfile.lock file
47+
COPY Pipfile.lock ./
48+
49+
RUN micropipenv install && \
50+
rm -f ./Pipfile.lock && \
51+
# Fix permissions to support pip in OpenShift environments \
52+
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
53+
fix-permissions /opt/app-root -P
54+
55+
# Install Flash Attention
56+
ENV GPU_ARCHS=gfx90a;gfx941;gfx942
57+
58+
RUN pip install wheel ninja
59+
60+
RUN export TMP_DIR=$(mktemp -d) \
61+
&& cd $TMP_DIR \
62+
&& git clone --depth 1 --branch v2.7.4 https://github.com/Dao-AILab/flash-attention.git \
63+
&& cd flash-attention \
64+
&& git submodule update --init \
65+
&& MAX_JOBS="16" python3 setup.py install --verbose \
66+
&& rm -rf $TMP_DIR
67+
68+
# Install BitsAndBytes
69+
RUN export TMP_DIR=$(mktemp -d) \
70+
&& cd $TMP_DIR \
71+
&& git clone --depth 1 --branch rocm_enabled_multi_backend https://github.com/ROCm/bitsandbytes.git \
72+
&& cd bitsandbytes \
73+
&& cmake -S . \
74+
&& make \
75+
&& cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=$GPU_ARCHS -S . \
76+
&& make \
77+
&& cp bitsandbytes/libbitsandbytes_rocm62.so bitsandbytes/libbitsandbytes_rocm61.so \
78+
&& pip install --no-deps . \
79+
&& rm -rf $TMP_DIR
80+
81+
# Install a compatible version of pytorch-triton-rocm
82+
RUN pip install --force-reinstall pytorch-triton-rocm==3.1.0 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
83+
84+
# Reapply write permissions on site‑packages
85+
RUN chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
86+
fix-permissions /opt/app-root -P
87+
88+
# Restore user workspace
89+
USER 1001
90+
91+
WORKDIR /opt/app-root/src
92+
93+
LABEL name="rhoai/odh-training-rocm62-torch24-py311-rhel9" \
94+
com.redhat.component="odh-training-rocm62-torch24-py311-rhel9" \
95+
io.k8s.display-name="odh-training-rocm62-torch24-py311-rhel9" \
96+
summary="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
97+
description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
98+
io.k8s.description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
99+
com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf"
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
FROM registry.access.redhat.com/ubi9/python-311:latest AS base
2+
3+
# Copy license
4+
COPY LICENSE.md /licenses/rocm-license.md
5+
6+
# Set the working directory in the container
7+
USER 0
8+
WORKDIR /app
9+
10+
# upgrade requests package
11+
RUN pip install --no-cache-dir --upgrade requests==2.32.3
12+
13+
# Install ROCm
14+
WORKDIR /opt/app-root/bin
15+
16+
ARG ROCM_VERSION=6.2.4
17+
ARG AMDGPU_VERSION=6.2.4
18+
19+
RUN <<EOF
20+
cat <<EOD > /etc/yum.repos.d/rocm.repo
21+
[amdgpu]
22+
name=amdgpu
23+
baseurl=https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/rhel/9.4/main/x86_64/
24+
enabled=1
25+
priority=50
26+
gpgcheck=1
27+
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
28+
29+
[ROCm]
30+
name=ROCm
31+
baseurl=https://repo.radeon.com/rocm/rhel9/$ROCM_VERSION/main
32+
enabled=1
33+
priority=50
34+
gpgcheck=1
35+
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
36+
EOD
37+
EOF
38+
39+
RUN dnf install -y cmake rocm-developer-tools rocm-ml-sdk rocm-opencl-sdk rocm-openmp-sdk rocm-utils && dnf clean all && rm -rf /var/cache/dnf
40+
41+
# Install Python packages
42+
43+
# Install micropipenv to deploy packages from Pipfile.lock
44+
RUN pip install --no-cache-dir -U "micropipenv[toml]"
45+
46+
# Install Python dependencies from Pipfile.lock file
47+
COPY Pipfile.lock ./
48+
49+
RUN micropipenv install && \
50+
rm -f ./Pipfile.lock && \
51+
# Fix permissions to support pip in OpenShift environments \
52+
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
53+
fix-permissions /opt/app-root -P
54+
55+
# Install Flash Attention
56+
ENV GPU_ARCHS=gfx90a;gfx941;gfx942
57+
58+
RUN pip install wheel ninja
59+
60+
RUN export TMP_DIR=$(mktemp -d) \
61+
&& cd $TMP_DIR \
62+
&& git clone --depth 1 --branch v2.7.4 https://github.com/Dao-AILab/flash-attention.git \
63+
&& cd flash-attention \
64+
&& git submodule update --init \
65+
&& MAX_JOBS="16" pip install --no-build-isolation . \
66+
&& rm -rf $TMP_DIR
67+
68+
# Install BitsAndBytes
69+
RUN export TMP_DIR=$(mktemp -d) \
70+
&& cd $TMP_DIR \
71+
&& git clone --depth 1 --branch rocm_enabled_multi_backend https://github.com/ROCm/bitsandbytes.git \
72+
&& cd bitsandbytes \
73+
&& cmake -S . \
74+
&& make \
75+
&& cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=$GPU_ARCHS -S . \
76+
&& make \
77+
&& pip install --no-deps . \
78+
&& rm -rf $TMP_DIR
79+
80+
# Install a compatible version of pytorch-triton-rocm
81+
RUN pip install --force-reinstall pytorch-triton-rocm==3.1.0 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
82+
83+
# Reapply write permissions on site‑packages
84+
RUN chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
85+
fix-permissions /opt/app-root -P
86+
87+
# Restore user workspace
88+
USER 1001
89+
90+
WORKDIR /opt/app-root/src
91+
92+
LABEL name="rhoai/odh-training-rocm62-torch25-py311-rhel9" \
93+
com.redhat.component="odh-training-rocm62-torch25-py311-rhel9" \
94+
io.k8s.display-name="odh-training-rocm62-torch25-py311-rhel9" \
95+
summary="ROCm 6.2 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
96+
description="ROCm 6.2 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
97+
io.k8s.description="ROCm 6.2 Python 3.11 PyTorch 2.5.1 image based on UBI9 for Training" \
98+
com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf"

0 commit comments

Comments
 (0)