From 29e990e9aa8d09cf819a2f4948e5d127354cd050 Mon Sep 17 00:00:00 2001 From: Jonathan Dierksen Date: Thu, 12 Mar 2026 16:42:15 -0500 Subject: [PATCH 1/5] refactor(docker): consolidate 8 per-version Dockerfiles into 2 parameterized files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace Dockerfile.cu{126,128,129,130}[.dev] with a single docker/Dockerfile and docker/Dockerfile.dev, parameterized via CUDA_BASE_IMAGE, CUDA_VERSION, NVIDIA_LIB_PATH, and INSTALL_TILELANG ARGs. Per-version values move into the CI workflow matrix, making it the single source of truth — adding a new CUDA version now requires adding one block to the matrix instead of copying a file. Also adds cu131 support, fixes the cu130.dev base image (13.0.0 → 13.0.1), and backfills TRITON_PTXAS_PATH for cu126 which was previously missing it. AI-assisted Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/release-ci-docker.yml | 33 ++++++++- docker/{Dockerfile.cu129 => Dockerfile} | 14 +++- docker/Dockerfile.cu126 | 37 ---------- docker/Dockerfile.cu126.dev | 73 ------------------- docker/Dockerfile.cu128 | 37 ---------- docker/Dockerfile.cu128.dev | 73 ------------------- docker/Dockerfile.cu129.dev | 73 ------------------- docker/Dockerfile.cu130 | 40 ---------- .../{Dockerfile.cu130.dev => Dockerfile.dev} | 8 +- 9 files changed, 46 insertions(+), 342 deletions(-) rename docker/{Dockerfile.cu129 => Dockerfile} (74%) delete mode 100644 docker/Dockerfile.cu126 delete mode 100644 docker/Dockerfile.cu126.dev delete mode 100644 docker/Dockerfile.cu128 delete mode 100644 docker/Dockerfile.cu128.dev delete mode 100644 docker/Dockerfile.cu129.dev delete mode 100644 docker/Dockerfile.cu130 rename docker/{Dockerfile.cu130.dev => Dockerfile.dev} (89%) diff --git a/.github/workflows/release-ci-docker.yml b/.github/workflows/release-ci-docker.yml index 0f1686fe88..b57f81de98 100644 --- a/.github/workflows/release-ci-docker.yml +++ b/.github/workflows/release-ci-docker.yml @@ -36,7 +36,27 @@ jobs: needs: generate-tag strategy: matrix: - cuda: [cu126, cu128, cu129, cu130] + include: + - cuda: cu126 + base_image: nvidia/cuda:12.6.0-devel-ubuntu24.04 + nvidia_lib_path: nvidia/cublas/lib + install_tilelang: "false" + - cuda: cu128 + base_image: nvidia/cuda:12.8.0-devel-ubuntu24.04 + nvidia_lib_path: nvidia/cublas/lib + install_tilelang: "false" + - cuda: cu129 + base_image: nvidia/cuda:12.9.0-devel-ubuntu24.04 + nvidia_lib_path: nvidia/cublas/lib + install_tilelang: "false" + - cuda: cu130 + base_image: nvidia/cuda:13.0.1-devel-ubuntu24.04 + nvidia_lib_path: nvidia/cu13/lib + install_tilelang: "false" + - cuda: cu131 + base_image: nvidia/cuda:13.1.1-cudnn-devel-ubuntu24.04 + nvidia_lib_path: nvidia/cu13/lib + install_tilelang: "true" arch: [amd64, arm64] steps: - uses: actions/checkout@v4 @@ -58,7 +78,12 @@ jobs: uses: docker/build-push-action@v5 with: context: . - file: docker/Dockerfile.${{ matrix.cuda }} + file: docker/Dockerfile + build-args: | + CUDA_BASE_IMAGE=${{ matrix.base_image }} + CUDA_VERSION=${{ matrix.cuda }} + NVIDIA_LIB_PATH=${{ matrix.nvidia_lib_path }} + INSTALL_TILELANG=${{ matrix.install_tilelang }} platforms: linux/${{ matrix.arch }} push: ${{ github.event_name != 'pull_request' }} pull: true # Always pull the latest base image @@ -74,7 +99,7 @@ jobs: needs: [generate-tag, build] strategy: matrix: - cuda: [cu126, cu128, cu129, cu130] + cuda: [cu126, cu128, cu129, cu130, cu131] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -111,6 +136,7 @@ jobs: flashinfer/flashinfer-ci-cu128: ${DATE_SHA} flashinfer/flashinfer-ci-cu129: ${DATE_SHA} flashinfer/flashinfer-ci-cu130: ${DATE_SHA} + flashinfer/flashinfer-ci-cu131: ${DATE_SHA} EOF - name: Create Pull Request @@ -127,6 +153,7 @@ jobs: - flashinfer/flashinfer-ci-cu128:${{ needs.generate-tag.outputs.date_sha }} - flashinfer/flashinfer-ci-cu129:${{ needs.generate-tag.outputs.date_sha }} - flashinfer/flashinfer-ci-cu130:${{ needs.generate-tag.outputs.date_sha }} + - flashinfer/flashinfer-ci-cu131:${{ needs.generate-tag.outputs.date_sha }} Auto-generated by [release-ci-docker workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) branch: update-docker-tags-${{ needs.generate-tag.outputs.date_sha }} diff --git a/docker/Dockerfile.cu129 b/docker/Dockerfile similarity index 74% rename from docker/Dockerfile.cu129 rename to docker/Dockerfile index e5607416ff..9bf810f2b3 100644 --- a/docker/Dockerfile.cu129 +++ b/docker/Dockerfile @@ -1,4 +1,5 @@ -FROM nvidia/cuda:12.9.0-devel-ubuntu24.04 +ARG CUDA_BASE_IMAGE +FROM ${CUDA_BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive @@ -19,8 +20,9 @@ RUN echo "source activate py312" >> ~/.bashrc ENV PATH="/opt/conda/bin:$PATH" ENV PATH="/opt/conda/envs/py312/bin:$PATH" -# Ensure pip-installed nvidia-cublas takes precedence over system libraries -ENV LD_LIBRARY_PATH="/opt/conda/envs/py312/lib/python3.12/site-packages/nvidia/cublas/lib/:$LD_LIBRARY_PATH" +# Set LD_LIBRARY_PATH to ensure pip-installed nvidia libs take precedence over system libraries +ARG NVIDIA_LIB_PATH +ENV LD_LIBRARY_PATH="/opt/conda/envs/py312/lib/python3.12/site-packages/${NVIDIA_LIB_PATH}/:$LD_LIBRARY_PATH" # Triton ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" @@ -28,7 +30,11 @@ ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" # Install torch and other python packages COPY requirements.txt /install/requirements.txt COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -RUN bash /install/install_python_packages.sh cu129 +ARG CUDA_VERSION +RUN bash /install/install_python_packages.sh ${CUDA_VERSION} + +ARG INSTALL_TILELANG=false +RUN if [ "$INSTALL_TILELANG" = "true" ]; then pip install tilelang cuda-tile; fi # Install mpi4py in the conda environment RUN conda install -n py312 -y mpi4py mpich diff --git a/docker/Dockerfile.cu126 b/docker/Dockerfile.cu126 deleted file mode 100644 index fda2f23b91..0000000000 --- a/docker/Dockerfile.cu126 +++ /dev/null @@ -1,37 +0,0 @@ -FROM nvidia/cuda:12.6.0-devel-ubuntu24.04 - -ENV DEBIAN_FRONTEND=noninteractive - -# Update package lists and install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - git \ - wget - -# Install python -COPY docker/install/install_python.sh /install/install_python.sh -RUN bash /install/install_python.sh /opt/conda py312 - -# Set home directory -WORKDIR /workspace - -RUN echo "source activate py312" >> ~/.bashrc -ENV PATH="/opt/conda/bin:$PATH" -ENV PATH="/opt/conda/envs/py312/bin:$PATH" - -# Ensure pip-installed nvidia-cublas takes precedence over system libraries -ENV LD_LIBRARY_PATH="/opt/conda/envs/py312/lib/python3.12/site-packages/nvidia/cublas/lib/:$LD_LIBRARY_PATH" - -# Install torch and other python packages -COPY requirements.txt /install/requirements.txt -COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -RUN bash /install/install_python_packages.sh cu126 - -# Install mpi4py in the conda environment -RUN conda install -n py312 -y mpi4py mpich - -# Configure pip for user-site installations (allows arbitrary users to install packages) -# This enables 'pip install --user' and 'pip install -e .' to work for any user -RUN mkdir -p /opt/pip-user && chmod 1777 /opt/pip-user -ENV PYTHONUSERBASE=/opt/pip-user -ENV PATH="/opt/pip-user/bin:$PATH" diff --git a/docker/Dockerfile.cu126.dev b/docker/Dockerfile.cu126.dev deleted file mode 100644 index 816d5af619..0000000000 --- a/docker/Dockerfile.cu126.dev +++ /dev/null @@ -1,73 +0,0 @@ -FROM nvidia/cuda:12.6.0-devel-ubuntu24.04 - -ENV DEBIAN_FRONTEND=noninteractive - -# Update package lists and install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - git \ - wget \ - clang-format \ - clangd-19 \ - vim \ - zsh \ - && rm -rf /var/lib/apt/lists/* - -# Create a non-root user -ARG USERNAME=devuser -ARG USER_UID=1003 -ARG USER_GID=$USER_UID - -# Create the user -RUN groupadd --gid $USER_GID $USERNAME \ - && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ - # [Optional] Add sudo support - && apt-get update \ - && apt-get install -y sudo \ - && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ - && chmod 0440 /etc/sudoers.d/$USERNAME \ - && rm -rf /var/lib/apt/lists/* - -# Remove default 'ubuntu' user (UID 1000) to prevent devcontainer permission conflicts -# Ref: https://github.com/rapidsai/devcontainers/pull/373 -RUN if grep ubuntu:x:1000:1000 /etc/passwd >/dev/null; then userdel -f -r ubuntu; fi - -# Switch to non-root user -USER $USERNAME -WORKDIR /home/$USERNAME - -# Install python -COPY docker/install/install_python.sh /install/install_python.sh -RUN bash /install/install_python.sh /home/$USERNAME/conda py312 - -RUN echo "source activate py312" >> ~/.bashrc -ENV PATH="/home/$USERNAME/conda/bin:$PATH" -ENV PATH="/home/$USERNAME/conda/envs/py312/bin:$PATH" - -# Install torch and other python packages -COPY requirements.txt /install/requirements.txt -COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -RUN bash /install/install_python_packages.sh cu126 && pip3 install pre-commit - -# Install mpi4py in the conda environment -RUN conda install -n py312 -y mpi4py mpich - -# Install oh-my-zsh -RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended - -# Install zsh-autosuggestions -RUN git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions - -# Configure zsh -RUN sed -i 's/ZSH_THEME="robbyrussell"/ZSH_THEME="fino-time"/' ~/.zshrc && \ - sed -i 's/plugins=(git)/plugins=(git zsh-autosuggestions)/' ~/.zshrc - -# clangd -ENV PATH="/usr/lib/llvm-19/bin:$PATH" - -# Triton -ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" - -# Set zsh as default shell -ENV SHELL=/bin/zsh -CMD [ "zsh" ] diff --git a/docker/Dockerfile.cu128 b/docker/Dockerfile.cu128 deleted file mode 100644 index b43ecfbcda..0000000000 --- a/docker/Dockerfile.cu128 +++ /dev/null @@ -1,37 +0,0 @@ -FROM nvidia/cuda:12.8.0-devel-ubuntu24.04 - -ENV DEBIAN_FRONTEND=noninteractive - -# Update package lists and install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - git \ - wget - -# Install python -COPY docker/install/install_python.sh /install/install_python.sh -RUN bash /install/install_python.sh /opt/conda py312 - -# Set home directory -WORKDIR /workspace - -RUN echo "source activate py312" >> ~/.bashrc -ENV PATH="/opt/conda/bin:$PATH" -ENV PATH="/opt/conda/envs/py312/bin:$PATH" - -# Ensure pip-installed nvidia-cublas takes precedence over system libraries -ENV LD_LIBRARY_PATH="/opt/conda/envs/py312/lib/python3.12/site-packages/nvidia/cublas/lib/:$LD_LIBRARY_PATH" - -# Install torch and other python packages -COPY requirements.txt /install/requirements.txt -COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -RUN bash /install/install_python_packages.sh cu128 - -# Install mpi4py in the conda environment -RUN conda install -n py312 -y mpi4py mpich - -# Configure pip for user-site installations (allows arbitrary users to install packages) -# This enables 'pip install --user' and 'pip install -e .' to work for any user -RUN mkdir -p /opt/pip-user && chmod 1777 /opt/pip-user -ENV PYTHONUSERBASE=/opt/pip-user -ENV PATH="/opt/pip-user/bin:$PATH" diff --git a/docker/Dockerfile.cu128.dev b/docker/Dockerfile.cu128.dev deleted file mode 100644 index 4fb71d9eeb..0000000000 --- a/docker/Dockerfile.cu128.dev +++ /dev/null @@ -1,73 +0,0 @@ -FROM nvidia/cuda:12.8.0-devel-ubuntu24.04 - -ENV DEBIAN_FRONTEND=noninteractive - -# Update package lists and install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - git \ - wget \ - clang-format \ - clangd-19 \ - vim \ - zsh \ - && rm -rf /var/lib/apt/lists/* - -# Create a non-root user -ARG USERNAME=devuser -ARG USER_UID=1003 -ARG USER_GID=$USER_UID - -# Create the user -RUN groupadd --gid $USER_GID $USERNAME \ - && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ - # [Optional] Add sudo support - && apt-get update \ - && apt-get install -y sudo \ - && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ - && chmod 0440 /etc/sudoers.d/$USERNAME \ - && rm -rf /var/lib/apt/lists/* - -# Remove default 'ubuntu' user (UID 1000) to prevent devcontainer permission conflicts -# Ref: https://github.com/rapidsai/devcontainers/pull/373 -RUN if grep ubuntu:x:1000:1000 /etc/passwd >/dev/null; then userdel -f -r ubuntu; fi - -# Switch to non-root user -USER $USERNAME -WORKDIR /home/$USERNAME - -# Install python -COPY docker/install/install_python.sh /install/install_python.sh -RUN bash /install/install_python.sh /home/$USERNAME/conda py312 - -RUN echo "source activate py312" >> ~/.bashrc -ENV PATH="/home/$USERNAME/conda/bin:$PATH" -ENV PATH="/home/$USERNAME/conda/envs/py312/bin:$PATH" - -# Install torch and other python packages -COPY requirements.txt /install/requirements.txt -COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -RUN bash /install/install_python_packages.sh cu128 && pip3 install pre-commit - -# Install mpi4py in the conda environment -RUN conda install -n py312 -y mpi4py mpich - -# Install oh-my-zsh -RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended - -# Install zsh-autosuggestions -RUN git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions - -# Configure zsh -RUN sed -i 's/ZSH_THEME="robbyrussell"/ZSH_THEME="fino-time"/' ~/.zshrc && \ - sed -i 's/plugins=(git)/plugins=(git zsh-autosuggestions)/' ~/.zshrc - -# clangd -ENV PATH="/usr/lib/llvm-19/bin:$PATH" - -# Triton -ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" - -# Set zsh as default shell -ENV SHELL=/bin/zsh -CMD [ "zsh" ] diff --git a/docker/Dockerfile.cu129.dev b/docker/Dockerfile.cu129.dev deleted file mode 100644 index bfba3d95e6..0000000000 --- a/docker/Dockerfile.cu129.dev +++ /dev/null @@ -1,73 +0,0 @@ -FROM nvidia/cuda:12.9.0-devel-ubuntu24.04 - -ENV DEBIAN_FRONTEND=noninteractive - -# Update package lists and install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - git \ - wget \ - clang-format \ - clangd-19 \ - vim \ - zsh \ - && rm -rf /var/lib/apt/lists/* - -# Create a non-root user -ARG USERNAME=devuser -ARG USER_UID=1003 -ARG USER_GID=$USER_UID - -# Create the user -RUN groupadd --gid $USER_GID $USERNAME \ - && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ - # [Optional] Add sudo support - && apt-get update \ - && apt-get install -y sudo \ - && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ - && chmod 0440 /etc/sudoers.d/$USERNAME \ - && rm -rf /var/lib/apt/lists/* - -# Remove default 'ubuntu' user (UID 1000) to prevent devcontainer permission conflicts -# Ref: https://github.com/rapidsai/devcontainers/pull/373 -RUN if grep ubuntu:x:1000:1000 /etc/passwd >/dev/null; then userdel -f -r ubuntu; fi - -# Switch to non-root user -USER $USERNAME -WORKDIR /home/$USERNAME - -# Install python -COPY docker/install/install_python.sh /install/install_python.sh -RUN bash /install/install_python.sh /home/$USERNAME/conda py312 - -RUN echo "source activate py312" >> ~/.bashrc -ENV PATH="/home/$USERNAME/conda/bin:$PATH" -ENV PATH="/home/$USERNAME/conda/envs/py312/bin:$PATH" - -# Install torch and other python packages -COPY requirements.txt /install/requirements.txt -COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -RUN bash /install/install_python_packages.sh cu129 && pip3 install pre-commit - -# Install mpi4py in the conda environment -RUN conda install -n py312 -y mpi4py mpich - -# Install oh-my-zsh -RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended - -# Install zsh-autosuggestions -RUN git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions - -# Configure zsh -RUN sed -i 's/ZSH_THEME="robbyrussell"/ZSH_THEME="fino-time"/' ~/.zshrc && \ - sed -i 's/plugins=(git)/plugins=(git zsh-autosuggestions)/' ~/.zshrc - -# clangd -ENV PATH="/usr/lib/llvm-19/bin:$PATH" - -# Triton -ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" - -# Set zsh as default shell -ENV SHELL=/bin/zsh -CMD [ "zsh" ] diff --git a/docker/Dockerfile.cu130 b/docker/Dockerfile.cu130 deleted file mode 100644 index fb68419e5a..0000000000 --- a/docker/Dockerfile.cu130 +++ /dev/null @@ -1,40 +0,0 @@ -FROM nvidia/cuda:13.0.1-devel-ubuntu24.04 - -ENV DEBIAN_FRONTEND=noninteractive - -# Update package lists and install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - git \ - wget - -# Install python -COPY docker/install/install_python.sh /install/install_python.sh -RUN bash /install/install_python.sh /opt/conda py312 - -# Set home directory -WORKDIR /workspace - -RUN echo "source activate py312" >> ~/.bashrc -ENV PATH="/opt/conda/bin:$PATH" -ENV PATH="/opt/conda/envs/py312/bin:$PATH" - -# Set LD_LIBRARY_PATH to ensure pip-installed nvidia-cublas takes precedence over system libraries -ENV LD_LIBRARY_PATH="/opt/conda/envs/py312/lib/python3.12/site-packages/nvidia/cu13/lib/:$LD_LIBRARY_PATH" - -# Triton -ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" - -# Install torch and other python packages -COPY requirements.txt /install/requirements.txt -COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -RUN bash /install/install_python_packages.sh cu130 - -# Install mpi4py in the conda environment -RUN conda install -n py312 -y mpi4py mpich - -# Configure pip for user-site installations (allows arbitrary users to install packages) -# This enables 'pip install --user' and 'pip install -e .' to work for any user -RUN mkdir -p /opt/pip-user && chmod 1777 /opt/pip-user -ENV PYTHONUSERBASE=/opt/pip-user -ENV PATH="/opt/pip-user/bin:$PATH" diff --git a/docker/Dockerfile.cu130.dev b/docker/Dockerfile.dev similarity index 89% rename from docker/Dockerfile.cu130.dev rename to docker/Dockerfile.dev index 56762e242b..285953a836 100644 --- a/docker/Dockerfile.cu130.dev +++ b/docker/Dockerfile.dev @@ -1,4 +1,5 @@ -FROM nvidia/cuda:13.0.0-devel-ubuntu24.04 +ARG CUDA_BASE_IMAGE +FROM ${CUDA_BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive @@ -47,7 +48,10 @@ ENV PATH="/home/$USERNAME/conda/envs/py312/bin:$PATH" # Install torch and other python packages COPY requirements.txt /install/requirements.txt COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -RUN bash /install/install_python_packages.sh cu130 && pip3 install pre-commit +ARG CUDA_VERSION +ARG INSTALL_TILELANG=false +RUN bash /install/install_python_packages.sh ${CUDA_VERSION} && pip3 install pre-commit && \ + if [ "$INSTALL_TILELANG" = "true" ]; then pip install tilelang cuda-tile; fi # Install mpi4py in the conda environment RUN conda install -n py312 -y mpi4py mpich From bcba4505ac7ee0b891520d3b56dbc41b5c6f6265 Mon Sep 17 00:00:00 2001 From: Jonathan Dierksen Date: Tue, 17 Mar 2026 13:41:15 -0500 Subject: [PATCH 2/5] refactor(docker): merge Dockerfile.dev into Dockerfile as a multi-stage build Add a shared `base` stage and named `prod`/`dev` targets in a single Dockerfile, replacing the two-file approach. CI uses --target prod; devcontainers use --target dev. ARGs are re-declared per stage per Docker scoping rules. AI-assisted Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/release-ci-docker.yml | 1 + docker/Dockerfile | 79 ++++++++++++++++++++++++- docker/Dockerfile.dev | 77 ------------------------ 3 files changed, 77 insertions(+), 80 deletions(-) delete mode 100644 docker/Dockerfile.dev diff --git a/.github/workflows/release-ci-docker.yml b/.github/workflows/release-ci-docker.yml index b57f81de98..aed0ad99a8 100644 --- a/.github/workflows/release-ci-docker.yml +++ b/.github/workflows/release-ci-docker.yml @@ -79,6 +79,7 @@ jobs: with: context: . file: docker/Dockerfile + target: prod build-args: | CUDA_BASE_IMAGE=${{ matrix.base_image }} CUDA_VERSION=${{ matrix.cuda }} diff --git a/docker/Dockerfile b/docker/Dockerfile index 9bf810f2b3..785b15f7d0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,19 +1,20 @@ ARG CUDA_BASE_IMAGE -FROM ${CUDA_BASE_IMAGE} +FROM ${CUDA_BASE_IMAGE} AS base ENV DEBIAN_FRONTEND=noninteractive -# Update package lists and install system dependencies RUN apt-get update && apt-get install -y \ curl \ git \ wget +# ---- prod target ---- +FROM base AS prod + # Install python COPY docker/install/install_python.sh /install/install_python.sh RUN bash /install/install_python.sh /opt/conda py312 -# Set home directory WORKDIR /workspace RUN echo "source activate py312" >> ~/.bashrc @@ -44,3 +45,75 @@ RUN conda install -n py312 -y mpi4py mpich RUN mkdir -p /opt/pip-user && chmod 1777 /opt/pip-user ENV PYTHONUSERBASE=/opt/pip-user ENV PATH="/opt/pip-user/bin:$PATH" + +# ---- dev target ---- +FROM base AS dev + +RUN apt-get update && apt-get install -y \ + clang-format \ + clangd-19 \ + vim \ + zsh \ + && rm -rf /var/lib/apt/lists/* + +# Create a non-root user +ARG USERNAME=devuser +ARG USER_UID=1003 +ARG USER_GID=$USER_UID + +# Create the user +RUN groupadd --gid $USER_GID $USERNAME \ + && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ + # [Optional] Add sudo support + && apt-get update \ + && apt-get install -y sudo \ + && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ + && chmod 0440 /etc/sudoers.d/$USERNAME \ + && rm -rf /var/lib/apt/lists/* + +# Remove default 'ubuntu' user (UID 1000) to prevent devcontainer permission conflicts +# Ref: https://github.com/rapidsai/devcontainers/pull/373 +RUN if grep ubuntu:x:1000:1000 /etc/passwd >/dev/null; then userdel -f -r ubuntu; fi + +# Switch to non-root user +USER $USERNAME +WORKDIR /home/$USERNAME + +# Install python +COPY docker/install/install_python.sh /install/install_python.sh +RUN bash /install/install_python.sh /home/$USERNAME/conda py312 + +RUN echo "source activate py312" >> ~/.bashrc +ENV PATH="/home/$USERNAME/conda/bin:$PATH" +ENV PATH="/home/$USERNAME/conda/envs/py312/bin:$PATH" + +# Install torch and other python packages +COPY requirements.txt /install/requirements.txt +COPY docker/install/install_python_packages.sh /install/install_python_packages.sh +ARG CUDA_VERSION +ARG INSTALL_TILELANG=false +RUN bash /install/install_python_packages.sh ${CUDA_VERSION} && pip3 install pre-commit && \ + if [ "$INSTALL_TILELANG" = "true" ]; then pip install tilelang cuda-tile; fi + +# Install mpi4py in the conda environment +RUN conda install -n py312 -y mpi4py mpich + +# Install oh-my-zsh +RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended + +# Install zsh-autosuggestions +RUN git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions + +# Configure zsh +RUN sed -i 's/ZSH_THEME="robbyrussell"/ZSH_THEME="fino-time"/' ~/.zshrc && \ + sed -i 's/plugins=(git)/plugins=(git zsh-autosuggestions)/' ~/.zshrc + +# clangd +ENV PATH="/usr/lib/llvm-19/bin:$PATH" + +# Triton +ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" + +# Set zsh as default shell +ENV SHELL=/bin/zsh +CMD [ "zsh" ] diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev deleted file mode 100644 index 285953a836..0000000000 --- a/docker/Dockerfile.dev +++ /dev/null @@ -1,77 +0,0 @@ -ARG CUDA_BASE_IMAGE -FROM ${CUDA_BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -# Update package lists and install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - git \ - wget \ - clang-format \ - clangd-19 \ - vim \ - zsh \ - && rm -rf /var/lib/apt/lists/* - -# Create a non-root user -ARG USERNAME=devuser -ARG USER_UID=1003 -ARG USER_GID=$USER_UID - -# Create the user -RUN groupadd --gid $USER_GID $USERNAME \ - && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ - # [Optional] Add sudo support - && apt-get update \ - && apt-get install -y sudo \ - && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ - && chmod 0440 /etc/sudoers.d/$USERNAME \ - && rm -rf /var/lib/apt/lists/* - -# Remove default 'ubuntu' user (UID 1000) to prevent devcontainer permission conflicts -# Ref: https://github.com/rapidsai/devcontainers/pull/373 -RUN if grep ubuntu:x:1000:1000 /etc/passwd >/dev/null; then userdel -f -r ubuntu; fi - -# Switch to non-root user -USER $USERNAME -WORKDIR /home/$USERNAME - -# Install python -COPY docker/install/install_python.sh /install/install_python.sh -RUN bash /install/install_python.sh /home/$USERNAME/conda py312 - -RUN echo "source activate py312" >> ~/.bashrc -ENV PATH="/home/$USERNAME/conda/bin:$PATH" -ENV PATH="/home/$USERNAME/conda/envs/py312/bin:$PATH" - -# Install torch and other python packages -COPY requirements.txt /install/requirements.txt -COPY docker/install/install_python_packages.sh /install/install_python_packages.sh -ARG CUDA_VERSION -ARG INSTALL_TILELANG=false -RUN bash /install/install_python_packages.sh ${CUDA_VERSION} && pip3 install pre-commit && \ - if [ "$INSTALL_TILELANG" = "true" ]; then pip install tilelang cuda-tile; fi - -# Install mpi4py in the conda environment -RUN conda install -n py312 -y mpi4py mpich - -# Install oh-my-zsh -RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended - -# Install zsh-autosuggestions -RUN git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions - -# Configure zsh -RUN sed -i 's/ZSH_THEME="robbyrussell"/ZSH_THEME="fino-time"/' ~/.zshrc && \ - sed -i 's/plugins=(git)/plugins=(git zsh-autosuggestions)/' ~/.zshrc - -# clangd -ENV PATH="/usr/lib/llvm-19/bin:$PATH" - -# Triton -ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" - -# Set zsh as default shell -ENV SHELL=/bin/zsh -CMD [ "zsh" ] From 05e8ebc3079ee83f82546ae44f18a8a4ba3d294b Mon Sep 17 00:00:00 2001 From: Jonathan Dierksen Date: Tue, 17 Mar 2026 13:42:41 -0500 Subject: [PATCH 3/5] refactor(docker): rename prod stage to test The images are for CI testing and local dev, not prod/staging environments. AI-assisted Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/release-ci-docker.yml | 2 +- docker/Dockerfile | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-ci-docker.yml b/.github/workflows/release-ci-docker.yml index aed0ad99a8..1a4ccbc4bf 100644 --- a/.github/workflows/release-ci-docker.yml +++ b/.github/workflows/release-ci-docker.yml @@ -79,7 +79,7 @@ jobs: with: context: . file: docker/Dockerfile - target: prod + target: test build-args: | CUDA_BASE_IMAGE=${{ matrix.base_image }} CUDA_VERSION=${{ matrix.cuda }} diff --git a/docker/Dockerfile b/docker/Dockerfile index 785b15f7d0..450ebf7d51 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -8,8 +8,8 @@ RUN apt-get update && apt-get install -y \ git \ wget -# ---- prod target ---- -FROM base AS prod +# ---- test target ---- +FROM base AS test # Install python COPY docker/install/install_python.sh /install/install_python.sh From 7e990f6e1e4be83770e9b88f2cab266d39d0cb86 Mon Sep 17 00:00:00 2001 From: Jonathan Dierksen Date: Tue, 17 Mar 2026 14:09:23 -0500 Subject: [PATCH 4/5] fix(docker): use PYTORCH_INDEX arg to decouple PyTorch wheel index from CUDA version PyTorch doesn't publish cu131 wheels yet, so cu131 images fall back to the cu130 index. A TODO comment marks where to update when cu131 wheels ship. PYTORCH_INDEX defaults to CUDA_VERSION so existing versions are unaffected. AI-assisted Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/release-ci-docker.yml | 6 ++++++ docker/Dockerfile | 6 ++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-ci-docker.yml b/.github/workflows/release-ci-docker.yml index 1a4ccbc4bf..7f27ad9e15 100644 --- a/.github/workflows/release-ci-docker.yml +++ b/.github/workflows/release-ci-docker.yml @@ -40,22 +40,27 @@ jobs: - cuda: cu126 base_image: nvidia/cuda:12.6.0-devel-ubuntu24.04 nvidia_lib_path: nvidia/cublas/lib + pytorch_index: cu126 install_tilelang: "false" - cuda: cu128 base_image: nvidia/cuda:12.8.0-devel-ubuntu24.04 nvidia_lib_path: nvidia/cublas/lib + pytorch_index: cu128 install_tilelang: "false" - cuda: cu129 base_image: nvidia/cuda:12.9.0-devel-ubuntu24.04 nvidia_lib_path: nvidia/cublas/lib + pytorch_index: cu129 install_tilelang: "false" - cuda: cu130 base_image: nvidia/cuda:13.0.1-devel-ubuntu24.04 nvidia_lib_path: nvidia/cu13/lib + pytorch_index: cu130 install_tilelang: "false" - cuda: cu131 base_image: nvidia/cuda:13.1.1-cudnn-devel-ubuntu24.04 nvidia_lib_path: nvidia/cu13/lib + pytorch_index: cu130 # TODO: update to cu131 when PyTorch publishes cu131 wheels install_tilelang: "true" arch: [amd64, arm64] steps: @@ -83,6 +88,7 @@ jobs: build-args: | CUDA_BASE_IMAGE=${{ matrix.base_image }} CUDA_VERSION=${{ matrix.cuda }} + PYTORCH_INDEX=${{ matrix.pytorch_index }} NVIDIA_LIB_PATH=${{ matrix.nvidia_lib_path }} INSTALL_TILELANG=${{ matrix.install_tilelang }} platforms: linux/${{ matrix.arch }} diff --git a/docker/Dockerfile b/docker/Dockerfile index 450ebf7d51..60fb95cdd4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -32,7 +32,8 @@ ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas" COPY requirements.txt /install/requirements.txt COPY docker/install/install_python_packages.sh /install/install_python_packages.sh ARG CUDA_VERSION -RUN bash /install/install_python_packages.sh ${CUDA_VERSION} +ARG PYTORCH_INDEX=${CUDA_VERSION} +RUN bash /install/install_python_packages.sh ${PYTORCH_INDEX} ARG INSTALL_TILELANG=false RUN if [ "$INSTALL_TILELANG" = "true" ]; then pip install tilelang cuda-tile; fi @@ -91,8 +92,9 @@ ENV PATH="/home/$USERNAME/conda/envs/py312/bin:$PATH" COPY requirements.txt /install/requirements.txt COPY docker/install/install_python_packages.sh /install/install_python_packages.sh ARG CUDA_VERSION +ARG PYTORCH_INDEX=${CUDA_VERSION} ARG INSTALL_TILELANG=false -RUN bash /install/install_python_packages.sh ${CUDA_VERSION} && pip3 install pre-commit && \ +RUN bash /install/install_python_packages.sh ${PYTORCH_INDEX} && pip3 install pre-commit && \ if [ "$INSTALL_TILELANG" = "true" ]; then pip install tilelang cuda-tile; fi # Install mpi4py in the conda environment From f13c1b7dc319e5dcbced14a3a464898e39ad6b8b Mon Sep 17 00:00:00 2001 From: Jonathan Dierksen Date: Wed, 18 Mar 2026 11:02:00 -0500 Subject: [PATCH 5/5] fix(devcontainer): update devcontainer.json files to use consolidated Dockerfile Point all devcontainer configs at docker/Dockerfile with target=dev and per-version build args (CUDA_BASE_IMAGE, CUDA_VERSION, PYTORCH_INDEX), replacing the deleted per-version Dockerfile.cu*.dev references. AI-assisted Co-Authored-By: Claude Sonnet 4.6 --- .devcontainer/cu126/devcontainer.json | 10 ++++++++-- .devcontainer/cu128/devcontainer.json | 10 ++++++++-- .devcontainer/cu129/devcontainer.json | 10 ++++++++-- .devcontainer/cu130/devcontainer.json | 10 ++++++++-- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/.devcontainer/cu126/devcontainer.json b/.devcontainer/cu126/devcontainer.json index 8c0e885fbb..3edf2bf3ea 100644 --- a/.devcontainer/cu126/devcontainer.json +++ b/.devcontainer/cu126/devcontainer.json @@ -1,8 +1,14 @@ { "name": "CUDA Development Container", "build": { - "dockerfile": "../../docker/Dockerfile.cu126.dev", - "context": "../../" + "dockerfile": "../../docker/Dockerfile", + "context": "../../", + "target": "dev", + "args": { + "CUDA_BASE_IMAGE": "nvidia/cuda:12.6.0-devel-ubuntu24.04", + "CUDA_VERSION": "cu126", + "PYTORCH_INDEX": "cu126" + } }, "runArgs": [ "--gpus=all" diff --git a/.devcontainer/cu128/devcontainer.json b/.devcontainer/cu128/devcontainer.json index b1afbf30a5..5bfa17dae1 100644 --- a/.devcontainer/cu128/devcontainer.json +++ b/.devcontainer/cu128/devcontainer.json @@ -1,8 +1,14 @@ { "name": "CUDA Development Container", "build": { - "dockerfile": "../../docker/Dockerfile.cu128.dev", - "context": "../../" + "dockerfile": "../../docker/Dockerfile", + "context": "../../", + "target": "dev", + "args": { + "CUDA_BASE_IMAGE": "nvidia/cuda:12.8.0-devel-ubuntu24.04", + "CUDA_VERSION": "cu128", + "PYTORCH_INDEX": "cu128" + } }, "runArgs": [ "--gpus=all" diff --git a/.devcontainer/cu129/devcontainer.json b/.devcontainer/cu129/devcontainer.json index 5829d9eae6..d25f365329 100644 --- a/.devcontainer/cu129/devcontainer.json +++ b/.devcontainer/cu129/devcontainer.json @@ -1,8 +1,14 @@ { "name": "CUDA Development Container", "build": { - "dockerfile": "../../docker/Dockerfile.cu129.dev", - "context": "../../" + "dockerfile": "../../docker/Dockerfile", + "context": "../../", + "target": "dev", + "args": { + "CUDA_BASE_IMAGE": "nvidia/cuda:12.9.0-devel-ubuntu24.04", + "CUDA_VERSION": "cu129", + "PYTORCH_INDEX": "cu129" + } }, "runArgs": [ "--gpus=all" diff --git a/.devcontainer/cu130/devcontainer.json b/.devcontainer/cu130/devcontainer.json index 08b8d763fd..4894bf67cb 100644 --- a/.devcontainer/cu130/devcontainer.json +++ b/.devcontainer/cu130/devcontainer.json @@ -1,8 +1,14 @@ { "name": "CUDA Development Container", "build": { - "dockerfile": "../../docker/Dockerfile.cu130.dev", - "context": "../../" + "dockerfile": "../../docker/Dockerfile", + "context": "../../", + "target": "dev", + "args": { + "CUDA_BASE_IMAGE": "nvidia/cuda:13.0.1-devel-ubuntu24.04", + "CUDA_VERSION": "cu130", + "PYTORCH_INDEX": "cu130" + } }, "runArgs": [ "--gpus=all"