Merge remote-tracking branch 'upstream/main'

m-rafeeq · m-rafeeq · commit 53b10a0aa682 · 2025-11-06T00:16:25.000Z
diff --git a/OWNERS b/OWNERS
@@ -1,10 +1,13 @@
 approvers:
   - astefanutti
+  - briangallagher
   - chipspeak
   - efazal
   - Fiona-Waters
+  - kramaranya
   - kryanbeane
   - laurafitzgerald
+  - MStokluska
   - pawelpaszki
   - sutaakar
   - szaher
diff --git a/images/universal/training/py312-cuda128-torch280/Dockerfile b/images/universal/training/py312-cuda128-torch280/Dockerfile
@@ -0,0 +1,160 @@
+# Universal image Dockerfile
+#
+# Base image:
+# - Minimal Jupyter CUDA workbench with CUDA 12.8 and Python 3.12
+# - Provides JupyterLab, Elyra integration, addons, and default ENTRYPOINT start-notebook.sh
+# - Source: quay.io/opendatahub/workbench-images:cuda-jupyter-minimal-ubi9-python-3.12-2025a_20250903
+#
+# Design intent:
+# - Preserve workbench behavior by default (no args → start-notebook.sh)
+# - Add runtime capabilities on top (Python ML/training stack, RDMA/IB packages)
+# - Avoid duplicating dependencies provided by the base image
+# - Allow headless runtime mode when a command is provided (args → exec that command)
+
+FROM quay.io/opendatahub/workbench-images:cuda-jupyter-minimal-ubi9-python-3.12-2025a_20250903
+
+LABEL name="universal:py312-cuda128-torch280" \
+      summary="Universal CUDA 12.8 Python 3.12 image with PyTorch 2.8.0" \
+      description="Universal image combining minimal Jupyter workbench and runtime ML stack (CUDA 12.8, PyTorch 2.8.0, FlashAttention 2.8.3) on UBI9" \
+      io.k8s.display-name="Universal CUDA 12.8 Python 3.12 (Workbench + Runtime)" \
+      io.k8s.description="Universal image: Jupyter workbench by default; runtime when command provided. Includes RDMA/IB libs, Torch 2.8.0 cu128, FlashAttention 2.8.3."
+
+## TODO: Add license file
+# COPY LICENSE.md /licenses/cuda-license.md
+
+# For OS installs we need elevated privileges; base may default to 1001
+USER 0
+WORKDIR /opt/app-root/bin
+
+# Keep NVIDIA driver capability constraints consistent with runtime image behavior
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    CUDA_VERSION=12.8 \
+    PIP_DEFAULT_TIMEOUT=600 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+
+# Follow runtime: enable CUDA and Mellanox OFED repositories for RDMA/IB packages.
+# Note: The base image already includes CUDA 12.8 runtime; we only add missing components (e.g., RDMA libs).
+RUN dnf config-manager \
+    --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
+ && dnf config-manager \
+    --add-repo https://linux.mellanox.com/public/repo/mlnx_ofed/latest/rhel9.5/mellanox_mlnx_ofed.repo \
+ && dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,mlnx_ofed_24.10-1.1.4.0_base,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
+    libibverbs-utils \
+    infiniband-diags \
+    libibumad3 \
+    librdmacm \
+    librdmacm-utils \
+    rdma-core \
+    mlnx-tools \
+ && dnf clean all \
+ && rm -rf /var/cache/dnf/*
+
+# Install CUDA NVCC and build toolchain required to build FlashAttention from source
+# NOTE: Use command-line CUDA packages to avoid Nsight GUI deps (X11 libs) not available in UBI
+RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
+    cuda-command-line-tools-12-8 \
+    cuda-cudart-devel-12-8 \
+    cuda-nvcc-12-8-12.8.93-1 \
+    gcc \
+    gcc-c++ \
+    make \
+    python3-devel \
+    cmake \
+    git \
+ && dnf clean all \
+ && rm -rf /var/cache/dnf/*
+
+# Ensure CUDA_HOME points to the toolkit and nvcc is discoverable, then sanity check nvcc
+ENV CUDA_HOME=/usr/local/cuda \
+    PATH=/usr/local/cuda/bin:$PATH \
+    LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
+    TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
+
+# NOTE: Optional build-time CUDA checks (remove if not needed for faster builds)
+# Verify CUDA toolkit present and nvcc available
+RUN /usr/local/cuda/bin/nvcc -V
+# Verify key CUDA libs are discoverable
+RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || (echo "[fail-fast] CUDA libs not found in ldconfig" >&2; exit 1)
+
+# Quick preflight: verify torch wheel and flash-attn index are reachable to fail fast before large downloads
+ARG TORCH_WHEEL_FILE=https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
+RUN curl -IfsS --connect-timeout 10 --max-time 20 "$TORCH_WHEEL_FILE" > /dev/null || (echo "[fail-fast] Torch cu128 wheel not reachable: $TORCH_WHEEL_FILE" >&2; exit 1)
+RUN curl -IfsS --connect-timeout 10 --max-time 20 https://pypi.org/simple/flash-attn/ > /dev/null || (echo "[fail-fast] PyPI flash-attn index not reachable" >&2; exit 1)
+
+# Switch back to the non-root user for Python environment changes
+USER 1001
+
+WORKDIR /opt/app-root/src
+
+# Add runtime Python dependencies on top of the minimal Jupyter stack.
+# We intentionally avoid re-installing minimal-provided packages (e.g., jupyterlab) to prevent downgrades.
+# Torch/cu128 must match CUDA 12.8. FlashAttention is mandatory and currently supported on amd64.
+ARG TARGETARCH
+# Enforce amd64 for FlashAttention wheel availability
+RUN if [ "$TARGETARCH" != "amd64" ]; then echo "FlashAttention is mandatory and requires amd64 prebuilt wheels. Build with --platform linux/amd64." >&2; exit 1; fi
+
+# Install torch from the PyTorch CUDA index separately to avoid affecting other packages' index resolution
+RUN pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
+
+# NOTE: Optional build-time check (remove if not needed): verify torch build has CUDA enabled
+RUN python - <<'PY'
+import torch, sys
+print("[check] torch", torch.__version__, "cuda build:", torch.version.cuda)
+sys.exit(0 if torch.backends.cuda.is_built() else 1)
+PY
+
+# Install numpy ahead of building extensions that expect it
+RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
+
+# Install build backend for VCS package and the SDK itself (no build isolation so backend is visible)
+RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
+RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
+
+# Provide ninja via pip (RHEL/UBI repo ninja-build may be unavailable)
+RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
+
+# Install remaining runtime packages (resolved from default PyPI), including FlashAttention
+# Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
+#       to control CUDA/FA install order and indexes, and to reduce lock churn across arches/ABI-specific wheels.
+RUN pip install --retries 5 --timeout 300 --no-cache-dir \
+    flash-attn==2.8.3 --no-build-isolation \
+    accelerate==1.10.0 \
+    transformers==4.55.2 \
+    peft==0.17.0 \
+    tqdm==4.67.1 \
+    datasets==4.0.0 \
+    pydantic>=2.11.7 \
+    aiofiles==24.1.0 \
+    "protobuf>=5.28.0,<6.0.0" \
+    "simpleeval>=0.9.13,<1.0" \
+    safetensors==0.6.2 \
+    packaging==25.0 \
+    pyyaml==6.0.2 \
+    py-cpuinfo==9.0.0 \
+    numba==0.61.2 \
+    rich==14.1.0 \
+    tensorboard==2.19.0 \
+    bitsandbytes>=0.45.3 \
+    liger-kernel==0.5.10 \
+    "sentencepiece>=0.1.99,<0.3" \
+    tokenizers==0.21.4 \
+    training-hub==0.2.0 \
+    trl==0.21.0 \
+    deepspeed>=0.14.3 \
+    async-timeout==4.0.3 \
+    aiohttp==3.12.15 \
+    hf-xet==1.1.8 \
+    huggingface-hub==0.34.4 \
+    mlflow==3.4.0 \
+    psutil==7.0.0 \
+ && chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
+ && fix-permissions /opt/app-root -P
+
+# Provide a POSIX entrypoint wrapper to choose behavior based on invocation
+COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
+
+# Set ENTRYPOINT to the wrapper so that providing a command runs headless.
+# Default CMD maintains workbench behavior (no args → start-notebook.sh)
+ENTRYPOINT ["/usr/local/bin/entrypoint-universal.sh"]
+CMD ["start-notebook.sh"]
diff --git a/images/universal/training/py312-cuda128-torch280/entrypoint-universal.sh b/images/universal/training/py312-cuda128-torch280/entrypoint-universal.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env sh
+set -e
+# Universal entrypoint
+# Behavior:
+# - If RUNTIME_MODE is set to a truthy value, run provided command (headless). If no command provided, exit with help.
+# - Otherwise, start the workbench notebook exactly like the base image.
+
+is_truthy() {
+	case "$(printf %s "$1" | tr '[:upper:]' '[:lower:]')" in
+		y|yes|true|1) return 0 ;;
+		*) return 1 ;;
+	esac
+}
+
+if is_truthy "${RUNTIME_MODE:-}"; then
+	if [ "$#" -gt 0 ]; then
+		exec "$@"
+	else
+		echo "RUNTIME_MODE=true but no command provided. Provide a command, e.g.: python -m your.module" >&2
+		exit 2
+	fi
+fi
+
+exec start-notebook.sh