Skip to content

Commit 5598440

Browse files
ASDF
Signed-off-by: Brian Gallagher <[email protected]>
1 parent d4015cd commit 5598440

File tree

1 file changed

+25
-94
lines changed
  • images/universal/training/py312-cuda128-torch280

1 file changed

+25
-94
lines changed

images/universal/training/py312-cuda128-torch280/Dockerfile

Lines changed: 25 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ LABEL name="universal:py312-cuda128-torch280" \
1919
io.k8s.display-name="Universal CUDA 12.8 Python 3.12 (Workbench + Runtime)" \
2020
io.k8s.description="Universal image: Jupyter workbench by default; runtime when command provided. Includes RDMA/IB libs, Torch 2.8.0 cu128, FlashAttention 2.8.3."
2121

22-
COPY ./images/universal/training/py312-cuda128-torch280/LICENSE.md /licenses/cuda-license.md
22+
## TODO: Add license file
23+
# COPY LICENSE.md /licenses/cuda-license.md
2324

2425
# For OS installs we need elevated privileges; base may default to 1001
2526
USER 0
@@ -30,8 +31,7 @@ ENV NVIDIA_VISIBLE_DEVICES=all \
3031
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
3132
CUDA_VERSION=12.8 \
3233
PIP_DEFAULT_TIMEOUT=600 \
33-
PIP_DISABLE_PIP_VERSION_CHECK=1 \
34-
PIP_UPGRADE_STRATEGY=only-if-needed
34+
PIP_DISABLE_PIP_VERSION_CHECK=1
3535

3636
# Follow runtime: enable CUDA and Mellanox OFED repositories for RDMA/IB packages.
3737
# Note: The base image already includes CUDA 12.8 runtime; we only add missing components (e.g., RDMA libs).
@@ -55,64 +55,38 @@ RUN dnf config-manager \
5555
RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
5656
cuda-command-line-tools-12-8 \
5757
cuda-cudart-devel-12-8 \
58-
cuda-libraries-devel-12-8 \
59-
cuda-compat-12-8 \
60-
cuda-compiler-12-8 \
6158
cuda-nvcc-12-8-12.8.93-1 \
6259
gcc \
6360
gcc-c++ \
6461
make \
6562
python3-devel \
6663
cmake \
6764
git \
68-
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
69-
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
7065
&& dnf clean all \
7166
&& rm -rf /var/cache/dnf/*
7267

7368
# Ensure CUDA_HOME points to the toolkit and nvcc is discoverable, then sanity check nvcc
74-
ARG CUDA_ARCH_LIST=9.0
7569
ENV CUDA_HOME=/usr/local/cuda \
76-
PATH=/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH} \
77-
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH \
78-
TORCH_CUDA_ARCH_LIST="${CUDA_ARCH_LIST}"
79-
80-
# Extra verbosity and progress for CUDA extension builds (Ninja progress, Torch verbose)
81-
ENV USE_NINJA=1 \
82-
TORCH_CUDA_VERBOSE_BUILD=1 \
83-
NINJA_STATUS="[%f/%t %es] "
70+
PATH=/usr/local/cuda/bin:$PATH \
71+
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
72+
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
73+
8474
# NOTE: Optional build-time CUDA checks (remove if not needed for faster builds)
8575
# Verify CUDA toolkit present and nvcc available
8676
RUN /usr/local/cuda/bin/nvcc -V
8777
# Verify key CUDA libs are discoverable
8878
RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || (echo "[fail-fast] CUDA libs not found in ldconfig" >&2; exit 1)
89-
RUN mkdir -p /opt/app-root/.ccache && chown -R 1001:0 /opt/app-root/.ccache
9079

9180
# Quick preflight: verify torch wheel and flash-attn index are reachable to fail fast before large downloads
9281
ARG TORCH_WHEEL_FILE=https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
9382
RUN curl -IfsS --connect-timeout 10 --max-time 20 "$TORCH_WHEEL_FILE" > /dev/null || (echo "[fail-fast] Torch cu128 wheel not reachable: $TORCH_WHEEL_FILE" >&2; exit 1)
9483
RUN curl -IfsS --connect-timeout 10 --max-time 20 https://pypi.org/simple/flash-attn/ > /dev/null || (echo "[fail-fast] PyPI flash-attn index not reachable" >&2; exit 1)
95-
#
96-
# Additional diagnostics to help debug build env before Python installs
97-
RUN python -m pip debug --verbose || true
98-
RUN gcc --version | head -n1 || true
99-
RUN g++ --version | head -n1 || true
100-
RUN /usr/local/cuda/bin/nvcc -V | head -n1 || true
10184

10285
# Switch back to the non-root user for Python environment changes
10386
USER 1001
10487

10588
WORKDIR /opt/app-root/src
10689

107-
# Ensure user installs land in app-root and are discoverable by Python
108-
ENV PYTHONUSERBASE=/opt/app-root \
109-
PYTHONNOUSERSITE=0
110-
111-
# Speed up repeated native builds using ccache (if present)
112-
ENV CCACHE_DIR=/opt/app-root/.ccache \
113-
CCACHE_MAXSIZE=5G \
114-
CCACHE_COMPRESS=1
115-
11690
# Add runtime Python dependencies on top of the minimal Jupyter stack.
11791
# We intentionally avoid re-installing minimal-provided packages (e.g., jupyterlab) to prevent downgrades.
11892
# Torch/cu128 must match CUDA 12.8. FlashAttention is mandatory and currently supported on amd64.
@@ -121,11 +95,7 @@ ARG TARGETARCH
12195
RUN if [ "$TARGETARCH" != "amd64" ]; then echo "FlashAttention is mandatory and requires amd64 prebuilt wheels. Build with --platform linux/amd64." >&2; exit 1; fi
12296

12397
# Install torch from the PyTorch CUDA index separately to avoid affecting other packages' index resolution
124-
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
125-
126-
# Diagnostics: show interpreter, sys.path, and user site locations
127-
RUN python -c "import sys,site,os; print('exe:',sys.executable); print('sys.path:',sys.path); print('userbase:',site.getuserbase()); print('usersite:',site.getusersitepackages()); print('PYTHONNOUSERSITE=',os.environ.get('PYTHONNOUSERSITE'))"
128-
RUN python -m pip show torch || true && python -c "import importlib.util; print('torch_spec:', importlib.util.find_spec('torch'))"
98+
RUN pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
12999

130100
# NOTE: Optional build-time check (remove if not needed): verify torch build has CUDA enabled
131101
RUN python - <<'PY'
@@ -135,32 +105,19 @@ sys.exit(0 if torch.backends.cuda.is_built() else 1)
135105
PY
136106

137107
# Install numpy ahead of building extensions that expect it
138-
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.4
108+
RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
139109

140110
# Install build backend for VCS package and the SDK itself (no build isolation so backend is visible)
141-
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
142-
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
143-
#
144-
# NOTE: kubeflow pulls kubernetes==34.x which requires urllib3<2.4, but
145-
# training-hub requires urllib3>=2.4. There is no kubernetes>=35 on PyPI yet.
146-
# We intentionally keep urllib3>=2.4 for training-hub and accept the mismatch.
147-
# To avoid resolution failure, we do NOT try to force-upgrade kubernetes here.
148-
# RUN python -m pip install --retries 5 --timeout 180 --no-cache-dir \
149-
# "kubernetes>=35.0.0" "urllib3>=2.4,<3"
150-
151-
# Ensure modern build tooling for extensions
152-
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir -U pip setuptools wheel ninja cmake
153-
#
154-
# Fail-fast: ensure binary wheels exist for packages that are expensive to build
155-
RUN mkdir -p /tmp/wheels && \
156-
python -m pip download --retries 5 --timeout 120 --only-binary=:all: --no-deps -d /tmp/wheels \
157-
numba==0.62.1 bitsandbytes==0.48.1 || \
158-
(echo "[fail-fast] Missing binary wheel for numba or bitsandbytes on this platform (refusing to build from source)." >&2; exit 1)
111+
RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
112+
RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@add-training-hub"
113+
114+
# Provide ninja via pip (RHEL/UBI repo ninja-build may be unavailable)
115+
# RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
159116

160117
# Install remaining runtime packages (resolved from default PyPI), including FlashAttention
161118
# Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
162119
# to control CUDA/FA install order and indexes, and to reduce lock churn across arches/ABI-specific wheels.
163-
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir --prefer-binary --only-binary=numba,bitsandbytes --upgrade-strategy only-if-needed \
120+
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
164121
flash-attn==2.8.3 --no-build-isolation \
165122
accelerate==1.10.0 \
166123
transformers==4.57.1 \
@@ -199,47 +156,21 @@ RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir --prefer-bina
199156
rhai-innovation-mini-trainer==0.3.0 \
200157
&& chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
201158
&& fix-permissions /opt/app-root -P
202-
#
203-
# WARNING: Skipping `pip check` due to known kubernetes(urllib3<2.4) vs
204-
# training-hub(urllib3>=2.4) requirement mismatch. Re-enable once upstream
205-
# loosens kubernetes urllib3 bounds or kubeflow no longer pins 34.x.
206-
# RUN python -m pip check || (python -m pip freeze; exit 1)
207-
#
208-
# Numba diagnostics (helps catch llvmlite/LLVM/NumPy mismatches quickly)
209-
RUN python -m numba -s || (echo "[diagnostics] numba sysinfo failed" >&2; exit 1)
210-
211-
# Provide CUDA user-space libraries via pip, aligning with runtime for extension builds
212-
RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir \
213-
nvidia-nccl-cu12==2.27.3 \
214-
nvidia-cublas-cu12==12.8.4.1 \
215-
nvidia-cuda-cupti-cu12==12.8.90 \
216-
nvidia-cuda-nvrtc-cu12==12.8.93 \
217-
nvidia-cuda-runtime-cu12==12.8.90 \
218-
nvidia-cudnn-cu12==9.10.2.21 \
219-
nvidia-cufft-cu12==11.3.3.83 \
220-
nvidia-cufile-cu12==1.13.1.3 \
221-
nvidia-curand-cu12==10.3.9.90 \
222-
nvidia-cusolver-cu12==11.7.3.90 \
223-
nvidia-cusparse-cu12==12.5.8.93 \
224-
nvidia-cusparselt-cu12==0.7.1 \
225-
nvidia-nvjitlink-cu12==12.8.93 \
226-
nvidia-nvtx-cu12==12.8.90 \
227-
&& fix-permissions /opt/app-root -P
228159

229-
# Ensure cuDNN from pip is discoverable during source builds
230-
ENV LD_LIBRARY_PATH="/opt/app-root/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH}"
231160

232-
# Deterministic 2-step: sub-dep first, then parent without deps (Python 3.12 env)
233-
RUN set -e; \
234-
ARCH_NO_DOT="$(echo "${CUDA_ARCH_LIST}" | tr -d '.')" ; \
235-
export TORCH_CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" PYTORCH_NVCC_FLAGS="-gencode arch=compute_${ARCH_NO_DOT},code=sm_${ARCH_NO_DOT}"; \
236-
python -m pip install --no-build-isolation --no-cache-dir causal-conv1d==1.5.3.post1 && \
237-
rm -rf "$HOME/.cache/torch_extensions" /tmp/pip-* /tmp/pip-wheel-* || true && \
238-
TORCH_CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" PYTORCH_NVCC_FLAGS="${PYTORCH_NVCC_FLAGS}" python -m pip install --no-build-isolation --no-cache-dir mamba-ssm==2.2.6.post3 --no-deps && \
161+
# Build helpers to compile PyTorch extensions
162+
RUN pip install -U pip setuptools wheel ninja cmake
163+
164+
# Optional: set GPU archs if you hit arch issues
165+
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
166+
167+
# Deterministic 2-step: sub-dep first, then parent without deps
168+
RUN pip install --no-build-isolation --no-cache-dir causal-conv1d==1.5.3.post1 && \
169+
pip install --no-build-isolation --no-cache-dir mamba-ssm==2.2.6.post3 --no-deps && \
239170
fix-permissions /opt/app-root -P
240171

241172
# Provide a POSIX entrypoint wrapper to choose behavior based on invocation
242-
COPY --chmod=0755 ./images/universal/training/py312-cuda128-torch280/entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
173+
COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
243174

244175
# Set ENTRYPOINT to the wrapper so that providing a command runs headless.
245176
# Default CMD maintains workbench behavior (no args → start-notebook.sh)

0 commit comments

Comments
 (0)