@@ -30,7 +30,8 @@ ENV NVIDIA_VISIBLE_DEVICES=all \
3030 NVIDIA_DRIVER_CAPABILITIES=compute,utility \
3131 CUDA_VERSION=12.8 \
3232 PIP_DEFAULT_TIMEOUT=600 \
33- PIP_DISABLE_PIP_VERSION_CHECK=1
33+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
34+ PIP_UPGRADE_STRATEGY=only-if-needed
3435
3536# Follow runtime: enable CUDA and Mellanox OFED repositories for RDMA/IB packages.
3637# Note: The base image already includes CUDA 12.8 runtime; we only add missing components (e.g., RDMA libs).
@@ -54,22 +55,32 @@ RUN dnf config-manager \
5455RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
5556 cuda-command-line-tools-12-8 \
5657 cuda-cudart-devel-12-8 \
58+ cuda-libraries-devel-12-8 \
59+ cuda-compat-12-8 \
60+ cuda-compiler-12-8 \
5761 cuda-nvcc-12-8-12.8.93-1 \
5862 gcc \
5963 gcc-c++ \
6064 make \
6165 python3-devel \
6266 cmake \
6367 git \
68+ && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
69+ && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
6470 && dnf clean all \
6571 && rm -rf /var/cache/dnf/*
6672
6773# Ensure CUDA_HOME points to the toolkit and nvcc is discoverable, then sanity check nvcc
6874ENV CUDA_HOME=/usr/local/cuda \
69- PATH=/usr/local/cuda/bin:$PATH \
70- LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
71- TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
75+ PATH=/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH} \
76+ LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH
77+ # TODO: revisit this, going with default list for now.
78+ # TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
7279
80+ # Extra verbosity and progress for CUDA extension builds (Ninja progress, Torch verbose)
81+ ENV USE_NINJA=1 \
82+ TORCH_CUDA_VERBOSE_BUILD=1 \
83+ NINJA_STATUS="[%f/%t %es] "
7384# NOTE: Optional build-time CUDA checks (remove if not needed for faster builds)
7485# Verify CUDA toolkit present and nvcc available
7586RUN /usr/local/cuda/bin/nvcc -V
@@ -80,12 +91,22 @@ RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || (echo "[fail-fast] C
8091ARG TORCH_WHEEL_FILE=https://download.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
8192RUN curl -IfsS --connect-timeout 10 --max-time 20 "$TORCH_WHEEL_FILE" > /dev/null || (echo "[fail-fast] Torch cu128 wheel not reachable: $TORCH_WHEEL_FILE" >&2; exit 1)
8293RUN curl -IfsS --connect-timeout 10 --max-time 20 https://pypi.org/simple/flash-attn/ > /dev/null || (echo "[fail-fast] PyPI flash-attn index not reachable" >&2; exit 1)
94+ #
95+ # Additional diagnostics to help debug build env before Python installs
96+ RUN python -m pip debug --verbose || true
97+ RUN gcc --version | head -n1 || true
98+ RUN g++ --version | head -n1 || true
99+ RUN /usr/local/cuda/bin/nvcc -V | head -n1 || true
83100
84101# Switch back to the non-root user for Python environment changes
85102USER 1001
86103
87104WORKDIR /opt/app-root/src
88105
106+ # Ensure user installs land in app-root and are discoverable by Python
107+ ENV PYTHONUSERBASE=/opt/app-root \
108+ PYTHONNOUSERSITE=0
109+
89110# Add runtime Python dependencies on top of the minimal Jupyter stack.
90111# We intentionally avoid re-installing minimal-provided packages (e.g., jupyterlab) to prevent downgrades.
91112# Torch/cu128 must match CUDA 12.8. FlashAttention is mandatory and currently supported on amd64.
@@ -94,7 +115,11 @@ ARG TARGETARCH
94115RUN if [ "$TARGETARCH" != "amd64" ]; then echo "FlashAttention is mandatory and requires amd64 prebuilt wheels. Build with --platform linux/amd64." >&2; exit 1; fi
95116
96117# Install torch from the PyTorch CUDA index separately to avoid affecting other packages' index resolution
97- RUN pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
118+ RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
119+
120+ # Diagnostics: show interpreter, sys.path, and user site locations
121+ RUN python -c "import sys,site,os; print('exe:',sys.executable); print('sys.path:',sys.path); print('userbase:',site.getuserbase()); print('usersite:',site.getusersitepackages()); print('PYTHONNOUSERSITE=',os.environ.get('PYTHONNOUSERSITE'))"
122+ RUN python -m pip show torch || true && python -c "import importlib.util; print('torch_spec:', importlib.util.find_spec('torch'))"
98123
99124# NOTE: Optional build-time check (remove if not needed): verify torch build has CUDA enabled
100125RUN python - <<'PY'
@@ -104,19 +129,32 @@ sys.exit(0 if torch.backends.cuda.is_built() else 1)
104129PY
105130
106131# Install numpy ahead of building extensions that expect it
107- RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
132+ RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.4
108133
109134# Install build backend for VCS package and the SDK itself (no build isolation so backend is visible)
110- RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
111- RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
135+ RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
136+ RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
137+ #
138+ # NOTE: kubeflow pulls kubernetes==34.x which requires urllib3<2.4, but
139+ # training-hub requires urllib3>=2.4. There is no kubernetes>=35 on PyPI yet.
140+ # We intentionally keep urllib3>=2.4 for training-hub and accept the mismatch.
141+ # To avoid resolution failure, we do NOT try to force-upgrade kubernetes here.
142+ # RUN python -m pip install --retries 5 --timeout 180 --no-cache-dir \
143+ # "kubernetes>=35.0.0" "urllib3>=2.4,<3"
112144
113- # Provide ninja via pip (RHEL/UBI repo ninja-build may be unavailable)
114- RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
145+ # Ensure modern build tooling for extensions
146+ RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir -U pip setuptools wheel ninja cmake
147+ #
148+ # Fail-fast: ensure binary wheels exist for packages that are expensive to build
149+ RUN mkdir -p /tmp/wheels && \
150+ python -m pip download --retries 5 --timeout 120 --only-binary=:all: --no-deps -d /tmp/wheels \
151+ numba==0.62.1 bitsandbytes==0.48.1 || \
152+ (echo "[fail-fast] Missing binary wheel for numba or bitsandbytes on this platform (refusing to build from source)." >&2; exit 1)
115153
116154# Install remaining runtime packages (resolved from default PyPI), including FlashAttention
117155# Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
118156# to control CUDA/FA install order and indexes, and to reduce lock churn across arches/ABI-specific wheels.
119- RUN pip install --retries 5 --timeout 300 --no-cache-dir \
157+ RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir --prefer-binary --only-binary=numba,bitsandbytes --upgrade-strategy only-if-needed \
120158 flash-attn==2.8.3 --no-build-isolation \
121159 accelerate==1.10.0 \
122160 transformers==4.57.1 \
@@ -125,6 +163,8 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
125163 datasets==4.0.0 \
126164 pydantic>=2.11.7 \
127165 aiofiles==24.1.0 \
166+ deprecated==1.2.18 \
167+ typer==0.19.2 \
128168 "protobuf>=5.28.0,<6.0.0" \
129169 "simpleeval>=0.9.13,<1.0" \
130170 safetensors==0.6.2 \
@@ -140,9 +180,6 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
140180 kernels==0.10.3 \
141181 "sentencepiece>=0.1.99,<0.3" \
142182 tokenizers==0.22.1 \
143- instructlab-training==0.12.1 \
144- rhai-innovation-mini-trainer==0.3.0 \
145- training-hub==0.3.0 \
146183 trl==0.21.0 \
147184 deepspeed>=0.14.3 \
148185 async-timeout==4.0.3 \
@@ -151,12 +188,79 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
151188 huggingface-hub==0.34.4 \
152189 mlflow==3.4.0 \
153190 psutil==7.0.0 \
191+ training-hub==0.3.0 \
192+ instructlab-training==0.12.1 \
193+ rhai-innovation-mini-trainer==0.3.0 \
154194 && chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
155195 && fix-permissions /opt/app-root -P
196+ #
197+ # WARNING: Skipping `pip check` due to known kubernetes(urllib3<2.4) vs
198+ # training-hub(urllib3>=2.4) requirement mismatch. Re-enable once upstream
199+ # loosens kubernetes urllib3 bounds or kubeflow no longer pins 34.x.
200+ # RUN python -m pip check || (python -m pip freeze; exit 1)
201+ #
202+ # Numba diagnostics (helps catch llvmlite/LLVM/NumPy mismatches quickly)
203+ RUN python -m numba -s || (echo "[diagnostics] numba sysinfo failed" >&2; exit 1)
204+
205+ # Provide CUDA user-space libraries via pip, aligning with runtime for extension builds
206+ RUN python -m pip install --retries 5 --timeout 300 --no-cache-dir \
207+ nvidia-nccl-cu12==2.27.3 \
208+ nvidia-cublas-cu12==12.8.4.1 \
209+ nvidia-cuda-cupti-cu12==12.8.90 \
210+ nvidia-cuda-nvrtc-cu12==12.8.93 \
211+ nvidia-cuda-runtime-cu12==12.8.90 \
212+ nvidia-cudnn-cu12==9.10.2.21 \
213+ nvidia-cufft-cu12==11.3.3.83 \
214+ nvidia-cufile-cu12==1.13.1.3 \
215+ nvidia-curand-cu12==10.3.9.90 \
216+ nvidia-cusolver-cu12==11.7.3.90 \
217+ nvidia-cusparse-cu12==12.5.8.93 \
218+ nvidia-cusparselt-cu12==0.7.1 \
219+ nvidia-nvjitlink-cu12==12.8.93 \
220+ nvidia-nvtx-cu12==12.8.90 \
221+ && fix-permissions /opt/app-root -P
222+
223+ # Ensure cuDNN from pip is discoverable during source builds
224+ ENV LD_LIBRARY_PATH="/opt/app-root/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH}"
156225
157- # Deterministic 2-step: sub-dep first, then parent without deps (align with runtime)
158- RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation causal-conv1d==1.5.3.post1 && \
159- pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation mamba-ssm==2.2.6.post3 --no-deps && \
226+ # Deterministic 2-step with verbose logging for debugging builds
227+ # 1) Build and log wheel via heredoc script (avoids fragile inline quoting); 2) Install from local wheel
228+ RUN python -m pip install --no-cache-dir --no-build-isolation -vv --log /tmp/pip-causal.log causal-conv1d==1.5.3.post1 && \
229+ CMAKE_GENERATOR=Ninja CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)" MAX_JOBS="$(nproc)" CMAKE_ARGS='-DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_FLAGS:STRING="--ptxas-options=-v -v" -DCMAKE_CXX_FLAGS:STRING="-v"' \
230+ echo "[diag] nproc=$(nproc)" ; df -h /tmp || true; cmake --version || true; ninja --version || true; which nvcc && nvcc -V || true; which g++ && g++ --version | head -n1 || true; \
231+ export PYTORCH_NVCC_FLAGS="-v -lineinfo" FORCE_CUDA=1 TORCH_EXTENSIONS_VERBOSE=1; \
232+ bash -lc 'cat > /tmp/build_mamba_wheel.sh <<' \' 'BASH' \' '
233+ #!/usr/bin/env bash
234+ set -euo pipefail
235+ build_log=/tmp/pip-mamba-ssm.log
236+ mkdir -p /tmp/wheels
237+ : > "$build_log"
238+ echo "[diag] starting wheel build at: $(date)"
239+ python -m pip wheel --no-cache-dir --no-build-isolation -vv \
240+ --log "$build_log" \
241+ mamba-ssm==2.2.6.post3 --no-deps -w /tmp/wheels &
242+ pid=$!
243+ for i in $(seq 1 40); do
244+ sleep 60
245+ if ! kill -0 "$pid" 2>/dev/null; then break; fi
246+ echo "[heartbeat] $(date): minute=$i"
247+ echo "[heartbeat] recent log:"
248+ tail -n 80 "$build_log" || true
249+ echo "[heartbeat] compiled objects so far: $(find /tmp -type f -name "*.o" | wc -l || true)"
250+ df -h /tmp || true
251+ done
252+ if kill -0 "$pid" 2>/dev/null; then
253+ echo "[timeout] wheel build exceeded 40 minutes; killing..."
254+ kill "$pid" || true; sleep 10; kill -9 "$pid" || true
255+ echo "--- pip mamba-ssm log (tail) ---"
256+ tail -n 1000 "$build_log" || true
257+ exit 1
258+ fi
259+ wait "$pid"
260+ BASH' && \
261+ bash /tmp/build_mamba_wheel.sh || (echo '--- pip mamba-ssm log (tail) ---' ; tail -n 1000 /tmp/pip-mamba-ssm.log; exit 1) && \
262+ python -m pip install --no-cache-dir /tmp/wheels/*.whl && \
263+ rm -rf /tmp/wheels && \
160264 fix-permissions /opt/app-root -P
161265
162266# Provide a POSIX entrypoint wrapper to choose behavior based on invocation
0 commit comments