@@ -54,20 +54,25 @@ RUN dnf config-manager \
5454RUN dnf install -y --disablerepo="*" --enablerepo="cuda-rhel9-x86_64,ubi-9-appstream-rpms,ubi-9-baseos-rpms" \
5555 cuda-command-line-tools-12-8 \
5656 cuda-cudart-devel-12-8 \
57+ cuda-libraries-devel-12-8 \
58+ cuda-compat-12-8 \
59+ cuda-compiler-12-8 \
5760 cuda-nvcc-12-8-12.8.93-1 \
5861 gcc \
5962 gcc-c++ \
6063 make \
6164 python3-devel \
6265 cmake \
6366 git \
67+ && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
68+ && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
6469 && dnf clean all \
6570 && rm -rf /var/cache/dnf/*
6671
6772# Ensure CUDA_HOME points to the toolkit and nvcc is discoverable, then sanity check nvcc
6873ENV CUDA_HOME=/usr/local/cuda \
69- PATH=/usr/local/cuda /bin:$PATH \
70- LD_LIBRARY_PATH=/usr/local/cuda /lib64:$LD_LIBRARY_PATH \
74+ PATH=/usr/local/nvidia /bin:${CUDA_HOME}/bin:${ PATH} \
75+ LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI /lib64:$LD_LIBRARY_PATH \
7176 TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
7277
7378# NOTE: Optional build-time CUDA checks (remove if not needed for faster builds)
@@ -110,8 +115,8 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir numpy==2.3.3
110115RUN pip install --retries 5 --timeout 300 --no-cache-dir hatchling hatch-vcs
111116RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation "git+https://github.com/briangallagher/sdk@training-hub"
112117
113- # Provide ninja via pip (RHEL/UBI repo ninja- build may be unavailable)
114- RUN pip install --retries 5 --timeout 300 --no-cache-dir ninja
118+ # Ensure modern build tooling for extensions
119+ RUN pip install --retries 5 --timeout 300 --no-cache-dir -U pip setuptools wheel ninja cmake
115120
116121# Install remaining runtime packages (resolved from default PyPI), including FlashAttention
117122# Note: We intentionally do not use a Pipfile/lock here to avoid mixing resolvers with the base (uv lock),
@@ -154,9 +159,37 @@ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
154159 && chmod -R g+w /opt/app-root/lib/python3.12/site-packages \
155160 && fix-permissions /opt/app-root -P
156161
157- # Deterministic 2-step: sub-dep first, then parent without deps (align with runtime)
158- RUN pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation causal-conv1d==1.5.3.post1 && \
159- pip install --retries 5 --timeout 300 --no-cache-dir --no-build-isolation mamba-ssm==2.2.6.post3 --no-deps && \
162+ # Provide CUDA user-space libraries via pip, aligning with runtime for extension builds
163+ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
164+ nvidia-nccl-cu12==2.27.3 \
165+ nvidia-cublas-cu12==12.8.4.1 \
166+ nvidia-cuda-cupti-cu12==12.8.90 \
167+ nvidia-cuda-nvrtc-cu12==12.8.93 \
168+ nvidia-cuda-runtime-cu12==12.8.90 \
169+ nvidia-cudnn-cu12==9.10.2.21 \
170+ nvidia-cufft-cu12==11.3.3.83 \
171+ nvidia-cufile-cu12==1.13.1.3 \
172+ nvidia-curand-cu12==10.3.9.90 \
173+ nvidia-cusolver-cu12==11.7.3.90 \
174+ nvidia-cusparse-cu12==12.5.8.93 \
175+ nvidia-cusparselt-cu12==0.7.1 \
176+ nvidia-nvjitlink-cu12==12.8.93 \
177+ nvidia-nvtx-cu12==12.8.90 \
178+ && fix-permissions /opt/app-root -P
179+
180+ # Ensure cuDNN from pip is discoverable during source builds
181+ ENV LD_LIBRARY_PATH="/opt/app-root/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH}"
182+
183+ # Deterministic 2-step with verbose logging for debugging builds
184+ # 1) Build and log wheel, 2) Install from local wheel (keeps logs on failure)
185+ RUN pip install --no-cache-dir --no-build-isolation -vv --log /tmp/pip-causal.log causal-conv1d==1.5.3.post1 && \
186+ CMAKE_GENERATOR=Ninja CMAKE_ARGS="-DCMAKE_VERBOSE_MAKEFILE=ON" \
187+ pip wheel --no-cache-dir --no-build-isolation -vv \
188+ --log /tmp/pip-mamba-ssm.log \
189+ mamba-ssm==2.2.6.post3 --no-deps -w /tmp/wheels || \
190+ (echo '--- pip mamba-ssm log (tail) ---' ; tail -n 500 /tmp/pip-mamba-ssm.log; exit 1) && \
191+ pip install --no-cache-dir /tmp/wheels/*.whl && \
192+ rm -rf /tmp/wheels && \
160193 fix-permissions /opt/app-root -P
161194
162195# Provide a POSIX entrypoint wrapper to choose behavior based on invocation
0 commit comments