File tree Expand file tree Collapse file tree 3 files changed +7
-2
lines changed
tensorrt_llm/_torch/modules Expand file tree Collapse file tree 3 files changed +7
-2
lines changed Original file line number Diff line number Diff line change @@ -31,6 +31,11 @@ FROM base AS devel
3131# NB: PyTorch requires this to be < 1.0
3232ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
3333
34+ # Set CUDA include path and Triton PTXAS path for Triton runtime compilation
35+ # Required for Triton to find CUDA headers when compiling cuda_utils.c on remote MPI nodes
36+ ENV CPATH=/usr/local/cuda/include
37+ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
38+
3439# Copy all installation scripts at once to reduce layers
3540COPY docker/common/install.sh \
3641 docker/common/install_base.sh \
Original file line number Diff line number Diff line change 11{
22 "commit_hash" : " 05b5336ab6135e368157600da1d16b090ee9a00a" ,
33 "timestamp" : " 2025-11-14T18:16:21Z"
4- }
4+ }
Original file line number Diff line number Diff line change @@ -74,7 +74,7 @@ def load_weight_shard(
7474 # For integrated GPU systems (e.g., DGX Spark), CPU and GPU share limited physical memory.
7575 # Avoiding device transfers reduces memory consumption and unnecessary data copies,
7676 # enabling support for larger models on memory-constrained systems.
77- logger .warning (
77+ logger .debug (
7878 f"[load_weight_shard] Skipping device transfer from { weight .device } to { device } on integrated GPU to conserve shared memory."
7979 )
8080 device = weight .device
You can’t perform that action at this time.
0 commit comments