Add cpath for mpi env var fix

farazkh80 · farazkh80 · commit b560ad6a56fe · 2025-11-14T19:39:50.000Z
Signed-off-by: Faraz Khoubsirat &lt;58580514+farazkh80@users.noreply.github.com&gt;
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
@@ -31,6 +31,11 @@ FROM base AS devel
 # NB: PyTorch requires this to be < 1.0
 ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
 
+# Set CUDA include path and Triton PTXAS path for Triton runtime compilation
+# Required for Triton to find CUDA headers when compiling cuda_utils.c on remote MPI nodes
+ENV CPATH=/usr/local/cuda/include
+ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+
 # Copy all installation scripts at once to reduce layers
 COPY docker/common/install.sh \
      docker/common/install_base.sh \
diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
   "commit_hash": "05b5336ab6135e368157600da1d16b090ee9a00a",
   "timestamp": "2025-11-14T18:16:21Z"
-}
+}
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -74,7 +74,7 @@ def load_weight_shard(
         # For integrated GPU systems (e.g., DGX Spark), CPU and GPU share limited physical memory.
         # Avoiding device transfers reduces memory consumption and unnecessary data copies,
         # enabling support for larger models on memory-constrained systems.
-        logger.warning(
+        logger.debug(
             f"[load_weight_shard] Skipping device transfer from {weight.device} to {device} on integrated GPU to conserve shared memory."
         )
         device = weight.device

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ def load_weight_shard(`
`74`	`74`	`# For integrated GPU systems (e.g., DGX Spark), CPU and GPU share limited physical memory.`
`75`	`75`	`# Avoiding device transfers reduces memory consumption and unnecessary data copies,`
`76`	`76`	`# enabling support for larger models on memory-constrained systems.`
`77`		`- logger.warning(`
	`77`	`+ logger.debug(`
`78`	`78`	`f"[load_weight_shard] Skipping device transfer from {weight.device} to {device} on integrated GPU to conserve shared memory."`
`79`	`79`	`)`
`80`	`80`	`device = weight.device`