[Distributed] Fix broadcast_module_parameter for CPU-resident models

EtelisIBM · EtelisIBM · commit 72ed4b2cf7af · 2026-02-22T14:13:13.000+02:00
Use each rank's own GPU device for NCCL broadcast instead of the
module's execution device, which may be CPU or shared across ranks
when the model is not GPU-resident.

Signed-off-by: Itay Etlis &lt;itayetlis@gmail.com&gt;
diff --git a/src/llmcompressor/utils/distributed.py b/src/llmcompressor/utils/distributed.py
@@ -6,7 +6,7 @@
 from typing import Dict, List, Tuple
 
 import torch
-from compressed_tensors.utils import get_execution_device, update_offload_parameter
+from compressed_tensors.utils import update_offload_parameter
 from loguru import logger
 from torch import distributed as dist
 from torch.nn import Module
@@ -147,7 +147,8 @@ def broadcast_module_parameter(
     if param is None:
         return
 
-    device = get_execution_device(module)
+    # NCCL requires each rank to use its own GPU
+    device = torch.device(f"cuda:{dist.get_rank()}")
     tensor = param.data.to(device)
     dist.broadcast(tensor, src=src_rank)
     update_offload_parameter(module, param_name, tensor)