fix(smoothquant): fix NCCL timeout in DDP example sample generation

David Zheng · dzhengAP · commit fae564f92c27 · 2026-03-19T12:25:57.000-07:00
dispatch_model and generate require all ranks to participate.
Add dist.barrier() before generation and only log output on rank 0.

Signed-off-by: David Zheng &lt;dzheng@apple.com&gt;
diff --git a/examples/quantization_w8a8_int8/smoothquant_ddp_example.py b/examples/quantization_w8a8_int8/smoothquant_ddp_example.py
@@ -115,12 +115,14 @@ def tokenize(sample):
 # ---------------------------------------------------------------------------
 # Sample generation (rank 0 only)
 # ---------------------------------------------------------------------------
+# Sample generation (all ranks must participate)
+dist.barrier()
+dispatch_model(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {k: v.to(model.device) for k, v in sample.items()}
+output = model.generate(**sample, max_new_tokens=50)
 if rank == 0:
     logger.info("\n========== SAMPLE GENERATION ==========")
-    dispatch_model(model)
-    sample = tokenizer("Hello my name is", return_tensors="pt")
-    sample = {k: v.to(model.device) for k, v in sample.items()}
-    output = model.generate(**sample, max_new_tokens=50)
     logger.info(tokenizer.decode(output[0]))
     logger.info("========================================\n")