Updated amax_sync test to set const weights based on rank (#451)

kinjalpatel27 · web-flow · commit d19f4f5aef9a · 2025-10-21T15:32:31.000-04:00
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,7 @@ Model Optimizer Changelog (Linux)
 - Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
 - Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` (gated dataset accessed using ``HF_TOKEN`` environment variable) if no dataset is specified.
 - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
+- Add support for MCore MoE PTQ/QAT/QAD.
 
 **Documentation**
 
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -673,7 +673,22 @@ def _test_expert_model_parallel_amax_sync(
         num_moe_experts=8,
         transformer_impl="modelopt",
     )
-    prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
+
+    # Initialize ALL weights based on rank to produce different amax values
+    # to produce different amax values across ranks that need synchronization
+    weight_idx = 0
+    for name, param in model.named_parameters():
+        # Skip embeddings and any parameters without 'weight' in the name
+        if "embedding" in name.lower() or "weight" not in name.lower():
+            continue
+
+        if param.requires_grad and param.dim() >= 2:  # Only weight matrices, not biases
+            # Different constant value based on rank and parameter index
+            const_val = 0.1 + (rank * 0.5) + (weight_idx * 0.05)
+            param.data.fill_(const_val)
+            weight_idx += 1
+
+    prompt_tokens = (torch.ones((2, model.max_sequence_length)) * 0.05 + rank * 0.5).cuda().long()
 
     # force all expert routing
     for module in model.modules():