minor edit based on coderabbit's suggestions

yeyu-nvidia · yeyu-nvidia · commit 3c06000e6e54 · 2025-09-29T10:31:42.000-07:00
Signed-off-by: Ye Yu &lt;yeyu@nvidia.com&gt;
diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py
@@ -140,7 +140,7 @@ def dict_to_config(
 
 
 def mcore_version_higher_than(target_version: str):
-    """Check if megatron-core is least this version."""
+    """Check if megatron-core is greater than this version."""
     return Version(megatron.core.__version__) > Version(target_version)
 
 
@@ -239,13 +239,13 @@ def set_multi_step_attention_mask(attn_mask, step):
     =======================================================================================================================
     """  # noqa: E501
     s = attn_mask.shape[-1]
-    for iter in range(2, step + 1):
-        # iter starts from 2nd step
+    for step_idx in range(2, step + 1):
+        # step_idx starts from 2nd step
         mask_0 = attn_mask.clone().detach()
-        mask_0[:, :, iter - 2, :] = True
+        mask_0[:, :, step_idx - 2, :] = True
         mask_0[:, :, :, :-1] = mask_0[:, :, :, 1:]
         mask_1 = attn_mask.new_ones(attn_mask.shape[0], attn_mask.shape[1], s, s).bool()
-        for i in range(iter - 1, s - 1):
+        for i in range(step_idx - 1, s - 1):
             mask_1[:, :, i, i] = False
 
         attn_mask = torch.cat((mask_0, mask_1), dim=-1)
diff --git a/modelopt/torch/speculative/utils.py b/modelopt/torch/speculative/utils.py
@@ -293,7 +293,9 @@ def check_data_consistency_across_ranks(self, data, group=None, fail_when_mismat
         """This function checks the data consistency across all ranks in the group.
 
         Use rank 0 data as the golden set to broadcast to all ranks.
-        Each rank will then compare to this data and through error if different.
+        Each rank compares its data against this golden set and either raises
+        (when fail_when_mismatch=True) or emits a warning while forcing every
+        rank to adopt rank 0's data.
         """
         if not torch.distributed.is_initialized():
             return data