fix for sequence packing plus sequence parallel: padding the sequence to a multiple of TP (#2574)

jalbericiola · jon-barker · web-flow · commit 416687f4372f · 2025-12-05T20:38:49.000Z
Co-authored-by: Jon Barker &lt;19699370+jon-barker@users.noreply.github.com&gt;
diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py
@@ -28,7 +28,7 @@
 from megatron.core.num_microbatches_calculator import get_num_microbatches
 from megatron.core.optimizer import MegatronOptimizer
 from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core.parallel_state import get_tensor_model_parallel_src_rank
+from megatron.core.parallel_state import get_tensor_model_parallel_src_rank, get_tensor_model_parallel_world_size
 from megatron.core.rerun_state_machine import RerunDataIterator
 from megatron.core.transformer.cuda_graphs import _CudagraphGlobalRecord
 from megatron.core.transformer.utils import toggle_cuda_graphs
@@ -793,7 +793,30 @@ def get_logprobs(model, tokens, position_ids, attention_mask, no_grad=False, pac
                     # No real tokens, skip packed path
                     packed_seq_params = None
                 else:
-                    # Slice inputs to remove padding
+                    # When sequence parallelism is enabled, the sequence length must be 
+                    # divisible by the tensor parallel world size for reduce-scatter ops
+                    if model.config.sequence_parallel:
+                        tp_world_size = get_tensor_model_parallel_world_size()
+                        if actual_len % tp_world_size != 0:
+                            actual_len = ((actual_len + tp_world_size - 1) // tp_world_size) * tp_world_size
+                            # Update cu_seqlens to match the padded length.
+                            # The last entry of cu_seqlens must equal the tensor's sequence dimension.
+                            # Without this, TE attention/rotary ops see mismatched dimensions.
+                            if packed_seq_params.cu_seqlens_q[-1].item() != actual_len:
+                                # Clone to avoid modifying cached params
+                                new_cu_seqlens = packed_seq_params.cu_seqlens_q.clone()
+                                new_cu_seqlens[-1] = actual_len
+                                packed_seq_params = PackedSeqParams(
+                                    qkv_format=packed_seq_params.qkv_format,
+                                    cu_seqlens_q=new_cu_seqlens,
+                                    cu_seqlens_kv=new_cu_seqlens,
+                                    cu_seqlens_q_padded=packed_seq_params.cu_seqlens_q_padded,
+                                    cu_seqlens_kv_padded=packed_seq_params.cu_seqlens_kv_padded,
+                                    max_seqlen_q=packed_seq_params.max_seqlen_q,
+                                    max_seqlen_kv=packed_seq_params.max_seqlen_kv,
+                                )
+                    
+                    # Slice inputs to remove padding (or pad if needed for SP alignment)
                     # dimension 0 is batch, with seq packing BS=1
                     tokens = tokens[:, :actual_len]
                     position_ids = position_ids[:, :actual_len]