[misc] megatron grpo support non-padding-free (#7218)

hjh0119 · web-flow · commit 76795cbd62eb · 2025-12-29T15:27:44.000+08:00
diff --git a/swift/megatron/trainers/grpo_trainer.py b/swift/megatron/trainers/grpo_trainer.py
@@ -297,30 +297,49 @@ def _get_encoded_batch(rollout_batch):
                 encoded_list = [template.encode(data, return_length=True) for data in rollout_batch]
                 encoded_batch = to_device(
                     template.data_collator(encoded_list, padding_to=get_padding_to(args)), self.device)
-                if 'cu_seq_lens_q' in encoded_batch:
-                    cu_seq_lens_q = encoded_batch['cu_seq_lens_q']
-                else:
-                    cu_seq_lens_q = get_packed_seq_params(encoded_batch['position_ids'])['cu_seq_lens_q']
-                seq_lengths = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1]
 
             labels = encoded_batch['labels']
             batch_size = len(rollout_batch)
-            max_seq_len = seq_lengths.max().item()
-            assert self.template.padding_free
 
             truncated_mask = torch.tensor([b['is_truncated'] for b in rollout_batch],
                                           dtype=torch.bool,
                                           device=self.device)
 
-            # completion_mask in rmpad format [1, total_tokens]
-            completion_mask_rmpad = (labels != -100).float()
-            completion_mask, _ = pad_logps_back_to_batch(
-                logps_rmpad=completion_mask_rmpad,
-                logits_to_keep=max_seq_len,
-                batch_size=batch_size,
-                seq_lengths=seq_lengths,
-                pad_value=0.0)
-            completion_mask = completion_mask.bool()
+            if self.template.padding_free:
+                # In padding_free mode, labels shape is [1, total_seq_len] (rmpad format)
+                # Calculate seq_lengths from cu_seq_lens or position_ids
+                if 'cu_seq_lens_q' in encoded_batch:
+                    cu_seq_lens_q = encoded_batch['cu_seq_lens_q']
+                else:
+                    cu_seq_lens_q = get_packed_seq_params(encoded_batch['position_ids'])['cu_seq_lens_q']
+                seq_lengths = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1]
+                max_seq_len = seq_lengths.max().item()
+
+                # completion_mask in rmpad format [1, total_tokens]
+                completion_mask_rmpad = (labels != -100).float()
+                completion_mask, _ = pad_logps_back_to_batch(
+                    logps_rmpad=completion_mask_rmpad,
+                    logits_to_keep=max_seq_len,
+                    batch_size=batch_size,
+                    seq_lengths=seq_lengths,
+                    pad_value=0.0)
+                completion_mask = completion_mask.bool()
+            else:
+                # In non-padding_free mode, labels shape is [batch_size, seq_len] (batch format)
+                # Calculate seq_lengths from attention_mask
+                attention_mask = encoded_batch.get('attention_mask')
+                if attention_mask is not None:
+                    # attention_mask shape: [batch_size, seq_len] or [batch_size, 1, 1, seq_len]
+                    if attention_mask.dim() == 4:
+                        attention_mask = attention_mask[:, 0, 0, :]
+                    seq_lengths = attention_mask.sum(dim=-1).to(torch.int64)
+                else:
+                    # Fallback: assume full sequence length for each sample
+                    seq_lengths = torch.full((batch_size, ), labels.shape[-1], dtype=torch.int64, device=self.device)
+                max_seq_len = labels.shape[-1]
+
+                # completion_mask is already [batch_size, seq_len] in non-padding_free mode
+                completion_mask = (labels != -100)
 
             encoded_batch.update({
                 'completion_mask': completion_mask,  # [batch_size, max_seq_len]
@@ -400,10 +419,10 @@ def _get_encoded_batch(rollout_batch):
 
         if self.loss_type in ['cispo', 'dapo']:
             # Calculate num_items_in_batch
-            # Count tokens from all mini_batch_data (this includes gathered data from rollout_group)
-            total_token_count = sum(batch_data['seq_lengths'].sum().item() if self.template.
-                                    padding_free else batch_data['completion_mask'].sum().item()
-                                    for batch_data in mini_batch_data)
+            # Count completion tokens from all mini_batch_data (this includes gathered data from rollout_group)
+            # Use completion_mask.sum() for both padding_free and non-padding_free modes
+            # since we want the count of actual completion tokens, not sequence lengths
+            total_token_count = sum(batch_data['completion_mask'].sum().item() for batch_data in mini_batch_data)
 
             # All-reduce across all ranks
             total_token_count_tensor = torch.tensor(total_token_count, dtype=torch.int, device=self.device)
@@ -873,22 +892,34 @@ def _maybe_compute_logps(self, batch: Dict[str, Any]) -> Dict[str, Any]:
             with torch.no_grad(), self.null_ref_context() as ref_models:
                 assert len(ref_models) == 1, 'GRPO currently does not support VPP.'
                 ref_model = ref_models[0]
-                ref_per_token_logps_rmpad = self.model_forward(
+                ref_per_token_logps_raw = self.model_forward(
                     ref_model, iter([deepcopy(inputs)]), no_grad=True, per_token=True)['logps']
-                ref_per_token_logps, _ = pad_logps_back_to_batch(
-                    logps_rmpad=ref_per_token_logps_rmpad,
-                    logits_to_keep=max_seq_len,
-                    batch_size=batch_size,
-                    seq_lengths=seq_lengths)
+                if self.template.padding_free:
+                    # In padding_free mode, logps are in rmpad format [1, total_tokens]
+                    # Pad to batch format [batch_size, max_seq_len]
+                    ref_per_token_logps, _ = pad_logps_back_to_batch(
+                        logps_rmpad=ref_per_token_logps_raw,
+                        logits_to_keep=max_seq_len,
+                        batch_size=batch_size,
+                        seq_lengths=seq_lengths)
+                else:
+                    # In non-padding_free mode, logps are already in batch format [batch_size, seq_len]
+                    ref_per_token_logps = ref_per_token_logps_raw
                 batch['ref_per_token_logps'] = ref_per_token_logps
 
-        old_per_token_logps_rmpad = self.model_forward(
+        old_per_token_logps_raw = self.model_forward(
             self.unwrapped_models[0], iter([deepcopy(inputs)]), no_grad=True, per_token=True)['logps']
-        old_per_token_logps, _ = pad_logps_back_to_batch(
-            logps_rmpad=old_per_token_logps_rmpad,
-            logits_to_keep=max_seq_len,
-            batch_size=batch_size,
-            seq_lengths=seq_lengths)
+        if self.template.padding_free:
+            # In padding_free mode, logps are in rmpad format [1, total_tokens]
+            # Pad to batch format [batch_size, max_seq_len]
+            old_per_token_logps, _ = pad_logps_back_to_batch(
+                logps_rmpad=old_per_token_logps_raw,
+                logits_to_keep=max_seq_len,
+                batch_size=batch_size,
+                seq_lengths=seq_lengths)
+        else:
+            # In non-padding_free mode, logps are already in batch format [batch_size, seq_len]
+            old_per_token_logps = old_per_token_logps_raw
         batch['old_per_token_logps'] = old_per_token_logps
 
         return batch
@@ -985,7 +1016,16 @@ def build_pretraining_data_loader(*_args, **kwargs):
     def forward_step(self, data_iterator, model):
         # train_batch_size
         # return: output_tensor, loss_func
-        data = self.get_batch(data_iterator)
+        data = next(data_iterator)
+        advantages = data.pop('advantages')
+        truncated_mask = data.pop('truncated_mask')
+        seq_lengths = data.pop('seq_lengths')
+        data = self._prepare_batch(data)
+        data.update({
+            'advantages': advantages,
+            'truncated_mask': truncated_mask,
+            'seq_lengths': seq_lengths,
+        })
         data.pop('loss_scale', None)
         inputs = self._prepare_model_inputs(data)
 
@@ -995,29 +1035,36 @@ def forward_step(self, data_iterator, model):
 
     @profiling_decorator
     def loss_func(self, output_tensor: torch.Tensor, data: Dict[str, Any]):
+        args = get_args()
         # Get pre-padded data in batch format [batch_size, max_seq_len]
         advantages = data['advantages']  # [batch_size]
         labels = data['labels']
         completion_mask = data['completion_mask']  # [batch_size, max_seq_len]
-        packed_seq_params = data['packed_seq_params']
+        packed_seq_params = data.get('packed_seq_params')
         truncated_mask = data['truncated_mask']  # [batch_size]
         seq_lengths = data['seq_lengths']  # [batch_size]
         max_seq_len = completion_mask.shape[1]
         micro_batch_size = self.micro_batch_size
 
-        # Use full sequence lengths directly (get_logps returns full sequences in CP mode)
-        lengths = packed_seq_params.cu_seqlens_q[1:micro_batch_size
-                                                 + 1] - packed_seq_params.cu_seqlens_q[:micro_batch_size]
-
-        # get_logps with per_token=True returns rmpad format [1, total_tokens]
-        # Pad to batch format [batch_size, max_seq_len]
-        per_token_logps_rmpad = self.get_logps(
-            output_tensor, labels, packed_seq_params, packed_seq_params.num_samples, per_token=True)
-        per_token_logps, _ = pad_logps_back_to_batch(
-            logps_rmpad=per_token_logps_rmpad,
-            logits_to_keep=max_seq_len,
-            batch_size=micro_batch_size,
-            seq_lengths=seq_lengths)
+        if args.padding_free:
+            # Use full sequence lengths directly (get_logps returns full sequences in CP mode)
+            lengths = packed_seq_params.cu_seqlens_q[1:micro_batch_size
+                                                     + 1] - packed_seq_params.cu_seqlens_q[:micro_batch_size]
+
+            # get_logps with per_token=True returns rmpad format [1, total_tokens]
+            # Pad to batch format [batch_size, max_seq_len]
+            per_token_logps_rmpad = self.get_logps(
+                output_tensor, labels, packed_seq_params, packed_seq_params.num_samples, per_token=True)
+            per_token_logps, _ = pad_logps_back_to_batch(
+                logps_rmpad=per_token_logps_rmpad,
+                logits_to_keep=max_seq_len,
+                batch_size=micro_batch_size,
+                seq_lengths=seq_lengths)
+        else:
+            # In non-padding_free mode, get_logps with per_token=True returns [batch_size, seq_len]
+            # No need to pad, already in batch format
+            lengths = seq_lengths
+            per_token_logps = self.get_logps(output_tensor, labels, packed_seq_params, micro_batch_size, per_token=True)
 
         # Get pre-padded ref/old/rollout logps from data
         ref_per_token_logps = data.get('ref_per_token_logps')  # [batch_size, max_seq_len] or None
@@ -1256,13 +1303,19 @@ def model_forward(self, model, data_iterator, no_grad=True, per_token=False):
         with self.stimer(bdata=True):
             data = self.get_batch(data_iterator)
         data.pop('loss_scale', None)
+        input_ids = data.get('input_ids')
         labels = data.get('labels')
         context = torch.no_grad() if no_grad else nullcontext()
         with context:
             output_tensor = forward_step_helper(model, data)
-        packed_seq_params = data['packed_seq_params']
+        # packed_seq_params only exists in padding_free mode
+        packed_seq_params = data.get('packed_seq_params')
+        if packed_seq_params is not None:
+            num_samples = packed_seq_params.num_samples
+        else:
+            num_samples = input_ids.shape[0] if input_ids is not None else labels.shape[0]
         data['logps'] = None if labels is None else self.get_logps(
-            output_tensor, labels, data['packed_seq_params'], packed_seq_params.num_samples, per_token=per_token)
+            output_tensor, labels, packed_seq_params, num_samples, per_token=per_token)
         return data
 
     def inputs2requests(self, inputs: Union[DataType, List[RolloutInferRequest]]) -> List[RolloutInferRequest]:
diff --git a/swift/megatron/trainers/rlhf_mixin.py b/swift/megatron/trainers/rlhf_mixin.py
@@ -86,12 +86,12 @@ def _postprocess_packed_tensor_cp(self, tensor, packed_seq_params, num_samples):
         Works for both logps (float) and masks (bool/int).
 
         Args:
-            tensor: [1, packed_len/cp_size] - CP-split tensor (any dtype)
-            packed_seq_params: PackedSeqParams object
+            tensor: [1, packed_len/cp_size] in padding_free mode, or [batch_size, seq_len/cp_size] otherwise
+            packed_seq_params: PackedSeqParams object (None in non-padding_free mode)
             num_samples: Number of samples in the batch
 
         Returns:
-            output_full: [1, packed_len] - Full sequence tensor
+            output_full: [1, packed_len] in padding_free mode, or [batch_size, seq_len] otherwise
         """
         args = get_args()
         cp_size = args.context_parallel_size
@@ -102,36 +102,61 @@ def _postprocess_packed_tensor_cp(self, tensor, packed_seq_params, num_samples):
         torch.distributed.all_gather(output_list, tensor.contiguous(), group=mpu.get_context_parallel_group())
         output_list[cp_rank] = tensor
 
-        # Reconstruct full sequence
-        # Shape: [1, packed_len/cp_size] -> [1, packed_len]
-        cu_seqlens_full = packed_seq_params.cu_seqlens_q
-        cu_seqlens_cp = cu_seqlens_full // cp_size
+        if packed_seq_params is not None:
+            # padding_free mode: [1, packed_len/cp_size] -> [1, packed_len]
+            cu_seqlens_full = packed_seq_params.cu_seqlens_q
+            cu_seqlens_cp = cu_seqlens_full // cp_size
 
-        # Calculate total packed length
-        total_packed_len = cu_seqlens_full[num_samples].item()
-        output_full = tensor.new_zeros(1, total_packed_len)
+            # Calculate total packed length
+            total_packed_len = cu_seqlens_full[num_samples].item()
+            output_full = tensor.new_zeros(1, total_packed_len)
 
-        # Reconstruct each sequence
-        for i in range(num_samples):
-            start_full = cu_seqlens_full[i].item()
-            end_full = cu_seqlens_full[i + 1].item()
-            seq_len = end_full - start_full
+            # Reconstruct each sequence
+            for i in range(num_samples):
+                start_full = cu_seqlens_full[i].item()
+                end_full = cu_seqlens_full[i + 1].item()
+                seq_len = end_full - start_full
+
+                # Length of each chunk after CP split
+                chunk_len = seq_len // cp_size
+                half_chunk = chunk_len // 2
+
+                # Concatenate from each CP rank's output (load-balanced split)
+                for j in range(cp_size):
+                    o = output_list[j][0]
+                    start_cp = cu_seqlens_cp[i].item()
+
+                    # Get two half chunks (CP's load-balanced split)
+                    o0 = o[start_cp:start_cp + half_chunk]
+                    o1 = o[start_cp + half_chunk:start_cp + chunk_len]
+
+                    # Place back to full sequence
+                    output_full[0, start_full + j * half_chunk:start_full + (j + 1) * half_chunk] = o0
+                    output_full[0, end_full - (j + 1) * half_chunk:end_full - j * half_chunk] = o1
+        else:
+            # non-padding_free mode: [batch_size, seq_len/cp_size] -> [batch_size, seq_len]
+            # Each CP rank has chunks split with load-balanced pattern (2*cp_size chunks)
+            batch_size = tensor.shape[0]
+            seq_len_per_cp = tensor.shape[1]
+            full_seq_len = seq_len_per_cp * cp_size
 
-            # Length of each chunk after CP split
-            chunk_len = seq_len // cp_size
-            half_chunk = chunk_len // 2
+            output_full = tensor.new_zeros(batch_size, full_seq_len)
 
-            # Concatenate from each CP rank's output (load-balanced split)
-            for j in range(cp_size):
-                o = output_list[j][0]
-                start_cp = cu_seqlens_cp[i].item()
+            # Each CP rank j holds chunks j and (2*cp_size - j - 1) from the original 2*cp_size split
+            # Reconstruct the full sequence by placing chunks back in correct positions
+            chunk_len = full_seq_len // (2 * cp_size)
 
-                # Get two half chunks (CP's load-balanced split)
-                o0 = o[start_cp:start_cp + half_chunk]
-                o1 = o[start_cp + half_chunk:start_cp + chunk_len]
-
-                # Place back to full sequence
-                output_full[0, start_full + j * half_chunk:start_full + (j + 1) * half_chunk] = o0
-                output_full[0, end_full - (j + 1) * half_chunk:end_full - j * half_chunk] = o1
+            for j in range(cp_size):
+                o = output_list[j]  # [batch_size, seq_len_per_cp]
+                # This rank holds 2 chunks: chunk j and chunk (2*cp_size - j - 1)
+                half_len = seq_len_per_cp // 2
+                o0 = o[:, :half_len]  # First half -> chunk j
+                o1 = o[:, half_len:]  # Second half -> chunk (2*cp_size - j - 1)
+
+                # Place chunk j at position j * chunk_len
+                output_full[:, j * chunk_len:(j + 1) * chunk_len] = o0
+                # Place chunk (2*cp_size - j - 1) at position (2*cp_size - j - 1) * chunk_len
+                reverse_idx = 2 * cp_size - j - 1
+                output_full[:, reverse_idx * chunk_len:(reverse_idx + 1) * chunk_len] = o1
 
         return output_full
diff --git a/swift/megatron/utils/utils.py b/swift/megatron/utils/utils.py
@@ -287,6 +287,8 @@ def forward_step_helper(model, inputs, dtype=None):
     args = get_args()
     if mpu.is_pipeline_first_stage():
         micro_batch_size = 1  # use qkv_format 'thd'
+        if not args.padding_free:
+            micro_batch_size = args.micro_batch_size
         seq_length = inputs['position_ids'].shape[-1]
         if args.sequence_parallel:
             seq_length //= mpu.get_tensor_model_parallel_world_size()