update

hjh0119 · hjh0119 · commit fee106140833 · 2025-12-29T10:46:03.000+08:00
diff --git a/swift/megatron/trainers/grpo_trainer.py b/swift/megatron/trainers/grpo_trainer.py
@@ -54,8 +54,9 @@ def __init__(self, args: MegatronRLHFArguments, template: Template, **kwargs):
         self._prepare_metrics()
         self._init_grpo_params()
         self._init_rollout_engine()
+        self._init_rollout_engine()
         self._prepare_rewards()
-        self._prepare_scheduler()  # TODO
+        self._prepare_scheduler()
         # Initialize trainer state for reward functions to access training progress
         # Will be updated with actual values from Megatron args during training
         self.state = MegatronTrainerState()
@@ -1017,6 +1018,16 @@ def forward_step(self, data_iterator, model):
         # train_batch_size
         # return: output_tensor, loss_func
         data = self.get_batch(data_iterator)
+        data = next(data_iterator)
+        advantages = data.pop('advantages')
+        truncated_mask = data.pop('truncated_mask')
+        seq_lengths = data.pop('seq_lengths')
+        data = self._prepare_batch(data)
+        data.update({
+            'advantages': advantages,
+            'truncated_mask': truncated_mask,
+            'seq_lengths': seq_lengths,
+        })
         data.pop('loss_scale', None)
         inputs = self._prepare_model_inputs(data)
 
diff --git a/swift/megatron/trainers/rlhf_mixin.py b/swift/megatron/trainers/rlhf_mixin.py
@@ -86,12 +86,12 @@ def _postprocess_packed_tensor_cp(self, tensor, packed_seq_params, num_samples):
         Works for both logps (float) and masks (bool/int).
 
         Args:
-            tensor: [1, packed_len/cp_size] - CP-split tensor (any dtype)
-            packed_seq_params: PackedSeqParams object
+            tensor: [1, packed_len/cp_size] in padding_free mode, or [batch_size, seq_len/cp_size] otherwise
+            packed_seq_params: PackedSeqParams object (None in non-padding_free mode)
             num_samples: Number of samples in the batch
 
         Returns:
-            output_full: [1, packed_len] - Full sequence tensor
+            output_full: [1, packed_len] in padding_free mode, or [batch_size, seq_len] otherwise
         """
         args = get_args()
         cp_size = args.context_parallel_size
@@ -102,36 +102,61 @@ def _postprocess_packed_tensor_cp(self, tensor, packed_seq_params, num_samples):
         torch.distributed.all_gather(output_list, tensor.contiguous(), group=mpu.get_context_parallel_group())
         output_list[cp_rank] = tensor
 
-        # Reconstruct full sequence
-        # Shape: [1, packed_len/cp_size] -> [1, packed_len]
-        cu_seqlens_full = packed_seq_params.cu_seqlens_q
-        cu_seqlens_cp = cu_seqlens_full // cp_size
+        if packed_seq_params is not None:
+            # padding_free mode: [1, packed_len/cp_size] -> [1, packed_len]
+            cu_seqlens_full = packed_seq_params.cu_seqlens_q
+            cu_seqlens_cp = cu_seqlens_full // cp_size
 
-        # Calculate total packed length
-        total_packed_len = cu_seqlens_full[num_samples].item()
-        output_full = tensor.new_zeros(1, total_packed_len)
+            # Calculate total packed length
+            total_packed_len = cu_seqlens_full[num_samples].item()
+            output_full = tensor.new_zeros(1, total_packed_len)
 
-        # Reconstruct each sequence
-        for i in range(num_samples):
-            start_full = cu_seqlens_full[i].item()
-            end_full = cu_seqlens_full[i + 1].item()
-            seq_len = end_full - start_full
+            # Reconstruct each sequence
+            for i in range(num_samples):
+                start_full = cu_seqlens_full[i].item()
+                end_full = cu_seqlens_full[i + 1].item()
+                seq_len = end_full - start_full
+
+                # Length of each chunk after CP split
+                chunk_len = seq_len // cp_size
+                half_chunk = chunk_len // 2
+
+                # Concatenate from each CP rank's output (load-balanced split)
+                for j in range(cp_size):
+                    o = output_list[j][0]
+                    start_cp = cu_seqlens_cp[i].item()
+
+                    # Get two half chunks (CP's load-balanced split)
+                    o0 = o[start_cp:start_cp + half_chunk]
+                    o1 = o[start_cp + half_chunk:start_cp + chunk_len]
+
+                    # Place back to full sequence
+                    output_full[0, start_full + j * half_chunk:start_full + (j + 1) * half_chunk] = o0
+                    output_full[0, end_full - (j + 1) * half_chunk:end_full - j * half_chunk] = o1
+        else:
+            # non-padding_free mode: [batch_size, seq_len/cp_size] -> [batch_size, seq_len]
+            # Each CP rank has chunks split with load-balanced pattern (2*cp_size chunks)
+            batch_size = tensor.shape[0]
+            seq_len_per_cp = tensor.shape[1]
+            full_seq_len = seq_len_per_cp * cp_size
 
-            # Length of each chunk after CP split
-            chunk_len = seq_len // cp_size
-            half_chunk = chunk_len // 2
+            output_full = tensor.new_zeros(batch_size, full_seq_len)
 
-            # Concatenate from each CP rank's output (load-balanced split)
-            for j in range(cp_size):
-                o = output_list[j][0]
-                start_cp = cu_seqlens_cp[i].item()
+            # Each CP rank j holds chunks j and (2*cp_size - j - 1) from the original 2*cp_size split
+            # Reconstruct the full sequence by placing chunks back in correct positions
+            chunk_len = full_seq_len // (2 * cp_size)
 
-                # Get two half chunks (CP's load-balanced split)
-                o0 = o[start_cp:start_cp + half_chunk]
-                o1 = o[start_cp + half_chunk:start_cp + chunk_len]
-
-                # Place back to full sequence
-                output_full[0, start_full + j * half_chunk:start_full + (j + 1) * half_chunk] = o0
-                output_full[0, end_full - (j + 1) * half_chunk:end_full - j * half_chunk] = o1
+            for j in range(cp_size):
+                o = output_list[j]  # [batch_size, seq_len_per_cp]
+                # This rank holds 2 chunks: chunk j and chunk (2*cp_size - j - 1)
+                half_len = seq_len_per_cp // 2
+                o0 = o[:, :half_len]  # First half -> chunk j
+                o1 = o[:, half_len:]  # Second half -> chunk (2*cp_size - j - 1)
+
+                # Place chunk j at position j * chunk_len
+                output_full[:, j * chunk_len:(j + 1) * chunk_len] = o0
+                # Place chunk (2*cp_size - j - 1) at position (2*cp_size - j - 1) * chunk_len
+                reverse_idx = 2 * cp_size - j - 1
+                output_full[:, reverse_idx * chunk_len:(reverse_idx + 1) * chunk_len] = o1
 
         return output_full