[bugfix] fix megatron pp4 max_epochs (#5432)

Jintao-Huang · Jintao-Huang · commit e85fd5605e3e · 2025-08-19T01:01:20.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -207,7 +207,7 @@
 |[swift/Qwen3-32B-AWQ](https://modelscope.cn/models/swift/Qwen3-32B-AWQ)|qwen3|qwen3|transformers>=4.51|&#x2718;|-|-|
 |[Qwen/Qwen3-4B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-4B-Instruct-2507)|qwen3|qwen3|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507)|
 |[Qwen/Qwen3-4B-Instruct-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-4B-Instruct-2507-FP8)|qwen3|qwen3|transformers>=4.51|&#x2718;|-|[Qwen/Qwen3-4B-Instruct-2507-FP8](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507-FP8)|
-|[Qwen/Qwen3-4B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-4B-Thinking-2507)|qwen3_thinking|qwen3_thinking|transformers>=4.51|&#x2718;|-|[Qwen/Qwen3-4B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507)|
+|[Qwen/Qwen3-4B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-4B-Thinking-2507)|qwen3_thinking|qwen3_thinking|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-4B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507)|
 |[Qwen/Qwen3-4B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-4B-Thinking-2507-FP8)|qwen3_thinking|qwen3_thinking|transformers>=4.51|&#x2718;|-|[Qwen/Qwen3-4B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507-FP8)|
 |[Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base)|qwen3_moe|qwen3|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base)|
 |[Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B)|qwen3_moe|qwen3|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -207,7 +207,7 @@ The table below introduces the models integrated with ms-swift:
 |[swift/Qwen3-32B-AWQ](https://modelscope.cn/models/swift/Qwen3-32B-AWQ)|qwen3|qwen3|transformers>=4.51|&#x2718;|-|-|
 |[Qwen/Qwen3-4B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-4B-Instruct-2507)|qwen3|qwen3|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507)|
 |[Qwen/Qwen3-4B-Instruct-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-4B-Instruct-2507-FP8)|qwen3|qwen3|transformers>=4.51|&#x2718;|-|[Qwen/Qwen3-4B-Instruct-2507-FP8](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507-FP8)|
-|[Qwen/Qwen3-4B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-4B-Thinking-2507)|qwen3_thinking|qwen3_thinking|transformers>=4.51|&#x2718;|-|[Qwen/Qwen3-4B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507)|
+|[Qwen/Qwen3-4B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-4B-Thinking-2507)|qwen3_thinking|qwen3_thinking|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-4B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507)|
 |[Qwen/Qwen3-4B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-4B-Thinking-2507-FP8)|qwen3_thinking|qwen3_thinking|transformers>=4.51|&#x2718;|-|[Qwen/Qwen3-4B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507-FP8)|
 |[Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base)|qwen3_moe|qwen3|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base)|
 |[Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B)|qwen3_moe|qwen3|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B)|
diff --git a/swift/megatron/argument/megatron_args.py b/swift/megatron/argument/megatron_args.py
@@ -357,7 +357,10 @@ def create_group(ranks=None, timeout=None, *args, **kwargs):
     def __post_init__(self):
         require_version('numpy<2.0', 'Please install numpy<2.0 by running: `pip install "numpy<2.0"`.')
         if self.train_type == 'lora':
-            require_version('peft>=0.12')
+            if self.num_experts is not None:
+                require_version('peft>=0.15')
+            else:
+                require_version('peft>=0.12')
         MegatronTunerMixin.__post_init__(self)
         os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
         self._set_default()
diff --git a/swift/megatron/trainers/base.py b/swift/megatron/trainers/base.py
@@ -74,17 +74,18 @@ def initialize_megatron(*_args, **kwargs):
     def new_cyclic_iter(iterable):
         args = get_args()
         i = 0
+        n_batch = 0
         while True:
             is_training = getattr(args, 'is_training', False)
             if is_training:
                 logger.info(f'The training of Epoch {i} starts...')
             if is_training and args.max_epochs and i >= args.max_epochs - 1:
                 it = iter(iterable)
-                num_batches = args.global_batch_size // (args.micro_batch_size * args.data_parallel_size)
-                x = [next(it) for _ in range(num_batches)]
+                num_microbatches = args.global_batch_size // (args.micro_batch_size * args.data_parallel_size)
+                x = [next(it) for _ in range(num_microbatches - n_batch % num_microbatches)]
                 while True:
                     try:
-                        next_x = [next(it) for _ in range(num_batches)]
+                        next_x = [next(it) for _ in range(num_microbatches)]
                     except StopIteration:
                         break
                     yield from x
@@ -94,6 +95,7 @@ def new_cyclic_iter(iterable):
                 yield from x
             else:
                 for x in iterable:
+                    n_batch += 1
                     yield x
             i += 1
 
diff --git a/swift/megatron/trainers/utils.py b/swift/megatron/trainers/utils.py
@@ -73,6 +73,9 @@ def _broadcast(item):
             _broadcast(batch['attention_mask'])
             _broadcast(batch['position_ids'])
             _broadcast(batch['loss_scale'])
+        else:
+            for key in ('input_ids', 'labels', 'attention_mask', 'position_ids', 'loss_scale'):
+                batch[key] = None
 
     else:
         flags = torch.empty((3), dtype=torch.int64, device=torch.cuda.current_device())
@@ -117,6 +120,8 @@ def _broadcast(item):
             _broadcast(attention_mask)
             _broadcast(position_ids)  # compat packing & cp
             _broadcast(loss_scale)
+        else:
+            input_ids, labels, attention_mask, position_ids, loss_scale = (None, ) * 5
 
         batch = {
             'input_ids': input_ids,
@@ -187,15 +192,10 @@ def get_batch_on_this_cp_rank(batch: Dict[str, Any]):
 
 def get_batch(data_iterator):
     """Generate a batch."""
-
-    # TODO: this is pretty hacky, find a better way
-    if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
-        return {key: None for key in ['input_ids', 'attention_mask', 'position_ids', 'loss_scale']}
-
     # get batches based on the TP rank you are on
     batch = get_batch_on_this_tp_rank(data_iterator)
     args = get_args()
-    if args.padding_free:
+    if args.padding_free and batch.get('position_ids') is not None:
         batch['packed_seq_params'] = get_packed_seq_params(batch['position_ids'])
     # slice batch along sequence dimension for context parallelism
     batch = get_batch_on_this_cp_rank(batch)