Fix DeepSeek-V2 sequence packing sft (#444)

jerryli1981 · web-flow · commit 3b90f6832a66 · 2025-01-24T11:16:36.000+08:00
* Enhance DeepSeek-V2 236B mcore to hf conversion

* Fix DeepSeek-V2 sequence packing sft

---------

Co-authored-by: 同润 &lt;jerry.lp@alibaba-inc.com&gt;
diff --git a/examples/deepseek_v2/pretrain_deepseek.py b/examples/deepseek_v2/pretrain_deepseek.py
@@ -102,7 +102,7 @@ def get_batch(data_iterator):
         if args.train_mode == "pretrain":
             batch = get_batch_on_this_tp_rank(data_iterator)
         else:
-            batch = get_batch_on_this_tp_rank_idxmap_sft(data_iterator)
+            batch = get_batch_on_this_tp_rank_idxmap_sft(data_iterator, per_seq_average=True)
 
         packed_seq_params = None
         if args.reset_position_ids: