Fix bug of missing trainer attribute (#2684)

Ace-To-HYB · web-flow · commit f25a0f307e8e · 2025-09-25T16:31:22.000+08:00
diff --git a/examples/run_finetune.py b/examples/run_finetune.py
@@ -140,7 +140,6 @@ def main():
     model_config.max_sequence_length = training_args.max_seq_len
     model_config.num_nextn_predict_layers = model_args.num_nextn_predict_layers
     model_config._attn_implementation = model_args.attn_impl
-    model_config.moe_subbatch_token_num = model_args.moe_subbatch_token_num
     logger.info(f"Final model config: {model_config}")
     logger.info("Creating model")
 
diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py
@@ -294,6 +294,10 @@ class LlmMetaConfig:
         ),
     ]
 
+    moe_attributes = [
+        ("moe_subbatch_token_num", int, 0, "The number of tokens in each subbatch for MoE model processing."),
+    ]
+
     @classmethod
     def _get_defaults(cls):
         ret = {}
@@ -302,6 +306,7 @@ def _get_defaults(cls):
             cls.hybrid_parallel_attributes,
             cls.recompute_attributes,
             cls.loss_attributes,
+            cls.moe_attributes,
         ]:
             for attr in attrs:
                 # return dict of key and default values
@@ -316,6 +321,7 @@ def _get_all_meta(cls):
             cls.hybrid_parallel_attributes,
             cls.recompute_attributes,
             cls.loss_attributes,
+            cls.moe_attributes,
         ]:
             for attr in attrs:
                 # return dict of key and default values
@@ -330,6 +336,7 @@ def _get_unsavable_keys(cls):
             cls.hybrid_parallel_attributes,
             cls.recompute_attributes,
             cls.loss_attributes,
+            cls.moe_attributes,
         ]:
             for attr in attrs:
                 ret.add(attr[0])
@@ -488,6 +495,8 @@ class PretrainedConfig:
         problem_type (`str`, *optional*):
             Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
             `"single_label_classification"` or `"multi_label_classification"`.
+        moe_subbatch_token_num (`int`, *optional*, defaults to 0):
+            The number of tokens in a subbatch for MoE.
 
         > Parameters for general components
 
@@ -632,6 +641,8 @@ def __init__(self, **kwargs):
         self.dpo_config = kwargs.pop("dpo_config", None)
         self.kto_config = kwargs.pop("kto_config", None)
 
+        self.num_subbatch_token_num = kwargs.pop("num_subbatch_token_num", 0)
+
         # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
         self.tokenizer_class = kwargs.pop("tokenizer_class", None)
         self.prefix = kwargs.pop("prefix", None)