PaddlePaddle
diff --git a/‎examples/run_finetune.py‎
Lines changed: 7 additions & 0 deletions b/‎examples/run_finetune.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎paddleformers/trainer/training_args.py‎
Lines changed: 4 additions & 0 deletions b/‎paddleformers/trainer/training_args.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddleformers/transformers/glm4_moe/configuration.py‎
Lines changed: 2 additions & 0 deletions b/‎paddleformers/transformers/glm4_moe/configuration.py‎
Lines changed: 2 additions & 0 deletions
@@ -63,6 +63,12 @@ def main():
     training_args.print_config(model_args, "Model")
     training_args.print_config(data_args, "Data")
 
+    if training_args.pre_alloc_memory > 0:
+        memory_size = int(training_args.pre_alloc_memory * 1024 * 1024 * 1024)
+        x = paddle.empty([memory_size], dtype=paddle.uint8)
+        logger.info(f"pre_alloc_memory size {x.shape}")
+        del x
+
     # Setup GPU & distributed training
     paddle.set_device(training_args.device)
     set_seed(seed=training_args.seed)
@@ -134,6 +140,7 @@ def main():
     model_config.max_sequence_length = training_args.max_seq_len
     model_config.num_nextn_predict_layers = model_args.num_nextn_predict_layers
     model_config._attn_implementation = model_args.attn_impl
+    model_config.moe_subbatch_token_num = model_args.moe_subbatch_token_num
     logger.info(f"Final model config: {model_config}")
     logger.info("Creating model")
 
 
@@ -1096,6 +1096,10 @@ class TrainingArguments:
         default=False,
         metadata={"help": "Controls the parallel execution order. False (pp first), True (sharding first)."},
     )
+    pre_alloc_memory: int = field(
+        default=0,
+        metadata={"help": "pre allocate memory size GB"},
+    )
 
     def __post_init__(self):
         world_size = paddle.distributed.get_world_size()
 
@@ -158,6 +158,7 @@ def __init__(
         seq_aux=True,
         topk_method="noaux_tc",
         using_flex_token=True,
+        moe_subbatch_token_num=0,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -200,6 +201,7 @@ def __init__(
         self.topk_method = topk_method
         self.using_flex_token = using_flex_token
         self.use_fp8 = False
+        self.moe_subbatch_token_num = moe_subbatch_token_num
 
         self.pp_seg_method = pp_seg_method
         self.disable_ffn_model_parallel = disable_ffn_model_parallel