PaddlePaddle
diff --git a/‎paddleformers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎paddleformers/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddleformers/nn/attention/eager_attention.py‎
Lines changed: 4 additions & 9 deletions b/‎paddleformers/nn/attention/eager_attention.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎paddleformers/nn/attention/sdpa_attention.py‎
Lines changed: 0 additions & 10 deletions b/‎paddleformers/nn/attention/sdpa_attention.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎paddleformers/trainer/trainer.py‎
Lines changed: 14 additions & 2 deletions b/‎paddleformers/trainer/trainer.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎paddleformers/trainer/training_args.py‎
Lines changed: 8 additions & 0 deletions b/‎paddleformers/trainer/training_args.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddleformers/trainer/unified_checkpoint/async_handler.py‎
Lines changed: 19 additions & 7 deletions b/‎paddleformers/trainer/unified_checkpoint/async_handler.py‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎paddleformers/trainer/unified_checkpoint/load_local.py‎
Lines changed: 10 additions & 2 deletions b/‎paddleformers/trainer/unified_checkpoint/load_local.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎paddleformers/trainer/unified_checkpoint/load_save_single_card.py‎
Lines changed: 24 additions & 10 deletions b/‎paddleformers/trainer/unified_checkpoint/load_save_single_card.py‎
Lines changed: 24 additions & 10 deletions
@@ -19,6 +19,7 @@
 from typing import TYPE_CHECKING
 
 from .utils.lazy_import import _LazyModule
+from .utils.paddle_patch import *
 
 PADDLEFORMERS_STABLE_VERSION = "PADDLEFORMERS_STABLE_VERSION"
 
 
@@ -31,15 +31,10 @@ def eager_attention_forward(
     is_causal: Optional[bool] = None,
     **kwargs,
 ):
-    num_key_value_heads = None
-    if hasattr(module, "num_key_value_heads"):
-        num_key_value_heads = module.num_key_value_heads
-    elif hasattr(module, "num_key_value_groups"):
-        num_key_value_heads = module.num_key_value_groups
-
-    if num_key_value_heads is not None:
-        key = repeat_kv(key, module.num_key_value_heads)
-        value = repeat_kv(value, module.num_key_value_heads)
+    if hasattr(module, "num_key_value_groups"):
+        num_key_value_groups = module.num_key_value_groups
+        key = repeat_kv(key, num_key_value_groups)
+        value = repeat_kv(value, num_key_value_groups)
 
     perm = [0, 2, 1, 3]  # b l h d -> b h l d
     query = paddle.transpose(x=query, perm=perm)
 
@@ -18,7 +18,6 @@
 import paddle.nn as nn
 
 from ...utils.masking_utils import _gen_from_sparse_attn_mask_indices
-from .utils import repeat_kv
 
 
 def sdpa_attention_forward(
@@ -34,15 +33,6 @@ def sdpa_attention_forward(
     **kwargs,
 ):
     # query: b l h d
-    num_key_value_heads = None
-    if hasattr(module, "num_key_value_heads"):
-        num_key_value_heads = module.num_key_value_heads
-    elif hasattr(module, "num_key_value_groups"):
-        num_key_value_heads = module.num_key_value_groups
-
-    if num_key_value_heads is not None:
-        key = repeat_kv(key, module.num_key_value_heads)
-        value = repeat_kv(value, module.num_key_value_heads)
 
     if is_causal is None and attn_mask_start_row_indices is None:
         is_causal = query.shape[1] > 1 and attention_mask is None and getattr(module, "is_causal", True)
 
@@ -637,7 +637,9 @@ def _load_from_peft_checkpoint(self, resume_from_checkpoint=None):
             elif isinstance(self.model, LoKrModel):
                 weights_file = os.path.join(resume_from_checkpoint, LOKR_WEIGHTS_NAME)
             elif isinstance(self.model, ReFTModel):
-                self.model.from_pretrained(resume_from_checkpoint, self.model.model)
+                self.model.from_pretrained(
+                    resume_from_checkpoint, self.model.model, convert_from_hf=self.args.convert_from_hf
+                )
                 return
 
             if self.args.dataset_rank == 0:
@@ -689,6 +691,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint=None):
                     self.unified_checkpoint_handler.load_unified_checkpoint(
                         self.model,
                         resume_from_checkpoint,
+                        convert_from_hf=self.args.convert_from_hf,
                     )
                     if isinstance(self.model, LoRAModel) and self.model.lora_config.loraga:
                         self.model.reinit_base_model = True
@@ -1452,6 +1455,7 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg):
                     self.unified_checkpoint_handler.load_unified_checkpoint(
                         self.model,
                         self.state.best_model_checkpoint,
+                        convert_from_hf=self.args.convert_from_hf,
                     )
                     if self.args.sharding_parallel_degree > 1 or self.args.data_parallel_degree > 1:
                         broadcast_dataset_rank0_model(self.model)
@@ -1502,6 +1506,7 @@ def _load_best_model_from_peft_checkpoint(self):
             self.unified_checkpoint_handler.load_unified_checkpoint(
                 self.model,
                 self.state.best_model_checkpoint,
+                convert_from_hf=self.args.convert_from_hf,
             )
             if self.args.sharding_parallel_degree > 1 or self.args.data_parallel_degree > 1:
                 broadcast_dataset_rank0_model(self.model)
@@ -3010,7 +3015,9 @@ def _save(
             # backup and remove unified_checkpoint_config for not trine stage
             if not self.is_in_train:
                 self.args.unified_checkpoint_config = []
-            self.unified_checkpoint_handler.save_unified_checkpoint(self.model, self.optimizer, output_dir, signal_dir)
+            self.unified_checkpoint_handler.save_unified_checkpoint(
+                self.model, self.optimizer, output_dir, signal_dir, save_to_hf=self.args.save_to_hf
+            )
 
             # recover unified_checkpoint_config for not trine stage
             if not self.is_in_train:
@@ -3034,6 +3041,7 @@ def _save(
                 merge_tensor_parallel=merge_tensor_parallel,
                 is_main_process=self.args.should_save,
                 max_shard_size="1024GB",
+                save_to_hf=self.args.save_to_hf,
             )
         # TODO: @ZHUI unify unwrap_model(self.model) and self.model
         elif not isinstance(self.model, PretrainedModel):
@@ -3052,6 +3060,7 @@ def _save(
                         save_function=self._save_ckpt_func,
                         is_main_process=self.args.should_save,
                         max_shard_size="1024GB",
+                        save_to_hf=self.args.save_to_hf,
                     )
                 else:
                     unwrap_model(self.model).save_pretrained(
@@ -3061,6 +3070,7 @@ def _save(
                         save_function=self._save_ckpt_func,
                         is_main_process=self.args.should_save,
                         max_shard_size="1024GB",
+                        save_to_hf=self.args.save_to_hf,
                     )
             else:
                 logger.info("Trainer.model is not a `PretrainedModel`, only saving its state dict.")
@@ -3093,6 +3103,7 @@ def _save(
                     save_function=self._save_ckpt_func,
                     is_main_process=self.args.should_save,
                     max_shard_size="1024GB",
+                    save_to_hf=self.args.save_to_hf,
                 )
             else:
                 self.model.save_pretrained(
@@ -3102,6 +3113,7 @@ def _save(
                     save_function=self._save_ckpt_func,
                     is_main_process=self.args.should_save,
                     max_shard_size="1024GB",
+                    save_to_hf=self.args.save_to_hf,
                 )
         if self.args.should_save_sharding_stage1_model:
             model_meta = self.sharding_io.gather_distributed_model_meta()
 
@@ -1080,6 +1080,14 @@ class TrainingArguments:
         default=False,
         metadata={"help": "是否开启单路sharding时global norm通信拆分全局通信组为pp通信和mp通信分别做"},
     )
+    convert_from_hf: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Load model from HuggingFace safetensors."},
+    )
+    save_to_hf: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Save model to HuggingFace safetensors."},
+    )
 
     def __post_init__(self):
         world_size = paddle.distributed.get_world_size()
 
@@ -21,6 +21,7 @@
 import paddle
 import paddle.distributed as dist
 
+from ...transformers.model_utils import prepare_safe_save_state_dict
 from ...transformers.utils import is_safetensors_available
 from ...utils.log import logger
 
@@ -70,16 +71,20 @@ def __init__(self, args):
             self._shared_save_optimizer_flag = multiprocessing.Array("i", 1)
 
     def _file_save_async_or_sync(
-        self, state_dict, path, signal_path=None, is_sync=True, state_dict_type="model_weight", ckpt_quant_stage="O0"
+        self,
+        state_dict,
+        path,
+        signal_path=None,
+        is_sync=True,
+        state_dict_type="model_weight",
+        ckpt_quant_stage="O0",
+        save_to_hf=False,
     ):
         if is_sync:
-            for k in list(state_dict.keys()):
-                if isinstance(state_dict[k], paddle.Tensor):
-                    state_dict[k] = state_dict.pop(k).cpu().numpy()
-
+            state_dict, metadata = prepare_safe_save_state_dict(state_dict, save_to_hf=save_to_hf)
             if state_dict_type == "optimizer_weight" and ckpt_quant_stage != "O0":
                 state_dict = quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stage)
-            safe_save_file(state_dict, path, metadata={"format": "np"})
+            safe_save_file(state_dict, path, metadata=metadata)
         else:
             if len(state_dict.keys()) == 0:
                 saved_signal_path = os.path.join(signal_path, f".{state_dict_type}.done.{self.global_rank}")
@@ -107,6 +112,8 @@ def _file_save_async_or_sync(
                             self._lock,
                             state_dict_type,
                             self.global_rank,
+                            ckpt_quant_stage,
+                            save_to_hf,
                         ),
                     )
                     self._process_model_weight.start()
@@ -134,6 +141,8 @@ def _file_save_async_or_sync(
                             if "skip_save_model_weight" in self.args.unified_checkpoint_config
                             else state_dict_type,
                             self.global_rank,
+                            ckpt_quant_stage,
+                            save_to_hf,
                         ),
                     )
                     self._process_master_weight.start()
@@ -160,6 +169,7 @@ def _file_save_async_or_sync(
                             state_dict_type,
                             self.global_rank,
                             ckpt_quant_stage,
+                            save_to_hf,
                         ),
                     )
                     self._process_optimizer_weight.start()
@@ -191,6 +201,7 @@ def _save_file_async_in_process(
         state_dict_type,
         global_rank,
         ckpt_quant_stage="O0",
+        save_to_hf=False,
     ):
         shm = shared_memory.SharedMemory(name=shm_name)
         while True:
@@ -208,7 +219,8 @@ def _save_file_async_in_process(
                     state_dict = quant_unified_optimizer(
                         state_dict, state_dict_type, ckpt_quant_stage, async_save=True
                     )  # ckpt quantization
-                safe_save_file(state_dict, path, {"format": "np"})
+                metadata = {"format": "pt"} if save_to_hf else {"format": "np"}
+                safe_save_file(state_dict, path, metadata=metadata)
                 del state_dict
                 saved_signal_path = os.path.join(signal_path, f".{state_dict_type}.done.{global_rank}")
                 paddle.save(global_rank, saved_signal_path)
 
@@ -50,7 +50,9 @@
 __all__ = ["load_unified_checkpoint_locally", "load_unified_optimizer_locally"]
 
 
-def load_unified_checkpoint_locally(args, model, resume_from_checkpoint: str, safe_serialization=False):
+def load_unified_checkpoint_locally(
+    args, model, resume_from_checkpoint: str, safe_serialization=False, convert_from_hf=False
+):
     """
     Only dataset_rank == 0 or using expert parallel can enter this function.
     """
@@ -114,8 +116,14 @@ def _remove_unused_keys(
             else:
                 tp_actions = model.get_tensor_parallel_convert_actions(model.config, loaded_keys, ignore_error=True)
         # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
+        transpose_weight_keys = getattr(model, "transpose_weight_keys", None)
         state_dict = load_state_dict(
-            shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys, device="expected"
+            shard_file,
+            tp_actions if pre_tensor_parallel_split else None,
+            expected_keys,
+            device="expected",
+            convert_from_hf=convert_from_hf,
+            transpose_weight_keys=transpose_weight_keys,
         )
 
         if not pre_tensor_parallel_split:
 
@@ -20,7 +20,12 @@
 import paddle
 
 from ...peft import LoRAModel, PrefixModelForCausalLM
-from ...transformers.model_utils import _load_state_dict_into_model, load_state_dict
+from ...transformers.conversion_utils import ConversionMixin
+from ...transformers.model_utils import (
+    _load_state_dict_into_model,
+    load_state_dict,
+    prepare_safe_save_state_dict,
+)
 from ...transformers.utils import (
     dtype_byte_size,
     get_checkpoint_shard_files,
@@ -54,17 +59,19 @@
 ]
 
 
-def save_file_sync(state_dict, path):
-    for k in list(state_dict.keys()):
-        if isinstance(state_dict[k], paddle.Tensor):
-            state_dict[k] = state_dict.pop(k).cpu().numpy()
-    safe_save_file(state_dict, path, metadata={"format": "np"})
+def save_file_sync(state_dict, path, save_to_hf=False):
+    state_dict, metadata = prepare_safe_save_state_dict(state_dict, save_to_hf=save_to_hf)
+    safe_save_file(state_dict, path, metadata=metadata)
 
 
-def save_single_card_checkpoint(model_to_save, output_dir):
+def save_single_card_checkpoint(model_to_save, output_dir, save_to_hf=False):
     """Save checkpoint for non-distributed environment."""
 
     state_dict = get_expected_state_dict(model_to_save, concat_additional_adapter=True)
+    if save_to_hf:
+        transpose_weight_keys = getattr(model_to_save, "transpose_weight_keys", None)
+        state_dict = ConversionMixin.convert_transpose_selected_weights(state_dict, transpose_weight_keys)
+
     if isinstance(model_to_save, LoRAModel) or isinstance(model_to_save, PrefixModelForCausalLM):
         weight_filename = "peft_model-00001-of-00001.safetensors"
         index_filename = SAFE_PEFT_WEIGHTS_INDEX_NAME
@@ -92,7 +99,7 @@ def save_single_card_checkpoint(model_to_save, output_dir):
 
     # save checkpoint, do no support asynchronous save for single card currently.
     logger.warning("Asynchronous saving is not supported for single card environment currently.")
-    save_file_sync(state_dict, path=os.path.join(output_dir, weight_filename))
+    save_file_sync(state_dict, path=os.path.join(output_dir, weight_filename), save_to_hf=save_to_hf)
 
     save_model_config(model_to_save, output_dir)
 
@@ -162,7 +169,7 @@ def save_single_card_optimizer(model, optimizer, output_dir):
         save_file_sync(master_weights, path=os.path.join(output_dir, "master_weights-00001-of-00001.safetensors"))
 
 
-def load_single_card_checkpoint(model, resume_from_checkpoint: str):
+def load_single_card_checkpoint(model, resume_from_checkpoint: str, convert_from_hf=False):
     if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
         index_filename = SAFE_PEFT_WEIGHTS_INDEX_NAME
     else:
@@ -180,7 +187,14 @@ def load_single_card_checkpoint(model, resume_from_checkpoint: str):
     if len(missing_keys) > 0:
         raise ValueError(f"Missing keys: {missing_keys}")
 
-    state_dict = load_state_dict(resolved_archive_file[0], None, expected_keys)
+    transpose_weight_keys = getattr(model, "transpose_weight_keys", None)
+    state_dict = load_state_dict(
+        resolved_archive_file[0],
+        None,
+        expected_keys,
+        convert_from_hf=convert_from_hf,
+        transpose_weight_keys=transpose_weight_keys,
+    )
     error_msgs = _load_state_dict_into_model(model, state_dict, "")
     del state_dict
     gc.collect()