[unified checkpoint] Update unified checkpoint (#7730)

DrownFish19 · web-flow · commit 1e9b5a876a30 · 2023-12-28T13:06:37.000+08:00
* replace with unwrap_optimizer * remove dist.barrier(group=tp_group) for distributed_gather * add warning when loading model weights as master weights * update paddle.cast * Revert "replace with unwrap_optimizer" This reverts commit 3fd6a58. * add unwrap_optimizer * convert is_unified_checkpoint to is_unified_checkpoint * update
diff --git a/paddlenlp/trainer/plugins/unified_checkpoint.py b/paddlenlp/trainer/plugins/unified_checkpoint.py
@@ -116,7 +116,7 @@ def save_unified_checkpoint(args, model, optimizer, output_dir, safe_serializati
         raise ValueError("Unified checkpoint only supports PretrainedModel")
 
     if UnifiedCheckpointOption.SKIP_SAVE_MODEL_WEIGHT.value in args.unified_checkpoint_config:
-        if is_need_master_weight(args, optimizer):
+        if is_need_master_weight(optimizer, is_fp16_or_bp16=(args.fp16 or args.bf16)):
             logger.info(
                 f"With {UnifiedCheckpointOption.SKIP_SAVE_MODEL_WEIGHT.value}, skip the model checkpoint save."
                 "The master weight will be loaded as model weights for next resumption."
@@ -237,9 +237,6 @@ def _remove_unused_keys(
                 None, model.config, state_dict=state_dict, ignore_error=len(resolved_archive_file) > 1
             )
 
-        # confirm parameter cast is executed on the same device as model
-        # TODO: cast(FP32 -> FP16) has diff on different devices, need to fix it
-        state_dict = nested_copy_place(state_dict, place=paddle.framework._current_expected_place())
         error_msgs += _load_state_dict_into_model(model, state_dict, "")
 
         # force memory release
@@ -1388,7 +1385,6 @@ def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys):
             tensor = state_dict[key]
             if key in tp_actions:
                 ret = distributed_gather(tensor, dst=j, group=tp_group, offload=False)
-                dist.barrier(group=tp_group)
                 action = tp_actions.pop(key)
                 tensor = action(ret) if is_dst else None
             else:
@@ -1429,7 +1425,6 @@ def merge_tensor_parallel_for_optimizer(state_dict, tp_actions, all_filter_keys)
                     )  # Need broadcast when loaded
                 else:
                     ret = distributed_gather(tensor, dst=j, group=tp_group, offload=False)
-                    dist.barrier(group=tp_group)
                     action = tp_actions[model_key]
                     tensor = action(ret) if is_dst else None
             else:
@@ -1631,13 +1626,17 @@ def select_model_weight_index(args, model, resume_from_checkpoint, safe_serializ
 
 
 def update_master_weight_status(args, optimizer, has_master_weight, safe_serialization):
-    if is_need_master_weight(args, optimizer):
+    if is_need_master_weight(optimizer, is_fp16_or_bp16=(args.fp16 or args.bf16)):
         if not has_master_weight:
             if UnifiedCheckpointOption.MASTER_WEIGHT_COMPATIBLE.value in args.unified_checkpoint_config:
                 index_filename_master_weights = (
                     PADDLE_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
                 )
                 has_master_weight = True
+                logger.warning(
+                    "The unified checkpoint does not contain master weight, "
+                    "the model weight will be loaded as master weight."
+                )
             else:
                 raise ValueError(
                     "Can't find a valid unified master weight checkpoint,"
@@ -1656,28 +1655,19 @@ def update_master_weight_status(args, optimizer, has_master_weight, safe_seriali
     return has_master_weight, index_filename_master_weights
 
 
-def is_need_master_weight(args, optimizer):
-    """
-    https://github.com/PaddlePaddle/Paddle/blob/4a9991fb6744443333638b65fb7e225fb2b00a13/python/paddle/amp/auto_cast.py#L485
-    """
+def unwrap_optimizer(optimizer):
+    while hasattr(optimizer, "_inner_opt") or hasattr(optimizer, "_optim"):
+        if hasattr(optimizer, "_inner_opt"):
+            optimizer = optimizer._inner_opt
+        if hasattr(optimizer, "_optim"):
+            optimizer = optimizer._optim
 
-    from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
-        DygraphShardingOptimizer,
-        DygraphShardingOptimizerV2,
-    )
-    from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
-        HybridParallelOptimizer,
-    )
-    from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
-        GroupShardedOptimizerStage2,
-    )
+    return optimizer
 
-    if isinstance(optimizer, (DygraphShardingOptimizer, DygraphShardingOptimizerV2, HybridParallelOptimizer)):
-        optimizer = optimizer._inner_opt
-    elif isinstance(optimizer, GroupShardedOptimizerStage2):
-        optimizer = optimizer._optim
 
+def is_need_master_weight(optimizer, is_fp16_or_bp16):
+    optimizer = unwrap_optimizer(optimizer)
     if hasattr(optimizer, "_multi_precision"):
-        return optimizer._multi_precision and (args.bf16 or args.fp16)
+        return optimizer._multi_precision and is_fp16_or_bp16
     else:
         return False
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -88,9 +88,12 @@
 from ..utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
 from ..utils.env import (
     LORA_WEIGHTS_NAME,
+    PADDLE_MASTER_WEIGHTS_INDEX_NAME,
     PADDLE_WEIGHTS_INDEX_NAME,
     PADDLE_WEIGHTS_NAME,
     PREFIX_WEIGHTS_NAME,
+    SAFE_MASTER_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
 )
 from ..utils.import_utils import is_datasets_available, is_paddle_cuda_available
 from ..utils.log import logger
@@ -507,9 +510,10 @@ def _load_from_checkpoint(self, resume_from_checkpoint=None):
 
         if self.args.unified_checkpoint:
             if resume_from_checkpoint is not None:
-                use_unified_checkpoint = True
-                if self.check_origin_checkpoint(resume_from_checkpoint):
-                    use_unified_checkpoint = False
+                use_unified_checkpoint = False
+                if self.is_unified_checkpoint(resume_from_checkpoint):
+                    use_unified_checkpoint = True
+                else:
                     logger.info("Loading origin checkpoint, the next checkpoint will be saved as unified checkpoint")
 
                 if use_unified_checkpoint:
@@ -2285,11 +2289,11 @@ def _load_optimizer_and_scheduler(self, checkpoint):
                 checkpoint, OPTIMIZER_NAME, self.model_wrapped
             )
         else:
-            use_unified_checkpoint = False
             if self.args.unified_checkpoint:
-                use_unified_checkpoint = True
-                if self.check_origin_checkpoint(checkpoint):
-                    use_unified_checkpoint = False
+                use_unified_checkpoint = False
+                if self.is_unified_checkpoint(checkpoint):
+                    use_unified_checkpoint = True
+                else:
                     logger.info("Loading checkpoint, the next checkpoint will be saved as unified checkpoint")
 
             if not use_unified_checkpoint:
@@ -2940,20 +2944,22 @@ def print_config(self, args=None, key=""):
 
         logger.info("")
 
-    def check_origin_checkpoint(self, resume_from_checkpoint):
-        is_origin_checkpoint_type = False
-
-        weight_name = PADDLE_WEIGHTS_NAME
-        weight_index_name = PADDLE_WEIGHTS_INDEX_NAME
-        weights_file = os.path.join(
-            resume_from_checkpoint,
-            _add_variant(weight_name, self.args.weight_name_suffix),
+    def is_unified_checkpoint(self, resume_from_checkpoint, safe_serialization=True):
+        is_unified_checkpoint_type = False
+        weights_index_name = PADDLE_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
+        master_weights_index_name = (
+            PADDLE_MASTER_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_MASTER_WEIGHTS_INDEX_NAME
         )
         weights_index_file = os.path.join(
             resume_from_checkpoint,
-            _add_variant(weight_index_name, self.args.weight_name_suffix),
+            weights_index_name,
+        )
+        master_weights_index_file = os.path.join(
+            resume_from_checkpoint,
+            master_weights_index_name,
         )
-        if distributed_isfile(weights_file) or distributed_isfile(weights_index_file):
-            is_origin_checkpoint_type = True
 
-        return is_origin_checkpoint_type
+        if distributed_isfile(weights_index_file) or distributed_isfile(master_weights_index_file):
+            is_unified_checkpoint_type = True
+
+        return is_unified_checkpoint_type
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
@@ -735,15 +735,22 @@ def _convert_state_dict_dtype_and_shape(state_dict, model_to_load):
     def is_0d_or_1d(tensor):
         return len(tensor.shape) == 0 or list(tensor.shape) == [1]
 
+    expected_place = paddle.framework._current_expected_place()
     for key, value in model_to_load.state_dict().items():
         if key in state_dict:
             if isinstance(state_dict[key], np.ndarray):
                 raise ValueError(
                     "convert_state_dict_dtype expected paddle.Tensor not numpy.ndarray, plase convert numpy.ndarray to paddle.Tensor"
                 )
+            # confirm parameter cast is executed on the same device as model
+            # TODO: cast(FP32 -> FP16) has diff on different devices, need to fix it
             if state_dict[key].is_floating_point() and state_dict[key].dtype != value.dtype:
-                state_dict[key] = paddle.cast(state_dict.pop(key), value.dtype)
-
+                value_pop = state_dict.pop(key)
+                value_new_place = (
+                    value_pop if value_pop.place == expected_place else value_pop._copy_to(expected_place, False)
+                )
+                state_dict[key] = paddle.cast(value_new_place, value.dtype)._copy_to(value_pop.place, False)
+                del value_new_place
             # unified 0d and 1d tensor
             if is_0d_or_1d(value) and is_0d_or_1d(state_dict[key]):
                 if list(value.shape) != list(state_dict[key].shape):