support quant ckpt limit strategy (#9494)

wtmlon · web-flow · commit 2985f90327f9 · 2024-11-29T10:22:00.000+08:00
* support quant ckpt limit strategy

* bug fix

* bug fix

* fix bug

* add log, fix bug
diff --git a/paddlenlp/quantization/unified_checkpoint_quantization.py b/paddlenlp/quantization/unified_checkpoint_quantization.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 import paddle
 from paddle.distributed import fleet
 
@@ -33,7 +34,7 @@
 from paddlenlp.utils.log import logger
 
 
-def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
+def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict, use_pd=False):
     """
     dequantize unified optimizer state dict.
     Args:
@@ -44,6 +45,7 @@ def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
         scale_dict (`int`):
             compression checkpoint scale dict.
     """
+    logger.info(f"Start unified checkpoint dequantization, stage {ckpt_quant_stage}.")
     tp_rank, tp_degree = -1, 1
     if paddle.distributed.get_world_size() > 1:
         hcg = fleet.get_hybrid_communicate_group()
@@ -68,7 +70,7 @@ def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
                     dequant=True,
                     tp_rank=tp_rank,
                     tp_degree=tp_degree,
-                    use_pd=True,
+                    use_pd=use_pd,
                 )
                 state_dict[quant_key] = weight
             elif is_moment2:
@@ -85,10 +87,13 @@ def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
                     dequant=True,
                     tp_rank=tp_rank,
                     tp_degree=tp_degree,
-                    use_pd=True,
+                    use_pd=use_pd,
                 )
                 # cal m2
-                weight = paddle.square(1.0 / weight - eps)
+                if use_pd:
+                    weight = paddle.square(1.0 / weight - eps)
+                else:
+                    weight = np.square(1.0 / weight - eps)
                 state_dict[quant_key] = weight
     elif ckpt_quant_stage == "O2":
         # set eps
@@ -117,7 +122,7 @@ def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
                 quant=False,
                 tp_rank=tp_rank,
                 tp_degree=tp_degree,
-                use_pd=True,
+                use_pd=use_pd,
                 symmetry=True,
             )
             ratio_weight = group_wise_quant_dequant(
@@ -128,14 +133,19 @@ def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
                 quant=False,
                 tp_rank=tp_rank,
                 tp_degree=tp_degree,
-                use_pd=True,
+                use_pd=use_pd,
             )
 
-            ratio_weight = paddle.square(1.0 / ratio_weight - eps)
+            if use_pd:
+                ratio_weight = paddle.square(1.0 / ratio_weight - eps)
+            else:
+                ratio_weight = np.square(1.0 / ratio_weight - eps)
             state_dict[quant_key] = ratio_weight
             m1_state_dict[quant_key[: -len(MOMENT2_KEYNAME)] + MOMENT1_KEYNAME] = m1_weight
             state_dict.update(m1_state_dict)
 
+    logger.info(f"Unified checkpoint dequantization done, stage {ckpt_quant_stage}.")
+
     return state_dict
 
 
@@ -152,14 +162,15 @@ def quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stage, async
         async_save (`bool`):
             whether use async_save.
     """
+    logger.info(f"Start unified checkpoint quantization, stage {ckpt_quant_stage}.")
+
     quant = False
     if ckpt_quant_stage != "O0":
         quant = True
     del_key = []
     if quant and state_dict_type == "optimizer_weight":
         scales_dict = {}
-        opt_keys = state_dict.keys()
-        for k in opt_keys:
+        for k in state_dict.keys():
             momentum1 = k.endswith(MOMENT1_KEYNAME)
             momentum2 = k.endswith(MOMENT2_KEYNAME)
 
@@ -205,5 +216,6 @@ def quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stage, async
             state_dict.pop(k, None)
 
         state_dict.update(scales_dict)
+    logger.info(f"Unified checkpoint quantization done, stage {ckpt_quant_stage}.")
 
     return state_dict
diff --git a/paddlenlp/trainer/unified_checkpoint/unified_checkpoint.py b/paddlenlp/trainer/unified_checkpoint/unified_checkpoint.py
@@ -29,8 +29,10 @@
     unwrap_model,
 )
 from paddlenlp.transformers.utils import dtype_byte_size
+from paddlenlp.utils import infohub
 from paddlenlp.utils.env import (
     LORA_WEIGHTS_NAME,
+    MAX_QUANTIZATION_TIMES,
     PADDLE_MASTER_WEIGHTS_NAME,
     PADDLE_OPTIMIZER_NAME,
     PADDLE_WEIGHTS_NAME,
@@ -239,9 +241,16 @@ def save_non_merge_optimizer(self, model, optim_state_dict, master_weights, outp
         optimizer_name = _add_variant(SAFE_OPTIMIZER_NAME, self.args.optimizer_name_suffix)
         master_weights_name = _add_variant(SAFE_MASTER_WEIGHTS_NAME, self.args.optimizer_name_suffix)
 
+        sharded_optim_index = {}
         # save opt index json if checkpoint quantization is on.
-        if self.args.ckpt_quant_stage != "O0":
-            sharded_optim_index = {"ckpt_quant_stage": self.args.ckpt_quant_stage}
+        if self.args.ckpt_quant_stage != "O0" and "quant_reach_limit" not in infohub:
+            sharded_optim_index["ckpt_quant_stage"] = self.args.ckpt_quant_stage
+
+        sharded_optim_index["quant_ckpt_resume_times"] = (
+            infohub["quant_ckpt_resume_times"] if "quant_ckpt_resume_times" in infohub else 0
+        )
+
+        if len(sharded_optim_index) > 0:
             optimizer_index_name = SAFE_OPTIMIZER_INDEX_NAME
             path = os.path.join(output_dir, optimizer_index_name)
             if self.args.should_save:
@@ -257,7 +266,7 @@ def save_non_merge_optimizer(self, model, optim_state_dict, master_weights, outp
             signal_path=signal_dir,
             is_sync=is_sync_save,
             state_dict_type="optimizer_weight",
-            ckpt_quant_stage=self.args.ckpt_quant_stage,
+            ckpt_quant_stage=self.args.ckpt_quant_stage if "quant_reach_limit" not in infohub else "O0",
         )
         if master_weights is not None:
             self.async_handler._file_save_async_or_sync(
@@ -277,7 +286,7 @@ def load_non_merge_optimizer(self, model, optimizer, resume_from_checkpoint, ckp
         optimizer_path = os.path.join(resume_from_checkpoint, optimizer_name)
         master_weights_path = os.path.join(resume_from_checkpoint, master_weights_name)
         # no quantization & no master weight represent O1 AMP strategy.
-        is_amp_o1 = True if not os.path.isfile(master_weights_path) and ckpt_quant_stage == "O0" else False
+        is_amp_o1 = self.args.fp16_opt_level == "O1"
 
         model_state_dict = get_expected_state_dict(model)
         struct2static_name_mappings = {k: v.name for k, v in model_state_dict.items()}  # get optimizer param mappings
@@ -379,7 +388,7 @@ def save_unified_optimizer(self, model, optimizer, output_dir, signal_dir):
             signal_path=signal_dir,
             is_sync=is_sync_save,
             state_dict_type="optimizer_weight",
-            ckpt_quant_stage=self.args.ckpt_quant_stage,
+            ckpt_quant_stage=self.args.ckpt_quant_stage if "quant_reach_limit" not in infohub else "O0",
         )
         if master_weight_state_dict is not None:
             self.async_handler._file_save_async_or_sync(
@@ -429,10 +438,24 @@ def load_unified_optimizer(self, model, optimizer, resume_from_checkpoint):
             with open(os.path.join(resume_from_checkpoint, SAFE_OPTIMIZER_INDEX_NAME), "r") as f:
                 index = json.loads(f.read())
 
+        # get quant ckpt info `ckpt_quant_stage` and `quant_ckpt_resume_times`
         ckpt_quant_stage = "O0"
         if "ckpt_quant_stage" in index:
             ckpt_quant_stage = index["ckpt_quant_stage"]
 
+        quant_ckpt_resume_times = 0
+        if "quant_ckpt_resume_times" in index:
+            quant_ckpt_resume_times = index["quant_ckpt_resume_times"]
+        # increment and save resume times in infohub
+        if ckpt_quant_stage != "O0":
+            quant_ckpt_resume_times += 1
+        infohub["quant_ckpt_resume_times"] = quant_ckpt_resume_times
+
+        # Quantization times exceeds the limit. Turn off the quantization strategy.
+        if quant_ckpt_resume_times >= MAX_QUANTIZATION_TIMES:
+            infohub["quant_reach_limit"] = True
+            logger.info("Checkpoint quantization time reach limit and will be closed.")
+
         # If not having merge optimizer, then load non-merge optimizer.
         if "weight_map" not in index:
             if self.args.data_parallel_rank == 0 or self.args.use_expert_parallel:
@@ -647,8 +670,12 @@ def unified_optimizer_into_shards(
     )
     sharded_optim_index = get_sharded_index(index_optimizer_filelist, total_optim_size_list)
 
-    if args.should_save and args.ckpt_quant_stage in ["O1", "O2"]:
-        sharded_optim_index["ckpt_quant_stage"] = args.ckpt_quant_stage
+    if args.should_save:
+        if args.ckpt_quant_stage in ["O1", "O2"] and "quant_reach_limit" not in infohub:
+            sharded_optim_index["ckpt_quant_stage"] = args.ckpt_quant_stage
+        sharded_optim_index["quant_ckpt_resume_times"] = (
+            infohub["quant_ckpt_resume_times"] if "quant_ckpt_resume_times" in infohub else 0
+        )
 
     if master_weights is not None:
         index_master_weight_filelist, total_master_weight_size_list = gather_sharded_object(
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
@@ -473,7 +473,7 @@ def load_state_dict(
             if len(scale_dict) != 0:
                 if ckpt_quant_stage == "O0":
                     raise ValueError('optimizer weight has quantization scales but `ckpt_quant_stage` is set to "O0"')
-                state_dict = dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict)
+                state_dict = dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict, use_pd=True)
 
             return state_dict
 
diff --git a/paddlenlp/utils/env.py b/paddlenlp/utils/env.py
@@ -123,6 +123,7 @@ def _get_bool_env(env_key: str, default_value: str) -> bool:
 SYMMETRY_QUANT_SCALE = "@scales"
 ASYMMETRY_QUANT_SCALE_MIN = "@min_scales"
 ASYMMETRY_QUANT_SCALE_MAX = "@max_scales"
+MAX_QUANTIZATION_TIMES = 1
 
 # LLM Inference related environment variables
 # Note(@Wanglongzhi2001): MAX_BSZ, SPECULATE_MAX_BSZ, MAX_DRAFT_TOKENS must be the same as definition in get_output / save_output
diff --git a/tests/llm/test_finetune.py b/tests/llm/test_finetune.py
@@ -13,11 +13,14 @@
 # limitations under the License.
 from __future__ import annotations
 
+import json
+import os
 import sys
 import unittest
 
 from parameterized import parameterized_class
 
+from paddlenlp.utils.env import SAFE_OPTIMIZER_INDEX_NAME
 from tests.parallel_launch import TestMultipleGpus
 from tests.testing_utils import argv_context_guard, load_test_config
 
@@ -92,8 +95,16 @@ def test_ckpt_quant(self):
         finetune_config["output_dir"] = self.output_dir
 
         self.runfirst(finetune_config)
+
+        # get `quant_ckpt_resume_times`
+        with open(os.path.join(self.output_dir, "checkpoint-1", SAFE_OPTIMIZER_INDEX_NAME), "r") as r:
+            index = json.loads(r.read())
+        quant_ckpt_resume_times = index["quant_ckpt_resume_times"]
+
         self.rerun(finetune_config)
 
+        self.assertEqual(quant_ckpt_resume_times, 0)
+
     def runfirst(self, train_args):
         self.run_n1c2(self.run_sft, **train_args)