Merge pull request #185 from ModelTC/dev_fixbug

gushiqiao · web-flow · commit aecd6c4e190f · 2024-11-13T13:56:49.000+08:00
Fix dp bugs
diff --git a/llmc/__main__.py b/llmc/__main__.py
@@ -6,6 +6,7 @@
 import time
 
 import torch
+import torch.distributed as dist
 import yaml
 from easydict import EasyDict
 from loguru import logger
@@ -31,34 +32,36 @@ def main(config):
     logger.info(tokenizer)
     logger.info(model)
 
-    if 'eval' in config and len(config.eval.eval_pos):
-        eval_list = []
-        name_list = (
-            config.eval.name
-            if not isinstance(config.eval.name, str)
-            else [config.eval.name]
-        )
-        for name in name_list:
-            eval_config = copy.deepcopy(config.eval)
-            eval_config.name = name
-            if len(name_list) != 1:  # eval multi datasets
-                eval_config.path = os.path.join(config.eval.path, name)
+    if int(os.environ['RANK']) == 0:
+        if 'eval' in config and len(config.eval.eval_pos):
+            eval_list = []
+            name_list = (
+                config.eval.name
+                if not isinstance(config.eval.name, str)
+                else [config.eval.name]
+            )
+            for name in name_list:
+                eval_config = copy.deepcopy(config.eval)
+                eval_config.name = name
+                if len(name_list) != 1:  # eval multi datasets
+                    eval_config.path = os.path.join(config.eval.path, name)
+                if config.eval.type == 'acc':
+                    acc_eval = AccuracyEval(eval_config)
+                    eval_list.append(acc_eval)
+                else:
+                    ppl_eval = PerplexityEval(tokenizer.get_tokenizer(), eval_config)
+                    eval_list.append(ppl_eval)
+
+        if 'eval' in config and 'pretrain' in config.eval.eval_pos:
             if config.eval.type == 'acc':
-                acc_eval = AccuracyEval(eval_config)
-                eval_list.append(acc_eval)
+                for acc_eval in eval_list:
+                    acc = acc_eval.eval(model)
+                    logger.info(f'{config.eval.name} acc : {acc}')
             else:
-                ppl_eval = PerplexityEval(tokenizer.get_tokenizer(), eval_config)
-                eval_list.append(ppl_eval)
-
-    if 'eval' in config and 'pretrain' in config.eval.eval_pos:
-        if config.eval.type == 'acc':
-            for acc_eval in eval_list:
-                acc = acc_eval.eval(model)
-                logger.info(f'{config.eval.name} acc : {acc}')
-        else:
-            for ppl_eval in eval_list:
-                ppl = ppl_eval.eval(model)
-                logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
+                for ppl_eval in eval_list:
+                    ppl = ppl_eval.eval(model)
+                    logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
+
     if not config.get('calib', False):
         blockwise_opt = ALGO_REGISTRY[config.quant.method](
             model,
@@ -68,6 +71,7 @@ def main(config):
             config=config
         )
         blockwise_opt.run_block_loop()
+        dist.barrier()
     else:
         dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib, model.batch_process)
         calib_data, padding_mask = dataset.get_calib_dataset()
@@ -93,121 +97,124 @@ def main(config):
                 config
             )
         blockwise_opt.run_block_loop()
+        dist.barrier()
 
-    if 'eval' in config and 'transformed' in config.eval.eval_pos:
-        blockwise_opt.deploy('origin_float')
-        if config.eval.type == 'acc':
-            for acc_eval in eval_list:
-                acc = acc_eval.eval(model)
-                logger.info(f'{config.eval.name} acc : {acc}')
-        else:
-            for ppl_eval in eval_list:
-                ppl = ppl_eval.eval(model)
-                logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
+    if int(os.environ['RANK']) == 0:
+        if 'eval' in config and 'transformed' in config.eval.eval_pos:
+            blockwise_opt.deploy('origin_float')
+            if config.eval.type == 'acc':
+                for acc_eval in eval_list:
+                    acc = acc_eval.eval(model)
+                    logger.info(f'{config.eval.name} acc : {acc}')
+            else:
+                for ppl_eval in eval_list:
+                    ppl = ppl_eval.eval(model)
+                    logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
 
-    if 'save' in config and config.save.get('save_trans', False):
-        blockwise_opt.save_model(save_trans_path)
+        if 'save' in config and config.save.get('save_trans', False):
+            blockwise_opt.save_model(save_trans_path)
 
-    if 'save' in config and config.save.get('save_trtllm', False):
-        blockwise_opt.save_model(save_trtllm_trans_path)
-        from llmc.utils.export_trtllm import cvt_trtllm_engine
+        if 'save' in config and config.save.get('save_trtllm', False):
+            blockwise_opt.save_model(save_trtllm_trans_path)
+            from llmc.utils.export_trtllm import cvt_trtllm_engine
 
-        cvt_trtllm_engine(
-            save_trtllm_trans_path,
-            save_trtllm_engine_path,
-            config.save.get('trtllm_cfg'),
-        )
+            cvt_trtllm_engine(
+                save_trtllm_trans_path,
+                save_trtllm_engine_path,
+                config.save.get('trtllm_cfg'),
+            )
 
-    if 'eval' in config and 'fake_quant' in config.eval.eval_pos:
-        blockwise_opt.deploy('fake_quant')
-        if config.eval.type == 'acc':
-            for acc_eval in eval_list:
-                acc = acc_eval.eval(model)
-                logger.info(f'{config.eval.name} acc : {acc}')
-        else:
-            for ppl_eval in eval_list:
-                ppl = ppl_eval.eval(model)
-                logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
+        if 'eval' in config and 'fake_quant' in config.eval.eval_pos:
+            blockwise_opt.deploy('fake_quant')
+            if config.eval.type == 'acc':
+                for acc_eval in eval_list:
+                    acc = acc_eval.eval(model)
+                    logger.info(f'{config.eval.name} acc : {acc}')
+            else:
+                for ppl_eval in eval_list:
+                    ppl = ppl_eval.eval(model)
+                    logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
 
-        if 'eval_token_consist' in config.eval and config.eval.eval_token_consist:
-            org_model = MODEL_REGISTRY[config.model.type](
-                config.model.path, config.model.torch_dtype
+            if 'eval_token_consist' in config.eval and config.eval.eval_token_consist:
+                org_model = MODEL_REGISTRY[config.model.type](
+                    config.model.path, config.model.torch_dtype
+                )
+                token_consist_eval = TokenConsistencyEval(tokenizer.get_tokenizer(),
+                                                          eval_config)
+                consistency_ratio = token_consist_eval.eval(model, org_model)
+                logger.info(f'Token consistency ratio: {consistency_ratio}')
+                del org_model
+
+        if 'save' in config and config.save.get('save_fake', False):
+            blockwise_opt.deploy('fake_quant')
+            blockwise_opt.save_model(save_fake_path)
+
+        if 'save' in config and config.save.get('save_vllm', False):
+            w, a = config.quant.weight, config.quant.get('act')
+            if isinstance(w.bit, str):
+                assert a, 'Only WA float quant is supported.'
+                assert w.symmetric and a.symmetric, 'Only symmetric quant is supported.'
+                assert w.bit == a.bit and w.bit in ['e4m3', 'e5m2'] and \
+                    a.bit in ['e4m3', 'e5m2'], 'Only WA FP8 quant is supported'
+            else:
+                assert w.symmetric, 'Only symmetric quant is supported.'
+                assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.'
+                if a:
+                    assert a.symmetric, 'Only symmetric quant is supported.'
+                    assert a.bit == 8, 'Supported quant: w4a16, w8a16, w8a8.'
+            blockwise_opt.deploy('vllm_quant')
+            blockwise_opt.save_model(save_quant_path)
+            update_vllm_quant_config(blockwise_opt.model, config, save_quant_path)
+
+        if 'save' in config and config.save.get('save_sgl', False):
+            w, a = config.quant.weight, config.quant.get('act')
+            if isinstance(w.bit, str):
+                assert a, 'Only WA float quant is supported.'
+                assert w.symmetric and a.symmetric, 'Only symmetric quant is supported.'
+                assert w.bit == a.bit and w.bit in ['e4m3', 'e5m2'] and \
+                    a.bit in ['e4m3', 'e5m2'], 'Only WA FP8 quant is supported'
+            else:
+                assert w.symmetric, 'Only symmetric quant is supported.'
+                assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.'
+                if a:
+                    assert a.symmetric, 'Only symmetric quant is supported.'
+                    assert a.bit == 8, 'Supported quant: w4a16, w8a16, w8a8.'
+            blockwise_opt.deploy('sgl_quant')
+            blockwise_opt.save_model(save_quant_path)
+            update_vllm_quant_config(blockwise_opt.model, config, save_quant_path)
+
+        if 'save' in config and config.save.get('save_autoawq', False):
+            assert config.quant.weight.bit in [4] and 'act' not in config.quant, \
+                'AutoAWQ supports only 4-bit weight-only quantization.'
+            assert not config.quant.weight.symmetric, 'Only asymmetric quant is supported.'
+
+            blockwise_opt.deploy('autoawq_quant')
+            blockwise_opt.save_model(save_quant_path)
+            update_autoawq_quant_config(config, save_quant_path)
+
+        if 'save' in config and config.save.get('save_mlcllm', False):
+            assert config.quant.weight.bit in [4] and 'act' not in config.quant, \
+                'MlcLLM supports only 4-bit weight-only quantization.'
+            assert not config.quant.weight.symmetric, 'Only asymmetric quant is supported.'
+
+            blockwise_opt.deploy('mlcllm_quant')
+            blockwise_opt.save_model(save_quant_path)
+            update_autoawq_quant_config(config, save_quant_path)
+
+        if 'opencompass' in config:
+            assert config.save.get('save_trans', False)
+            cfg_path = config['opencompass']['cfg_path']
+            output_path = config['opencompass']['output_path']
+            eval_model_path = os.path.abspath(save_trans_path)
+            opencompass_cmd = (
+                f'opencompass {cfg_path} -w {output_path} '
+                f'--llmc_cfg {args.config} '
+                f'--llmc_eval_mode quant '
+                f'--llmc_model_path {eval_model_path}'
             )
-            token_consist_eval = TokenConsistencyEval(tokenizer.get_tokenizer(),
-                                                      eval_config)
-            consistency_ratio = token_consist_eval.eval(model, org_model)
-            logger.info(f'Token consistency ratio: {consistency_ratio}')
-            del org_model
-
-    if 'save' in config and config.save.get('save_fake', False):
-        blockwise_opt.deploy('fake_quant')
-        blockwise_opt.save_model(save_fake_path)
-
-    if 'save' in config and config.save.get('save_vllm', False):
-        w, a = config.quant.weight, config.quant.get('act')
-        if isinstance(w.bit, str):
-            assert a, 'Only WA float quant is supported.'
-            assert w.symmetric and a.symmetric, 'Only symmetric quant is supported.'
-            assert w.bit == a.bit and w.bit in ['e4m3', 'e5m2'] and \
-                a.bit in ['e4m3', 'e5m2'], 'Only WA FP8 quant is supported'
-        else:
-            assert w.symmetric, 'Only symmetric quant is supported.'
-            assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.'
-            if a:
-                assert a.symmetric, 'Only symmetric quant is supported.'
-                assert a.bit == 8, 'Supported quant: w4a16, w8a16, w8a8.'
-        blockwise_opt.deploy('vllm_quant')
-        blockwise_opt.save_model(save_quant_path)
-        update_vllm_quant_config(blockwise_opt.model, config, save_quant_path)
-
-    if 'save' in config and config.save.get('save_sgl', False):
-        w, a = config.quant.weight, config.quant.get('act')
-        if isinstance(w.bit, str):
-            assert a, 'Only WA float quant is supported.'
-            assert w.symmetric and a.symmetric, 'Only symmetric quant is supported.'
-            assert w.bit == a.bit and w.bit in ['e4m3', 'e5m2'] and \
-                a.bit in ['e4m3', 'e5m2'], 'Only WA FP8 quant is supported'
-        else:
-            assert w.symmetric, 'Only symmetric quant is supported.'
-            assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.'
-            if a:
-                assert a.symmetric, 'Only symmetric quant is supported.'
-                assert a.bit == 8, 'Supported quant: w4a16, w8a16, w8a8.'
-        blockwise_opt.deploy('sgl_quant')
-        blockwise_opt.save_model(save_quant_path)
-        update_vllm_quant_config(blockwise_opt.model, config, save_quant_path)
-
-    if 'save' in config and config.save.get('save_autoawq', False):
-        assert config.quant.weight.bit in [4] and 'act' not in config.quant, \
-            'AutoAWQ supports only 4-bit weight-only quantization.'
-        assert not config.quant.weight.symmetric, 'Only asymmetric quant is supported.'
-
-        blockwise_opt.deploy('autoawq_quant')
-        blockwise_opt.save_model(save_quant_path)
-        update_autoawq_quant_config(config, save_quant_path)
-
-    if 'save' in config and config.save.get('save_mlcllm', False):
-        assert config.quant.weight.bit in [4] and 'act' not in config.quant, \
-            'MlcLLM supports only 4-bit weight-only quantization.'
-        assert not config.quant.weight.symmetric, 'Only asymmetric quant is supported.'
-
-        blockwise_opt.deploy('mlcllm_quant')
-        blockwise_opt.save_model(save_quant_path)
-        update_autoawq_quant_config(config, save_quant_path)
-
-    if 'opencompass' in config:
-        assert config.save.get('save_trans', False)
-        cfg_path = config['opencompass']['cfg_path']
-        output_path = config['opencompass']['output_path']
-        eval_model_path = os.path.abspath(save_trans_path)
-        opencompass_cmd = (
-            f'opencompass {cfg_path} -w {output_path} '
-            f'--llmc_cfg {args.config} '
-            f'--llmc_eval_mode quant '
-            f'--llmc_model_path {eval_model_path}'
-        )
-        logger.info(f'opencompass_cmd : {opencompass_cmd}')
-        os.system(opencompass_cmd)
+            logger.info(f'opencompass_cmd : {opencompass_cmd}')
+            os.system(opencompass_cmd)
+    dist.barrier()
 
 
 if __name__ == '__main__':
@@ -270,7 +277,7 @@ def main(config):
                 mkdirs(save_fake_path)
 
     # Synchronize all processes after directory creation
-    torch.distributed.barrier()
+    dist.barrier()
 
     main(config)
 
diff --git a/llmc/compression/blockwise_optimization.py b/llmc/compression/blockwise_optimization.py
@@ -47,10 +47,6 @@ def run_block_loop(self):
             os.makedirs(self.clip_path, exist_ok=True)
             torch.save(self.auto_clipper.weight_clips, os.path.join(self.clip_path, 'clips.pth'))
 
-    @abstractmethod
-    def block_opt(self, block):
-        pass
-
     def cache_input_hook(self, m, x, y, name, feat_dict):
         inputs = [i.detach().cpu() for i in x]
         if len(inputs) == 1:
@@ -60,3 +56,16 @@ def cache_input_hook(self, m, x, y, name, feat_dict):
             feat_dict[name].append(inp)
         else:
             feat_dict[name].append(tuple(inputs))
+
+    @abstractmethod
+    def block_opt(self, block):
+        pass
+
+    def layer_init(self, layer):
+        pass
+
+    def subset_init(self, subset):
+        pass
+
+    def block_init(self, block):
+        pass
diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py
@@ -522,15 +522,6 @@ def rehook_next_subset(self, block, subset, next_subset):
 
         return input_feat_subset
 
-    def layer_init(self, layer):
-        pass
-
-    def subset_init(self, subset):
-        pass
-
-    def block_init(self, block):
-        pass
-
     def collect_layers_weights(self, layers, tensor_parallelize_style=None):
         weights = []
         for _m in layers:
@@ -566,13 +557,20 @@ def register_act_qparams(self, layers_dict, act_tensors):
         scales_list, zeros_list, qmin_list, qmax_list = (
             self.aquantizer.get_batch_tensors_qparams(act_tensors)
         )
-        for i in range(len(scales_list)):
-            scales, zeros, qmin, qmax = scales_list[i], zeros_list[i], qmin_list[i], qmax_list[i]
-            for name in layers_dict:
-                layers_dict[name].register_buffer(f'buf_act_scales_{i}', scales)
-                layers_dict[name].register_buffer(f'buf_act_zeros_{i}', zeros)
-                layers_dict[name].register_buffer(f'buf_act_qmin_{i}', qmin)
-                layers_dict[name].register_buffer(f'buf_act_qmax_{i}', qmax)
+        world_size = int(os.environ['WORLD_SIZE'])
+
+        for i, (scales, zeros, qmin, qmax) in enumerate(
+            zip(scales_list, zeros_list, qmin_list, qmax_list)
+        ):
+            scales = scales.cuda()
+            dist.all_reduce(scales, op=dist.ReduceOp.SUM)
+            scales = (scales / world_size).cpu()
+
+            for name, layer in layers_dict.items():
+                layer.register_buffer(f'buf_act_scales_{i}', scales)
+                layer.register_buffer(f'buf_act_zeros_{i}', zeros)
+                layer.register_buffer(f'buf_act_qmin_{i}', qmin)
+                layer.register_buffer(f'buf_act_qmax_{i}', qmax)
 
     @torch.no_grad()
     def apply_scale(self, scales, prev_op, layers):