fix dq issue with llama3-70b on single gpu

chichun-charlie-liu · chichun-charlie-liu · commit 362d52128b05 · 2025-05-21T14:35:27.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -38,7 +38,7 @@
 from fms_mo import qconfig_init, qmodel_prep
 from fms_mo.fx.utils import model_size_Wb
 from fms_mo.quant.ptq import (
-    calibration_llm_1GPU,
+    calibration_llm_1GPU_v2,
     dq_llm,
     get_act_scales,
     get_act_scales_1gpu,
@@ -224,9 +224,9 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     if qcfg["qmodel_calibration_new"] > 0:
         logger.info("Starting to calibrate activation clip_val")
         if qcfg["large_model"]:
-            calibration_llm_1GPU(qcfg, model, dq_dataloader)
+            calibration_llm_1GPU_v2(qcfg, model, dq_dataloader)
         else:
-            model.to("cuda:0")
+            model.to("cuda")
             pbar = tqdm(
                 dq_dataloader,
                 desc=" calibration after applying smoothq scale and before inference",
diff --git a/fms_mo/prep.py b/fms_mo/prep.py
@@ -177,7 +177,10 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False):
     is mappable, create a Qmodule and return, otherwise, return the original module. In the future,
     Qmodules need to have a .from_torch() or .from_nn() classmethod, and then this function will be
     greatly simplified.
-    NOTE: This func will check qskip_layer_name before creating the Qmodule
+    NOTE:
+    1. This func will check qskip_layer_name before creating the Qmodule
+    2. Qmodule will be created on "meta device" as a placeholder, which will skip params init and
+        mem alloc, as weights and bias will be reassigned to module.weight/.bias right after
 
     Args:
         module (nn.Module): the module which Qmodule will be based on
@@ -216,7 +219,7 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False):
         if hasattr(module, "__constants__"):
             base_params = {k: getattr(module, k) for k in module.__constants__}
             base_params["bias"] = module.bias is not None
-        base_params["device"] = next(module.parameters()).device  # usually cuda
+        base_params["device"] = "meta"
 
     module_output = module
 
@@ -499,8 +502,17 @@ def q_any_net_5(model: nn.Module, qcfg: dict, verbose: bool = False):
     """
     # Third Party
     from torch.ao.quantization.utils import _parent_name
+    from tqdm import tqdm
+
+    total_modules = len(list(model.named_modules()))
+    pbar = tqdm(
+        model.named_modules(),
+        total=total_modules,
+        desc="Mapping modules to target Qmodules.",
+    )
+    for name, module in pbar:
+        pbar.set_description(f"processing {name}")
 
-    for name, module in model.named_modules():
         parent_module_name, curr_mod_name = _parent_name(name)
         new_module = make_quant_module(module, name, qcfg)
         parent_module = model.get_submodule(parent_module_name)
@@ -525,6 +537,7 @@ def q_any_net_5(model: nn.Module, qcfg: dict, verbose: bool = False):
             if verbose:
                 logger.info(f"Swap ({name}) from {type(module)} to {type(new_module)}")
 
+    pbar.close()
     return model
 
 
diff --git a/fms_mo/quant/ptq.py b/fms_mo/quant/ptq.py
@@ -419,15 +419,25 @@ class PTQHookRecInOutLMv2(nn.Module):
     leave the special handling, e.g. reshape/cat/shuffling...etc, for later
     """
 
-    def __init__(self, qcfg, name=None, cls2rec=(nn.Conv2d,), recInOnly=False):
+    def __init__(
+        self,
+        qcfg,
+        name=None,
+        cls2rec=(nn.Conv2d, nn.Linear),
+        recInOnly=False,
+        stop_after_rec=False,
+        cache_dev="cuda",
+    ):
         super().__init__()
         self.name = name
         self.qcfg = qcfg
         self.cls2rec = cls2rec
         self.rec_input_only = recInOnly
         self.num_valid_input = -1
+        self.stop_after_rec = stop_after_rec
+        self.cache_dev = cache_dev
 
-    def __call__(self, mod, inputs, output):
+    def __call__(self, mod, inputs, *args, **_kwargs):
         # make sure this module/block's ptqmode is not 'q_out'
         submods = [m for m in mod.modules() if isinstance(m, self.cls2rec)]
         if any(sm.ptqmode == "q_out" for sm in submods):
@@ -448,7 +458,7 @@ def __call__(self, mod, inputs, output):
         # check available GPU memory, cache on GPU if possible:
         GPUmem_available, _GPUmem_total = torch.cuda.mem_get_info()
         # 1 block for SQUAD/BERT 500 batches*12/batch = ~10G
-        if GPUmem_available / 1e9 > 20:
+        if self.cache_dev == "cuda" and GPUmem_available / 1e9 > 20:
             cache_device = "cuda"
         else:
             cache_device = "cpu"
@@ -461,13 +471,15 @@ def __call__(self, mod, inputs, output):
         )
 
         # output could be a tuple of a single tensor or simply a tensor ?
-        assert isinstance(output, (torch.Tensor, tuple))
-        if not self.rec_input_only:
+        if not self.rec_input_only and "output" in args:
+            output = args["output"]
+            assert isinstance(output, (torch.Tensor, tuple))
             self.qcfg["cached_output"].append(
                 output[0].detach().to(cache_device)
                 if isinstance(output, tuple)
                 else output.detach().to(cache_device)
             )
+        assert not self.stop_after_rec
 
 
 # this hook is meant for ptq_loss_func == 'fisher_diag' and to temp hold the "Q_out" of the module
@@ -2021,7 +2033,7 @@ def get_blocks(model, model_type=None):
         "llama": (
             "model.layers",
             "model.embed_tokens",
-            None,
+            "model.rotary_emb",
             None,
             "model.norm",
             "lm_head",
@@ -2111,20 +2123,16 @@ def cache_block0_inputs(
     model, dloader, qcfg, blocks, emb=None, emb_pos=None, emb_ln=None, dev="cpu"
 ):
     """
-    To cache the input to the first transformer block.
+    To cache the input to the first transformer block. Basically a "forward_pre_hook"
+    NOTE, change caching from tensor to list to allow varying input length, slightly
+    increase memeory due to mask and alibi.
     """
     emb = emb.to(dev)
     if emb_pos is not None:
         emb_pos.to(dev)
     if emb_ln is not None:
         emb_ln = emb_ln.to(dev)
     blocks[0] = blocks[0].to(dev)
-    # NOTE, change caching from tensor to list to allow varying input length, slightly
-    # increase memeory due to mask and alibi.
-    qcfg["cached_block0_input"] = []
-    qcfg["cache_id"] = 0
-    qcfg["cached_mask"] = []
-    qcfg["cached_alibi"] = []
     # move block0 to GPU and excuting fwd() until finish block0
     if "fms" in qcfg["model_type"]:
         qcfg["kw_to_cache"] = {
@@ -2142,9 +2150,16 @@ def cache_block0_inputs(
         }
         blocks[0] = RunModule(blocks[0], qcfg)
 
+    # clear up old cache, if exists.
+    qcfg["cached_block0_input"] = []
+    qcfg["cache_id"] = 0
+    for kw in qcfg["kw_to_cache"].values():
+        if kw in qcfg:
+            qcfg[kw] = []
+
     if isinstance(dloader, torch.utils.data.DataLoader):
         pbar = tqdm(
-            dloader, desc="Phase 0: PTQ caching block0 input", total=qcfg["ptq_nbatch"]
+            dloader, desc="Phase 0: Caching block0 inputs", total=qcfg["ptq_nbatch"]
         )
         for data_mb, _ in zip(pbar, range(qcfg["ptq_nbatch"])):
             try:
@@ -2310,9 +2325,8 @@ def freeze_layers(m, layer_list):
 
 @torch.no_grad()
 def calibration_llm_1GPU(qcfg, model, dloader):
-    """
-    calibration for large models that can not fit the whole model on 1 GPU.
-    """
+    """Calibration for large models that can not fit on 1 GPU."""
+
     model.train()
     dev = "cuda"
     qcfg["batch_size"] = 1
@@ -2365,6 +2379,83 @@ def calibration_llm_1GPU(qcfg, model, dloader):
     logger.info("All blocks are calibrated")
 
 
+@torch.no_grad()
+def calibration_llm_1GPU_v2(qcfg, model, dloader):
+    """
+    Improved version of Calibration for large language models that can not fit on 1 GPU with new
+    (built-in) calibration mechanism.
+    NOTE:
+    1. Calibration only, NO update to weights!
+    2. Rely on a alternative "pre fwd hook" to cache all possible inputs.
+    3. As calibration usually cache a small number of data only, no need to move each batch back and
+        forth between GPU and CPU.
+    """
+
+    model.train()
+    dev = "cuda"
+    qcfg["batch_size"] = 1
+    qcfg["dtype"] = next(iter(model.parameters())).dtype
+    qcfg["n_samples"] = min(qcfg["ptq_nbatch"], qcfg["qmodel_calibration_new"])
+
+    assert "model_type" in qcfg, "Unknown model type. please check before proceed."
+    assert isinstance(
+        dloader, torch.utils.data.DataLoader
+    ), "Please provide a valid dataloader."
+    # --- Phase 0 cache the inputs of the block0---
+    model.config.use_cache = False
+    blocks, emb, emb_pos, emb_ln, _, _ = get_blocks(model, qcfg["model_type"])
+
+    cache_block0_inputs(
+        model,
+        dloader,
+        qcfg,
+        blocks,
+        emb=emb,
+        emb_pos=emb_pos,
+        emb_ln=emb_ln,
+        dev="cpu",
+    )
+    logger.info("Done, caching inputs to block0 for calibration")
+
+    # --- Phase 1 --- compute blocks and last linear layer
+    pbar = tqdm(
+        blocks, desc="Phase 1: Calibration for each block", position=0, leave=True
+    )
+    qcfg["cached_input"] = [
+        inp.clone().detach().to(dev) for inp in qcfg["cached_block0_input"]
+    ]
+    kw_to_use = {
+        kw_org: kw_new
+        for kw_org, kw_new in qcfg["kw_to_cache"].items()
+        if len(qcfg[kw_new]) == len(qcfg["cached_input"])
+    }
+    for _num_block, m in enumerate(pbar):
+        m.to(dev)
+        for i in tqdm(
+            range(qcfg["n_samples"]), desc="number of samples", position=1, leave=False
+        ):
+            if qcfg["cached_alibi"]:
+                cached_inp_prev_lay = qcfg["cached_input"][i].unsqueeze(0).to(dev)
+                data_mb = {
+                    "attention_mask": qcfg["cached_mask"][i].unsqueeze(0).to(dev),
+                    "alibi": qcfg["cached_alibi"][i].unsqueeze(0).to(dev),
+                }
+            else:
+                cached_inp_prev_lay = qcfg["cached_input"][i]
+                data_mb = {
+                    kw_org: move_to(qcfg[kw_new][i], dev)
+                    for kw_org, kw_new in kw_to_use.items()
+                }
+
+            with patch_torch_bmm(qcfg):
+                qcfg["cached_input"][i] = m(cached_inp_prev_lay, **data_mb)[0]
+
+        m.cpu()
+        torch.cuda.empty_cache()
+
+    logger.info("All blocks are calibrated")
+
+
 @torch.no_grad()
 def activation_stats(name, tensor, act_scales):
     # TODO if 'QBmm' in name: reshape the tensor.
@@ -2498,8 +2589,8 @@ def get_act_scales_1gpu(model, dloader, qcfg):
 
     assert "model_type" in qcfg, "Unknown model type. please check before proceed."
     assert (
-        qcfg["loader_len"] == qcfg["ptq_nbatch"]
-    ), "set batch_size=1 and PTQ samples== Nbatches"
+        qcfg["loader_len"] >= qcfg["ptq_nbatch"]
+    ), "Please make sure dataloader has enough data needed for PTQ (ie. check qcfg['ptq_nbatch'])."
     # --- Phase 0 cache the inputs of the block0---
     blocks, emb, emb_pos, emb_ln, _, _ = get_blocks(model, qcfg["model_type"])
     cache_block0_inputs(
diff --git a/fms_mo/utils/eval_utils.py b/fms_mo/utils/eval_utils.py
@@ -26,7 +26,7 @@
 
 # Local
 from fms_mo.quant.ptq import cache_block0_inputs, get_blocks
-from fms_mo.utils.utils import patch_torch_bmm
+from fms_mo.utils.utils import move_to, patch_torch_bmm
 
 logger = logging.getLogger(__name__)
 
@@ -35,11 +35,13 @@
 def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs):  # pylint: disable=unused-argument
     """
     Evaluate causal LLM with 1GPU, return perplexity
-    Note: currently taking test_dataset as dict (instead of dataloader)
-    Used for models that cannot fit into a 1 GPU.
+    Note:
+    1. currently taking test_dataset as dict (instead of dataloader)
+    2. Used for models that cannot fit into a 1 GPU. Will need to move modules back and forth.
+    3. Keep hid_state on device to reduce uncessary data transfer.
     """
     model.eval()
-    dev = "cuda:0"  # cuda:0 is used for PTQ
+    dev = "cuda"
     qcfg["batch_size"] = 1  # for dataloading, always use batch_size of 1
     qcfg["dtype"] = next(iter(model.parameters())).dtype
     seq_len = qcfg["seq_len"]
@@ -63,7 +65,14 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs):  #
     # Phase 1: compute blocks and last linear layer
     pbar = tqdm(blocks, desc="evaluation: compute blocks")
 
-    qcfg["cached_input"] = [inp.clone().detach() for inp in qcfg["cached_block0_input"]]
+    qcfg["cached_input"] = [
+        inp.clone().detach().to(dev) for inp in qcfg["cached_block0_input"]
+    ]
+    kw_to_use = {
+        kw_org: kw_new
+        for kw_org, kw_new in qcfg["kw_to_cache"].items()
+        if len(qcfg[kw_new]) == len(qcfg["cached_input"])
+    }
     for block_id, m in enumerate(pbar):  # pylint: disable=unused-variable
         m.to(dev)
         for i in range(qcfg["n_samples"]):
@@ -74,16 +83,14 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs):  #
                     "alibi": qcfg["cached_alibi"][i].unsqueeze(0).to(dev),
                 }
             else:
-                cached_inp_prev_lay = qcfg["cached_input"][i].to(dev)
+                cached_inp_prev_lay = qcfg["cached_input"][i]
                 data_mb = {
-                    "attention_mask": qcfg["cached_mask"][i].to(dev)
-                    if len(qcfg["cached_mask"]) > 0
-                    else None,
-                    "position_ids": qcfg["position_ids"][i].to(dev),
+                    kw_org: move_to(qcfg[kw_new][i], dev)
+                    for kw_org, kw_new in kw_to_use.items()
                 }
 
-            with torch.no_grad(), patch_torch_bmm(qcfg):
-                qcfg["cached_input"][i] = m(cached_inp_prev_lay, **data_mb)[0].cpu()
+            with patch_torch_bmm(qcfg):
+                qcfg["cached_input"][i] = m(cached_inp_prev_lay, **data_mb)[0]
 
         m.cpu()
         torch.cuda.empty_cache()