fix DL8/DL16 bugs and a couple other minor bugs fix

chichun-charlie-liu · chichun-charlie-liu · commit 52833235f91f · 2025-07-09T14:16:41.000-04:00
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -268,35 +268,9 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
             max_acc_bits=qcfg.get("max_acc_bits", 32),
             num_lsb_to_truncate=qcfg.get("lsb_trun_bits", 0),
             chunk_size=qcfg.get("chunk_size", 32),  # 1024
-            clamp_acc_to_dl16=False,  # fms_mo_args.aiu_sim_triton == "fp8"
+            clamp_acc_to_dl16=fms_mo_args.aiu_sim_triton == "fp8",
             # layer_to_exclude=["lm_head",]
         )
-    # [CL] -------- record W, A, qW, qA with hooks ----------------
-    # from fms_mo.modules.linear import QLinear, QLinearINT8Deploy
-    # from fms_mo.quant.ptq import HookRecPostQuantInOut
-    #     cache_dict = {}
-    #     hook_handles = []
-    #     for n, m in model.named_modules():
-    #         if not isinstance(m, (QLinear, QLinearINT8Deploy, torch.nn.Linear)):
-    #             continue
-
-    #         m.mod_name = n
-    #         hook_handles.append(
-    #             m.register_forward_hook( HookRecPostQuantInOut(cache_dict, n))
-    #         )
-
-    #     data_mb = next(iter(eval_dataloader))
-    #     with torch.no_grad():
-    #         model(**data_mb)
-
-    #     for h in hook_handles:
-    #         h.remove()
-
-    #     torch.save(
-    #         cache_dict,
-    #         f"roberta_sqv2_data_dump_{qcfg['qa_mode']}_{qcfg['qw_mode']}_chunk64_lsb{args.aiu_int_lsb_trun}_dq.pt"
-    #     )
-    #     return
 
     if fms_mo_args.eval_ppl:
         path_test = Path(data_args.test_data_path)
diff --git a/fms_mo/fx/dynamo_utils.py b/fms_mo/fx/dynamo_utils.py
@@ -1180,14 +1180,20 @@ def cus_backend_model_analyzer(
     if is_transformers:
         # NOTE simplified method to determine 1st/last modules for transformers.
         # will not work if model has multiple parallel heads at the end, e.g. obj det
-        def call_seq_hook(mod, *_args, **_kwargs):
-            qcfg["mod_call_seq"].append(lut_weight2modname[mod.weight])
+        def call_seq_hook(mod, *_args, **kwargs):
+            mod_name = kwargs.get("mod_name", lut_weight2modname.get(mod.weight, None))
+            if mod_name is None:
+                raise RuntimeError("cannot determine module name, plz check model.")
+
+            qcfg["mod_call_seq"].append(mod_name)
 
         h_hooks = []
         qcfg["mod_call_seq"] = []
         for n, m in model.named_modules():
             if isinstance(m, (torch.nn.Linear, torch.nn.Conv2d)):
-                h_hooks.append(m.register_forward_hook(call_seq_hook))
+                h_hooks.append(
+                    m.register_forward_hook(partial(call_seq_hook, mod_name=n))
+                )
 
         with torch.no_grad():
             run_fwd_once(model, sample_inp)
diff --git a/fms_mo/fx/utils.py b/fms_mo/fx/utils.py
@@ -461,14 +461,14 @@ def model_size_Wb(mod, unit="MB", print_to_file=True, show_details=False):
                 w_mat.numel() * w_mat.element_size()
                 + b_mat.numel() * b_mat.element_size()
             )
-            w_dtype = w_mat.dtype
+            w_dtype = str(w_mat.dtype)
             w_shape = w_mat.shape
 
         elif isinstance(w, torch.Tensor):
             mem_use = w.numel() * w.element_size()
             if hasattr(m, "bias") and m.bias is not None:
                 mem_use += m.bias.numel() * m.bias.element_size()
-            w_dtype = w.dtype
+            w_dtype = str(w.dtype)
             w_shape = w.shape
 
         if w_shape:
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -1926,14 +1926,16 @@ def forward(
         ctx.chunk_size = chunk_size
         ctx.fp8_dyn = fp8_dyn
         ctx.clamp_acc_to_dl16 = clamp_acc_to_dl16
+        ctx.fp8_e4m3_max = torch.finfo(torch.float8_e4m3fn).max
+        ctx.fp8_e5m2_max = torch.finfo(torch.float8_e5m2).max
         ctx.dl8_min = 0.0087890625
 
+        x_scale = torch.tensor(1.0, device=x.device, dtype=org_dtype)
+        w_scale = x_scale.clone()
         if fp8_dyn:
             # use Q/dQ simulation for now, meaning still compute in fp16/bf16
             # if choose per_token for input, use per_channel for W
             # (W saved as [out, in], reduce inCh-dim, => reduce_dim=1)
-            ctx.fp8_e4m3_max = torch.finfo(torch.float8_e4m3fn).max
-            ctx.fp8_e5m2_max = torch.finfo(torch.float8_e5m2).max
             reduce_dim = None if fp8_dyn == "per_tensor" else 1
             x_scale = (
                 x.abs().amax(dim=reduce_dim, keepdim=True) / ctx.fp8_e4m3_max
@@ -1942,22 +1944,30 @@ def forward(
                 weight.abs().amax(dim=reduce_dim, keepdim=True) / ctx.fp8_e4m3_max
             ).clamp(min=1e-5)
 
-            x = (x / x_scale).to(torch.float8_e4m3fn).to(org_dtype) * x_scale
-            weight = (weight / w_scale).to(torch.float8_e4m3fn).to(org_dtype) * w_scale
+            x = (x / x_scale).to(torch.float8_e4m3fn).to(torch.float32)
+            weight = (weight / w_scale).to(torch.float8_e4m3fn).to(torch.float32)
             if clamp_acc_to_dl16:
-                # NOTE For DL8@DL8 acc in DL16, as DL8 doesn't support subnorm numbers like PyTorch
-                # (whose real min for e4m3fn is 2^-9), need to flush subnorm numbers to 0
-                x.masked_fill_(x < ctx.dl8_min, 0)
-                weight.masked_fill_(weight < ctx.dl8_min, 0)
+                # at this point, x and W are clamped to PT's FP8 range (2^-9 to 448). But since DL8
+                # doesn't support subnorm like PyTorch, need to flush subnorms to 0 BEFORE descaling
+                x.masked_fill_(x.abs() < ctx.dl8_min, 0)
+                weight.masked_fill_(weight.abs() < ctx.dl8_min, 0)
 
         # triton kernel assumes 2D inputs and cast the return to input.dtype
-        output = tl_matmul(
-            x,
-            weight.t().to(org_dtype),
-            chunk_trun_bits=trun_bits,
-            chunk_size=chunk_size,
-            clamp_acc_to_dl16=clamp_acc_to_dl16,
-        ).reshape(target_shape_output)
+        output = (
+            (
+                tl_matmul(
+                    x,
+                    weight.t(),
+                    chunk_trun_bits=trun_bits,
+                    chunk_size=chunk_size,
+                    clamp_acc_to_dl16=clamp_acc_to_dl16,
+                )
+                * x_scale
+                * w_scale.t()
+            )
+            .to(org_dtype)
+            .reshape(target_shape_output)
+        )
 
         if bias is not None:
             output = output + bias.to(org_dtype)
@@ -1977,44 +1987,54 @@ def backward(ctx, grad_output):
         target_shape_grad_input = grad_output.shape[:-1] + (in_dim,)
         grad_output_2D = grad_output.reshape(-1, out_dim).to(dtype_input)
 
+        x_scale = torch.tensor(1.0, device=x.device, dtype=dtype_input)
+        w_scale = x_scale.clone()
         if ctx.fp8_dyn:
             reduce_dim = None if ctx.fp8_dyn == "per_tensor" else 1
             x_scale = x.abs().amax(dim=reduce_dim) / ctx.fp8_e5m2_max
             w_scale = weight.abs().amax(dim=reduce_dim) / ctx.fp8_e5m2_max
             # always assume perT in this case
             grad_out_scale = grad_output_2D.abs().amax(dim=None) / ctx.fp8_e5m2_max
 
-            x = (x / x_scale).to(torch.float8_e5m2).to(dtype_input) * x_scale
-            weight = (weight / w_scale).to(torch.float8_e5m2).to(weight.dtype) * w_scale
-            grad_output_2D = (grad_output_2D / grad_out_scale).to(torch.float8_e5m2).to(
-                grad_output.dtype
-            ) * grad_out_scale
+            x = (x / x_scale).to(torch.float8_e5m2).to(torch.float)
+            weight = (weight / w_scale).to(torch.float8_e5m2).to(torch.float)
+            grad_output_2D = (
+                (grad_output_2D / grad_out_scale).to(torch.float8_e5m2).to(torch.float)
+            )
             if ctx.clamp_acc_to_dl16:
                 # flush subnorm numbers to 0 as DL8 doesn't support it
-                x.masked_fill_(x < ctx.dl8_min, 0)
-                weight.masked_fill_(weight < ctx.dl8_min, 0)
-                grad_output_2D.masked_fill_(grad_output_2D < ctx.dl8_min, 0)
+                x.masked_fill_(x.abs() < ctx.dl8_min, 0)
+                weight.masked_fill_(weight.abs() < ctx.dl8_min, 0)
+                grad_output_2D.masked_fill_(grad_output_2D.abs() < ctx.dl8_min, 0)
 
         # Compute grad_weight, shape = [out, in]
         # NOTE: this triton kernel requires A matrix to be contiguous
-        grad_weight = tl_matmul(
-            grad_output_2D.transpose(0, 1).contiguous(),
-            x,
-            chunk_trun_bits=trun_bits,
-            chunk_size=chunk_size,
-            clamp_acc_to_dl16=ctx.clamp_acc_to_dl16,
-        ).to(weight.dtype)
-        # Compute grad_input in 2D then reshape to target shape, could be 3D or 2D
-        grad_input = (
+        grad_weight = (
             tl_matmul(
-                grad_output_2D,
-                weight.to(dtype_input),
+                grad_output_2D.transpose(0, 1).contiguous(),
+                x,
                 chunk_trun_bits=trun_bits,
                 chunk_size=chunk_size,
                 clamp_acc_to_dl16=ctx.clamp_acc_to_dl16,
             )
-            .reshape(target_shape_grad_input)
+            * grad_out_scale.t()
+            * x_scale
+        ).to(weight.dtype)
+        # Compute grad_input in 2D then reshape to target shape, could be 3D or 2D
+        grad_input = (
+            (
+                tl_matmul(
+                    grad_output_2D,
+                    weight,
+                    chunk_trun_bits=trun_bits,
+                    chunk_size=chunk_size,
+                    clamp_acc_to_dl16=ctx.clamp_acc_to_dl16,
+                )
+                * grad_out_scale
+                * w_scale
+            )
             .to(dtype_input)
+            .reshape(target_shape_grad_input)
         )
 
         if not ctx.has_bias: