Merge pull request #141 from chichun-charlie-liu/triton_aiu_sim

chichun-charlie-liu · web-flow · commit 988add03e757 · 2025-06-24T13:53:38.000-04:00
fix: feat: fix for new transformers (&gt;4.48) and new QLinear for INT8 training with HW emulation
diff --git a/fms_mo/calib.py b/fms_mo/calib.py
@@ -574,13 +574,19 @@ def qmodel_calib(
                     f"Qmodel calibration (clip_val analysis) in progress: {i}/{Nbatch}"
                 )
 
-        if "perCh" not in qcfg["qw_mode"]:
-            cv_sum_dict = {"layer": [], "value": []}
-            for k, v in tempmodel.state_dict().items():
-                if "clip" in k:
-                    cv_sum_dict["layer"].append(k)
-                    cv_sum_dict["value"].append(v.item())
-            logger.info(f"Observed clipvals: \n{ pd.DataFrame(cv_sum_dict) }")
+        cv_sum_dict = {"layer": [], "value": []}
+        for k, v in tempmodel.state_dict().items():
+            if "clip" not in k:
+                continue
+
+            if v.numel() > 1:
+                k = k + "*"
+                v = v.mean()
+            cv_sum_dict["layer"].append(k)
+            cv_sum_dict["value"].append(v.item())
+        logger.info(
+            f"Observed clipvals: ('*' if it's a vector) \n{ pd.DataFrame(cv_sum_dict) }"
+        )
 
     # Step 3: extract new clip_vals, params and buffers, then remove handles if needed
     temp_new_clipvals = {
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -36,6 +36,9 @@
 
 # Local
 from fms_mo import qconfig_init, qmodel_prep
+from fms_mo.custom_ext_kernels.utils import (
+    lower_qmodel_triton,  # pylint: disable=unused-import
+)
 from fms_mo.fx.utils import model_size_Wb
 from fms_mo.quant.ptq import (
     calibration_llm_1GPU_v2,
@@ -256,6 +259,15 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         model.save_pretrained(opt_args.output_dir, use_safetensors=True)
         tokenizer.save_pretrained(opt_args.output_dir)
 
+    if fms_mo_args.aiu_sim_triton:
+        lower_qmodel_triton(
+            model,
+            use_dyn_max_act=-1 if qcfg["qa_mode"] == "pertokenmax" else False,
+            max_acc_bits=qcfg.get("max_acc_bits", 32),
+            num_lsb_to_truncate=qcfg.get("lsb_trun_bits", 0),
+            chunk_size=qcfg.get("chunk_size", 1024),
+        )
+
     if fms_mo_args.eval_ppl:
         path_test = Path(data_args.test_data_path)
         arrow_files = list(path_test.glob("*.arrow"))
diff --git a/fms_mo/modules/bmm.py b/fms_mo/modules/bmm.py
@@ -192,7 +192,7 @@ def forward(self, m1, m2):
             torch.Tensor: Output tensor after quantized bmm.
         """
         # pylint: disable = access-member-before-definition
-        if self.calib_counter:
+        if self.calib_counter > 0:
             with torch.no_grad():
                 qm1 = self.quantize_calib_m1(m1)
                 qm2 = self.quantize_calib_m2(m2)
diff --git a/fms_mo/modules/conv.py b/fms_mo/modules/conv.py
@@ -270,7 +270,7 @@ def forward(self, x):
             torch.Tensor: Output tensor of shape (batch_size, out_channels, out_height, out_width).
         """
         # pylint: disable = access-member-before-definition
-        if self.calib_counter:
+        if self.calib_counter > 0:
             with torch.no_grad():
                 qinput = self.quantize_calib_feature(x)
                 qweight = self.quantize_calib_weight(self.weight)
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
@@ -3476,11 +3476,14 @@ def __init__(self, num_bits):
         """
         super().__init__()
         self.num_bits = num_bits
+        self.register_buffer("clip_val", torch.Tensor([0.0]))
+        self.register_buffer("clip_valn", torch.Tensor([0.0]))
 
     def forward(self, input_tensor):
-        scales = input_tensor.abs().max(dim=-1, keepdim=True)[0]
+        self.clip_val = input_tensor.abs().max(dim=-1, keepdim=True)[0]
+        self.clip_valn = -self.clip_val
         levels = 2 ** (self.num_bits - 1) - 1
-        scales.clamp_(min=1e-5).div_(levels)
+        scales = self.clip_val.clamp(min=1e-5).div(levels)
         input_tensor.div_(scales).round_().mul_(scales)
         return input_tensor
 
diff --git a/fms_mo/utils/aiu_utils.py b/fms_mo/utils/aiu_utils.py
@@ -309,7 +309,7 @@ def process_zero_shift(
             a_cvn = model.state_dict()[a_cvn_name]
 
         # compute "zero_shift" correction factor only for asymmetric activations
-        if a_cv and a_cvn and a_cv != -a_cvn:
+        if not (a_cv is None or a_cvn is None or torch.equal(a_cv, -a_cvn)):
             if weight_int is None:
                 logger.info(
                     f"As weights appear to be not quantized, zero shift for {k} "
diff --git a/fms_mo/utils/eval_utils.py b/fms_mo/utils/eval_utils.py
@@ -152,7 +152,9 @@ def evaluate(self, model, block_size=2048):
                 model.device
             )
             with torch.no_grad():
-                lm_logits = model(batch, return_dict=True).logits
+                mod_out = model(batch, return_dict=True)
+                # for newer transformers, model output could be simply a tuple
+                lm_logits = getattr(mod_out, "logits", mod_out[0])
             shift_logits = lm_logits[:, :-1, :].contiguous().float()
             shift_labels = self.dataset[:, (i * block_size) : ((i + 1) * block_size)][
                 :, 1:
diff --git a/tests/models/test_model_utils.py b/tests/models/test_model_utils.py
@@ -232,7 +232,7 @@ def check_linear_dtypes(state_dict: dict, linear_names: list):
         if any(n in k for n in linear_names):
             if k.endswith(".weight"):
                 assert v.dtype == torch.int8
-            elif k.endswith(".zero_point"):
+            elif k.endswith(".zero_point") or k.endswith(".zero_shift"):
                 assert v.dtype == torch.float32
             else:
                 assert v.dtype == torch.float16
diff --git a/tests/triton_kernels/test_triton_mm.py b/tests/triton_kernels/test_triton_mm.py
@@ -69,8 +69,10 @@ def test_triton_matmul_fp(mkn, dtype_to_test):
         .to("cuda")
         .to(torch.float)
     )
-    tl_output_no_trun = tl_matmul(a, b).to(torch.float)
-    tl_output_trun_8b = tl_matmul(a, b, chunk_trun_bits=8).to(torch.float)
+    tl_output_no_trun = tl_matmul(a, b, truncate_then_accumulate=False).to(torch.float)
+    tl_output_trun_8b = tl_matmul(
+        a, b, chunk_trun_bits=8, truncate_then_accumulate=False
+    ).to(torch.float)
 
     diff_no_trun = torch_output - tl_output_no_trun
     diff_trun_8b = torch_output - tl_output_trun_8b