fix 2 bugs in aiu_save funcs related to vector clipvals and zero_shift being fp32

chichun-charlie-liu · chichun-charlie-liu · commit 049df45dcc2e · 2025-06-23T18:28:11.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/calib.py b/fms_mo/calib.py
@@ -574,13 +574,19 @@ def qmodel_calib(
                     f"Qmodel calibration (clip_val analysis) in progress: {i}/{Nbatch}"
                 )
 
-        if "perCh" not in qcfg["qw_mode"]:
-            cv_sum_dict = {"layer": [], "value": []}
-            for k, v in tempmodel.state_dict().items():
-                if "clip" in k:
-                    cv_sum_dict["layer"].append(k)
-                    cv_sum_dict["value"].append(v.item())
-            logger.info(f"Observed clipvals: \n{ pd.DataFrame(cv_sum_dict) }")
+        cv_sum_dict = {"layer": [], "value": []}
+        for k, v in tempmodel.state_dict().items():
+            if "clip" not in k:
+                continue
+
+            if v.numel() > 1:
+                k = k + "*"
+                v = v.mean()
+            cv_sum_dict["layer"].append(k)
+            cv_sum_dict["value"].append(v.item())
+        logger.info(
+            f"Observed clipvals: ('*' if it's a vector) \n{ pd.DataFrame(cv_sum_dict) }"
+        )
 
     # Step 3: extract new clip_vals, params and buffers, then remove handles if needed
     temp_new_clipvals = {
diff --git a/fms_mo/utils/aiu_utils.py b/fms_mo/utils/aiu_utils.py
@@ -288,7 +288,7 @@ def process_zero_shift(
             a_cvn = model.state_dict()[a_cvn_name]
 
         # compute "zero_shift" correction factor only for asymmetric activations
-        if a_cv and a_cvn and a_cv != -a_cvn:
+        if a_cv is not None and a_cvn is not None and torch.equal(a_cv, -a_cvn):
             if weight_int is None:
                 logger.info(
                     f"As weights appear to be not quantized, zero shift for {k} "
diff --git a/tests/models/test_model_utils.py b/tests/models/test_model_utils.py
@@ -232,7 +232,7 @@ def check_linear_dtypes(state_dict: dict, linear_names: list):
         if any(n in k for n in linear_names):
             if k.endswith(".weight"):
                 assert v.dtype == torch.int8
-            elif k.endswith(".zero_point"):
+            elif k.endswith(".zero_point") or k.endswith(".zero_shift"):
                 assert v.dtype == torch.float32
             else:
                 assert v.dtype == torch.float16