feat: Implemented dequant for Qmax original perCh STEs

BrandonGroth · BrandonGroth · commit 454391f2081a · 2025-07-23T17:41:25.000-04:00
Signed-off-by: Brandon Groth &lt;brandon.m.groth@gmail.com&gt;
diff --git a/fms_mo/quant_refactor/quantizers_new.py b/fms_mo/quant_refactor/quantizers_new.py
@@ -3110,14 +3110,29 @@ def forward(
         )  # original SAWB assumes odd number of bins when calc clip_val
         zero_point = torch.zeros_like(scale)  # centers around 0 and align 0
         # FIXME, fake quantize function only support float.
-        output = torch.fake_quantize_per_channel_affine(
-            input.float(),
-            scale.float(),
-            zero_point.float(),
-            axis=0,
-            quant_min=int_l,
-            quant_max=int_u,
-        ).to(input.dtype)
+
+        if dequantize:
+            output = torch.fake_quantize_per_channel_affine(
+                input.float(),
+                scale.float(),
+                zero_point.float(),
+                axis=0,
+                quant_min=int_l,
+                quant_max=int_u,
+            ).to(input.dtype)
+        else:
+            output = (
+                torch.quantize_per_channel(
+                    input.float(),
+                    scale.float(),
+                    zero_point.float(),
+                    axis=0,
+                    dtype=torch.qint8,
+                )
+                .int_repr()
+                .clamp(int_l, int_u)
+            )
+
         return output
 
     @staticmethod
@@ -3210,15 +3225,14 @@ def forward(
             ctx.mark_dirty(input)
         clip_val, clip_valn = clip_val.to(input.dtype), clip_valn.to(input.dtype)
         scale = (clip_val - clip_valn) / (2**num_bits - 1)
-        zero_point = torch.round(-clip_valn / scale).to(torch.int)
+        zero_point = torch.round(clip_valn / scale).to(torch.int)
 
-        output = input.clamp(clip_valn[:, None], clip_val[:, None])
-        output = torch.round(output / scale[:, None] - zero_point[:, None])
+        output = torch.round(input / scale[:, None] - zero_point[:, None])
         if dequantize:
             output = (output + zero_point[:, None]) * scale[:, None]
         else:
-            n_half = 2 ** (num_bits - 1)
-            output = (output - n_half).to(torch.int8)
+            output = output.to(torch.uint8)
+
         return output
 
     @staticmethod