From 2d81e135716237062ca8fe520f71acb0ec9b8a26 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 15 Aug 2025 16:06:54 +0800
Subject: [PATCH 1/2] support bf16 scale and bias

---
 auto_round/autoround.py                         | 11 +++++++++++
 auto_round/export/export_to_autoround/export.py |  3 ++-
 auto_round/wrapper.py                           |  5 +++--
 auto_round_extension/torch/qlinear_torch.py     | 10 +++++-----
 auto_round_extension/torch/qlinear_torch_zp.py  | 12 ++++++------
 5 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 854f3f460..59e1e9edb 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -604,6 +604,17 @@ def parse_format_to_list(self, format: str) -> list:
                 self.scale_dtype = torch.float32
                 logger.info("change `scale_dtype` to `torch.float32`")
 
+        if self.model.dtype!=torch.float16 and self.scale_dtype==torch.float16:
+            only_auto_round = True
+            for format_ in formats:
+                if not ("auto_round" in format_ or "fake" in format_):
+                    only_auto_round = False
+                    break
+                if only_auto_round:
+                    self.scale_dtype = torch.bfloat16
+                    logger.info("change `scale_dtype` to `torch.bfloat16`")
+
+
         # Adjust format settings based on compatibility
         for index in range(len(formats)):
             format = formats[index]
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 22ffe967d..75095d63f 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -221,13 +221,14 @@ def pack_layer(layer_name, model, backend):
 
         if bits != 4:
             logger.error("AutoAWQ format only supports 4-bits quantization.")
+
         qlayer = QuantLinear.from_linear(
             linear=layer,
             w_bit=bits,
             group_size=group_size,
             init_only=False,
             scales=scale,
-            zeros=zp,
+            zeros=zp
         )
         qlayer.to(device)
         set_module(model, layer_name, qlayer)
diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py
index 591e5c567..17e3856b0 100644
--- a/auto_round/wrapper.py
+++ b/auto_round/wrapper.py
@@ -89,8 +89,9 @@ def __init__(
             weight_global_scale = calculate_gparam(self.orig_layer.weight, self.orig_layer.group_size)
             setattr(self, "weight_global_scale", weight_global_scale)
             self.weight_global_scale = self.weight_global_scale.to(self.orig_layer.weight.device)
-        if hasattr(self.orig_layer, "scale_dtype") and self.orig_layer.scale_dtype == torch.float32:
-            self.q_scale_thresh = 1e-8
+        if hasattr(self.orig_layer, "scale_dtype") and (
+                self.orig_layer.scale_dtype == torch.float32 or self.orig_layer.scale_dtype == torch.bfloat16):
+            self.q_scale_thresh = 1e-30
         else:
             self.q_scale_thresh = 1e-5
         self._init_tuning_params_and_quant_func()
diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py
index c45e183df..2d1a07442 100644
--- a/auto_round_extension/torch/qlinear_torch.py
+++ b/auto_round_extension/torch/qlinear_torch.py
@@ -30,7 +30,7 @@ class QuantLinear(nn.Module):
 
     QUANT_TYPE = "torch"
 
-    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
+    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False,  weight_dtype=torch.bfloat16, **kwargs):
         super().__init__()
         if bits not in [2, 3, 4, 8]:
             raise NotImplementedError("Only 2,3,4,8 bits are supported.")
@@ -62,7 +62,7 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa
             ),
         )
         if bias:
-            self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
         else:
             self.bias = None
 
@@ -89,8 +89,8 @@ def post_init(self):
     def pack(self, linear, scales, zeros, g_idx=None):
         scales_t = scales.t().contiguous()
         if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-        self.scales = scales_t.clone().half()
+            self.bias = linear.bias.clone().to(self.bias.dtype)
+        self.scales = scales_t.clone().to(self.scales.dtype)
         device = "cpu"
         if torch.cuda.is_available():
             device = "cuda:0"
@@ -160,7 +160,7 @@ def pack(self, linear, scales, zeros, g_idx=None):
 
         if isinstance(zeros, torch.Tensor):
             zeros = zeros.t().contiguous()
-            zeros = zeros.numpy().astype(np.uint32)
+            zeros = zeros.to(torch.float16).numpy().astype(np.uint32)
             qzeros = torch.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=torch.int32)
             i = 0
             col = 0
diff --git a/auto_round_extension/torch/qlinear_torch_zp.py b/auto_round_extension/torch/qlinear_torch_zp.py
index 2958d249c..db9b9b57a 100644
--- a/auto_round_extension/torch/qlinear_torch_zp.py
+++ b/auto_round_extension/torch/qlinear_torch_zp.py
@@ -31,7 +31,7 @@ class QuantLinear(nn.Module):
 
     QUANT_TYPE = "torch"
 
-    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
+    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs):
         super().__init__()
         if bits not in [2, 3, 4, 8]:
             raise NotImplementedError("Only 2,3,4,8 bits are supported.")
@@ -59,11 +59,11 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa
             "scales",
             torch.zeros(
                 (math.ceil(infeatures / self.group_size), outfeatures),
-                dtype=torch.float16,
+                dtype=weight_dtype,
             ),
         )
         if bias:
-            self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
         else:
             self.bias = None
 
@@ -90,8 +90,8 @@ def post_init(self):
     def pack(self, linear, scales, zeros, g_idx=None):
         scales_t = scales.t().contiguous()
         if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-        self.scales = scales_t.clone().half()
+            self.bias = linear.bias.clone().to(self.bias.dtype)
+        self.scales = scales_t.clone().to(self.scales.dtype)
         device = "cpu"
         if torch.cuda.is_available():
             device = "cuda:0"
@@ -161,7 +161,7 @@ def pack(self, linear, scales, zeros, g_idx=None):
 
         zeros = zeros.t().contiguous()
         zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
+        zeros = zeros.to(torch.float16).numpy().astype(np.uint32)
         qzeros = torch.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=torch.int32)
         i = 0
         col = 0

From aa67a4e3c23e01c3b57f1ea5d074c838b261a974 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 Aug 2025 08:08:37 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/autoround.py                         | 3 +--
 auto_round/export/export_to_autoround/export.py | 7 +------
 auto_round/wrapper.py                           | 3 ++-
 auto_round_extension/torch/qlinear_torch.py     | 4 +++-
 auto_round_extension/torch/qlinear_torch_zp.py  | 4 +++-
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 59e1e9edb..13bd41064 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -604,7 +604,7 @@ def parse_format_to_list(self, format: str) -> list:
                 self.scale_dtype = torch.float32
                 logger.info("change `scale_dtype` to `torch.float32`")
 
-        if self.model.dtype!=torch.float16 and self.scale_dtype==torch.float16:
+        if self.model.dtype != torch.float16 and self.scale_dtype == torch.float16:
             only_auto_round = True
             for format_ in formats:
                 if not ("auto_round" in format_ or "fake" in format_):
@@ -614,7 +614,6 @@ def parse_format_to_list(self, format: str) -> list:
                     self.scale_dtype = torch.bfloat16
                     logger.info("change `scale_dtype` to `torch.bfloat16`")
 
-
         # Adjust format settings based on compatibility
         for index in range(len(formats)):
             format = formats[index]
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 75095d63f..e48653a91 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -223,12 +223,7 @@ def pack_layer(layer_name, model, backend):
             logger.error("AutoAWQ format only supports 4-bits quantization.")
 
         qlayer = QuantLinear.from_linear(
-            linear=layer,
-            w_bit=bits,
-            group_size=group_size,
-            init_only=False,
-            scales=scale,
-            zeros=zp
+            linear=layer, w_bit=bits, group_size=group_size, init_only=False, scales=scale, zeros=zp
         )
         qlayer.to(device)
         set_module(model, layer_name, qlayer)
diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py
index 17e3856b0..46acf8954 100644
--- a/auto_round/wrapper.py
+++ b/auto_round/wrapper.py
@@ -90,7 +90,8 @@ def __init__(
             setattr(self, "weight_global_scale", weight_global_scale)
             self.weight_global_scale = self.weight_global_scale.to(self.orig_layer.weight.device)
         if hasattr(self.orig_layer, "scale_dtype") and (
-                self.orig_layer.scale_dtype == torch.float32 or self.orig_layer.scale_dtype == torch.bfloat16):
+            self.orig_layer.scale_dtype == torch.float32 or self.orig_layer.scale_dtype == torch.bfloat16
+        ):
             self.q_scale_thresh = 1e-30
         else:
             self.q_scale_thresh = 1e-5
diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py
index 2d1a07442..c3b24c821 100644
--- a/auto_round_extension/torch/qlinear_torch.py
+++ b/auto_round_extension/torch/qlinear_torch.py
@@ -30,7 +30,9 @@ class QuantLinear(nn.Module):
 
     QUANT_TYPE = "torch"
 
-    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False,  weight_dtype=torch.bfloat16, **kwargs):
+    def __init__(
+        self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs
+    ):
         super().__init__()
         if bits not in [2, 3, 4, 8]:
             raise NotImplementedError("Only 2,3,4,8 bits are supported.")
diff --git a/auto_round_extension/torch/qlinear_torch_zp.py b/auto_round_extension/torch/qlinear_torch_zp.py
index db9b9b57a..eb37c8269 100644
--- a/auto_round_extension/torch/qlinear_torch_zp.py
+++ b/auto_round_extension/torch/qlinear_torch_zp.py
@@ -31,7 +31,9 @@ class QuantLinear(nn.Module):
 
     QUANT_TYPE = "torch"
 
-    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs):
+    def __init__(
+        self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs
+    ):
         super().__init__()
         if bits not in [2, 3, 4, 8]:
             raise NotImplementedError("Only 2,3,4,8 bits are supported.")