From 2d81e135716237062ca8fe520f71acb0ec9b8a26 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 15 Aug 2025 16:06:54 +0800 Subject: [PATCH 1/2] support bf16 scale and bias --- auto_round/autoround.py | 11 +++++++++++ auto_round/export/export_to_autoround/export.py | 3 ++- auto_round/wrapper.py | 5 +++-- auto_round_extension/torch/qlinear_torch.py | 10 +++++----- auto_round_extension/torch/qlinear_torch_zp.py | 12 ++++++------ 5 files changed, 27 insertions(+), 14 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 854f3f460..59e1e9edb 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -604,6 +604,17 @@ def parse_format_to_list(self, format: str) -> list: self.scale_dtype = torch.float32 logger.info("change `scale_dtype` to `torch.float32`") + if self.model.dtype!=torch.float16 and self.scale_dtype==torch.float16: + only_auto_round = True + for format_ in formats: + if not ("auto_round" in format_ or "fake" in format_): + only_auto_round = False + break + if only_auto_round: + self.scale_dtype = torch.bfloat16 + logger.info("change `scale_dtype` to `torch.bfloat16`") + + # Adjust format settings based on compatibility for index in range(len(formats)): format = formats[index] diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 22ffe967d..75095d63f 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -221,13 +221,14 @@ def pack_layer(layer_name, model, backend): if bits != 4: logger.error("AutoAWQ format only supports 4-bits quantization.") + qlayer = QuantLinear.from_linear( linear=layer, w_bit=bits, group_size=group_size, init_only=False, scales=scale, - zeros=zp, + zeros=zp ) qlayer.to(device) set_module(model, layer_name, qlayer) diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index 591e5c567..17e3856b0 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -89,8 +89,9 @@ def __init__( weight_global_scale = calculate_gparam(self.orig_layer.weight, self.orig_layer.group_size) setattr(self, "weight_global_scale", weight_global_scale) self.weight_global_scale = self.weight_global_scale.to(self.orig_layer.weight.device) - if hasattr(self.orig_layer, "scale_dtype") and self.orig_layer.scale_dtype == torch.float32: - self.q_scale_thresh = 1e-8 + if hasattr(self.orig_layer, "scale_dtype") and ( + self.orig_layer.scale_dtype == torch.float32 or self.orig_layer.scale_dtype == torch.bfloat16): + self.q_scale_thresh = 1e-30 else: self.q_scale_thresh = 1e-5 self._init_tuning_params_and_quant_func() diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py index c45e183df..2d1a07442 100644 --- a/auto_round_extension/torch/qlinear_torch.py +++ b/auto_round_extension/torch/qlinear_torch.py @@ -30,7 +30,7 @@ class QuantLinear(nn.Module): QUANT_TYPE = "torch" - def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs): + def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs): super().__init__() if bits not in [2, 3, 4, 8]: raise NotImplementedError("Only 2,3,4,8 bits are supported.") @@ -62,7 +62,7 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa ), ) if bias: - self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16)) + self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype)) else: self.bias = None @@ -89,8 +89,8 @@ def post_init(self): def pack(self, linear, scales, zeros, g_idx=None): scales_t = scales.t().contiguous() if linear.bias is not None: - self.bias = linear.bias.clone().half() - self.scales = scales_t.clone().half() + self.bias = linear.bias.clone().to(self.bias.dtype) + self.scales = scales_t.clone().to(self.scales.dtype) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" @@ -160,7 +160,7 @@ def pack(self, linear, scales, zeros, g_idx=None): if isinstance(zeros, torch.Tensor): zeros = zeros.t().contiguous() - zeros = zeros.numpy().astype(np.uint32) + zeros = zeros.to(torch.float16).numpy().astype(np.uint32) qzeros = torch.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=torch.int32) i = 0 col = 0 diff --git a/auto_round_extension/torch/qlinear_torch_zp.py b/auto_round_extension/torch/qlinear_torch_zp.py index 2958d249c..db9b9b57a 100644 --- a/auto_round_extension/torch/qlinear_torch_zp.py +++ b/auto_round_extension/torch/qlinear_torch_zp.py @@ -31,7 +31,7 @@ class QuantLinear(nn.Module): QUANT_TYPE = "torch" - def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs): + def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs): super().__init__() if bits not in [2, 3, 4, 8]: raise NotImplementedError("Only 2,3,4,8 bits are supported.") @@ -59,11 +59,11 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa "scales", torch.zeros( (math.ceil(infeatures / self.group_size), outfeatures), - dtype=torch.float16, + dtype=weight_dtype, ), ) if bias: - self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16)) + self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype)) else: self.bias = None @@ -90,8 +90,8 @@ def post_init(self): def pack(self, linear, scales, zeros, g_idx=None): scales_t = scales.t().contiguous() if linear.bias is not None: - self.bias = linear.bias.clone().half() - self.scales = scales_t.clone().half() + self.bias = linear.bias.clone().to(self.bias.dtype) + self.scales = scales_t.clone().to(self.scales.dtype) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" @@ -161,7 +161,7 @@ def pack(self, linear, scales, zeros, g_idx=None): zeros = zeros.t().contiguous() zeros -= 1 - zeros = zeros.numpy().astype(np.uint32) + zeros = zeros.to(torch.float16).numpy().astype(np.uint32) qzeros = torch.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=torch.int32) i = 0 col = 0 From aa67a4e3c23e01c3b57f1ea5d074c838b261a974 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 15 Aug 2025 08:08:37 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/autoround.py | 3 +-- auto_round/export/export_to_autoround/export.py | 7 +------ auto_round/wrapper.py | 3 ++- auto_round_extension/torch/qlinear_torch.py | 4 +++- auto_round_extension/torch/qlinear_torch_zp.py | 4 +++- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 59e1e9edb..13bd41064 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -604,7 +604,7 @@ def parse_format_to_list(self, format: str) -> list: self.scale_dtype = torch.float32 logger.info("change `scale_dtype` to `torch.float32`") - if self.model.dtype!=torch.float16 and self.scale_dtype==torch.float16: + if self.model.dtype != torch.float16 and self.scale_dtype == torch.float16: only_auto_round = True for format_ in formats: if not ("auto_round" in format_ or "fake" in format_): @@ -614,7 +614,6 @@ def parse_format_to_list(self, format: str) -> list: self.scale_dtype = torch.bfloat16 logger.info("change `scale_dtype` to `torch.bfloat16`") - # Adjust format settings based on compatibility for index in range(len(formats)): format = formats[index] diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 75095d63f..e48653a91 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -223,12 +223,7 @@ def pack_layer(layer_name, model, backend): logger.error("AutoAWQ format only supports 4-bits quantization.") qlayer = QuantLinear.from_linear( - linear=layer, - w_bit=bits, - group_size=group_size, - init_only=False, - scales=scale, - zeros=zp + linear=layer, w_bit=bits, group_size=group_size, init_only=False, scales=scale, zeros=zp ) qlayer.to(device) set_module(model, layer_name, qlayer) diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index 17e3856b0..46acf8954 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -90,7 +90,8 @@ def __init__( setattr(self, "weight_global_scale", weight_global_scale) self.weight_global_scale = self.weight_global_scale.to(self.orig_layer.weight.device) if hasattr(self.orig_layer, "scale_dtype") and ( - self.orig_layer.scale_dtype == torch.float32 or self.orig_layer.scale_dtype == torch.bfloat16): + self.orig_layer.scale_dtype == torch.float32 or self.orig_layer.scale_dtype == torch.bfloat16 + ): self.q_scale_thresh = 1e-30 else: self.q_scale_thresh = 1e-5 diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py index 2d1a07442..c3b24c821 100644 --- a/auto_round_extension/torch/qlinear_torch.py +++ b/auto_round_extension/torch/qlinear_torch.py @@ -30,7 +30,9 @@ class QuantLinear(nn.Module): QUANT_TYPE = "torch" - def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs): + def __init__( + self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs + ): super().__init__() if bits not in [2, 3, 4, 8]: raise NotImplementedError("Only 2,3,4,8 bits are supported.") diff --git a/auto_round_extension/torch/qlinear_torch_zp.py b/auto_round_extension/torch/qlinear_torch_zp.py index db9b9b57a..eb37c8269 100644 --- a/auto_round_extension/torch/qlinear_torch_zp.py +++ b/auto_round_extension/torch/qlinear_torch_zp.py @@ -31,7 +31,9 @@ class QuantLinear(nn.Module): QUANT_TYPE = "torch" - def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs): + def __init__( + self, bits, group_size, infeatures, outfeatures, bias, trainable=False, weight_dtype=torch.bfloat16, **kwargs + ): super().__init__() if bits not in [2, 3, 4, 8]: raise NotImplementedError("Only 2,3,4,8 bits are supported.")