intel · n1ck-guo · Aug 4, 2025 · Aug 4, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -40,6 +40,8 @@ def __init__(self, *args, **kwargs):
 
         self.add_argument("--eval", action="store_true", help="whether to use eval only mode")
 
+        self.add_argument("--sq", action="store_true", help="whether to use smoothquant")
+
         self.add_argument(
             "--scheme",
             default="W4A16",
@@ -470,6 +472,7 @@ def tune(args):
     autoround: BaseCompressor = AutoRound(
         model=model_name,
         scheme=scheme,
+        sq=args.sq,
         dataset=args.dataset,
         iters=args.iters,
         seqlen=args.seqlen,

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -64,6 +64,7 @@ def __new__(
         model: Union[torch.nn.Module, str],
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        sq: bool = False,
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         iters: int = 200,
@@ -159,6 +160,7 @@ def __new__(
             model=model,
             tokenizer=tokenizer,
             scheme=scheme,
+            sq=sq,
             layer_config=layer_config,
             dataset=dataset,
             iters=iters,

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -130,6 +130,7 @@ def __init__(
         model: Union[torch.nn.Module, str],
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        sq: bool = False,
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         iters: int = 200,
@@ -385,6 +386,33 @@ def __init__(
             import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
             import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
 
+        # sq, for test
+        if sq:
+            from auto_round.calib_dataset import get_dataloader
+
+            dataloader = get_dataloader(tokenizer, seqlen, bs=batch_size, nsamples=nsamples)
+            auto_alpha_args = {
+                "init_alpha": 0.5,
+                "alpha_min": 0.1,
+                "alpha_max": 1.0,
+                "alpha_step": 0.1,
+                "shared_criterion": "mean",
+                "n_samples": 512,  ##512 for cuda, 128 for cpu?
+                # "do_blockwise": True
+            }
+            from auto_round.smooth_quant import SmoothQuant
+
+            model = model.to(self.device)
+            sq = SmoothQuant(model, dataloader, device=model.device, group_size=-1)
+            model = sq.transform_model(
+                alpha=0.5,
+                # alpha="auto",
+                auto_alpha_args=auto_alpha_args,
+                folding=True,
+                op_types=[torch.nn.Linear, torch.nn.Conv2d],
+                calib_iter=100,
+            )
+
     def _set_device(self, device_map):
         if hasattr(self, "device") and self.device is not None:
             return

diff --git a/auto_round/smooth_quant/__init__.py b/auto_round/smooth_quant/__init__.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_round.smooth_quant.sq import SmoothQuant