Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c482371
Make SmoothQuant more General
namgyu-youn Aug 11, 2025
e16edc2
refactor: use predefined ToyLinearModel
namgyu-youn Aug 11, 2025
5ec0dcf
fix incorrect parameters
namgyu-youn Aug 12, 2025
2475ad1
add type hint for dataclass
namgyu-youn Aug 12, 2025
ccb7b84
Merge branch 'main' into refactor-smoothquant
namgyu-youn Aug 15, 2025
ba89d03
use Quantization API for more generalized SmoothQuant API
namgyu-youn Aug 16, 2025
a6df6af
add PREPARE_FOR_LOADING mode for loading quantized weight
namgyu-youn Aug 19, 2025
0fc6539
update example and doc for updated SmoothQuant API
namgyu-youn Aug 19, 2025
46d5d31
remove overused/misunderstood parameters
namgyu-youn Aug 20, 2025
fc8ae4d
remove unused variable from SmoothQuant
namgyu-youn Aug 21, 2025
4f7def9
update SmoothQuant docs for user guide
namgyu-youn Aug 23, 2025
0bd922b
add benchmark comparison: base vs smoothquant
namgyu-youn Aug 26, 2025
3552a05
add benchmark: w4a8-dynamic
namgyu-youn Aug 28, 2025
4c2d55c
update docs for a4w8 benchmark
namgyu-youn Aug 28, 2025
ef9ab2c
replace Sec/Tokens with Tokens/Sec for metrics
namgyu-youn Aug 28, 2025
0d73344
update docs for SmoothQuant experiment
namgyu-youn Aug 28, 2025
6232f77
fix typo in README
namgyu-youn Aug 28, 2025
1802b5e
rename parser: repo to model
namgyu-youn Aug 28, 2025
0da2189
fix incorrect id: w4a8 -> w8a8
namgyu-youn Aug 28, 2025
7bfcf9c
remove args: precision dtype, `torch.compile`
namgyu-youn Aug 28, 2025
e571d5e
rename: precision -> precision dtype in benchmark table
namgyu-youn Aug 29, 2025
0a9ef9e
add args: bias
namgyu-youn Aug 29, 2025
62d986f
fix typo: W4A8 -> W8A8
namgyu-youn Aug 29, 2025
3f8d4ca
fix ci after adding is_bias args
namgyu-youn Aug 29, 2025
65698af
remove dead annotations in args: smoothing_factor
namgyu-youn Aug 29, 2025
d06c7ba
remove torch.compile from unittests
namgyu-youn Aug 30, 2025
ba58e89
refactor: use ToyLinearModel in AWQ
namgyu-youn Aug 30, 2025
b92caac
remove unused test case: dtype, alpha
namgyu-youn Aug 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 68 additions & 58 deletions test/prototype/test_smoothquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#
# This source code is licensed under the BSD 3-Clause license found in the
# LICENSE file in the root directory of this source tree.
import tempfile
import unittest
from copy import deepcopy

Expand All @@ -13,39 +12,13 @@
from torchao.prototype.smoothquant import (
SmoothQuantConfig,
SmoothQuantObservedLinear,
insert_smooth_quant_observer_,
load_smooth_quant_recipe,
save_smooth_quant_recipe,
)
from torchao.quantization import quantize_
from torchao.quantization.utils import (
dequantize_per_channel,
dynamically_quantize_per_channel,
)


class ToyLinearModel(torch.nn.Module):
def __init__(self, m=512, n=256, k=128):
super().__init__()
self.linear1 = torch.nn.Linear(m, n, bias=False)
self.linear2 = torch.nn.Linear(n, k, bias=False)
self.linear3 = torch.nn.Linear(k, 1, bias=False)

def example_inputs(
self, batch_size, sequence_length=10, dtype=torch.bfloat16, device="cuda"
):
return [
torch.randn(
1, sequence_length, self.linear1.in_features, dtype=dtype, device=device
)
for j in range(batch_size)
]

def forward(self, x):
x = self.linear1(x)
x = self.linear2(x)
x = self.linear3(x)
return x
from torchao.testing.model_architectures import ToyLinearModel


@unittest.skipIf(torch.version.hip is not None, "Skipping tests in ROCm")
Expand Down Expand Up @@ -82,14 +55,15 @@ def forward(self, x):
test_data = torch.randn(2, 32, dtype=input_dtype, device=device)

# Step 1: Setup quantized model with observer insertion and calibration
insert_smooth_quant_observer_(m, alpha, quant_mode)
config = SmoothQuantConfig(step="prepare", alpha=alpha, quant_mode=quant_mode)
quantize_(m, config)

# Perform calibration with test data
m(test_data)

# Apply quantization configuration
is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
quantize_(m, SmoothQuantConfig(), is_observed_linear)
config.step = "convert"
quantize_(m, config)

# Apply compilation if supported
m = torch.compile(m, fullgraph=True)
Expand Down Expand Up @@ -169,43 +143,82 @@ def forward(self, x):
f"device={device}, dtype={input_dtype}",
)

def test_observer_insertion(self):
"""Test that PREPARE step correctly inserts SmoothQuantObservedLinear."""

class SimpleLinear(torch.nn.Module):
def __init__(self, bias: bool):
super().__init__()
self.fc = torch.nn.Linear(32, 32, bias)

def forward(self, x):
return self.fc(x)

m = SimpleLinear(True).eval()

# Before quantization - should be regular Linear
self.assertIsInstance(m.fc, torch.nn.Linear)
self.assertNotIsInstance(m.fc, SmoothQuantObservedLinear)

# PREPARE step - should insert observers
config = SmoothQuantConfig(step="prepare", alpha=0.5, quant_mode="dynamic")
quantize_(m, config)

# After PREPARE - should be SmoothQuantObservedLinear
self.assertIsInstance(m.fc, SmoothQuantObservedLinear)
self.assertTrue(hasattr(m.fc, "obs"))

# Test calibration
test_data = torch.randn(2, 32)
m(test_data)

# CONVERT step - should produce regular Linear with quantized weights
config.step = "convert"
quantize_(m, config)

# After CONVERT - should be regular Linear again (but quantized)
self.assertIsInstance(m.fc, torch.nn.Linear)
self.assertNotIsInstance(m.fc, SmoothQuantObservedLinear)

@unittest.skip("This test is broken on recent PyTorch, TODO(#1639): fix it")
@common_utils.parametrize("alpha", [None, 0.5, 0.75])
@common_utils.parametrize("quant_mode", ["static", "dynamic"])
@common_utils.parametrize(
"device", ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
)
@common_utils.parametrize("input_dtype", [torch.float, torch.bfloat16, torch.half])
def test_save_load_recipe(self, alpha, quant_mode, device, input_dtype):
"""Test save/load recipe functionality."""
def test_two_step_quantization(self, alpha, quant_mode, device, input_dtype):
"""Test two-step quantization process (PREPARE -> CONVERT)."""
dataset_size = 20
layer_dims = (512, 256, 128) # Input, hidden, output dimensions
n_calib_examples = 10
sequence_length = 5

# Create two identical models for comparison
m = ToyLinearModel(*layer_dims).eval().to(input_dtype).to(device)
m_save_load = deepcopy(m)
m1 =
(*layer_dims).eval().to(input_dtype).to(device)
m2 = deepcopy(m1)

# Generate calibration dataset
dataset = m.example_inputs(
dataset = m1.example_inputs(
dataset_size,
sequence_length=sequence_length,
dtype=input_dtype,
device=device,
)
calibration_data = dataset[:n_calib_examples]

# Step 1: Setup first quantized model with observer insertion and calibration
insert_smooth_quant_observer_(m, alpha, quant_mode)
# Step 1: PREPARE - Insert observers
config = SmoothQuantConfig(step="prepare", alpha=alpha, quant_mode=quant_mode)
quantize_(m2, config)

# Perform calibration with calibration data
# Step 2: Calibration
for data in calibration_data:
m(data)
m2(data)

# Apply quantization configuration
is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
quantize_(m, SmoothQuantConfig(), is_observed_linear)
quantize_(m2, SmoothQuantConfig(), is_observed_linear)

# Apply compilation if supported
m = torch.compile(m, fullgraph=True)
Expand Down Expand Up @@ -239,26 +252,23 @@ def test_save_load_recipe(self, alpha, quant_mode, device, input_dtype):
original_output = m(input_tensor)
save_load_output = m_save_load(input_tensor)

original_outputs.append(original_output)
save_load_outputs.append(save_load_output)
for data in dataset:
# Remove batch dimension for model input
input_tensor = data.squeeze(0)
m2_output = m2(input_tensor)
m2_outputs.append(m2_output)

# Concatenate all outputs for comparison
original_result = torch.cat(original_outputs)
save_load_out = torch.cat(save_load_outputs)
# Concatenate all outputs
m2_result = torch.cat(m2_outputs)

self.assertIsNotNone(
original_result, "Original model output should not be None"
)
self.assertIsNotNone(
save_load_out, "Save/load model output should not be None"
)
self.assertIsNotNone(m2_result, "Quantized model output should not be None")

torch.testing.assert_close(
original_result,
save_load_out,
msg=f"Save/load recipe should produce identical results for "
f"alpha={alpha}, quant_mode={quant_mode}, device={device}, dtype={input_dtype}",
)
# Check that model produces reasonable outputs
self.assertFalse(
torch.isnan(m2_result).any(),
f"Quantized model should not produce NaN values for "
f"alpha={alpha}, quant_mode={quant_mode}, device={device}, dtype={input_dtype}",
)


common_utils.instantiate_parametrized_tests(TestSmoothQuant)
Expand Down
10 changes: 5 additions & 5 deletions torchao/prototype/smoothquant/README.md
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
# SmothQuant quantization
# SmoothQuant quantization
This is a native PyTorch implementation of the algorithm described in [this paper](https://arxiv.org/abs/2211.10438).

In this implementation, weights are smoothed (equalized) and quantized to int8 during quantization. Activations are smoothed and quantized to int8 at runtime. Quantization is done either dynamically or statically. If activations are dynamically quantized, qparams (i.e., scales) are found at runtime while qparams are found during quantization for static quantization. For dynamic quantization, activations are quantized per token. And for static quantization, activations are quantized per tensor. Generally, dynamic quantization produces better accuracy while static quantization has better latency. In both cases, weights and activations are symmetrically quantized.

## Quick start
Run the example code with
```bash
python example.py -m MODLE_ID --device=<cuda or cpu> --quant-mode=<dynamic or static>
python example.py -m MODEL_ID --device=<cuda or cpu> --quant-mode=<dynamic or static>
# An example
python example.py -m meta-llama/Llama-2-7b-hf --device=cuda --quant-mode=dynamic
```
To use the `torch.compile` for speedup, add `--compile`. You may want to export `TORCHINDUCTOR_FREEZING=1` for even better performance.
```bash
TORCHINDUCTOR_FREEZING=1 python example.py -m MODLE_ID --device=<cuda or cpu> --quant-mode=<dynamic or static> --compile
TORCHINDUCTOR_FREEZING=1 python example.py -m MODEL_ID --device=<cuda or cpu> --quant-mode=<dynamic or static> --compile
```
To save a quantized model for reuse, specify `--model-save-path`
```bash
python example.py -m MODLE_ID --device=<cuda or cpu> --quant-mode=<dynamic or static> --model-save-path ./quantized_model.pt
python example.py -m MODEL_ID --device=<cuda or cpu> --quant-mode=<dynamic or static> --model-save-path ./quantized_model.pt
```
And load it by `--model-load-path`
```bash
python example.py -m MODLE_ID --device=<cuda or cpu> --quant-mode=<dynamic or static> --model-load-path ./quantized_model.pt
python example.py -m MODEL_ID --device=<cuda or cpu> --quant-mode=<dynamic or static> --model-load-path ./quantized_model.pt
```


Expand Down
16 changes: 7 additions & 9 deletions torchao/prototype/smoothquant/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
from .api import (
SmoothQuantConfig,
insert_smooth_quant_observer_,
load_smooth_quant_recipe,
save_smooth_quant_recipe,
from .api import SmoothQuantConfig
from .core import (
SmoothQuantObservedLinear,
SmoothQuantObserver,
SmoothQuantStep,
)
from .core import SmoothQuantObservedLinear

__all__ = [
"insert_smooth_quant_observer_",
"load_smooth_quant_recipe",
"save_smooth_quant_recipe",
"SmoothQuantConfig",
"SmoothQuantStep",
"SmoothQuantObserver",
"SmoothQuantObservedLinear",
]
Loading
Loading