Update Int8DynActOnlyConfig

lisjin · lisjin · commit 57e29ee1ea90 · 2025-08-13T08:06:18.000-07:00
diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
@@ -39,11 +39,7 @@
     quantize_,
 )
 from torchao.quantization.quant_primitives import MappingType
-from torchao.utils import (
-    TORCH_VERSION_AT_LEAST_2_4,
-    TORCH_VERSION_AT_LEAST_2_6,
-    check_cpu_version,
-)
+from torchao.utils import check_cpu_version
 
 _DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -208,7 +204,6 @@ class TestUnifTorchaoQuantizer(common_utils.TestCase):
     def setUp(self):
         torch.manual_seed(123)
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
     @common_utils.parametrize("group_size", [32, 256])
     def test_int4_weight_only(self, group_size: int = 32):
         model = M(m=512, n=512).to(_DEVICE, dtype=torch.bfloat16)
@@ -225,7 +220,6 @@ def test_int4_weight_only(self, group_size: int = 32):
             model, m_ref, Int4UnifTorchaoQuantizer(), b, group_size
         )
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @common_utils.parametrize("b", [2, 3, 4, 8])
     @common_utils.parametrize("group_size", [32, 512])
     def test_intx_weight_only(self, b: int = 2, group_size: int = 32):
@@ -243,7 +237,6 @@ def test_intx_weight_only(self, b: int = 2, group_size: int = 32):
         quantizer = UnifTorchaoQuantizer()
         compare_quantized_models(model, m_ref, quantizer, b, group_size)
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
     @unittest.skipIf(_DEVICE == "cpu", "Need GPU available")
     def test_int4_weight_only_e2e(self, group_size: int = 32):
         model = M(m=512, n=512).to(torch.bfloat16).to(_DEVICE)
@@ -265,7 +258,6 @@ def test_int4_weight_only_e2e(self, group_size: int = 32):
         )
         compare_parq_convert(model, m_ref, optimizer, config)
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @unittest.skipIf(_DEVICE == "cpu", "Need GPU available")
     @common_utils.parametrize("b", [2, 3, 4, 8])
     def test_intx_weight_only_e2e(self, b: int = 2, group_size: int = 32):
@@ -315,7 +307,6 @@ def test_intx_weight_only_parq_equivalent(self, b: int = 2, group_size: int = 32
             torch.testing.assert_close(q, q_ref, atol=0, rtol=0)
             torch.testing.assert_close(Q, Q_ref, atol=0, rtol=0)
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @common_utils.parametrize("b", [2, 3])
     @common_utils.parametrize("group_size", [32, 512])
     def test_intx_weight_only(self, b: int = 2, group_size: int = 32):
@@ -337,7 +328,6 @@ def test_intx_weight_only(self, b: int = 2, group_size: int = 32):
 
         compare_quantized_models(model, m_ref, quantizer, b, group_size)
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @unittest.skipIf(_DEVICE == "cpu", "Need GPU available")
     @common_utils.parametrize("b", [2, 3])
     def test_intx_weight_only_e2e(self, b: int = 2, group_size: int = 32):
@@ -369,7 +359,6 @@ class TestInt8DynamicActivationTorchaoQuantizer(common_utils.TestCase):
     def setUp(self):
         torch.manual_seed(123)
 
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @common_utils.parametrize("b", [2, 3, 4, 8])
     @common_utils.parametrize("model_dtype", [torch.float16, torch.float32])
     @common_utils.parametrize("group_size", [32, 128])
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -54,6 +54,7 @@
     GemliteUIntXWeightOnlyConfig,
     Int4DynamicActivationInt4WeightConfig,
     Int4WeightOnlyConfig,
+    Int8DynActOnlyConfig,
     Int8DynamicActivationInt4WeightConfig,
     Int8DynamicActivationInt8WeightConfig,
     Int8DynamicActivationIntxWeightConfig,
@@ -144,6 +145,7 @@
     "Int8DynamicActivationIntxWeightConfig",
     "Int4WeightOnlyConfig",
     "Float8DynamicActivationInt4WeightConfig",
+    "Int8DynActOnlyConfig",
     "Int8WeightOnlyConfig",
     "Float8WeightOnlyConfig",
     "Float8DynamicActivationFloat8WeightConfig",
diff --git a/torchao/quantization/qat/fake_quantize_config.py b/torchao/quantization/qat/fake_quantize_config.py
@@ -292,6 +292,7 @@ def _infer_fake_quantize_configs(
     # avoid circular imports
     from torchao.quantization import (
         Int4WeightOnlyConfig,
+        Int8DynActOnlyConfig,
         Int8DynamicActivationInt4WeightConfig,
     )
 
@@ -315,5 +316,12 @@ def _infer_fake_quantize_configs(
             zero_point_domain=base_config.zero_point_domain,
         )
         return (None, weight_config)
+    elif isinstance(base_config, Int8DynActOnlyConfig):
+        act_config = IntxFakeQuantizeConfig(
+            dtype=torch.int8,
+            granularity="per_token",
+            is_symmetric=base_config.is_symmetric,
+        )
+        return (act_config, None)
     else:
         raise ValueError("Unexpected base config: %s" % base_config)
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -148,6 +148,7 @@
     "gemlite_uintx_weight_only",
     "float8_dynamic_activation_float8_weight",
     "float8_static_activation_float8_weight",
+    "Int8DynActOnlyConfig",
     "Int8DynActInt4WeightQuantizer",
     "Float8DynamicActivationFloat8SemiSparseWeightConfig",
     "ModuleFqnToConfig",
@@ -1312,6 +1313,31 @@ def _float8_cutlass_quant_sparse(
     )
 
 
+@dataclass
+class Int8DynActOnlyConfig(AOBaseConfig):
+    """
+    Configuration for applying int8 dynamic symmetric per-token activation quantization to linear layers.
+    Args:
+        is_symmetric: bool = False - Whether to use symmetric quantization for activations.
+    """
+
+    is_symmetric: bool = False
+
+
+@register_quantize_module_handler(Int8DynActOnlyConfig)
+def _int8_dynamic_activation_transform(
+    module: torch.nn.Module, config: Int8DynActOnlyConfig
+) -> torch.nn.Module:
+    weight = module.weight
+    if config.is_symmetric == MappingType.SYMMETRIC:
+        input_quant_func = _int8_symm_per_token_reduced_range_quant
+    else:
+        input_quant_func = _int8_asymm_per_token_quant
+    weight = to_linear_activation_quantized(weight, input_quant_func)
+    module.weight = torch.nn.Parameter(weight, requires_grad=False)
+    return module
+
+
 @dataclass
 class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
     """