diff --git a/backends/apple/coreml/test/test_coreml_recipes.py b/backends/apple/coreml/test/test_coreml_recipes.py index 313e24922d6..303d8cb78ed 100644 --- a/backends/apple/coreml/test/test_coreml_recipes.py +++ b/backends/apple/coreml/test/test_coreml_recipes.py @@ -3,6 +3,7 @@ # Please refer to the license found in the LICENSE file in the root directory of the source tree. +import copy import unittest import coremltools as ct @@ -152,8 +153,9 @@ def forward(self, x): # Test with different group sizes for group_size in [8, 16, 32]: with self.subTest(group_size=group_size): + model_to_export = copy.deepcopy(model) session = export( - model=model, + model=model_to_export, example_inputs=example_inputs, export_recipe=ExportRecipe.get_recipe( CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, @@ -219,8 +221,9 @@ def forward(self, x): # Test with different group sizes for group_size in [16, 32, 64]: with self.subTest(group_size=group_size): + model_to_export = copy.deepcopy(model) session = export( - model=model, + model=model_to_export, example_inputs=example_inputs, export_recipe=ExportRecipe.get_recipe( CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 01547d7140d..f8194f0b32c 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -2680,14 +2680,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def apply_8da4w_quantization(self): """Apply TorchAO 8da4w quantization (int8 dynamic activation + int4 weight).""" from torchao.quantization import ( - int8_dynamic_activation_int4_weight, + Int8DynamicActivationIntxWeightConfig, quantize_, ) + from torchao.quantization.granularity import PerGroup from torchao.utils import unwrap_tensor_subclass quantize_( self, - int8_dynamic_activation_int4_weight(group_size=self.group_size), + Int8DynamicActivationIntxWeightConfig( + weight_dtype=torch.int4, granularity=PerGroup(self.group_size) + ), ) unwrap_tensor_subclass(self) return self diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py index 421e59c0b08..ac6fec25732 100644 --- a/backends/xnnpack/test/ops/test_linear.py +++ b/backends/xnnpack/test/ops/test_linear.py @@ -34,8 +34,9 @@ from torch.export.graph_signature import ExportGraphSignature, InputKind try: + from torchao.quantization.granularity import PerGroup from torchao.quantization.quant_api import ( - int8_dynamic_activation_int4_weight, + Int8DynamicActivationIntxWeightConfig, quantize_, ) from torchao.utils import unwrap_tensor_subclass @@ -391,7 +392,12 @@ def _test_groupwise_dq_linear( """ Helper function to test groupwise dynamic quantized linear op with different configurations. """ - quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size)) + quantize_( + mod, + Int8DynamicActivationIntxWeightConfig( + weight_dtype=torch.int4, weight_granularity=PerGroup(group_size) + ), + ) unwrap_tensor_subclass(mod) DynamicallyQuantizedPartitioner = XnnpackPartitioner( config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py index 9f2210b5c64..835972b7f3e 100644 --- a/examples/models/llama/source_transformation/quantize.py +++ b/examples/models/llama/source_transformation/quantize.py @@ -116,7 +116,6 @@ def quantize( # noqa C901 assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}" bitwidth = int(matches[0][0]) - from torchao.dtypes import PackedLinearInt8DynamicActivationIntxWeightLayout from torchao.quantization.granularity import PerAxis, PerGroup from torchao.quantization.quant_api import ( Int8DynamicActivationIntxWeightConfig, @@ -136,7 +135,7 @@ def quantize( # noqa C901 PerAxis(0) if group_size == 0 else PerGroup(group_size) ), weight_mapping_type=MappingType.SYMMETRIC, - layout=PackedLinearInt8DynamicActivationIntxWeightLayout(), + intx_packing_format="opaque_torchao_auto", ), ) model = unwrap_tensor_subclass(model) @@ -148,10 +147,21 @@ def quantize( # noqa C901 # TODO: Default value for group size for 8da4w. Need this here for refactor, will clean this up. group_size = 128 - from torchao.quantization import int8_dynamic_activation_int4_weight, quantize_ + from torchao.quantization import ( + Int8DynamicActivationIntxWeightConfig, + quantize_, + ) + from torchao.quantization.granularity import PerGroup from torchao.utils import unwrap_tensor_subclass - quantize_(model, int8_dynamic_activation_int4_weight(group_size=group_size)) + quantize_( + model, + Int8DynamicActivationIntxWeightConfig( + weight_dtype=torch.int4, + weight_granularity=PerGroup(group_size), + ), + ) + model = unwrap_tensor_subclass(model) # TODO: deal with checkpoint / computation dtype decoupling. @@ -744,9 +754,9 @@ def get_quant_embedding_transform( dtype_override: Optional[DType] = None, ): if embedding_quantize.startswith("torchao:"): - from torchao.experimental.quant_api import ( + from torchao.prototype.quantization.embedding.api import ( EmbeddingQuantizer, - SharedEmbeddingQuantizer, + TiedEmbeddingQuantizer, ) from torchao.quantization.granularity import PerAxis, PerGroup from torchao.quantization.quant_api import MappingType @@ -780,7 +790,7 @@ def _torchao_embedding_quantizer(model): use_fallback=False, ).quantize(model) else: - SharedEmbeddingQuantizer( + TiedEmbeddingQuantizer( weight_dtype=weight_dtype, granularity=granularity, mapping_type=mapping_type, diff --git a/third-party/ao b/third-party/ao index f1acc1e2ade..b99904b34c0 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit f1acc1e2ade01fef0129a3cee62b3d8e14e22602 +Subproject commit b99904b34c0fd98f8a63ec57cbc1dc4993f74793