Bump torchao pin and use v2 torchao tensors (#14171)

metascroy · web-flow · commit a4b7de0dc233 · 2025-09-11T11:51:26.000-07:00
This PR bumps the torchao pin in ExecuTorch, and adjusts the code in ExecuTorch to rely less on deprecated features. In particular, * torchao/experimental folder is being deprecated, so we switch embedding / tied embedding quantizers to their new home * v1 tensors based on AffineQuantizedTensor + QDQLayout are being deprecated. This switches ExecuTorch to use v2 tensors. See pytorch/ao#2967.
diff --git a/backends/apple/coreml/test/test_coreml_recipes.py b/backends/apple/coreml/test/test_coreml_recipes.py
@@ -3,6 +3,7 @@
 # Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 
+import copy
 import unittest
 
 import coremltools as ct
@@ -152,8 +153,9 @@ def forward(self, x):
         # Test with different group sizes
         for group_size in [8, 16, 32]:
             with self.subTest(group_size=group_size):
+                model_to_export = copy.deepcopy(model)
                 session = export(
-                    model=model,
+                    model=model_to_export,
                     example_inputs=example_inputs,
                     export_recipe=ExportRecipe.get_recipe(
                         CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP,
@@ -219,8 +221,9 @@ def forward(self, x):
         # Test with different group sizes
         for group_size in [16, 32, 64]:
             with self.subTest(group_size=group_size):
+                model_to_export = copy.deepcopy(model)
                 session = export(
-                    model=model,
+                    model=model_to_export,
                     example_inputs=example_inputs,
                     export_recipe=ExportRecipe.get_recipe(
                         CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP,
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -2680,14 +2680,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             def apply_8da4w_quantization(self):
                 """Apply TorchAO 8da4w quantization (int8 dynamic activation + int4 weight)."""
                 from torchao.quantization import (
-                    int8_dynamic_activation_int4_weight,
+                    Int8DynamicActivationIntxWeightConfig,
                     quantize_,
                 )
+                from torchao.quantization.granularity import PerGroup
                 from torchao.utils import unwrap_tensor_subclass
 
                 quantize_(
                     self,
-                    int8_dynamic_activation_int4_weight(group_size=self.group_size),
+                    Int8DynamicActivationIntxWeightConfig(
+                        weight_dtype=torch.int4, granularity=PerGroup(self.group_size)
+                    ),
                 )
                 unwrap_tensor_subclass(self)
                 return self
diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py
@@ -34,8 +34,9 @@
 from torch.export.graph_signature import ExportGraphSignature, InputKind
 
 try:
+    from torchao.quantization.granularity import PerGroup
     from torchao.quantization.quant_api import (
-        int8_dynamic_activation_int4_weight,
+        Int8DynamicActivationIntxWeightConfig,
         quantize_,
     )
     from torchao.utils import unwrap_tensor_subclass
@@ -391,7 +392,12 @@ def _test_groupwise_dq_linear(
         """
         Helper function to test groupwise dynamic quantized linear op with different configurations.
         """
-        quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size))
+        quantize_(
+            mod,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=torch.int4, weight_granularity=PerGroup(group_size)
+            ),
+        )
         unwrap_tensor_subclass(mod)
         DynamicallyQuantizedPartitioner = XnnpackPartitioner(
             config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -116,7 +116,6 @@ def quantize(  # noqa C901
         assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
         bitwidth = int(matches[0][0])
 
-        from torchao.dtypes import PackedLinearInt8DynamicActivationIntxWeightLayout
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import (
             Int8DynamicActivationIntxWeightConfig,
@@ -136,7 +135,7 @@ def quantize(  # noqa C901
                         PerAxis(0) if group_size == 0 else PerGroup(group_size)
                     ),
                     weight_mapping_type=MappingType.SYMMETRIC,
-                    layout=PackedLinearInt8DynamicActivationIntxWeightLayout(),
+                    intx_packing_format="opaque_torchao_auto",
                 ),
             )
             model = unwrap_tensor_subclass(model)
@@ -148,10 +147,21 @@ def quantize(  # noqa C901
             # TODO: Default value for group size for 8da4w. Need this here for refactor, will clean this up.
             group_size = 128
 
-        from torchao.quantization import int8_dynamic_activation_int4_weight, quantize_
+        from torchao.quantization import (
+            Int8DynamicActivationIntxWeightConfig,
+            quantize_,
+        )
+        from torchao.quantization.granularity import PerGroup
         from torchao.utils import unwrap_tensor_subclass
 
-        quantize_(model, int8_dynamic_activation_int4_weight(group_size=group_size))
+        quantize_(
+            model,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=torch.int4,
+                weight_granularity=PerGroup(group_size),
+            ),
+        )
+
         model = unwrap_tensor_subclass(model)
 
         # TODO: deal with checkpoint / computation dtype decoupling.
@@ -744,9 +754,9 @@ def get_quant_embedding_transform(
     dtype_override: Optional[DType] = None,
 ):
     if embedding_quantize.startswith("torchao:"):
-        from torchao.experimental.quant_api import (
+        from torchao.prototype.quantization.embedding.api import (
             EmbeddingQuantizer,
-            SharedEmbeddingQuantizer,
+            TiedEmbeddingQuantizer,
         )
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import MappingType
@@ -780,7 +790,7 @@ def _torchao_embedding_quantizer(model):
                         use_fallback=False,
                     ).quantize(model)
                 else:
-                    SharedEmbeddingQuantizer(
+                    TiedEmbeddingQuantizer(
                         weight_dtype=weight_dtype,
                         granularity=granularity,
                         mapping_type=mapping_type,
diff --git a/third-party/ao b/third-party/ao
@@ -1 +1 @@
-Subproject commit f1acc1e2ade01fef0129a3cee62b3d8e14e22602
+Subproject commit b99904b34c0fd98f8a63ec57cbc1dc4993f74793