Fix torchao quantization for deployment targets under iOS18 (#13896)

metascroy · web-flow · commit 7e76ba2f931d · 2025-09-05T11:43:17.000-07:00
_construct_constexpr_dequant_op should use axis = None (default value),
rather than -1. This will infer the axis from the shape of scales.

On iOS18, this was not an issue because the axis is not used. For iOS16,
the axis is used and the value -1 usually caused an issue during export.
diff --git a/backends/apple/coreml/compiler/torch_ops.py b/backends/apple/coreml/compiler/torch_ops.py
@@ -175,7 +175,6 @@ def dequantize_affine(context, node):
         int_data.astype(quantized_np_dtype),
         zero_point,
         scale,
-        axis=-1,
         name=node.name,
     )
     context.add(output, node.name)
diff --git a/backends/apple/coreml/test/test_torch_ops.py b/backends/apple/coreml/test/test_torch_ops.py
@@ -27,9 +27,9 @@
 class TestTorchOps(unittest.TestCase):
     edge_compile_config = executorch.exir.EdgeCompileConfig()
 
-    def _coreml_partitioner(self):
+    def _coreml_partitioner(self, *, minimum_deployment_target=ct.target.iOS18):
         compile_specs = CoreMLBackend.generate_compile_specs(
-            minimum_deployment_target=ct.target.iOS18
+            minimum_deployment_target=minimum_deployment_target
         )
         return CoreMLPartitioner(compile_specs=compile_specs)
 
@@ -158,6 +158,33 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
         et_prog = delegated_program.to_executorch()
         self._compare_outputs(et_prog, model, example_inputs)
 
+    def test_dequantize_affine_c8w_embedding_c8w_linear_ios16(self):
+        model, example_inputs = self._get_test_model()
+        quantize_(
+            model,
+            IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)),
+            lambda m, fqn: isinstance(m, torch.nn.Embedding),
+        )
+        quantize_(
+            model,
+            IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)),
+        )
+        ep = torch.export.export(model, example_inputs)
+        delegated_program = executorch.exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[
+                self._coreml_partitioner(minimum_deployment_target=ct.target.iOS16)
+            ],
+        )
+        for node in delegated_program.exported_program().graph.nodes:
+            if node.op == "call_function":
+                assert node.target.__name__ in [
+                    "executorch_call_delegate",
+                    "getitem",
+                ], f"Got unexpected node target after delegation: {node.target.__name__}"
+        et_prog = delegated_program.to_executorch()
+        self._compare_outputs(et_prog, model, example_inputs)
+
     def test_dequantize_codebook_linear_per_grouped_col(self):
         model, example_inputs = self._get_test_model()
         quantize_(
@@ -298,6 +325,7 @@ def forward(self, x):
     test_runner.test_dequantize_affine_c4w_embedding()
     test_runner.test_dequantize_affine_c4w_linear()
     test_runner.test_dequantize_affine_c8w_embedding_b4w_linear()
+    test_runner.test_dequantize_affine_c8w_embedding_c8w_linear_ios16()
     test_runner.test_dequantize_codebook_linear_per_grouped_col()
     test_runner.test_dequantize_codebook_linear_per_grouped_row()
     test_runner.test_dequantize_codebook_embedding_per_grouped_col()

Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,6 @@ def dequantize_affine(context, node):`
`175`	`175`	`int_data.astype(quantized_np_dtype),`
`176`	`176`	`zero_point,`
`177`	`177`	`scale,`
`178`		`- axis=-1,`
`179`	`178`	`name=node.name,`
`180`	`179`	`)`
`181`	`180`	`context.add(output, node.name)`