pytorch
diff --git a/‎backends/apple/coreml/recipes/coreml_recipe_provider.py‎
Lines changed: 5 additions & 2 deletions b/‎backends/apple/coreml/recipes/coreml_recipe_provider.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_quantizer.py‎
Lines changed: 2 additions & 4 deletions b/‎backends/apple/coreml/test/test_coreml_quantizer.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_recipes.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/test/test_coreml_recipes.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/mps/test/test_mps_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/test/test_mps_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/ref_implementations.py‎
Lines changed: 19 additions & 3 deletions b/‎backends/cadence/aot/ref_implementations.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎backends/cadence/aot/tests/test_ref_implementations.py‎
Lines changed: 17 additions & 8 deletions b/‎backends/cadence/aot/tests/test_ref_implementations.py‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎backends/cortex_m/test/test_quantize_op_fusion_pass.py‎
Lines changed: 3 additions & 7 deletions b/‎backends/cortex_m/test/test_quantize_op_fusion_pass.py‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎backends/example/test_example_delegate.py‎
Lines changed: 2 additions & 6 deletions b/‎backends/example/test_example_delegate.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎backends/mediatek/quantizer/annotator.py‎
Lines changed: 2 additions & 4 deletions b/‎backends/mediatek/quantizer/annotator.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎backends/qualcomm/_passes/TARGETS‎
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/_passes/TARGETS‎
Lines changed: 1 addition & 0 deletions
@@ -69,6 +69,7 @@ def create_recipe(
                 recipe_type, activation_dtype=torch.float32, **kwargs
             )
         elif recipe_type == CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL:
+            self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
             return self._build_torchao_quantized_recipe(
                 recipe_type,
                 weight_dtype=torch.int4,
@@ -77,6 +78,7 @@ def create_recipe(
             )
         elif recipe_type == CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP:
             group_size = kwargs.pop("group_size", 32)
+            self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
             return self._build_torchao_quantized_recipe(
                 recipe_type,
                 weight_dtype=torch.int4,
@@ -85,11 +87,14 @@ def create_recipe(
                 **kwargs,
             )
         elif recipe_type == CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL:
+            self._validate_and_set_deployment_target(kwargs, ct.target.iOS16, "torchao")
             return self._build_torchao_quantized_recipe(
                 recipe_type, weight_dtype=torch.int8, is_per_channel=True, **kwargs
             )
         elif recipe_type == CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP:
             group_size = kwargs.pop("group_size", 32)
+            # override minimum_deployment_target to ios18 for torchao (GH issue #13122)
+            self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
             return self._build_torchao_quantized_recipe(
                 recipe_type,
                 weight_dtype=torch.int8,
@@ -312,8 +317,6 @@ def _build_torchao_quantized_recipe(
             ao_quantization_configs=[config],
         )
 
-        # override minimum_deployment_target to ios18 for torchao (GH issue #13122)
-        self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
         lowering_recipe = self._get_coreml_lowering_recipe(**kwargs)
 
         return ExportRecipe(
 
@@ -15,7 +15,7 @@
 )
 
 from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
-from torch.export import export_for_training
+from torch.export import export
 from torchao.quantization.pt2e.quantize_pt2e import (
     convert_pt2e,
     prepare_pt2e,
@@ -32,9 +32,7 @@ def quantize_and_compare(
     ) -> None:
         assert quantization_type in {"PTQ", "QAT"}
 
-        pre_autograd_aten_dialect = export_for_training(
-            model, example_inputs, strict=True
-        ).module()
+        pre_autograd_aten_dialect = export(model, example_inputs, strict=True).module()
 
         quantization_config = LinearQuantizerConfig.from_dict(
             {
 
@@ -501,7 +501,7 @@ def test_minimum_deployment_target_validation(self):
             (CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),
             (
                 CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL,
-                ct.target.iOS18,
+                ct.target.iOS16,
                 {},
             ),
             (CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),
 
@@ -206,7 +206,7 @@ def lower_module_and_test_output(
 
         expected_output = model(*sample_inputs)
 
-        model = torch.export.export_for_training(
+        model = torch.export.export(
             model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
         ).module()
 
 
@@ -458,10 +458,21 @@ def quantized_conv_nhwc_per_tensor(
         - out_shift (int): Unused
     """
 
-    if not input_tensor.is_contiguous(memory_format=torch.channels_last):
-        raise ValueError("Input tensor must be in NHWC format")
+    # Convert to NCHW format to reuse the existing implementation
+    conv_is_1d = False
+    if len(input_tensor.shape) == 3:
+        conv_is_1d = True
+        input_tensor = input_tensor.movedim(-1, 1).contiguous()
+        if len(weight.shape) != 3:
+            raise ValueError("Weight tensor must be 3D if input is 3D")
+        weight = weight.movedim(-1, 1).contiguous()
+    else:
+        input_tensor = input_tensor.movedim(-1, -3)
+        if len(weight.shape) != 4:
+            raise ValueError("Weight tensor must be 4D if input is nd > 3")
+        weight = torch.permute(weight, (0, -1, 1, 2)).contiguous()
 
-    return quantized_conv_per_tensor(
+    nchw_out = quantized_conv_per_tensor(
         input_tensor,
         weight,
         bias,
@@ -478,6 +489,11 @@ def quantized_conv_nhwc_per_tensor(
         out_shift,
     )
 
+    if conv_is_1d:
+        return nchw_out.movedim(1, -1).contiguous()
+    else:
+        return nchw_out.movedim(-3, -1).contiguous()
+
 
 def quantized_conv_variant(
     layout: str,
 
@@ -449,7 +449,7 @@ def test_quantized_layer_norm_per_tensor(
                     ),  # expected_output: [1+2, 2+3, 3+4] / 0.5 = [6, 10, 14]
                     memory_format,
                 )
-                for memory_format in [torch.contiguous_format]
+                for memory_format in [torch.contiguous_format, torch.channels_last]
             ],
             # Test case 5: Multiple output channels
             *[
@@ -686,10 +686,13 @@ def test_quantized_conv_per_tensor(
     ) -> None:
         assert memory_format in [torch.contiguous_format, torch.channels_last]
 
-        if len(input_tensor.shape) == 3 and memory_format == torch.channels_last:
-            self.fail("Channels last format is not supported for 3D input tensors")
-
-        input_tensor = input_tensor.to(memory_format=memory_format)
+        if memory_format == torch.channels_last:
+            if input_tensor.ndim == 3:
+                input_tensor = input_tensor.movedim(1, -1)
+                weight = weight.movedim(1, -1)
+            else:
+                input_tensor = input_tensor.movedim(-3, -1)
+                weight = weight.movedim(-3, -1)
 
         convs = [
             (
@@ -701,7 +704,7 @@ def test_quantized_conv_per_tensor(
 
         optimized_convs = []
         if input_tensor.dtype == torch.int8 and weight.dtype == torch.int8:
-            if input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+            if memory_format == torch.contiguous_format:
                 optimized_convs = [
                     torch.ops.cadence.quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor,
                     torch.ops.cadence.quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor,
@@ -715,7 +718,7 @@ def test_quantized_conv_per_tensor(
                     torch.ops.cadence.quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor,
                 ]
         elif input_tensor.dtype == torch.uint8 and weight.dtype == torch.uint8:
-            if input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+            if memory_format == torch.contiguous_format:
                 optimized_convs = [
                     torch.ops.cadence.quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor,
                     torch.ops.cadence.quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor,
@@ -746,7 +749,13 @@ def test_quantized_conv_per_tensor(
                 output_zero_point,
                 out_multiplier,
                 out_shift,
-            ).to(memory_format=torch.contiguous_format)
+            )
+
+            if memory_format == torch.channels_last:
+                if input_tensor.ndim == 3:
+                    output = output.movedim(-1, 1)
+                else:
+                    output = output.movedim(-1, -3)
 
             # Verify output properties
             self.assertEqual(output.dtype, dtype, f"Output dtype should be {dtype}")
 
@@ -23,7 +23,7 @@
     get_node_args,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
-from torch.export import export, export_for_training
+from torch.export import export
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
@@ -42,9 +42,7 @@ def _prepare_quantized_model(self, model_class):
         model = model_class()
 
         # Export and quantize
-        exported_model = export_for_training(
-            model.eval(), self.example_inputs, strict=True
-        ).module()
+        exported_model = export(model.eval(), self.example_inputs, strict=True).module()
         prepared_model = prepare_pt2e(exported_model, AddQuantizer())
         quantized_model = convert_pt2e(prepared_model)
 
@@ -242,9 +240,7 @@ def forward(self, x, y):
                 inputs = (torch.randn(shape), torch.randn(shape))
 
                 model = SingleAddModel()
-                exported_model = export_for_training(
-                    model.eval(), inputs, strict=True
-                ).module()
+                exported_model = export(model.eval(), inputs, strict=True).module()
                 prepared_model = prepare_pt2e(exported_model, AddQuantizer())
                 quantized_model = convert_pt2e(prepared_model)
 
 
@@ -46,9 +46,7 @@ def get_example_inputs():
         )
 
         m = model.eval()
-        m = torch.export.export_for_training(
-            m, copy.deepcopy(example_inputs), strict=True
-        ).module()
+        m = torch.export.export(m, copy.deepcopy(example_inputs), strict=True).module()
         # print("original model:", m)
         quantizer = ExampleQuantizer()
         # quantizer = XNNPACKQuantizer()
@@ -84,9 +82,7 @@ def test_delegate_mobilenet_v2(self):
         )
 
         m = model.eval()
-        m = torch.export.export_for_training(
-            m, copy.deepcopy(example_inputs), strict=True
-        ).module()
+        m = torch.export.export(m, copy.deepcopy(example_inputs), strict=True).module()
         quantizer = ExampleQuantizer()
 
         m = prepare_pt2e(m, quantizer)
 
@@ -10,7 +10,7 @@
 from torch._ops import OpOverload
 from torch._subclasses import FakeTensor
 
-from torch.export import export_for_training
+from torch.export import export
 from torch.fx import Graph, Node
 from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
     SubgraphMatcherWithNameNodeMap,
@@ -158,9 +158,7 @@ def forward(self, x):
             return norm, {}
 
     for pattern_cls in (ExecuTorchPattern, MTKPattern):
-        pattern_gm = export_for_training(
-            pattern_cls(), (torch.randn(3, 3),), strict=True
-        ).module()
+        pattern_gm = export(pattern_cls(), (torch.randn(3, 3),), strict=True).module()
         matcher = SubgraphMatcherWithNameNodeMap(
             pattern_gm, ignore_literals=True, remove_overlapping_matches=False
         )
 
@@ -12,6 +12,7 @@ runtime.python_library(
     ],
     deps = [
         "//executorch/backends/transforms:addmm_mm_to_linear",
+        "//executorch/backends/transforms:decompose_sdpa",
         "//executorch/exir/backend:backend_details",
         "//executorch/exir/backend:compile_spec_schema",
     ],
Original file line number	Diff line number	Diff line change
`@@ -501,7 +501,7 @@ def test_minimum_deployment_target_validation(self):`
`501`	`501`	`(CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),`
`502`	`502`	`(`
`503`	`503`	`CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL,`
`504`		`- ct.target.iOS18,`
	`504`	`+ ct.target.iOS16,`
`505`	`505`	`{},`
`506`	`506`	`),`
`507`	`507`	`(CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),`