pytorch
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/convert_split_to_slice.py‎
Lines changed: 17 additions & 6 deletions b/‎backends/arm/_passes/convert_split_to_slice.py‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎backends/arm/operator_support/tosa_profile_supported_op_lists.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/operator_support/tosa_profile_supported_op_lists.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/quantizer/quantization_annotator.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/quantizer/quantization_annotator.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/test/ops/test_slice.py‎
Lines changed: 10 additions & 1 deletion b/‎backends/arm/test/ops/test_slice.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎backends/arm/test/ops/test_split.py‎
Lines changed: 85 additions & 11 deletions b/‎backends/arm/test/ops/test_split.py‎
Lines changed: 85 additions & 11 deletions
diff --git a/‎backends/cortex_m/passes/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎backends/cortex_m/passes/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎backends/cortex_m/passes/cortex_m_pass_manager.py‎
Lines changed: 25 additions & 0 deletions b/‎backends/cortex_m/passes/cortex_m_pass_manager.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎backends/cortex_m/test/tester.py‎
Lines changed: 3 additions & 17 deletions b/‎backends/cortex_m/test/tester.py‎
Lines changed: 3 additions & 17 deletions
diff --git a/‎backends/cuda/cuda_backend.py‎
Lines changed: 9 additions & 0 deletions b/‎backends/cuda/cuda_backend.py‎
Lines changed: 9 additions & 0 deletions
@@ -71,7 +71,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: [linear, add, add_mul, resnet18]
+        model: [linear, add, add_mul, resnet18, conv1d]
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
 
@@ -46,13 +46,24 @@ def call(self, graph_module: torch.fx.GraphModule):
             dim = (dim + rank) % rank
 
             # Validate that split lengths cover the entire dimension
-            length_sum = sum(split_lengths)
+
             dim_size = shape[dim]
-            if length_sum != dim_size:
-                raise ValueError(
-                    f"Split sizes {split_lengths} sum to {length_sum}, "
-                    f"but dimension {dim} has size {dim_size}"
-                )
+            if isinstance(split_lengths, int):
+                if split_lengths <= 0:
+                    raise ValueError(
+                        f"Split size must be positive, got {split_lengths}"
+                    )
+                full_chunks, remainder = divmod(dim_size, split_lengths)
+                split_lengths = [split_lengths] * full_chunks
+                if remainder:
+                    split_lengths.append(remainder)
+            else:
+                length_sum = sum(split_lengths)
+                if length_sum != dim_size:
+                    raise ValueError(
+                        f"Split sizes {split_lengths} sum to {length_sum}, "
+                        f"but dimension {dim} has size {dim_size}"
+                    )
 
             # Convert split argument 'split_lengths' to slice arguments start and end.
             starts = [0] * len(split_lengths)
 
@@ -55,6 +55,7 @@
     exir_ops.edge.aten.log.default,
     exir_ops.edge.aten.linear.default,
     exir_ops.edge.aten.split_with_sizes_copy.default,
+    exir_ops.edge.aten.split_copy.Tensor,
     exir_ops.edge.aten.floor.default,
     exir_ops.edge.aten.full.default,
     exir_ops.edge.aten.full_like.default,
@@ -152,6 +153,7 @@
     exir_ops.edge.aten.log.default,
     exir_ops.edge.aten.linear.default,
     exir_ops.edge.aten.split_with_sizes_copy.default,
+    exir_ops.edge.aten.split_copy.Tensor,
     exir_ops.edge.aten.floor.default,
     exir_ops.edge.aten.full.default,
     exir_ops.edge.aten.full_like.default,
 
@@ -330,6 +330,7 @@ def _match_pattern(
     torch.ops.aten.slice_copy.Tensor,
     torch.ops.aten.split.Tensor,
     torch.ops.aten.split_with_sizes.default,
+    torch.ops.aten.split_copy.Tensor,
     torch.ops.aten.transpose.Dimname,
     torch.ops.aten.transpose.int,
     torch.ops.aten.transpose_copy.int,
 
@@ -7,6 +7,7 @@
 
 from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -43,7 +44,6 @@
 
 
 class Slice(torch.nn.Module):
-
     def forward(self, x: torch.Tensor, s: list[tuple[int, int]]):
         slices = [slice(*i) for i in s]
         return x[slices]
@@ -153,6 +153,9 @@ def get_symmetric_a16w8_slice_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
+@pytest.mark.xfail(
+    reason="missing int16 slice ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13976"
+)
 def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -178,6 +181,9 @@ def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
+@pytest.mark.xfail(
+    reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations"
+)
 def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -202,6 +208,9 @@ def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
+@pytest.mark.xfail(
+    reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations"
+)
 def test_slice_tensor_16a8w_u85_INT16(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
 
@@ -22,7 +22,6 @@
 
 
 class Split(torch.nn.Module):
-
     test_data = {
         "split_1d_2_size_0_dim": lambda: (torch.rand(10), 2, 0),
         "split_2d_3_size_1_dim": lambda: (torch.rand(10, 10), 3, 1),
@@ -60,12 +59,24 @@ def forward(
         return x.split(split_size=split_size_or_sections, dim=dim)[1:3]
 
 
+class SplitCopy(torch.nn.Module):
+    aten_op = "torch.ops.aten.split_copy.Tensor"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_split_copy_Tensor"
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        split_size: int,
+        dim: int,
+    ):
+        return torch.split_copy(x, split_size=split_size, dim=dim)
+
+
 @common.parametrize(
     "test_data",
     (Split.test_data | Split.test_data_list),
 )
 def test_split_with_sizes_tosa_FP(test_data: input_t1):
-
     pipeline = TosaPipelineFP[input_t1](
         Split(),
         test_data(),
@@ -77,7 +88,6 @@ def test_split_with_sizes_tosa_FP(test_data: input_t1):
 
 @common.parametrize("test_data", Split.test_data_list)
 def test_split_with_sizes_tosa_FP_2(test_data: input_t1):
-
     pipeline = TosaPipelineFP[input_t1](
         SplitWithSizes(),
         test_data(),
@@ -92,7 +102,6 @@ def test_split_with_sizes_tosa_FP_2(test_data: input_t1):
     (Split.test_data | Split.test_data_list),
 )
 def test_split_with_sizes_tosa_FP_one_out(test_data: input_t1):
-
     pipeline = TosaPipelineFP[input_t1](
         SplitSingleOut(),
         test_data(),
@@ -107,7 +116,6 @@ def test_split_with_sizes_tosa_FP_one_out(test_data: input_t1):
     (Split.test_data | Split.test_data_list),
 )
 def test_split_with_sizes_tosa_FP_two_out(test_data: input_t1):
-
     pipeline = TosaPipelineFP[input_t1](
         SplitTwoOut(),
         test_data(),
@@ -122,7 +130,6 @@ def test_split_with_sizes_tosa_FP_two_out(test_data: input_t1):
     (Split.test_data | Split.test_data_list),
 )
 def test_split_with_sizes_tosa_INT(test_data: input_t1):
-
     pipeline = TosaPipelineINT[input_t1](
         Split(),
         test_data(),
@@ -161,7 +168,6 @@ def test_split_with_sizes_u55_INT(test_data: input_t1):
 )
 @common.XfailIfNoCorstone320
 def test_split_with_sizes_u85_INT(test_data: input_t1):
-
     pipeline = EthosU85PipelineINT[input_t1](
         Split(),
         test_data(),
@@ -190,7 +196,6 @@ def test_split_with_sizes_vgf_FP(test_data: input_t1):
 @common.parametrize("test_data", Split.test_data_list)
 @common.SkipIfNoModelConverter
 def test_split_with_sizes_vgf_FP_2(test_data: input_t1):
-
     pipeline = VgfPipeline[input_t1](
         SplitWithSizes(),
         test_data(),
@@ -207,7 +212,6 @@ def test_split_with_sizes_vgf_FP_2(test_data: input_t1):
 )
 @common.SkipIfNoModelConverter
 def test_split_with_sizes_vgf_FP_one_out(test_data: input_t1):
-
     pipeline = VgfPipeline[input_t1](
         SplitSingleOut(),
         test_data(),
@@ -224,7 +228,6 @@ def test_split_with_sizes_vgf_FP_one_out(test_data: input_t1):
 )
 @common.SkipIfNoModelConverter
 def test_split_with_sizes_vgf_FP_two_out(test_data: input_t1):
-
     pipeline = VgfPipeline[input_t1](
         SplitTwoOut(),
         test_data(),
@@ -241,7 +244,6 @@ def test_split_with_sizes_vgf_FP_two_out(test_data: input_t1):
 )
 @common.SkipIfNoModelConverter
 def test_split_with_sizes_vgf_INT(test_data: input_t1):
-
     pipeline = VgfPipeline[input_t1](
         Split(),
         test_data(),
@@ -250,3 +252,75 @@ def test_split_with_sizes_vgf_INT(test_data: input_t1):
         tosa_version="TOSA-1.0+INT",
     )
     pipeline.run()
+
+
+@common.parametrize("test_data", Split.test_data)
+def test_split_tensor_tosa_FP(test_data: Tuple):
+    pipeline = TosaPipelineFP[input_t1](
+        SplitCopy(),
+        test_data(),
+        aten_op=SplitCopy.aten_op,
+        exir_op=SplitCopy.exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Split.test_data)
+def test_split_tensor_tosa_INT(test_data: Tuple):
+    pipeline = TosaPipelineINT[input_t1](
+        SplitCopy(),
+        test_data(),
+        aten_op=SplitCopy.aten_op,
+        exir_op=SplitCopy.exir_op,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@common.parametrize("test_data", Split.test_data)
+def test_split_tensor_u55_INT(test_data: Tuple):
+    pipeline = EthosU55PipelineINT[input_t1](
+        SplitCopy(),
+        test_data(),
+        aten_ops=SplitCopy.aten_op,
+        exir_ops=SplitCopy.exir_op,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@common.parametrize("test_data", Split.test_data)
+def test_split_tensor_u85_INT(test_data: Tuple):
+    pipeline = EthosU85PipelineINT[input_t1](
+        SplitCopy(),
+        test_data(),
+        aten_ops=SplitCopy.aten_op,
+        exir_ops=SplitCopy.exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Split.test_data)
+@common.SkipIfNoModelConverter
+def test_split_tensor_vgf_FP(test_data: Tuple):
+    pipeline = VgfPipeline[input_t1](
+        SplitCopy(),
+        test_data(),
+        aten_op=SplitCopy.aten_op,
+        exir_op=SplitCopy.exir_op,
+        tosa_version="TOSA-1.0+FP",
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Split.test_data)
+@common.SkipIfNoModelConverter
+def test_split_tensor_vgf_INT(test_data: Tuple):
+    pipeline = VgfPipeline[input_t1](
+        SplitCopy(),
+        test_data(),
+        aten_op=SplitCopy.aten_op,
+        exir_op=SplitCopy.exir_op,
+        tosa_version="TOSA-1.0+INT",
+    )
+    pipeline.run()
@@ -0,0 +1,9 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .quantized_linear_fusion_pass import QuantizedLinearFusionPass  # noqa
+from .quantized_op_fusion_pass import QuantizedOpFusionPass  # noqa
+from .replace_quant_nodes_pass import ReplaceQuantNodesPass  # noqa
+from .cortex_m_pass_manager import CortexMPassManager  # noqa  # usort: skip
@@ -0,0 +1,25 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from executorch.backends.cortex_m.passes import (
+    QuantizedLinearFusionPass,
+    QuantizedOpFusionPass,
+    ReplaceQuantNodesPass,
+)
+from executorch.backends.xnnpack._passes import XNNPACKPassManager
+from executorch.exir.pass_base import ExportPass
+
+
+class CortexMPassManager(XNNPACKPassManager):
+
+    pass_list: list[ExportPass] = [
+        ReplaceQuantNodesPass,
+        QuantizedOpFusionPass,
+        QuantizedLinearFusionPass,
+    ]
+
+    def __init__(self, exported_program, passes=None):
+        super().__init__(exported_program, passes or self.pass_list)
@@ -10,16 +10,7 @@
 import torch
 from executorch.backends.arm.test.common import get_u55_compile_spec
 from executorch.backends.arm.test.tester.arm_tester import Serialize
-from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import (
-    QuantizedLinearFusionPass,
-)
-from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import (
-    QuantizedOpFusionPass,
-)
-
-from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
-    ReplaceQuantNodesPass,
-)
+from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
 from executorch.backends.test.harness import Tester as TesterBase
 from executorch.backends.test.harness.stages import (
     Export,
@@ -29,7 +20,6 @@
     ToEdgeTransformAndLower,
     ToExecutorch,
 )
-from executorch.backends.xnnpack._passes import XNNPACKPassManager
 
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
@@ -47,12 +37,8 @@ def __init__(self):
 class CortexMRunPasses(RunPasses):
     def __init__(self):
         super().__init__(
-            XNNPACKPassManager,
-            pass_list=[
-                ReplaceQuantNodesPass,
-                QuantizedLinearFusionPass,
-                QuantizedOpFusionPass,
-            ],
+            CortexMPassManager,
+            CortexMPassManager.pass_list,
         )
 
 
 
@@ -24,9 +24,14 @@
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch._inductor.decomposition import conv1d_to_conv2d
 from torch.export.passes import move_to_device_pass
 from torch.nn.attention import SDPBackend
 
+cuda_decomposition_table = {
+    torch.ops.aten.conv1d.default: conv1d_to_conv2d,
+}
+
 # exist fallback operators in et namespace;
 supported_fallback_kernels: Dict[str, Any] = {}
 
@@ -119,6 +124,10 @@ def preprocess(
         # replace slice_copy with slice
         ReplaceSliceCopyWithSlicePass()(cuda_edge_program.graph_module)
 
+        cuda_edge_program = cuda_edge_program.run_decompositions(
+            cuda_decomposition_table
+        )
+
         edge_program_module = cuda_edge_program.module()
 
         # Grab all input placeholders from the graph