pytorch
diff --git a/‎.ci/scripts/test_backend_linux.sh‎
Lines changed: 6 additions & 1 deletion b/‎.ci/scripts/test_backend_linux.sh‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_link_check.yml‎
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/_link_check.yml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎.github/workflows/_test_backend.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/_test_backend.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/test-backend-arm.yml‎
Lines changed: 27 additions & 0 deletions b/‎.github/workflows/test-backend-arm.yml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎.github/workflows/test-backend-qnn.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test-backend-qnn.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/test/tester/arm_tester.py‎
Lines changed: 14 additions & 2 deletions b/‎backends/arm/test/tester/arm_tester.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎backends/cadence/aot/TARGETS‎
Lines changed: 17 additions & 1 deletion b/‎backends/cadence/aot/TARGETS‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 39 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 78 additions & 1 deletion b/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 78 additions & 1 deletion
@@ -39,12 +39,17 @@ if [[ "$FLOW" == *qnn* ]]; then
 fi
 
 if [[ "$FLOW" == *vulkan* ]]; then
-    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate.
     source .ci/scripts/setup-vulkan-linux-deps.sh
 
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *arm* ]]; then
+    # Setup ARM deps.
+    .ci/scripts/setup-arm-baremetal-tools.sh
+fi
+
 # We need the runner to test the built library.
 PYTHON_EXECUTABLE=python CMAKE_ARGS="$EXTRA_BUILD_ARGS" .ci/scripts/setup-linux.sh --build-tool cmake --build-mode Release --editable true
 
 
@@ -149,7 +149,7 @@ run_and_verify() {
 
     # verify result.txt
     RESULT=$(cat result.txt)
-    EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with"
+    EXPECTED_PREFIX="ASSISTANT: The image captures a basketball game in progress, with"
 
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
 
@@ -55,3 +55,29 @@ jobs:
           echo "Or add \`@lint-ignore\` somewhere on the same line as the reference you want to skip checking."
           exit 1
         }
+
+  lint-file-size:
+    if: ${{ github.event_name == 'pull_request' }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-linter
+      submodules: false
+      fetch-depth: 0
+      ref: ${{ inputs.ref }}
+      timeout: 30
+      script: |
+        chmod +x ./scripts/lint_file_size.sh
+        ./scripts/lint_file_size.sh $(
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
+        ) || {
+          echo
+          echo "File size lint failed: some files exceed the 1 MB limit."
+          echo "If you really need large files, consider using Git LFS or storing them elsewhere."
+          echo "If you really need to get unblocked and check in the file, can add it to the EXCEPTIONS list in scripts/lint_file_size.sh."
+          exit 1
+        }
@@ -31,6 +31,11 @@ on:
         required: false
         type: boolean
         default: false
+      runner-linux:
+        description: 'Runner type for Linux jobs'
+        required: false
+        type: string
+        default: linux.4xlarge.memory
 
 jobs:
   test-backend-linux:
@@ -44,7 +49,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       ref: ${{ inputs.ref }}
-      runner: linux.4xlarge.memory
+      runner: ${{ inputs.runner-linux }}
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
       submodules: recursive
       timeout: ${{ inputs.timeout }}
 
@@ -0,0 +1,27 @@
+name: Test ARM Backend
+
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+    paths:
+      - .github/workflows/test-backend-arm.yml
+      - .github/workflows/_test_backend.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-arm:
+    uses: ./.github/workflows/_test_backend.yml
+    with:
+      backend: arm
+      flows: '["arm_tosa"]'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      run-linux: true
@@ -25,3 +25,4 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 120
       run-linux: true
+      runner-linux: linux.8xlarge.memory
@@ -57,6 +57,7 @@
 
 from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
 
+from executorch.backends.test.harness.error_statistics import ErrorStatistics
 from executorch.backends.test.harness.stages import Stage, StageType
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.devtools.backend_debug import get_delegation_info
@@ -333,6 +334,7 @@ def to_edge_transform_and_lower(
         transform_passes: Optional[
             Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
         ] = None,
+        generate_etrecord: bool = False,
     ):
         if transform_passes is not None:
             raise RuntimeError(
@@ -367,7 +369,9 @@ def to_edge_transform_and_lower(
                 to_edge_and_lower_stage.partitioners = partitioners
             if edge_compile_config is not None:
                 to_edge_and_lower_stage.edge_compile_conf = edge_compile_config
-        return super().to_edge_transform_and_lower(to_edge_and_lower_stage)
+        return super().to_edge_transform_and_lower(
+            to_edge_and_lower_stage, generate_etrecord=generate_etrecord
+        )
 
     def to_executorch(self, to_executorch_stage: Optional[ToExecutorch] | None = None):
         if to_executorch_stage is None:
@@ -402,6 +406,7 @@ def run_method_and_compare_outputs(
         qtol=0,
         error_callbacks=None,
         run_eager_mode=False,
+        statistics_callback: Callable[[ErrorStatistics], None] | None = None,
     ):
         """
         Compares the run_artifact output of 'stage' with the output of a reference stage.
@@ -657,10 +662,17 @@ def _compare_outputs(
         rtol=1e-03,
         qtol=0,
         error_callbacks=None,
+        statistics_callback: Callable[[ErrorStatistics], None] | None = None,
     ):
         try:
             super()._compare_outputs(
-                reference_output, stage_output, quantization_scale, atol, rtol, qtol
+                reference_output,
+                stage_output,
+                quantization_scale,
+                atol,
+                rtol,
+                qtol,
+                statistics_callback=statistics_callback,
             )
         except AssertionError as e:
             if error_callbacks is None:
 
@@ -143,7 +143,23 @@ executorch_generated_lib(
     visibility = ["PUBLIC"],
     deps = [
         "//executorch/backends/cadence/generic/kernels:cadence_kernels",
-        "//executorch/backends/cadence/generic/operators:cadence_generic_ops",
+        # Individual operator targets instead of combined cadence_generic_ops
+        "//executorch/backends/cadence/generic/operators:op_add",
+        "//executorch/backends/cadence/generic/operators:op_embedding",
+        "//executorch/backends/cadence/generic/operators:op_full",
+        "//executorch/backends/cadence/generic/operators:op_requantize_out",
+        "//executorch/backends/cadence/generic/operators:op_view_copy",
+        "//executorch/backends/cadence/generic/operators:im2row_out",
+        "//executorch/backends/cadence/generic/operators:dequantize_per_tensor",
+        "//executorch/backends/cadence/generic/operators:quantize_per_tensor",
+        "//executorch/backends/cadence/generic/operators:quantized_add_out",
+        "//executorch/backends/cadence/generic/operators:quantized_conv_nchw_out",
+        "//executorch/backends/cadence/generic/operators:quantized_conv_nhwc_out",
+        "//executorch/backends/cadence/generic/operators:quantized_fully_connected_out",
+        "//executorch/backends/cadence/generic/operators:quantized_layer_norm",
+        "//executorch/backends/cadence/generic/operators:quantized_linear_out",
+        "//executorch/backends/cadence/generic/operators:quantized_matmul_out",
+        "//executorch/backends/cadence/generic/operators:quantized_relu_out",
         "//executorch/kernels/portable:executorch_all_ops",
         "//executorch/kernels/portable:operators",
     ],
 
@@ -324,6 +324,19 @@
     "rope.out(Tensor input, Tensor sin_tensor, Tensor cos_tensor, Tensor? pos, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
+lib.define(
+    "quantized_softmax(Tensor input, Tensor mask, int dim, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point) -> (Tensor out)"
+)
+lib.define(
+    "quantized_softmax.per_tensor(Tensor input, Tensor mask, int dim, float in_scale, int in_zero_point, float out_scale, int out_zero_point) -> (Tensor out)"
+)
+lib.define(
+    "quantized_softmax.out(Tensor input, Tensor mask, int dim, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
+lib.define(
+    "quantized_softmax.per_tensor_out(Tensor input, Tensor mask, int dim, float in_scale, int in_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
+
 # Load/store with iDMA. These only exist before memory planning.
 # Post memory planning, we check that outputs/inputs for the load/store are in
 # DTCM and replace idma_load/idma_store with idma_copy.
@@ -2329,3 +2342,29 @@ def softmax_f32_f32_meta(
     half_to_float: Optional[bool] = None,
 ) -> torch.Tensor:
     return self.new_empty(self.size(), dtype=self.dtype)
+
+
+@register_fake("cadence::quantized_softmax")
+def quantized_softmax_meta(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    dim: int,
+    in_scale: torch.Tensor,
+    in_zero_point: torch.Tensor,
+    out_scale: torch.Tensor,
+    out_zero_point: torch.Tensor,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_softmax.per_tensor")
+def quantized_softmax_per_tensor_meta(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    dim: int,
+    in_scale: float,
+    in_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
@@ -6,9 +6,10 @@
 
 # pyre-strict
 
-from typing import Any, Dict, List, Tuple
+from typing import Any, cast, Dict, List, Tuple
 
 import torch
+from executorch.backends.cadence.aot.compiler_utils import get_shape
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
     AddPattern,
@@ -25,6 +26,7 @@
     MatmulPattern,
     ReluPattern0,
     ReluPattern1,
+    SoftmaxPattern,
 )
 from executorch.backends.cadence.aot.quantizer.utils import (
     check_out_zero_point_is_min_range,
@@ -388,6 +390,73 @@ def get_args_and_kwargs_relu(
     return args, kwargs
 
 
+def get_args_and_kwargs_softmax(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    quant_node: fx.Node,
+    op_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    # Make a dummy mask tensor
+    mask_shape = get_shape(graph_module, cast(fx.Node, quant_node.args[0]))
+    mask_shape = list(mask_shape) if mask_shape else []
+    mask_shape[-1] = mask_shape[-1] // 16
+    mask_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            mask_shape,
+            0.0,
+        ),
+        {"dtype": torch.int32},
+    )
+    # Make the scale and zero_point tensors
+    in_scale_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            dequants_inputs[0].args[1],
+        ),
+        {"dtype": torch.float32},
+    )
+    in_zero_point_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            dequants_inputs[0].args[2],
+        ),
+        {"dtype": torch.int32},
+    )
+    out_scale_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            quant_node.args[1],
+        ),
+        {"dtype": torch.float32},
+    )
+    out_zero_point_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            quant_node.args[2],
+        ),
+        {"dtype": torch.int32},
+    )
+
+    # Make the args and kwargs for the replacement op
+    args = (
+        inputs_inputs[0],
+        mask_tensor,
+        op_node.args[1],
+        in_scale_tensor,
+        in_zero_point_tensor,
+        out_scale_tensor,
+        out_zero_point_tensor,
+    )
+    kwargs = {}
+    return args, kwargs
+
+
 class QuantFusion(ExportPass):
     # pyre-ignore[2]: Parameter `patterns` has no type specified
     def __init__(self, patterns) -> None:
@@ -543,6 +612,14 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_inputs,
                             quant_node,
                         )
+                    elif isinstance(pattern, SoftmaxPattern):
+                        args, kwargs = get_args_and_kwargs_softmax(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            quant_node,
+                            anchor_output_node,
+                        )
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
                         args,