pytorch
diff --git a/‎.ci/scripts/test_backend_linux.sh‎
Lines changed: 31 additions & 4 deletions b/‎.ci/scripts/test_backend_linux.sh‎
Lines changed: 31 additions & 4 deletions
diff --git a/‎.ci/scripts/test_backend_macos.sh‎
Lines changed: 7 additions & 1 deletion b/‎.ci/scripts/test_backend_macos.sh‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎.github/workflows/nightly.yml‎
Lines changed: 8 additions & 9 deletions b/‎.github/workflows/nightly.yml‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎backends/cadence/aot/functions.yaml‎
Lines changed: 35 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 30 additions & 0 deletions b/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎backends/cadence/aot/memory_planning.py‎
Lines changed: 8 additions & 5 deletions b/‎backends/cadence/aot/memory_planning.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎backends/cadence/aot/memory_planning_algo.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/aot/memory_planning_algo.py‎
Lines changed: 2 additions & 2 deletions
@@ -10,6 +10,8 @@ SUITE=$1
 FLOW=$2
 ARTIFACT_DIR=$3
 
+REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.csv"
+
 echo "Running backend test job for suite $SUITE, flow $FLOW."
 echo "Saving job artifacts to $ARTIFACT_DIR."
 
@@ -18,13 +20,38 @@ eval "$(conda shell.bash hook)"
 CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
 conda activate "${CONDA_ENV}"
 
-# Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
-source .ci/scripts/setup-vulkan-linux-deps.sh
+export PYTHON_EXECUTABLE=python
 
 # CMake options to use, in addition to the defaults.
-EXTRA_BUILD_ARGS="-DEXECUTORCH_BUILD_VULKAN=ON"
+EXTRA_BUILD_ARGS=""
+
+if [[ "$FLOW" == *qnn* ]]; then
+    # Setup QNN sdk and deps - note that this is a bit hacky due to the nature of the
+    # Qualcomm build. TODO (gjcomer) Clean this up once the QNN pybinding integration is
+    # cleaned up.
+    PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+    PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+    PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+    QNN_X86_LIB_DIR=`realpath build-x86/lib/`
+    QNN_SDK_ROOT="/tmp/qnn/2.28.0.241029"
+    export LD_LIBRARY_PATH"=$QNN_X86_LIB_DIR:$QNN_SDK_ROOT/lib/x86_64-linux-clang/:${LD_LIBRARY_PATH:-}"
+
+    # TODO Get SDK root from install scripts
+    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=$QNN_SDK_ROOT"
+fi
+
+if [[ "$FLOW" == *vulkan* ]]; then
+    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+    source .ci/scripts/setup-vulkan-linux-deps.sh
+
+    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
+fi
 
 # We need the runner to test the built library.
 PYTHON_EXECUTABLE=python CMAKE_ARGS="$EXTRA_BUILD_ARGS" .ci/scripts/setup-linux.sh --build-tool cmake --build-mode Release --editable true
 
-python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$ARTIFACT_DIR/test_results.csv"
+EXIT_CODE=0
+python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$REPORT_FILE" || EXIT_CODE=$?
+
+# Generate markdown summary.
+python -m executorch.backends.test.suite.generate_markdown_summary "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
@@ -10,6 +10,8 @@ SUITE=$1
 FLOW=$2
 ARTIFACT_DIR=$3
 
+REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.csv"
+
 echo "Running backend test job for suite $SUITE, flow $FLOW."
 echo "Saving job artifacts to $ARTIFACT_DIR."
 
@@ -21,4 +23,8 @@ eval "$(conda shell.bash hook)"
 PYTHON_EXECUTABLE=python
 ${CONDA_RUN} --no-capture-output .ci/scripts/setup-macos.sh --build-tool cmake --build-mode Release
 
-${CONDA_RUN} --no-capture-output python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$ARTIFACT_DIR/test_results.csv"
+EXIT_CODE=0
+${CONDA_RUN} --no-capture-output python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$REPORT_FILE" || EXIT_CODE=$?
+
+# Generate markdown summary.
+${CONDA_RUN} --no-capture-output python -m executorch.backends.test.suite.generate_markdown_summary "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
@@ -42,7 +42,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        flow: [vulkan, xnnpack, xnnpack_static_int8_per_channel]
+        flow: [
+          qnn, qnn_16a16w, qnn_16a8w, qnn_16a4w, qnn_16a4w_block, qnn_8a8w,
+          vulkan, vulkan_static_int8_per_channel,
+          xnnpack, xnnpack_dynamic_int8_per_channel, xnnpack_static_int8_per_channel, xnnpack_static_int8_per_tensor
+        ]
         suite: [models, operators]
     with:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -53,11 +57,8 @@ jobs:
       upload-artifact: test-report-${{ matrix.flow }}-${{ matrix.suite }}
       script: |
         set -eux
-        # Intentionally suppressing exit code for now.
-        # TODO (gjcomer) Remove this when jobs are stable.
-        EXIT_CODE=0
-        .ci/scripts/test_backend_linux.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}" || EXIT_CODE=$?
-        echo "Test run complete with exit code $EXIT_CODE."
+
+        source .ci/scripts/test_backend_linux.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
 
   backend-test-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -82,6 +83,4 @@ jobs:
         # This is needed to get the prebuilt PyTorch wheel from S3
         ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
 
-        EXIT_CODE=0
-        .ci/scripts/test_backend_macos.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}" || EXIT_CODE=$?
-        echo "Test run complete with exit code $EXIT_CODE."
+        source .ci/scripts/test_backend_macos.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
@@ -249,6 +249,21 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_relu_asym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_add.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_add_per_tensor_out
+
+- func: cadence::quantized_add_asym8sxasym8s_asym8s.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_add_asym8sxasym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_add_asym8uxasym8u_asym8u.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_add_asym8uxasym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -304,6 +319,26 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -330,6 +330,26 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -384,6 +404,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_relu_asym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_add_asym8sxasym8s_asym8s.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_add_asym8sxasym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_add_asym8uxasym8u_asym8u.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_add_asym8uxasym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -116,6 +116,9 @@ def plan_spec(
         Greedily place the spec in the first memory that can fit it.
         """
         for spec.mem_id in range(1, self.get_num_memories()):
+            if placement_constraints.is_mem_id_in_blocklist(spec, spec.mem_id):
+                # Skip placement for blocked memory id.
+                continue
             prev_offset, smallest_gap = 0, float("inf")
             for allocated_spec in state.allocated_buffers[spec.mem_id]:
                 if not Verifier.lifetime_overlap(spec, allocated_spec):
@@ -141,11 +144,11 @@ def plan_spec(
                 )
             if spec.mem_offset is None:
                 spec.mem_offset = prev_offset
-                if not self.is_valid_placement(spec, placement_constraints):
-                    spec.mem_offset = None
-                    continue
-                else:
-                    spec.mem_offset = prev_offset
+
+            if not self.is_valid_placement(spec, placement_constraints):
+                # Skip placement for invalid memory id.
+                spec.mem_offset = None
+                continue
 
             state.place_spec(spec)
             # A data structure used for maintaining the tensor order
 
@@ -204,7 +204,7 @@ def _place_memory_id_pinned_specs(
                 for spec, c in spec_with_abs_constraint.items()
                 if c is not None and c.pinned_memory_id == mem_id and c.offset is None
             }
-            logging.error(f"Placing specs {mem_id_pinned_specs} for {mem_id=}")
+            logging.debug(f"Placing specs {mem_id_pinned_specs} for {mem_id=}")
 
             with self.block_memories_except(mem_id):
                 self.plan(
@@ -220,7 +220,7 @@ def _place_memory_id_pinned_specs(
             if constraint is None:
                 continue
 
-            logging.error(f"Placing spec {spec} with {constraint}")
+            logging.debug(f"Placing spec {spec} with {constraint}")
 
             if not state.is_placed(spec):
                 raise MemoryError(