pytorch
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/android-perf.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/apple-perf.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/apple-perf.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/arm/test/models/test_dl3_arm.py‎
Lines changed: 92 additions & 0 deletions b/‎backends/arm/test/models/test_dl3_arm.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎backends/cadence/aot/export_example.py‎
Lines changed: 3 additions & 33 deletions b/‎backends/cadence/aot/export_example.py‎
Lines changed: 3 additions & 33 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/quantizer/quantizer.py‎
Lines changed: 41 additions & 28 deletions b/‎backends/cadence/aot/quantizer/quantizer.py‎
Lines changed: 41 additions & 28 deletions
diff --git a/‎backends/cadence/hifi/operators/op_quantized_relu_out.cpp‎
Lines changed: 3 additions & 32 deletions b/‎backends/cadence/hifi/operators/op_quantized_relu_out.cpp‎
Lines changed: 3 additions & 32 deletions
@@ -222,6 +222,7 @@ jobs:
                       --preq_mode 8da4w_output_8da8w \
                       --preq_group_size 32 \
                       --max_seq_length 2048 \
+                      --max_context_length 2048 \
                       --output_name "${OUT_ET_MODEL_NAME}.pte" \
                       -kv \
                       -d fp32 \
@@ -253,6 +254,7 @@ jobs:
                       --xnnpack-extended-ops \
                       -d fp32 \
                       --max_seq_length 2048 \
+                      --max_context_length 2048 \
                       --output_name "${OUT_ET_MODEL_NAME}.pte" \
                       --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
 
@@ -233,6 +233,7 @@ jobs:
                 --preq_mode 8da4w_output_8da8w \
                 --preq_group_size 32 \
                 --max_seq_length 2048 \
+                --max_context_length 2048 \
                 --output_name "${OUT_ET_MODEL_NAME}.pte" \
                 -kv \
                 -d fp32 \
@@ -264,6 +265,7 @@ jobs:
                 --xnnpack-extended-ops \
                 -d fp32 \
                 --max_seq_length 2048 \
+                --max_context_length 2048 \
                 --output_name "${OUT_ET_MODEL_NAME}.pte" \
                 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
 
@@ -596,7 +596,7 @@ endif()
 # any backends.
 #
 add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PRIVATE executorch_core)
+target_link_libraries(executorch PUBLIC executorch_core)
 target_include_directories(executorch PUBLIC ${_common_include_directories})
 target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch PUBLIC ${_common_compile_options})
@@ -750,7 +750,9 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch CONFIG REQUIRED)
+  if(NOT TARGET torch)
+    find_package(Torch CONFIG REQUIRED)
+  endif()
   find_library(
     TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
   )
 
@@ -0,0 +1,92 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import pytest
+
+from executorch.backends.arm.test import common, conftest
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.examples.models import deeplab_v3
+
+
+class TestDl3(unittest.TestCase):
+    """Tests DeepLabv3."""
+
+    dl3 = deeplab_v3.DeepLabV3ResNet50Model()
+    model_inputs = dl3.get_example_inputs()
+    dl3 = dl3.get_eager_model()
+
+    @unittest.expectedFailure
+    def test_dl3_tosa_MI(self):
+        (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .run_method_and_compare_outputs(self.model_inputs)
+        )
+
+    @unittest.expectedFailure
+    def test_dl3_tosa_BI(self):
+        (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .run_method_and_compare_outputs(atol=1.0, qtol=1, inputs=self.model_inputs)
+        )
+
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @unittest.skip
+    def test_dl3_u55_BI(self):
+        tester = (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs
+            )
+
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @unittest.skip
+    def test_dl3_u85_BI(self):
+        tester = (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_u85_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs
+            )
@@ -6,11 +6,11 @@
 
 # Example script for exporting simple models to flatbuffer
 
+#pyre-unsafe
+
 import logging
 import tempfile
 
-import torch
-
 from executorch.backends.cadence.aot.ops_registrations import *  # noqa
 from typing import Any, Tuple
 
@@ -23,38 +23,15 @@
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
 from executorch.backends.cadence.runtime import runtime
 from executorch.backends.cadence.runtime.executor import BundledProgramManager
-from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
-    QuantizationConfig,
-    QuantizationSpec,
-)
 from executorch.exir import ExecutorchProgramManager
 from torch import nn
-from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
 
 from .utils import save_bpte_program, save_pte_program
 
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
-act_qspec = QuantizationSpec(
-    dtype=torch.int8,
-    quant_min=-128,
-    quant_max=127,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
-)
-
-wgt_qspec = QuantizationSpec(
-    dtype=torch.int8,
-    quant_min=-128,
-    quant_max=127,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=MinMaxObserver,
-)
-
 
 def export_model(
     model: nn.Module,
@@ -66,15 +43,8 @@ def export_model(
     working_dir = tempfile.mkdtemp(dir="/tmp")
     logging.debug(f"Created work directory {working_dir}")
 
-    qconfig = QuantizationConfig(
-        act_qspec,
-        act_qspec,
-        wgt_qspec,
-        None,
-    )
-
     # Instantiate the quantizer
-    quantizer = CadenceDefaultQuantizer(qconfig)
+    quantizer = CadenceDefaultQuantizer()
 
     # Convert the model
     converted_model = convert_pt2(model, example_inputs, quantizer)
 
@@ -576,7 +576,7 @@ def quantized_relu_per_tensor_meta(
     out_multiplier: int,
     out_shift: int,
 ) -> torch.Tensor:
-    return input.new_empty(input.size(), dtype=torch.uint8)
+    return input.new_empty(input.size(), dtype=input.dtype)
 
 
 @register_fake("cadence::fully_connected")
 
@@ -40,30 +40,46 @@
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
 
 
-act_qspec = QuantizationSpec(
-    dtype=torch.uint8,
-    quant_min=0,
-    quant_max=255,
+act_qspec_asym8u = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-128,
+    quant_max=127,
     qscheme=torch.per_tensor_affine,
     is_dynamic=False,
     observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
 )
 
-wgt_qspec = QuantizationSpec(
-    dtype=torch.uint8,
-    quant_min=0,
-    quant_max=255,
+wgt_qspec_asym8u = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-128,
+    quant_max=127,
     qscheme=torch.per_tensor_affine,
     is_dynamic=False,
     observer_or_fake_quant_ctr=MinMaxObserver,
 )
 
+wgt_qspec_asym8s = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-128,
+    quant_max=127,
+    qscheme=torch.per_tensor_symmetric,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=MinMaxObserver,
+)
+
 bias_qspec: Optional[QuantizationSpec] = None
 
-_default_qconfig = QuantizationConfig(
-    act_qspec,
-    act_qspec,
-    wgt_qspec,
+qconfig_A8uW8u = QuantizationConfig(
+    act_qspec_asym8u,
+    act_qspec_asym8u,
+    wgt_qspec_asym8u,
+    None,
+)
+
+qconfig_A8uW8s = QuantizationConfig(
+    act_qspec_asym8u,
+    act_qspec_asym8u,
+    wgt_qspec_asym8s,
     None,
 )
 
@@ -147,19 +163,17 @@ def get_supported_operators(cls) -> List[OperatorConfig]:
         return []
 
 
-def get_cadence_default_quantizer_list_with_config(
-    quantization_config: QuantizationConfig,
-) -> List[Quantizer]:
+def get_cadence_default_quantizers() -> List[Quantizer]:
     return [
-        CadenceAtenQuantizer(AddmmPattern(), quantization_config),
-        CadenceAtenQuantizer(BmmPattern(), quantization_config),
-        CadenceAtenQuantizer(Conv1dPattern(), quantization_config),
-        CadenceAtenQuantizer(Conv2dPattern(), quantization_config),
-        CadenceAtenQuantizer(LayerNormPattern(), quantization_config),
-        CadenceAtenQuantizer(LinearPattern(), quantization_config),
-        CadenceAtenQuantizer(MatmulPattern(), quantization_config),
-        CadenceAtenQuantizer(ReluPattern0(), quantization_config),
-        CadenceAtenQuantizer(ReluPattern1(), quantization_config),
+        CadenceAtenQuantizer(AddmmPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(BmmPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8uW8s),
+        CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8uW8s),
+        CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(LinearPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(MatmulPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(ReluPattern0(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(ReluPattern1(), qconfig_A8uW8u),
     ]
 
 
@@ -178,10 +192,9 @@ class CadenceDefaultQuantizer(CadenceQuantizer):
     Default quantizer for Cadence backend.
     """
 
-    def __init__(self, qconfig: Optional[QuantizationConfig] = None) -> None:
-        if qconfig is None:
-            qconfig = _default_qconfig
-        quantizers = get_cadence_default_quantizer_list_with_config(qconfig)
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = get_cadence_default_quantizers()
         super().__init__(quantizers)
 
 
 
@@ -18,33 +18,6 @@ namespace impl {
 namespace HiFi {
 namespace native {
 
-template <typename T>
-void quantized_relu_(
-    const Tensor& input,
-    const Tensor& in_zero_point,
-    const int64_t out_zero_point,
-    const Tensor& out_multiplier,
-    const Tensor& out_shift,
-    Tensor& output) {
-  T q_zero_point = in_zero_point.const_data_ptr<T>()[0];
-  const T* __restrict__ in = input.const_data_ptr<T>();
-  T* __restrict__ out = output.mutable_data_ptr<T>();
-
-  const int32_t* __restrict__ out_multiplier_data =
-      out_multiplier.const_data_ptr<int32_t>();
-  const int32_t* __restrict__ out_shift_data =
-      out_shift.const_data_ptr<int32_t>();
-
-  // Compute the out_scale from out_multiplier and out_shift
-  const float out_scale =
-      -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]);
-
-  for (size_t i = 0, e = input.numel(); i < e; ++i) {
-    float temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0;
-    out[i] = kernels::quantize<T>(temp, out_scale, (int32_t)out_zero_point);
-  }
-}
-
 void quantized_relu_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
@@ -68,7 +41,7 @@ void quantized_relu_per_tensor_out(
         _out_multiplier,
         _out_shift,
         _out_zero_point,
-        _out_zero_point,
+        0,
         255,
         input.numel());
 
@@ -85,7 +58,7 @@ void quantized_relu_per_tensor_out(
         _out_multiplier,
         _out_shift,
         _out_zero_point,
-        _out_zero_point,
+        -128,
         127,
         input.numel());
 
@@ -107,9 +80,7 @@ void quantized_relu_per_tensor_out(
     const Tensor& out_multiplier,
     const Tensor& out_shift,
     Tensor& output) {
-  const uint8_t* p_in = input.const_data_ptr<uint8_t>();
-  uint8_t* p_out = output.mutable_data_ptr<uint8_t>();
-  uint8_t _in_zero_point = in_zero_point.const_data_ptr<uint8_t>()[0];
+  int8_t _in_zero_point = in_zero_point.const_data_ptr<int8_t>()[0];
   int32_t _out_multiplier = out_multiplier.const_data_ptr<int32_t>()[0];
   int32_t _out_shift = out_shift.const_data_ptr<int32_t>()[0];