pytorch
diff --git a/‎.github/workflows/_link_check.yml‎
Lines changed: 10 additions & 10 deletions b/‎.github/workflows/_link_check.yml‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 0 additions & 28 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 0 additions & 24 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎backends/arm/runtime/EthosUBackend.cpp‎
Lines changed: 4 additions & 9 deletions b/‎backends/arm/runtime/EthosUBackend.cpp‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎backends/arm/scripts/build_executor_runner.sh‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/scripts/build_executor_runner.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/test/test_arm_baremetal.sh‎
Lines changed: 6 additions & 3 deletions b/‎backends/arm/test/test_arm_baremetal.sh‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎backends/arm/test/test_model.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/test/test_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 0 additions & 26 deletions b/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 24 additions & 25 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 24 additions & 25 deletions
@@ -18,11 +18,11 @@ jobs:
       timeout: 120
       script: |
         ./scripts/lint_urls.sh $(
-          { [ "${{ github.event_name }}" = "pull_request" ] \
-              && git diff --name-only "${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }}"; } \
-          || \
-          { [ "${{ github.event_name }}" = "push" ] \
-              && git diff --name-only "${{ github.event.before }}...${{ github.sha }}"; }
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
         ) || {
           echo
           echo "URL lint failed."
@@ -43,11 +43,11 @@ jobs:
       timeout: 60
       script: |
         ./scripts/lint_xrefs.sh $(
-          { [ "${{ github.event_name }}" = "pull_request" ] \
-              && git diff --name-only "${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }}"; } \
-          || \
-          { [ "${{ github.event_name }}" = "push" ] \
-              && git diff --name-only "${{ github.event.before }}...${{ github.sha }}"; }
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
         ) || {
           echo
           echo "Xref lint failed."
 
@@ -367,7 +367,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         mkdir -p aar-out
-        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash scripts/build_android_library.sh
+        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 EXECUTORCH_ANDROID_PROFILING=ON bash scripts/build_android_library.sh
         mkdir -p extension/benchmark/android/benchmark/app/libs
         cp aar-out/executorch.aar extension/benchmark/android/benchmark/app/libs
         pushd extension/benchmark/android/benchmark
 
@@ -343,34 +343,6 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         PYTHON_EXECUTABLE=python bash examples/xnnpack/quantization/test_quantize.sh "${BUILD_TOOL}" mv2
 
-  test-pybind-build-linux:
-    name: test-pybind-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # build module for executorch.extension.pybindings.portable_lib
-        BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON" \
-        bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
-
-        # see if we can import the module successfully
-        python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
-
   test-binary-size-linux-gcc:
     name: test-binary-size-linux-gcc
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
 
@@ -289,30 +289,6 @@ jobs:
         # Build and test coreml delegate
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/build_all.sh
 
-  test-pybind-build-macos:
-    name: test-pybind-build-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - build-tool: cmake
-      fail-fast: false
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 180
-      script: |
-        bash .ci/scripts/setup-conda.sh
-
-        # build module for executorch.extension.pybindings.portable_lib
-        BUILD_TOOL=${{ matrix.build-tool }}
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON" PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
-
-        # see if we can import the module successfully
-        ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
-
   test-static-llama-ane:
     name: test-static-llama-ane
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
 
@@ -189,8 +189,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     // the end of the execution of the Ethos-U custom delegate
     char* ethosu_scratch =
         static_cast<char*>(temp_allocator->allocate(handles.scratch_data_size));
-    extern size_t ethosu_fast_scratch_size;
-    extern unsigned char* ethosu_fast_scratch;
     ET_LOG(
         Debug,
         "EthosUBackend::execute: Running program data:\n  cmd %p %zu\n  weight %p %zu\n  scratch %p %zu\n  fast scratch %p %zu\n",
@@ -200,8 +198,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         handles.weight_data_size,
         ethosu_scratch,
         handles.scratch_data_size,
-        ethosu_fast_scratch,
-        ethosu_fast_scratch_size);
+        nullptr,
+        0);
 
     // Write argument values (from EValue tensor) into Ethos-U scratch
     // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM
@@ -311,12 +309,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         static_cast<uint64_t>(
             reinterpret_cast<uintptr_t>((handles.weight_data))),
         static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_scratch)),
-        static_cast<uint64_t>(
-            reinterpret_cast<uintptr_t>(ethosu_fast_scratch))};
+        0};
     size_t bases_size[ETHOSU_NUM_BASE_ADDRS] = {
-        handles.weight_data_size,
-        handles.scratch_data_size,
-        ethosu_fast_scratch_size};
+        handles.weight_data_size, handles.scratch_data_size, 0};
     int result = 0;
     EXECUTORCH_PROF_START(
         event_tracer, event_tracer_local_scope, "+EthosUBackend::execute()NPU");
 
@@ -103,7 +103,7 @@ then
     memory_mode="Shared_Sram"
     if [[ ${target} =~ "ethos-u85" ]]
     then
-        memory_mode="Dedicated_Sram_384KB"
+        memory_mode="Sram_Only"
     fi
 fi
 
 
@@ -87,7 +87,7 @@ test_pytest_models() { # Test ops and other things
     backends/arm/scripts/build_executorch.sh
 
     # Run arm baremetal pytest tests without FVP
-    pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/models
+    pytest  --verbose --color=yes backends/arm/test/models
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -122,7 +122,7 @@ test_pytest_models_ethosu_fvp() { # Same as test_pytest but also sometime verify
     backends/arm/test/setup_testing.sh
 
     # Run arm baremetal pytest tests with FVP
-    pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/models --arm_run_corstoneFVP
+    pytest  --verbose --color=yes backends/arm/test/models --arm_run_corstoneFVP
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -210,7 +210,10 @@ test_models_ethos-u85() { # End to End model tests using model_test.py
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-512 --model=mv3 --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=w2l --extra_flags="-DET_ATOL=0.01 -DET_RTOL=0.01"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=ic4 --extra_flags="-DET_ATOL=0.8 -DET_RTOL=0.8" --timeout=2400
+    # Temporarily not test inception_v4 on Ethos-U85. To support inception_v4 properly on Ethos-U85, we need to run the model in Dedicated_Sram memory mode with
+    # 384KB(or another amount lower than 2MB) of SRAM passed as fast scratch area. The PR adding support for Dedicated_Sram(https://github.com/pytorch/executorch/pull/10714) 
+    # was reverted due to a change required in an internal variant of the examples/arm/executor_runner/arm_executor_runner.cpp
+    # python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256 --model=ic4 --extra_flags="-DET_ATOL=0.8 -DET_RTOL=0.8" --timeout=2400
 
     echo "${TEST_SUITE_NAME}: PASS"
     }
 
@@ -81,7 +81,7 @@ def get_args():
         if "u55" in args.target:
             args.memory_mode = "Shared_Sram"
         elif "u85" in args.target:
-            args.memory_mode = "Dedicated_Sram_384KB"
+            args.memory_mode = "Sram_Only"
         else:
             raise RuntimeError(f"Invalid target name {args.target}")
 
 
@@ -283,31 +283,6 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(op, args, kwargs, meta)
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=0))
-class ReplaceTCopyWithTransposePass(ExportPass):
-    """
-    Replace t_copy with transpose_copy.int. If the input is 1D, the t_copy is
-    a nop. t_copy is not supported, so this is an opt_level=0 pass.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if get_edge_overload_packet(op) != exir_ops.edge.aten.t_copy:
-            return super().call_operator(op, args, kwargs, meta)
-
-        # Get the input tensor shape
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
-
-        # If the input is a 1D tensor, this t_copy is a nop, so return the input
-        if in_tensor.dim() <= 1:
-            return args[0]
-
-        assert in_tensor.dim() == 2, "t_copy expects a tensor with <= 2 dimensions"
-        transpose_args = (args[0], 0, 1)
-        return super().call_operator(
-            exir_ops.edge.aten.transpose_copy.int, transpose_args, kwargs, meta
-        )
-
-
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceMMWithAddMMPass(ExportPass):
     """
@@ -2407,7 +2382,6 @@ class CadenceReplaceOpsInGraph:
     passes = [
         ReplaceEmptyTensorsWithFullPass,
         ReplaceFunctionallyEquivalentOpTargets,
-        ReplaceTCopyWithTransposePass,
         ReplacePermuteWithTransposePass,
         ReplaceScalarWithTensorArgPass,
         ReplaceConvolutionOptionalArgsWithConcreteArgsPass,
 
@@ -295,11 +295,12 @@ def test_no_replace_quant_permute_dequant_with_requantize(self):
             args=(permute, 4.5, 6, 0, 127, torch.int8),
         )
         builder.output(dequant)
-        graph_module = FuseQuantDequantToRequantizePass(
+        original_graph = builder.get_graph_module()
+        converted_graph = FuseQuantDequantToRequantizePass(
             force_quant_dequant_fusion=False
-        )(builder.get_graph_module()).graph_module
+        )(original_graph).graph_module
         self.check_op_counts(
-            graph_module,
+            converted_graph,
             expected_op_counts={
                 # Verify that no dequant/quant pair was replaced with requantize.
                 # quantize -> permute -> dequantize should not be replaced with requantize.
@@ -310,30 +311,28 @@ def test_no_replace_quant_permute_dequant_with_requantize(self):
         )
 
     def test_replace_quant_view_dequant_with_requantize(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, 1.2, 3, 0, 127, torch.int8
-                )
-                x = x.view(-1)
-                x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 4.5, 6, 0, 127, torch.int8
-                )
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6)
-        model = M()
-        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
-
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 12, 1, 6, dtype=torch.float32))
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(x, 1.2, 3, 0, 127, torch.int8),
+        )
+        view = builder.call_operator(
+            op=exir_ops.edge.aten.view_copy.default, args=(quant, [-1])
+        )
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(view, 4.5, 6, 0, 127, torch.int8),
+        )
+        builder.output(dequant)
+        original_graph = builder.get_graph_module()
+        converted_graph = FuseQuantDequantToRequantizePass()(
+            original_graph
+        ).graph_module
         self.check_op_counts(
-            graph_module,
+            converted_graph,
             expected_op_counts={
-                # Verify that no dequant/quant pair was replaced with requantize.
-                # quantize -> permute -> dequantize should not be replaced with requantize.
+                # Verify that dequant/quant pair was replaced with requantize.
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 0,
                 exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
                 exir_ops.edge.cadence.requantize.default: 1,