pytorch
diff --git a/‎.ci/scripts/test_model.ps1‎
Lines changed: 15 additions & 7 deletions b/‎.ci/scripts/test_model.ps1‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎.ci/scripts/unittest-windows.ps1‎
Lines changed: 0 additions & 2 deletions b/‎.ci/scripts/unittest-windows.ps1‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/_unittest.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/_unittest.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 8 additions & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm‎
Lines changed: 3 additions & 0 deletions b/‎backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/convert_expand_copy_to_repeat.py‎
Lines changed: 32 additions & 17 deletions b/‎backends/arm/_passes/convert_expand_copy_to_repeat.py‎
Lines changed: 32 additions & 17 deletions
diff --git a/‎backends/arm/_passes/remove_clone_pass.py‎
Lines changed: 10 additions & 0 deletions b/‎backends/arm/_passes/remove_clone_pass.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/arm/debug/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎backends/arm/debug/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/debug/schema.py‎
Lines changed: 133 additions & 0 deletions b/‎backends/arm/debug/schema.py‎
Lines changed: 133 additions & 0 deletions
@@ -30,20 +30,25 @@ function ExportModel-Portable {
 
 function ExportModel-Xnnpack {
     param (
-        [string]$model_name
+        [string]$model_name,
+        [bool]$quantize
     )
 
-    python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate | Write-Host
+    if $(quantize) {
+        python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize | Write-Host
+        $modelFile = "$($modelName)_xnnpack_q8.pte"
+    } else {
+        python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate | Write-Host
+        $modelFile = "$($modelName)_xnnpack_fp32.pte"
+    }
     if ($LASTEXITCODE -ne 0) {
         Write-Host "Model export failed. Exit code: $LASTEXITCODE."
         exit $LASTEXITCODE
     }
 
-    "$($modelName)_xnnpack_fp32.pte"
+    $modelFile
 }
 
-.ci/scripts/setup-windows.ps1
-
 # Build the runner
 if (Test-Path -Path $buildDir) {
     Remove-Item -Path $buildDir -Recurse -Force
@@ -64,8 +69,11 @@ switch ($backend) {
     "portable" {
         $model_path = ExportModel-Portable -model_name $modelName -strict $strict
     }
-    "xnnpack" {
-        $model_path = ExportModel-Xnnpack -model_name $modelName
+    "xnnpack-f32" {
+        $model_path = ExportModel-Xnnpack -model_name $modelName -quantize $false
+    }
+    "xnnpack-q8" {
+        $model_path = ExportModel-Xnnpack -model_name $modelName -quantize $true
     }
     default {
         Write-Host "Unknown backend $backend."
 
@@ -6,8 +6,6 @@ Set-PSDebug -Trace 1
 $ErrorActionPreference = 'Stop'
 $PSNativeCommandUseErrorActionPreference = $true
 
-.ci/scripts/setup-windows.ps1 -editable $editable
-
 # Run pytest with coverage
 # pytest -n auto --cov=./ --cov-report=xml
 pytest -v --full-trace -c pytest-windows.ini
 
@@ -63,4 +63,11 @@ jobs:
       timeout: 120
       script: |
         conda init powershell
+
+        Set-PSDebug -Trace 1
+        \$ErrorActionPreference = 'Stop'
+        \$PSNativeCommandUseErrorActionPreference = \$true
+
+        .ci/scripts/setup-windows.ps1       
+
         powershell .ci/scripts/unittest-windows.ps1 -editable "${{ inputs.editable }}"
@@ -986,11 +986,18 @@ jobs:
       fail-fast: false
       matrix:
         model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
-        backend: [portable, xnnpack]
+        backend: [portable, xnnpack-f32, xnnpack-q8]
     with:
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 60
       script: |
         conda init powershell
+        
+        Set-PSDebug -Trace 1
+        \$ErrorActionPreference = 'Stop'
+        \$PSNativeCommandUseErrorActionPreference = \$true
+
+        .ci/scripts/setup-windows.ps1       
+
         powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
@@ -27,7 +27,7 @@
 	url = https://github.com/google/pthreadpool.git
 [submodule "extension/llm/tokenizers"]
 	path = extension/llm/tokenizers
-	url = https://github.com/pytorch-labs/tokenizers.git
+	url = https://github.com/meta-pytorch/tokenizers.git
 [submodule "kernels/optimized/third-party/eigen"]
 	path = kernels/optimized/third-party/eigen
 	url = https://gitlab.com/libeigen/eigen.git
 
@@ -449,12 +449,14 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
         case ModelAssetType::CompiledModel: {
             // The model is already compiled; no further action needed.
             // Return the existing model URL.
+            ETCoreMLLogInfo("The model in the pte file is pre-compiled.  Skipping compilation.");
             return modelURL;
         }
 
         case ModelAssetType::Model: {
             // The model is not compiled yet.
             // Compile the model at the specified URL with a maximum wait time of 5 minutes.
+            ETCoreMLLogInfo("The model in the pte file is not pre-compiled.  Compiling with a 5 min timeout.");
             NSURL *compiledModelURL = [ETCoreMLModelCompiler compileModelAtURL:modelURL
                                                           maxWaitTimeInSeconds:(5 * 60)
                                                                          error:error];
@@ -490,6 +492,7 @@ - (nullable ETCoreMLAsset *)compiledModelAssetWithMetadata:(const ModelMetadata&
                                                                  error:error];
         if (compiledModelURL) {
             // Move the compiled model to the asset manager to transfer ownership.
+            ETCoreMLLogInfo("Storing compiled asset with identifier=%@ in the asset manager.", identifier);
             compiledModelAsset = [self.assetManager storeAssetAtURL:compiledModelURL withIdentifier:identifier error:error];
         }
     }];
 
@@ -8,12 +8,43 @@
 import logging
 from typing import cast
 
+import torch
+
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
 logger = logging.getLogger(__name__)
 
 
+def calculate_multiples(args):
+    input_node_or_tensor = args[0]
+
+    if isinstance(input_node_or_tensor, torch.fx.node.Node):
+        input_data = input_node_or_tensor.meta["val"]
+    else:
+        input_data = input_node_or_tensor.data
+
+    input_shape = input_data.shape
+
+    multiples = cast(list[int], args[1])
+    expanded_rank = len(multiples)
+
+    # Expanded shape is 'input_shape' front-padded with ones.
+    padding = expanded_rank - len(input_shape)
+    extended_shape = [
+        input_shape[i] if i >= 0 else 1 for i in range(-padding, len(input_shape))
+    ]
+
+    # To convert expand arg to repeat arg, non-repeated dims should have
+    # multiples[dim] = 1. Passing -1 to expand arg means
+    # not changing the size of that dimension.
+    multiples = [
+        multiples[i] if multiples[i] != -1 and extended_shape[i] == 1 else 1
+        for i in range(expanded_rank)
+    ]
+    return multiples
+
+
 class ConvertExpandCopyToRepeatPass(ExportPass):
     """
     Replace expand copy with repeat since it is a repeat that can only repeat singleton dimensions.
@@ -26,23 +57,7 @@ def call_operator(self, op, args, kwargs, meta):
         if op != self.expand_copy:
             return super().call_operator(op, args, kwargs, meta)
 
-        input_shape = args[0].data.shape
-        multiples = cast(list[int], args[1])
-        expanded_rank = len(multiples)
-
-        # Expanded shape is 'input_shape' front-padded with ones.
-        padding = expanded_rank - len(input_shape)
-        extended_shape = [
-            input_shape[i] if i >= 0 else 1 for i in range(-padding, len(input_shape))
-        ]
-
-        # To convert expand arg to repeat arg, non-repeated dims should have
-        # multiples[dim] = 1. Passing -1 to expand arg means
-        # not changing the size of that dimension.
-        multiples = [
-            multiples[i] if multiples[i] != -1 and extended_shape[i] == 1 else 1
-            for i in range(expanded_rank)
-        ]
+        multiples = calculate_multiples(args)
 
         if all((x == 1 for x in multiples)):
             # All dimensions/repetitions occur only once. Remove node
 
@@ -6,9 +6,13 @@
 
 # pyre-unsafe
 
+import logging
+
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
+logger = logging.getLogger(__name__)
+
 
 class RemoveClonePass(ExportPass):
     """Remove all clones from graph_module"""
@@ -21,4 +25,10 @@ def call_operator(self, op, args, kwargs, meta):
             raise ValueError(
                 f"clone operator expects exactly one argument, got {len(args)}"
             )
+
+        if "memory_format" in kwargs:
+            logger.warning(
+                f"Removing clone with memory_format '{kwargs['memory_format']}'."
+            )
+
         return args[0]
@@ -0,0 +1,4 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
@@ -0,0 +1,133 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import json
+
+from dataclasses import asdict, dataclass
+from typing import Any
+
+import serializer.tosa_serializer as ts  # type: ignore
+import torch
+
+from torch.fx.traceback import NodeSource
+
+
+@dataclass
+class TosaDebugSchema:
+    node_name: str
+    operator_name: str
+    operator_id: int
+
+
+@dataclass
+class ATenDebugSchema:
+    node_name: str
+    operator_name: str
+
+    @staticmethod
+    def from_node(node: torch.fx.Node) -> ATenDebugSchema:
+        # node.target is Union[Callable[..., Any], str], so we need to access this correctly depending on the type
+        if callable(node.target):
+            operator_name = node.target.__name__
+        else:
+            operator_name = node.target
+
+        return ATenDebugSchema(node_name=node.name, operator_name=operator_name)
+
+
+@dataclass
+class TorchDebugSchema:
+    stack_trace: list[str]
+    node_trace: list[dict[str, Any]] | str
+    nn_module_stack: dict[str, Any] | str
+    torch_fn: tuple[str, str] | str
+
+    @staticmethod
+    def serialize_node_trace(node_trace: list[NodeSource]) -> list[dict[str, Any]]:
+        """Flatten the from_node dictionary to remove nesting."""
+        flattened = []
+        node_stack = []
+
+        for n in node_trace:
+            node_stack.append((n, -1))
+
+        while len(node_stack) > 0:
+            node, parent_id = node_stack.pop()
+            flattened.append(
+                {
+                    "name": node.name,
+                    "target": node.target,
+                    "graph_id": node.graph_id,
+                    "pass_name": node.pass_name,
+                    "action": node._get_action_string(),
+                    "parent_graph_id": parent_id,
+                }
+            )
+
+            for n in node.from_node:
+                node_stack.append((n, node.graph_id))
+
+        return flattened
+
+    @staticmethod
+    def from_node(node: torch.fx.Node) -> TorchDebugSchema:
+        node_trace: str | list[dict[str, Any]] = "No node trace available."
+
+        if "from_node" in node.meta:
+            # Flatten the node_trace dictionary, so there is no nesting
+            node_trace = TorchDebugSchema.serialize_node_trace(node.meta["from_node"])
+
+        return TorchDebugSchema(
+            stack_trace=node.meta.get("stack_trace", "No stack trace available").split(
+                "\n"
+            ),
+            node_trace=node_trace,
+            nn_module_stack=node.meta.get(
+                "nn_module_stack", "No module stack trace available"
+            ),
+            torch_fn=node.meta.get("torch_fn", "No torch_fn available"),
+        )
+
+
+@dataclass
+class DebugSchema:
+    event_id: int
+    aten_info: ATenDebugSchema
+    tosa_info: TosaDebugSchema
+    torch_info: TorchDebugSchema
+
+
+class DebugHook:
+    def __init__(self) -> None:
+        self._debug_events: list[DebugSchema] = []
+        self.__op_id_to_name = {}
+
+        # Build up a mapping from TOSA 1.0 operator IDs to their names
+        for name, val in vars(ts.Op).items():
+            self.__op_id_to_name[val] = name
+
+    def add(self, node: torch.fx.Node, tosa_op: Any, tosa_op_id: int) -> None:
+        tosa_debug_info = TosaDebugSchema(
+            node_name=str(tosa_op),
+            operator_name=self.__op_id_to_name[tosa_op_id],
+            operator_id=tosa_op_id,
+        )
+
+        aten_debug_info = ATenDebugSchema.from_node(node)
+        torch_debug_info = TorchDebugSchema.from_node(node)
+
+        self._debug_events.append(
+            DebugSchema(
+                event_id=len(self._debug_events),
+                aten_info=aten_debug_info,
+                tosa_info=tosa_debug_info,
+                torch_info=torch_debug_info,
+            )
+        )
+
+    def serialize(self) -> str:
+        return json.dumps([asdict(event) for event in self._debug_events], indent=4)