pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_qnn_static_llm.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_qnn_static_llm.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/convert_expand_copy_to_repeat.py‎
Lines changed: 4 additions & 3 deletions b/‎backends/arm/_passes/convert_expand_copy_to_repeat.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/arm/_passes/decompose_embedding_pass.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/_passes/decompose_embedding_pass.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions
diff --git a/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 2 additions & 1 deletion b/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/ethosu/backend.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/ethosu/backend.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operator_support/minmax_support.py‎
Lines changed: 18 additions & 1 deletion b/‎backends/arm/operator_support/minmax_support.py‎
Lines changed: 18 additions & 1 deletion
@@ -1 +1 @@
-e6f766c7d750d40603eee3f66c5915bac606b3ea
+556fc09a9f67f24ca5591ec049c5d0c347c5f62a
@@ -81,7 +81,7 @@ elif [[ "${TASK_NAME}" == "stories_260k_bc" ]]; then
     fi
 
 elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then
-    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_smollm2 --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64
     exit_code1=$?
     if [ $exit_code1 -ne 0 ]; then
         exit 1
 
@@ -347,7 +347,7 @@ jobs:
         elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
           setup_script_args="--target-toolchain zephyr"
           toolchain_prefix=arm-zephyr-eabi-
-          threshold="135240" # 132 KiB
+          threshold="135656" # 132 KiB
           toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
         else
           echo "Fail unsupport OS selection ${{ matrix.os }}"
 
@@ -20,6 +20,7 @@
 
 
 def calculate_multiples(args):
+    """Returns expand args converted to repeat args, and whether the expand changes the rank"""
     input_node_or_tensor = args[0]
 
     if isinstance(input_node_or_tensor, torch.fx.node.Node):
@@ -45,7 +46,7 @@ def calculate_multiples(args):
         multiples[i] if multiples[i] != -1 and extended_shape[i] == 1 else 1
         for i in range(expanded_rank)
     ]
-    return multiples
+    return multiples, expanded_rank != len(input_shape)
 
 
 class ConvertExpandCopyToRepeatPass(ArmPass):
@@ -62,9 +63,9 @@ def call_operator(self, op, args, kwargs, meta):
         if op != self.expand_copy:
             return super().call_operator(op, args, kwargs, meta)
 
-        multiples = calculate_multiples(args)
+        multiples, changes_rank = calculate_multiples(args)
 
-        if all((x == 1 for x in multiples)):
+        if all((x == 1 for x in multiples)) and not changes_rank:
             # All dimensions/repetitions occur only once. Remove node
             # altogether since it's in practice just a copy.
             logger.warning("Found redundant expand node (no-op). Removing it.")
 
@@ -17,7 +17,6 @@
 from .arm_pass_utils import create_node, get_first_fake_tensor
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 class DecomposeEmbeddingPass(ArmPass):
 
@@ -13,6 +13,7 @@
 from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
 from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
 from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
+from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -50,6 +51,15 @@ def get_view(op):
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
+def get_quantization(op):
+    """Returns quant and dequant op of same type (per_channel/ tensor) as op if op is a dequant node, None otherwise."""
+    if op in DQ_OPS:
+        # Input of op can be placeholder, can't use that to get quant node directly.
+        quant_type_index = DQ_OPS.index(op)
+        return Q_OPS[quant_type_index], op
+    return None
+
+
 class DecomposeMeanDimPass(ArmPass):
     """
     Decomposes a meandim into avg_pool and/or sum + mul (1/N) depending on which dims the mean is taken for:
@@ -121,6 +131,7 @@ def call_operator(self, op, args, kwargs, meta):
                 dims_to_reduce = [dim - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
+            x = self._maybe_insert_q_dq_after(x, meta)
 
         # Reduce (h,w) dims by avg pool if possible
         x, dims_to_reduce = self._reduce_by_average_pool(op, x, dims_to_reduce, meta)
@@ -133,7 +144,7 @@ def call_operator(self, op, args, kwargs, meta):
             dims_to_reduce = [dim + len(original_dims) - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, temp_shape), {}, meta, True)
-
+            x = self._maybe_insert_q_dq_after(x, meta)
         # Reduce remaining dims by sum
         x = self._reduce_by_sum(op, x, dims_to_reduce, meta, dtype)
 
@@ -156,6 +167,45 @@ def _reduce_by_sum(self, op, input_node, dims, meta, dtype):
         full = super().call_operator(
             full_op, ([1] * len(output_shape), 1 / N), {"dtype": dtype}, meta, True
         )
+        if (quant_ops := get_quantization(input_node.node.target)) is not None:
+            # Insert Q and DQ nodes after full op.
+            # Since the value of full is known, we can compute quant params such that dq(q_max_value)
+            q_op, dq_op = quant_ops
+            qmax = input_node.node.args[4]
+            full_quant_args = (
+                1 / (N * qmax),  # Scale to map qmax to 1/N
+                0,  # Zero point
+                *input_node.node.args[3:],
+            )
+            q_args = (full, *full_quant_args)
+            full = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (full, *full_quant_args)
+            full = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
+            # Insert Q and DQ nodes after sum op.
+            # Scale needs to be adjusted with N, since it was computed on data after the division with N.
+            sum_quant_args = (input_node.node.args[1] * N, *input_node.node.args[2:])
+            q_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
         return super().call_operator(mul_op, (sum, full), {}, meta, True)
 
     def _reduce_by_average_pool(self, op, input_node, dims, meta):
@@ -190,10 +240,38 @@ def _reduce_by_average_pool(self, op, input_node, dims, meta):
         )
 
         if is_supported:
+            out = super().call_operator(avgpool_op, args, {}, meta, True)
+            out = self._maybe_insert_q_dq_after(out, meta)
             return (
-                super().call_operator(avgpool_op, args, {}, meta, True),
+                out,
                 dims_to_reduce_by_sum,
             )
 
         else:
             return input_node, dims
+
+    def _maybe_insert_q_dq_after(self, op, meta):
+        """If the input node of op is a dequant node, insert a q-dq pair after op with identical quantization parameters."""
+
+        if len(op.node.all_input_nodes) > 1:
+            raise ValueError(
+                f"Expected one input to {op.node}, got inputs {op.node.all_input_nodes}"
+            )
+        input_node = op.node.all_input_nodes[0]
+        if (quant_ops := get_quantization(input_node.target)) is not None:
+            q_op, dq_op = quant_ops
+            quant_args = list(input_node.args[1:])
+            q_args = (op, *quant_args)
+            out = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (out, *quant_args)
+            return super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+        else:
+            return op
@@ -65,7 +65,8 @@ def resolve_arg(arg):
             if isinstance(arg, torch.fx.Node) and arg in input_nodes:
                 idx = input_nodes.index(arg)
                 t = get_param_tensor(self.exported_program, arg)
-                if qparams:
+                # Check if qparams exist for this arg
+                if qparams and idx in qparams.keys():
                     t = qparams[idx].dequantize_value(t)
                 return t
             if isinstance(arg, tuple):
 
@@ -36,6 +36,8 @@ class InsertInt32CastsAfterInt64PlaceholdersPass(ArmPass):
     # Key: op overload; Value: zero-based indices of positional args that must be i64.
     I64_INPUT_ARG_POSITIONS = {
         torch.ops.aten.one_hot.default: (0,),
+        torch.ops.aten.index_copy_.default: (2,),
+        torch.ops.aten.index_copy.default: (2,),
     }
 
     def _insert_callsite_i32_to_i64_casts(self, graph_module: torch.fx.GraphModule):
 
@@ -63,7 +63,7 @@ def _compile_tosa_flatbuffer(
         binary = vela_compile(
             tosa_flatbuffer,
             compile_flags,
-            verbose=logger.getEffectiveLevel() == logging.INFO,
+            verbose=logger.getEffectiveLevel() <= logging.INFO,
             intermediate_path=compile_spec.get_intermediate_path(),
         )
         return binary
 
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Declare operator support for min/max along a dimension in TOSA.
+
+Provide support checks ensuring that argmax/argmin indices are not consumed,
+restricting to float profiles until index quantization is supported.
+
+"""
 
 import torch.fx as fx
 from executorch.backends.arm.operator_support.tosa_supported_operators import (
@@ -14,6 +20,8 @@
 
 @register_tosa_support_check
 class MinMaxSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support check for ``aten.max.dim`` and ``aten.min.dim``."""
+
     targets = [
         exir_ops.edge.aten.max.dim,
         exir_ops.edge.aten.min.dim,
@@ -24,7 +32,16 @@ class MinMaxSupported(SupportedTOSAOperatorCheck):
         TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+    def is_node_tosa_supported(
+        self, node: fx.Node, tosa_spec: TosaSpecification
+    ) -> bool:
+        """Return True if the node is supported by TOSA.
+
+        Allow max/min when the argmax/argmin output is unused or dropped (i.e.,
+        only the value is consumed). Disallow cases where arg indices are
+        further used.
+
+        """
         if node.target in [exir_ops.edge.aten.max.dim, exir_ops.edge.aten.min.dim]:
             no_argmax = len(node.users) == 1
             no_argmax_users = (len(node.users) == 2) and (
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e6f766c7d750d40603eee3f66c5915bac606b3ea`
	`1`	`+556fc09a9f67f24ca5591ec049c5d0c347c5f62a`
Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,8 @@ class InsertInt32CastsAfterInt64PlaceholdersPass(ArmPass):`
`36`	`36`	`# Key: op overload; Value: zero-based indices of positional args that must be i64.`
`37`	`37`	`I64_INPUT_ARG_POSITIONS = {`
`38`	`38`	`torch.ops.aten.one_hot.default: (0,),`
	`39`	`+ torch.ops.aten.index_copy_.default: (2,),`
	`40`	`+ torch.ops.aten.index_copy.default: (2,),`
`39`	`41`	`}`
`40`	`42`
`41`	`43`	`def _insert_callsite_i32_to_i64_casts(self, graph_module: torch.fx.GraphModule):`
Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ def _compile_tosa_flatbuffer(`
`63`	`63`	`binary = vela_compile(`
`64`	`64`	`tosa_flatbuffer,`
`65`	`65`	`compile_flags,`
`66`		`- verbose=logger.getEffectiveLevel() == logging.INFO,`
	`66`	`+ verbose=logger.getEffectiveLevel() <= logging.INFO,`
`67`	`67`	`intermediate_path=compile_spec.get_intermediate_path(),`
`68`	`68`	`)`
`69`	`69`	`return binary`