pytorch · shewu-quic · Nov 20, 2025 · Nov 21, 2025
@@ -24,6 +24,7 @@
 from .decompose_roll import DecomposeRoll
 from .decompose_silu import DecomposeSilu
 from .decompose_threshold import DecomposeThreshold
+from .decompose_triu import DecomposeTriu
 from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fixed_linear_keep_dim import FixedLinearKeepDim
@@ -67,6 +68,7 @@
     DecomposeRoll,
     DecomposeSilu,
     DecomposeThreshold,
+    DecomposeTriu,
     DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,

@@ -0,0 +1,65 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch._decomp import get_decompositions
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from .utils import merge_decomposed_graph
+
+
+class DecomposeTriu(ExportPass):
+    """
+    Decompose triu for quantization annotation to work properly.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _replace_output(
+        self, node: torch.fx.Node, output_node: torch.fx.Node, remap: Dict
+    ):
+        for user in node.users.copy():
+            # remap
+            user.replace_input_with(
+                node,
+                remap[output_node.args[0]],
+            )
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        decom_mappings = get_decompositions([torch.ops.aten.triu.default])
+
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.triu.default:
+                decomposed_module = make_fx(
+                    node.target,
+                    decomposition_table=decom_mappings,
+                    tracing_mode="fake",
+                )(node.args[0].meta["val"], node.args[1])
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {}
+                    remap["arg0_1"] = node.args[0]
+
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                        predicate=lambda decomp_node: "arg1_1" not in decomp_node.name,
+                        output_processor=self._replace_output,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -55,6 +55,7 @@ class TensorOpInfo:
     aten.where.ScalarOther: TensorOpInfo(aten.where.self, False, True),
     aten.where.Scalar: TensorOpInfo(aten.where.self, False, True),
     aten.masked_fill.Scalar: TensorOpInfo(aten.masked_fill.Tensor, False, False),
+    aten.masked_fill_.Scalar: TensorOpInfo(aten.masked_fill.Tensor, False, False),
     aten.bitwise_xor.Scalar: TensorOpInfo(aten.bitwise_xor.Tensor, False, False),
 }
 

@@ -29,6 +29,7 @@
     DecomposeRoll,
     DecomposeSilu,
     DecomposeThreshold,
+    DecomposeTriu,
     DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
@@ -203,6 +204,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeSilu())
         self.add_pass(DecomposeThreshold())
+        self.add_pass(DecomposeTriu())
         self.add_pass(DecomposeWrapWithAutocast())
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
@@ -221,6 +223,7 @@ def transform_for_export_pipeline(
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeThreshold())
+        self.add_pass(DecomposeTriu())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
         self.add_pass(DecomposeWrapWithAutocast())

@@ -37,7 +37,7 @@ class MyModel(torch.nn.Module):
 ```
 At the time we try to lower it with Qualcomm backend:
 ```python
-from excutorch.examples.qualcomm.utils import build_executorch_binary
+from executorch.examples.qualcomm.utils import build_executorch_binary
 
 build_executorch_binary(
     model=MyModel(),

@@ -56,12 +56,12 @@ def define_node(
                 [-1, 1]
             )
 
-        weight_tensor = get_parameter(weight_node, self.edge_program)
+        weight_tensor = self.get_tensor(weight_node, node)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
             node,
             weight_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
         )
         linear_input_tensors.append(weight_tensor_wrapper)

@@ -509,7 +509,7 @@ def _ensure_qnn_sdk_lib() -> bool:
     logger.info("[QNN] Loading %s", qnn_lib)
     lib_loaded = False
     try:
-        ctypes.CDLL(str(qnn_lib), mode=ctypes.RTLD_GLOBAL)
+        ctypes.CDLL(str(qnn_lib), mode=ctypes.RTLD_LOCAL)
         logger.info("[QNN] Loaded libQnnHtp.so from packaged SDK.")
         lib_loaded = True
     except OSError as e:
@@ -528,7 +528,7 @@ def _load_libcxx_libs(lib_path):
     logger.debug("sorted_candidates: %s", sorted_candidates)
     for sofile in sorted_candidates:
         try:
-            ctypes.CDLL(str(sofile), mode=ctypes.RTLD_GLOBAL)
+            ctypes.CDLL(str(sofile), mode=ctypes.RTLD_LOCAL)
             logger.info("Loaded %s", sofile.name)
         except OSError as e:
             logger.warning("[WARN] Failed to load %s: %s", sofile.name, e)