pytorch
diff --git a/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/insert_frozen_layer_norm_weight.py‎
Lines changed: 64 additions & 0 deletions b/‎backends/qualcomm/_passes/insert_frozen_layer_norm_weight.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/qnn_pass_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/qualcomm/_passes/qnn_pass_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/op_layer_norm.py‎
Lines changed: 11 additions & 10 deletions b/‎backends/qualcomm/builders/op_layer_norm.py‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎backends/qualcomm/quantizer/annotators.py‎
Lines changed: 20 additions & 15 deletions b/‎backends/qualcomm/quantizer/annotators.py‎
Lines changed: 20 additions & 15 deletions
diff --git a/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 40 additions & 4 deletions b/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 40 additions & 4 deletions
diff --git a/‎backends/qualcomm/tests/models.py‎
Lines changed: 11 additions & 0 deletions b/‎backends/qualcomm/tests/models.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 69 additions & 2 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 69 additions & 2 deletions
diff --git a/‎examples/models/llama/model_args.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/models/llama/model_args.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/models/olmo/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎examples/models/olmo/__init__.py‎
Lines changed: 16 additions & 0 deletions
@@ -28,6 +28,7 @@
 from .fuse_consecutive_cast import FuseConsecutiveCast
 from .fuse_consecutive_transpose import FuseConsecutiveTranspose
 from .i64_to_i32 import I64toI32
+from .insert_frozen_layer_norm_weight import InsertFrozenLayerNormWeight
 from .insert_io_qdq import InsertIOQDQ
 from .insert_requantize import InsertRequantize
 from .layout_transform import LayoutTransform
@@ -67,6 +68,7 @@
     FuseConsecutiveCast,
     FuseConsecutiveTranspose,
     I64toI32,
+    InsertFrozenLayerNormWeight,
     InsertIOQDQ,
     InsertRequantize,
     LayoutTransform,
 
@@ -0,0 +1,64 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+# TODO: Remove this workaround once HTP fixes the bug — LayerNorm without weights should be supported.
+class InsertFrozenLayerNormWeight(ExportPass):
+    """
+    This pass injects a frozen weight parameter (filled with ones) into LayerNorm ops
+    that were exported without weight (i.e., elementwise_affine=False), to satisfy
+    backends that require the presence of a weight parameter.
+
+    It operates at the ExportedProgram level, modifying both the FX graph and
+    the graph_signature to include the new frozen parameter.
+
+    Example transformation:
+
+        Before:
+            %out = aten.layer_norm(%x, normalized_shape=[128], weight=None, bias=None, eps=1e-5)
+
+        After:
+            %weight = get_attr("layer_norm_weight_0")
+            %out = aten.layer_norm(%x, normalized_shape=[128], weight=%weight, bias=None, eps=1e-5)
+
+    The injected weight is a frozen parameter with all values set to 1.0.
+    """
+
+    def __init__(self):
+        super(InsertFrozenLayerNormWeight, self).__init__()
+        self.layer_norm = torch.ops.aten.layer_norm.default
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        modified = False
+        frozen_weight_idx = 0
+
+        for node in graph.nodes:
+            if node.op != "call_function" or node.target != self.layer_norm:
+                continue
+
+            # Detect LayerNorm ops missing the 'weight' argument
+            if len(node.args) < 3:
+                normalized_shape = node.args[1]
+
+                # Create a frozen weight tensor filled with ones
+                param_name = f"{self.layer_norm.__name__.split('.')[0]}_weight_{frozen_weight_idx}"
+                frozen_weight = torch.ones(normalized_shape)
+                graph_module.register_buffer(param_name, frozen_weight)
+                with graph.inserting_before(node):
+                    weight_node = graph.get_attr(param_name)
+                node.args = (node.args[0], node.args[1], weight_node, *node.args[3:])
+
+                frozen_weight_idx += 1
+                modified = True
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, modified)
@@ -33,6 +33,7 @@
     FuseConsecutiveCast,
     FuseConsecutiveTranspose,
     I64toI32,
+    InsertFrozenLayerNormWeight,
     InsertIOQDQ,
     InsertRequantize,
     LayoutTransform,
@@ -201,6 +202,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
+        self.add_pass(InsertFrozenLayerNormWeight())
         self.add_pass(ReplaceInfValues())
         self.add_pass(LiftConstantScalarOperands())
         return self._transform(graph_module)
@@ -220,6 +222,7 @@ def transform_for_export_pipeline(
         if convert_linear_to_conv2d:
             self.add_pass(ConvertLinearToConv2d(exported_program))
         self.add_pass(ConvertSquareToPow())
+        self.add_pass(InsertFrozenLayerNormWeight())
         self.add_pass(LiftConstantScalarOperands())
         self._transform(exported_program.graph_module)
         ep = lift_constant_tensor_pass(exported_program)
 
@@ -40,6 +40,7 @@ def define_node(
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
         )
+        layer_norm_input_tensors = [input_tensor_wrapper]
 
         normalized_shapes = node.args[1]
         if (
@@ -55,16 +56,16 @@ def define_node(
         axis_shape = [len(axis)]
 
         weight_node = self.get_node(node.args[2])
-        weight_tensor = get_parameter(weight_node, self.edge_program)
-        weight_tensor_wrapper = self.define_tensor(
-            weight_node,
-            node,
-            weight_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-            nodes_to_wrappers,
-        )
-
-        layer_norm_input_tensors = [input_tensor_wrapper, weight_tensor_wrapper]
+        if weight_node is not None:
+            weight_tensor = get_parameter(weight_node, self.edge_program)
+            weight_tensor_wrapper = self.define_tensor(
+                weight_node,
+                node,
+                weight_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                nodes_to_wrappers,
+            )
+            layer_norm_input_tensors.append(weight_tensor_wrapper)
 
         bias_node = self.get_node(node.args[3])
         if bias_node is not None:
 
@@ -1235,9 +1235,12 @@ def annotate_getitem(node: Node, quantization_config: QuantizationConfig) -> Non
 @register_annotator([torch.ops.aten.layer_norm.default])
 def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) -> None:
     act_node = node.args[0]
-    weight_node = node.args[2]
-    bias_node = None
+    #  OLMo LayerNorm but with no learnable weight and bias.
+    weight_node = None
     if len(node.args) > 2:
+        weight_node = node.args[2]
+    bias_node = None
+    if len(node.args) > 3:
         bias_node = node.args[3]
 
     if _is_annotated([node]):
@@ -1249,19 +1252,21 @@ def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) ->
         act_node,
         input_act_qspec,
     )
-    if input_act_qspec.dtype == torch.int32:
-        annotate_input_qspec_map(
-            node,
-            weight_node,
-            get_16a16w_qnn_ptq_config().weight,
-        )
-    else:
-        annotate_input_qspec_map(
-            node,
-            weight_node,
-            input_act_qspec,
-        )
-    nodes_to_mark_annotated = [node, weight_node]
+    nodes_to_mark_annotated = [node]
+    if weight_node:
+        if input_act_qspec.dtype == torch.int32:
+            annotate_input_qspec_map(
+                node,
+                weight_node,
+                get_16a16w_qnn_ptq_config().weight,
+            )
+        else:
+            annotate_input_qspec_map(
+                node,
+                weight_node,
+                input_act_qspec,
+            )
+        nodes_to_mark_annotated.append(weight_node)
     if bias_node:
         annotate_input_qspec_map(
             node,
 
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from enum import Enum, unique
 from typing import Sequence
 
 import torch
@@ -31,6 +32,17 @@
 )
 
 
+@unique
+class StaticLLMQuantConfig(Enum):
+    """
+    Layer namespace configuration for Qualcomm's static LLaMA quantization.
+    """
+
+    wq_sha = "wq_sha"  # Query weight (single head)
+    wk_sha = "wk_sha"  # Key weight (single head)
+    wv_sha = "wv_sha"  # Value weight (single head)
+
+
 def annotate_eurobert(gm: torch.fx.GraphModule):
     """
     QNN does not support int32 -> signed 16bit quant
@@ -166,11 +178,35 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
                 )
 
 
-def annotate_wv_sha(gm: torch.fx.GraphModule, quantization_config: QuantizationConfig):
+def annotate_qkv_proj_sha(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig,
+    qkv_tags: set[StaticLLMQuantConfig],
+):
+    """
+    Annotates QKV projection layers in a GraphModule for quantization,
+    specifically layers defined in StaticLLMQuantConfig.
+
+    Args:
+        qkv_tags (set[StaticLLMQuantConfig]): A set of enum tags indicating which QKV layers
+            (e.g., wq, wk, wv) should be annotated for quantization. Only tags defined in
+            StaticLLMQuantConfig are allowed.
+
+    Raises:
+        ValueError: If any tag in `qkv_tags` is not among the allowed enum members.
+    """
+
+    # Get all valid tags from the StaticLLMQuantConfig enum
+    allowed_tags = set(StaticLLMQuantConfig)
+    invalid_tags = qkv_tags - allowed_tags
+    if invalid_tags:
+        raise ValueError(
+            f"Invalid qkv tags: {invalid_tags}. Allowed tags are: {allowed_tags}"
+        )
+
     for node in gm.graph.nodes:
-        if (
-            node.target == torch.ops.aten.conv2d.default
-            and "wv_sha" in node.meta["stack_trace"]
+        if node.target == torch.ops.aten.conv2d.default and any(
+            tag.value in node.meta["stack_trace"] for tag in qkv_tags
         ):
             input_qspec_map = {}
             input_qspec_map[node.args[0]] = quantization_config.input_activation
 
@@ -1113,6 +1113,17 @@ def forward(self, x):
         return self.linear2(x1)
 
 
+class LayerNormWithoutParams(torch.nn.Module):
+    def __init__(self, hidden_size: int):
+        super().__init__()
+        self.normalized_shape = (hidden_size,)
+
+    def forward(self, x):
+        return torch.nn.functional.layer_norm(
+            x, self.normalized_shape, None, None, eps=1e-5
+        )
+
+
 class LayerNorm(torch.nn.Module):
     def __init__(self, bias=True):
         super().__init__()
 
@@ -832,7 +832,11 @@ def test_qnn_backend_up_sampling_nearest_2d_with_size(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_layer_norm(self):
-        modules = [LayerNorm(), LayerNorm(bias=False)]  # noqa: F405
+        modules = [
+            LayerNorm(),  # noqa: F405
+            LayerNorm(bias=False),  # noqa: F405
+            LayerNormWithoutParams(768),  # noqa: F405
+        ]
         sample_input = (torch.randn(196, 768),)
         for i, module in enumerate(modules):
             with self.subTest(i=i):
@@ -2360,7 +2364,11 @@ def test_qnn_backend_up_sampling_nearest_2d_with_size(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_layer_norm(self):
-        modules = [LayerNorm(), LayerNorm(bias=False)]  # noqa: F405
+        modules = [
+            LayerNorm(),  # noqa: F405
+            LayerNorm(bias=False),  # noqa: F405
+            LayerNormWithoutParams(768),  # noqa: F405
+        ]
         sample_input = (torch.randn(196, 768),)
         for i, module in enumerate(modules):
             with self.subTest(i=i):
@@ -4863,6 +4871,65 @@ def test_llama_stories_110m(self):
                 if not self.compile_only and not self.enable_x86_64:
                     self.assertGreaterEqual(msg["inference_speed"], 220)  # Lanai
 
+    def test_static_olmo(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "Simply put, the theory of relativity states that"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--decoder_model",
+            "olmo-1b",
+            "--model_mode",
+            "kv",
+            "--temperature",
+            "0",
+            "--max_seq_len",
+            "1024",
+            "--eval_perplexity",
+            "--task",
+            "wikitext",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                inference_speed_ref = {"SM8650": 35, "SM8750": 60}
+                self.assertLessEqual(msg["wiki_ppl"], 10)
+                self.assertLessEqual(msg["pte_size"], 1_000_000_000)  # 1GB
+                if self.model in inference_speed_ref:
+                    self.assertGreaterEqual(
+                        msg["inference_speed"], inference_speed_ref[self.model]
+                    )
+
     def test_static_phi4(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
 
@@ -22,6 +22,7 @@ class ModelArgs:
     num_experts: int = 8  # Number of experts
     num_activated_experts: int = 2  # Number of experts to activate
     attention_type: str = "mha"  # Attention type, registered in attention.py
+    norm_type: str = "rmsnorm"  # Normalization type, registered in norm.py
     attention_qkv_bias: bool = False
     use_kv_cache: bool = False  # Use key/value cache
     use_sdpa_with_kv_cache_op: bool = (
 
@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.llama.model import Llama2Model
+from executorch.examples.models.olmo.convert_weights import convert_weights
+
+
+class OlmoModel(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "OlmoModel",
+    "convert_weights",
+]