diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index 38a354eddf0..4f8dc7a30e5 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -188,6 +188,14 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=edsr
     # Additional deps for edsr
     pip install piq
+  elif [[ "${MODEL_NAME}" == "albert" ]]; then
+    EXPORT_SCRIPT=albert
+  elif [[ "${MODEL_NAME}" == "bert" ]]; then
+    EXPORT_SCRIPT=bert
+  elif [[ "${MODEL_NAME}" == "distilbert" ]]; then
+    EXPORT_SCRIPT=distilbert
+  elif [[ "${MODEL_NAME}" == "eurobert" ]]; then
+    EXPORT_SCRIPT=eurobert
   else
     echo "Unsupported model $MODEL_NAME"
     exit 1
@@ -197,7 +205,25 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
+  SCRIPT_FOLDER=""
+  case "${MODEL_NAME}" in
+    "dl3"|"mv3"|"mv2"|"ic4"|"ic3"|"vit"|"mb"|"w2l")
+        SCRIPT_FOLDER=scripts
+        ;;
+    "albert"|"bert"|"distilbert")
+        pip install evaluate
+        SCRIPT_FOLDER=oss_scripts
+        # Bert models running in 16bit will encounter op validation fail on some operations,
+        # which requires CHIPSET >= SM8550.
+        QNN_CHIPSET=SM8550
+        ;;
+    *)
+        echo "Unsupported model $MODEL_NAME"
+        exit 1
+        ;;
+  esac
+
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 43da4e4b9b0..a4996459f8a 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -480,6 +480,32 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
 
+  test-qnn-optimum-model:
+    name: test-qnn-optimum-model
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        dtype: [fp32]
+        model: [albert, bert, distilbert] # eurobert requires transfomer >= 4.48.0, skip for now
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
+
   test-apple-model:
     name: test-apple-model
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index ca1aa78ef17..307756bff7f 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -19,6 +19,7 @@
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_roll import DecomposeRoll
 from .decompose_silu import DecomposeSilu
+from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fixed_linear_keep_dim import FixedLinearKeepDim
 from .fold_qdq import FoldQDQ
@@ -56,6 +57,7 @@
     DecomposeLinalgVectorNorm,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
     FoldQDQ,
diff --git a/backends/qualcomm/_passes/decompose_wrap_with_autocast.py b/backends/qualcomm/_passes/decompose_wrap_with_autocast.py
new file mode 100644
index 00000000000..6c073bd309c
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_wrap_with_autocast.py
@@ -0,0 +1,88 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import _operator
+from typing import Dict, Tuple
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_nn_module_stack
+
+
+class DecomposeWrapWithAutocast(ExportPass):
+    """
+    Decompose the _higher_order_ops WrapWithAutocast
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _get_submod(
+        self, gm: torch.fx.GraphModule, node: torch.fx.Node
+    ) -> Tuple[torch.fx.GraphModule, str]:
+        for a in node.args:
+            if isinstance(a, torch.fx.Node) and "submod" in a.target:
+                return getattr(gm, a.target), a.target
+
+    def _replace_output(
+        self, wwac_node: torch.fx.Node, output_node: torch.fx.Node, remap: Dict
+    ):
+        for user in wwac_node.users.copy():
+            arg_idx = 0
+            is_user_getitem = False
+
+            if user.target == _operator.getitem:
+                arg_idx = user.args[1]
+                is_user_getitem = True
+
+            user.replace_input_with(
+                wwac_node,
+                remap[output_node.args[0][arg_idx]],
+            )
+
+            if is_user_getitem:
+                for user_user in user.users.copy():
+                    user_user.replace_input_with(user, user.args[0])
+
+    def _replace(self, gm: torch.fx.GraphModule) -> None:
+        graph = gm.graph
+        for node in graph.nodes:
+            if isinstance(node.target, torch._higher_order_ops.wrap.WrapWithAutocast):
+                submod, submod_name = self._get_submod(gm, node)
+                n_args = node.args
+                input_submod = n_args[4]
+                decomposed_module = submod
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    # remap = {"expand_1": node.args[5], "to_4": node.args[6]}
+                    remap = {n_args[i].name: n_args[i] for i in range(5, len(n_args))}
+
+                    for decomposed_node in decomposed_module.graph.nodes:
+                        copy_nn_module_stack(node, decomposed_node)
+                        # no need to copy existent 'output'
+                        if decomposed_node.op == "output":
+                            self._replace_output(node, decomposed_node, remap)
+                        # no need to copy existent placeholders
+                        elif decomposed_node.op == "placeholder":
+                            # replace node map from string to graph node
+                            remap[decomposed_node] = remap.pop(decomposed_node.name)
+                        else:
+                            remap[decomposed_node] = graph.node_copy(
+                                decomposed_node,
+                                arg_transform=lambda x, remap=remap: remap[x],
+                            )
+
+                    graph.erase_node(node)
+
+                graph.erase_node(input_submod)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self._replace(graph_module)
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index bb6a4dd0a67..9364baccc7a 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -24,6 +24,7 @@
     DecomposeLinalgVectorNorm,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
     FoldQDQ,
@@ -194,6 +195,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeSilu())
+        self.add_pass(DecomposeWrapWithAutocast())
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
@@ -207,6 +209,7 @@ def transform_for_export_pipeline(self, exported_program: ExportedProgram):
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        self.add_pass(DecomposeWrapWithAutocast())
         # this pass will rewrite state_dict, it needs to be accomplished before
         # to_edge_transform_and_lower
         self.add_pass(ConvertConv1dToConv2d(exported_program))
diff --git a/backends/qualcomm/_passes/remove_redundancy.py b/backends/qualcomm/_passes/remove_redundancy.py
index bff917be3da..22d476ef21b 100644
--- a/backends/qualcomm/_passes/remove_redundancy.py
+++ b/backends/qualcomm/_passes/remove_redundancy.py
@@ -43,6 +43,8 @@ def _dim_order_op_condition(self, node):
         dim_order = node.kwargs.get("dim_order")
         # skip if there contains layout hint
         # e.g. (0, 2, 3, 1) != (0, 1, 2, 3)
+        if node.meta["val"].dtype != node.args[0].meta["val"].dtype:
+            return False
         return dim_order != list(range(len(dim_order)))
 
     def _to_copy_op_condition(self, node):
@@ -53,19 +55,15 @@ def _default_condition(self, ndoe):
 
     def _remove(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
         for n in graph_module.graph.nodes:
-            if n.target not in self.redundant_ops or not self.redundant_ops[n.target](
-                n
-            ):
-                continue
-
-            to_be_remove = n
-            # assert_tensor_metadata op has no user
-            if len(n.users.keys()) == 0:
-                n.args = ()
-            # normal case
-            for user_n in list(n.users.keys()):
-                user_n.replace_input_with(n, n.args[0])
-            graph_module.graph.erase_node(to_be_remove)
+            if n.target in self.redundant_ops and self.redundant_ops[n.target](n):
+                to_be_remove = n
+                # assert_tensor_metadata op has no user
+                if len(n.users.keys()) == 0:
+                    n.args = ()
+                # normal case
+                for user_n in list(n.users.keys()):
+                    user_n.replace_input_with(n, n.args[0])
+                graph_module.graph.erase_node(to_be_remove)
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._remove(graph_module)
diff --git a/backends/qualcomm/_passes/replace_inf_values.py b/backends/qualcomm/_passes/replace_inf_values.py
index c7e475f54f2..bffcea03a72 100644
--- a/backends/qualcomm/_passes/replace_inf_values.py
+++ b/backends/qualcomm/_passes/replace_inf_values.py
@@ -9,13 +9,13 @@
 
 class ReplaceInfValues(ExportPass):
     """
-    Due to limitation in Qnn, we need to change inf or -inf to arbitrary value in quantization.
+    Due to limitation in QNN, change inf or -inf to arbitrary value in quantization.
     """
 
     def __init__(self):
         super(ReplaceInfValues, self).__init__()
 
-    def call(self, graph_module: torch.fx.GraphModule):
+    def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
         for buf_name, tensor in graph_module.named_buffers():
             if tensor.is_floating_point():
                 # 255 here is mainly for attention_mask in Llama for reasonable quant scale
@@ -38,5 +38,23 @@ def call(self, graph_module: torch.fx.GraphModule):
                     arg_list[2] = -255
             node.args = tuple(arg_list)
 
+            if node.target in [
+                torch.ops.aten.masked_fill.Tensor,
+                torch.ops.aten.masked_fill.Scalar,
+            ]:
+                assert (
+                    len(node.args) == 3
+                ), f"Expecting {node.name} to have 3 arguments."
+                val = node.args[2]
+                if node.args[2] > torch.finfo(torch.float16).max:
+                    val = 255
+                elif node.args[2] < torch.finfo(torch.float16).min:
+                    val = -255
+                node.args = (
+                    node.args[0],
+                    node.args[1],
+                    val,
+                )
+
         graph_module.recompile()
         return PassResult(graph_module, True)
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
index ecce4ee3ef0..eca889a1610 100644
--- a/backends/qualcomm/quantizer/annotators.py
+++ b/backends/qualcomm/quantizer/annotators.py
@@ -462,7 +462,7 @@ def annotate_hardtanh(node: Node, quantization_config: QuantizationConfig) -> No
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.mean.default])
+@register_annotator([torch.ops.aten.mean.default, torch.ops.aten.mean.dim])
 def annotate_mean(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
@@ -604,11 +604,6 @@ def annotate_select(node: Node, quantization_config: QuantizationConfig) -> None
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.mean.dim])
-def annotate_mean_dim(node: Node, quantization_config: QuantizationConfig) -> None:
-    annotate_single_in_single_out(node, quantization_config)
-
-
 @register_annotator([torch.ops.aten.slice.Tensor])
 def annotate_slice(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
index 0e06015ed91..b5531cd492f 100644
--- a/backends/qualcomm/quantizer/custom_annotation.py
+++ b/backends/qualcomm/quantizer/custom_annotation.py
@@ -26,6 +26,35 @@
 )
 
 
+def annotate_eurobert(gm: torch.fx.GraphModule):
+    """
+    QNN does not support int32 -> signed 16bit quant
+    We need to first annotate this to_fp node as 8bit quant, so it will perform requantize
+    Final graph should look like: int32 -> convert -> cast -> matmul.args[1]
+
+    """
+    quantization_config_8a8w = get_8a8w_qnn_ptq_config()
+    for node in gm.graph.nodes:
+        # A little tricky here. This matmul node is wrapped inside a submodule after 1st torch.export.
+        # There are actually 2 'to' op that is redundant.
+        # It will look like: int64 -> to_fp -> to_fp -> matmul.args[1]
+        # Draw out the graph after the 1st export will help visualize the submodule.
+
+        if node.target == torch.ops.aten.matmul.default and node.args[1].args[0].args[
+            0
+        ].meta["val"].dtype in [torch.int64, torch.int32]:
+            to_node = node.args[1]
+            input_qspec_map = {}
+            assert isinstance(to_node, Node)
+            input_spec = quantization_config_8a8w.input_activation
+            input_qspec_map[to_node] = input_spec
+            to_node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=quantization_config_8a8w.output_activation,
+                _annotated=True,
+            )
+
+
 def annotate_mimi_decoder(gm: torch.fx.GraphModule):
     """
     The 1st transpose conv in mimi decoder is really sensitive to scale/offset in 16a8w, which causes execution failure.
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 4f101db8e6e..7666e36b985 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -3891,6 +3891,74 @@ def test_llama_stories_110m(self):
 
 
 class TestExampleOssScript(TestQNN):
+    def test_albert(self):
+        if not self.required_envs([self.sentence_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/albert.py",
+            "--dataset",
+            self.sentence_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["accuracy"], 0.8)
+
+    def test_bert(self):
+        if not self.required_envs([self.sentence_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/bert.py",
+            "--dataset",
+            self.sentence_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["accuracy"], 0.6)
+
     def test_conv_former(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
@@ -4033,6 +4101,40 @@ def test_dino_v2(self):
                 self.assertGreaterEqual(msg["top_1"], 70)
                 self.assertGreaterEqual(msg["top_5"], 85)
 
+    def test_distilbert(self):
+        if not self.required_envs([self.sentence_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/distilbert.py",
+            "--dataset",
+            self.sentence_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["accuracy"], 0.45)
+
     def test_dit(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
@@ -4142,14 +4244,13 @@ def test_efficientSAM(self):
             else:
                 self.assertGreaterEqual(msg["MIoU"], 0.55)
 
-    def test_swin_transformer(self):
-        if not self.required_envs([self.image_dataset]):
+    def test_esrgan(self):
+        if not self.required_envs():
             self.skipTest("missing required envs")
+
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/swin_transformer.py",
-            "--dataset",
-            self.image_dataset,
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/esrgan.py",
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -4158,6 +4259,9 @@ def test_swin_transformer(self):
             self.device,
             "--model",
             self.model,
+            "--default_dataset",
+            "--oss_repo",
+            self.oss_repo,
             "--ip",
             self.ip,
             "--port",
@@ -4174,16 +4278,17 @@ def test_swin_transformer(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                self.assertGreaterEqual(msg["top_1"], 60)
-                self.assertGreaterEqual(msg["top_5"], 80)
+                self.assertGreaterEqual(msg["PSNR"], 24)
+                self.assertGreaterEqual(msg["SSIM"], 0.8)
 
-    def test_esrgan(self):
-        if not self.required_envs():
+    def test_eurobert(self):
+        if not self.required_envs([self.sentence_dataset]):
             self.skipTest("missing required envs")
-
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/esrgan.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/eurobert.py",
+            "--dataset",
+            self.sentence_dataset,
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -4192,9 +4297,6 @@ def test_esrgan(self):
             self.device,
             "--model",
             self.model,
-            "--default_dataset",
-            "--oss_repo",
-            self.oss_repo,
             "--ip",
             self.ip,
             "--port",
@@ -4211,8 +4313,7 @@ def test_esrgan(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                self.assertGreaterEqual(msg["PSNR"], 24)
-                self.assertGreaterEqual(msg["SSIM"], 0.8)
+                self.assertGreaterEqual(msg["accuracy"], 0.5)
 
     def test_fastvit(self):
         if not self.required_envs(
@@ -4363,7 +4464,7 @@ def test_gMLP(self):
                 self.fail(msg["Error"])
             else:
                 self.assertGreaterEqual(msg["top_1"], 60)
-                self.assertGreaterEqual(msg["top_5"], 90)
+                self.assertGreaterEqual(msg["top_5"], 85)
 
     @unittest.skip("Only outputs good accuracy in QNN 2.29")
     def test_mobilevit_v2(self):
@@ -4654,6 +4755,41 @@ def test_ssd300_vgg16(self):
             else:
                 self.assertGreaterEqual(msg["mAP"], 0.70)
 
+    def test_swin_transformer(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/swin_transformer.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 80)
+
 
 class TestExampleQaihubScript(TestQNN):
     def test_utils_export(self):
diff --git a/examples/qualcomm/oss_scripts/albert.py b/examples/qualcomm/oss_scripts/albert.py
new file mode 100644
index 00000000000..6af554655f1
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/albert.py
@@ -0,0 +1,162 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import getpass
+import json
+import logging
+import os
+from multiprocessing.connection import Client
+
+import evaluate
+import numpy as np
+import torch
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
+)
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_masked_language_model_dataset,
+    make_output_dir,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+from transformers import AlbertConfig, AutoModelForMaskedLM, AutoTokenizer
+
+
+def main(args):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    os.makedirs(args.artifact, exist_ok=True)
+    data_size = 100
+
+    model_name = "albert/albert-base-v2"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, hidden_act="gelu")
+
+    if args.ci:
+        random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32)
+        attention_mask = torch.ones((1, 100), dtype=torch.float32)
+        inputs = [
+            (
+                random_ids,
+                attention_mask,
+            )
+        ]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
+    else:
+        inputs, targets, input_list = get_masked_language_model_dataset(
+            args.dataset, tokenizer, data_size
+        )
+
+    config = AlbertConfig.from_pretrained(model_name)
+    config.hidden_act = "gelu"
+    module = AutoModelForMaskedLM.from_pretrained(model_name, config=config).eval()
+    pte_filename = "albert_qnn_q16"
+
+    # lower to QNN
+    passes_job = get_capture_program_passes()
+    build_executorch_binary(
+        module,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        dataset=inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        quant_dtype=QuantDtype.use_16a16w,
+        passes_job=passes_job,
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}"
+    pte_path = f"{args.artifact}/{pte_filename}.pte"
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=pte_path,
+        workspace=workspace,
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    # accuracy analysis
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+    adb.pull(output_path=args.artifact)
+    # since the original nn.Module could not perform well on this task either
+    # we only measure the relative accuracy here
+    goldens, predictions, nominal_predictions = [], [], []
+    for i in range(len(inputs)):
+        indices = [i for i, x in enumerate(targets[i]) if x != -100]
+        goldens.extend(targets[i][indices].tolist())
+        nominal_prediction = module(*inputs[i])
+        nominal_predictions.extend(
+            nominal_prediction.logits.argmax(axis=-1)[0, indices].tolist()
+        )
+        prediction = (
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+            .reshape([1, inputs[0][0].shape[1], -1])
+            .argmax(axis=-1)
+        )
+        predictions.extend(prediction[0, indices].tolist())
+
+    metric = evaluate.load("accuracy")
+    nominal_results = metric.compute(
+        predictions=nominal_predictions, references=goldens
+    )
+    device_results = metric.compute(predictions=predictions, references=goldens)
+    result = device_results["accuracy"] / nominal_results["accuracy"]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({"accuracy": result}))
+    else:
+        print(f"accuracy: {device_results}")
+        print(f"accuracy with nn.Module as golden: {result}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts and output by this example. Default ./albert",
+        default="./albert",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation text. "
+            "e.g. --dataset wikisent2.txt "
+            "for https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences"
+        ),
+        type=str,
+        required=False,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/bert.py b/examples/qualcomm/oss_scripts/bert.py
new file mode 100644
index 00000000000..96c7826d89c
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/bert.py
@@ -0,0 +1,149 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import getpass
+import json
+import logging
+import os
+from multiprocessing.connection import Client
+
+import evaluate
+import numpy as np
+import torch
+
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
+)
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_masked_language_model_dataset,
+    make_output_dir,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+
+def main(args):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    os.makedirs(args.artifact, exist_ok=True)
+    data_size = 100
+
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+    if args.ci:
+        random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32)
+        attention_mask = torch.ones((1, 100), dtype=torch.float32)
+        inputs = [
+            (
+                random_ids,
+                attention_mask,
+            )
+        ]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
+    else:
+        inputs, targets, input_list = get_masked_language_model_dataset(
+            args.dataset, tokenizer, data_size
+        )
+    module = AutoModelForMaskedLM.from_pretrained(
+        "google-bert/bert-base-uncased"
+    ).eval()
+    pte_filename = "bert_qnn_q16"
+
+    # lower to QNN
+    passes_job = get_capture_program_passes()
+    build_executorch_binary(
+        module,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        dataset=inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        quant_dtype=QuantDtype.use_16a8w,
+        passes_job=passes_job,
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}"
+    pte_path = f"{args.artifact}/{pte_filename}.pte"
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=pte_path,
+        workspace=workspace,
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    # accuracy analysis
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+    adb.pull(output_path=args.artifact)
+    goldens, predictions = [], []
+    for i in range(len(inputs)):
+        indices = [i for i, x in enumerate(targets[i]) if x != -100]
+        goldens.extend(targets[i][indices].tolist())
+        prediction = (
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+            .reshape([1, inputs[0][0].shape[1], -1])
+            .argmax(axis=-1)
+        )
+        predictions.extend(prediction[0, indices].tolist())
+
+    metric = evaluate.load("accuracy")
+    results = metric.compute(predictions=predictions, references=goldens)
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({"accuracy": results["accuracy"]}))
+    else:
+        print(f"accuracy: {results['accuracy']}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts and output by this example. Default ./bert",
+        default="./bert",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation text. "
+            "e.g. --dataset wikisent2.txt "
+            "for https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences"
+        ),
+        type=str,
+        required=False,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/distilbert.py b/examples/qualcomm/oss_scripts/distilbert.py
new file mode 100644
index 00000000000..2863a653200
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/distilbert.py
@@ -0,0 +1,149 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import getpass
+import json
+import logging
+import os
+from multiprocessing.connection import Client
+
+import evaluate
+import numpy as np
+import torch
+
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
+)
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_masked_language_model_dataset,
+    make_output_dir,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+
+def main(args):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    os.makedirs(args.artifact, exist_ok=True)
+    data_size = 100
+
+    tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+    if args.ci:
+        random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32)
+        attention_mask = torch.ones((1, 100), dtype=torch.float32)
+        inputs = [
+            (
+                random_ids,
+                attention_mask,
+            )
+        ]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
+    else:
+        inputs, targets, input_list = get_masked_language_model_dataset(
+            args.dataset, tokenizer, data_size
+        )
+    module = AutoModelForMaskedLM.from_pretrained(
+        "distilbert/distilbert-base-uncased"
+    ).eval()
+    pte_filename = "distilbert_qnn_q16"
+
+    # lower to QNN
+    passes_job = get_capture_program_passes()
+    build_executorch_binary(
+        module,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        dataset=inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        quant_dtype=QuantDtype.use_16a8w,
+        passes_job=passes_job,
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}"
+    pte_path = f"{args.artifact}/{pte_filename}.pte"
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=pte_path,
+        workspace=workspace,
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    # accuracy analysis
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+    adb.pull(output_path=args.artifact)
+    goldens, predictions = [], []
+    for i in range(len(inputs)):
+        indices = [i for i, x in enumerate(targets[i]) if x != -100]
+        goldens.extend(targets[i][indices].tolist())
+        prediction = (
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+            .reshape([1, inputs[0][0].shape[1], -1])
+            .argmax(axis=-1)
+        )
+        predictions.extend(prediction[0, indices].tolist())
+
+    metric = evaluate.load("accuracy")
+    results = metric.compute(predictions=predictions, references=goldens)
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({"accuracy": results["accuracy"]}))
+    else:
+        print(f"accuracy: {results['accuracy']}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts and output by this example. Default ./distilbert",
+        default="./distilbert",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation text. "
+            "e.g. --dataset wikisent2.txt "
+            "for https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences"
+        ),
+        type=str,
+        required=False,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/eurobert.py b/examples/qualcomm/oss_scripts/eurobert.py
new file mode 100644
index 00000000000..97e70428e01
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/eurobert.py
@@ -0,0 +1,187 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import os
+from multiprocessing.connection import Client
+
+import evaluate
+import numpy as np
+import torch
+import transformers
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
+)
+
+from executorch.backends.qualcomm.quantizer.custom_annotation import annotate_eurobert
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_masked_language_model_dataset,
+    make_output_dir,
+    make_quantizer,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer
+
+TRANSFORMERS_VERSION = "4.48.0"
+
+
+def main(args):
+    assert (
+        transformers.__version__ >= TRANSFORMERS_VERSION
+    ), f"Please ensure transformers version >= {TRANSFORMERS_VERSION}, current version is {transformers.__version__}"
+
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    os.makedirs(args.artifact, exist_ok=True)
+
+    if not args.compile_only and args.device is None:
+        raise RuntimeError(
+            "device serial is required if not compile only. "
+            "Please specify a device serial by -s/--device argument."
+        )
+
+    module_id = "EuroBERT/EuroBERT-210m"
+    tokenizer = AutoTokenizer.from_pretrained(module_id)
+    model = AutoModelForMaskedLM.from_pretrained(
+        module_id, trust_remote_code=True
+    ).eval()
+    config = AutoConfig.from_pretrained(module_id, trust_remote_code=True)
+
+    def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module):
+        for name, child in module.named_children():
+            if child._get_name() == "EuroBertRMSNorm":
+                rms_norm = torch.nn.RMSNorm(
+                    [config.hidden_size], eps=child.variance_epsilon
+                )
+                rms_norm.weight = child.weight
+                setattr(
+                    module,
+                    name,
+                    rms_norm,
+                )
+            else:
+                replace_rms_norm_with_native_rms_norm(child)
+        return module
+
+    replace_rms_norm_with_native_rms_norm(model)
+
+    data_size = 100
+    if args.ci:
+        random_ids = torch.randint(low=0, high=100, size=(1, 100), dtype=torch.int32)
+        attention_mask = torch.ones((1, 100), dtype=torch.float32)
+        inputs = [
+            (
+                random_ids,
+                attention_mask,
+            )
+        ]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
+    else:
+        inputs, targets, input_list = get_masked_language_model_dataset(
+            args.dataset, tokenizer, data_size
+        )
+
+    pte_filename = "eurobert_qnn_q16"
+
+    # lower to QNN
+    passes_job = get_capture_program_passes()
+    quantizer = make_quantizer(
+        quant_dtype=QuantDtype.use_16a16w,
+    )
+    quantizer.add_custom_quant_annotations((annotate_eurobert,))
+    with torch.no_grad():
+        build_executorch_binary(
+            model,
+            inputs[0],
+            args.model,
+            f"{args.artifact}/{pte_filename}",
+            dataset=inputs,
+            skip_node_id_set=skip_node_id_set,
+            skip_node_op_set=skip_node_op_set,
+            custom_quantizer=quantizer,
+            passes_job=passes_job,
+            shared_buffer=args.shared_buffer,
+        )
+
+    if args.compile_only:
+        return
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    # accuracy analysis
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+    adb.pull(output_path=args.artifact)
+    goldens, predictions = [], []
+    for i in range(len(inputs)):
+        indices = [i for i, x in enumerate(targets[i]) if x != -100]
+        goldens.extend(targets[i][indices].tolist())
+
+        prediction = (
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+            .reshape([1, inputs[0][0].shape[1], -1])
+            .argmax(axis=-1)
+        )
+        predictions.extend(prediction[0, indices].tolist())
+    metric = evaluate.load("accuracy")
+    results = metric.compute(predictions=predictions, references=goldens)
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({"accuracy": results["accuracy"]}))
+    else:
+        print(f"accuracy: {results['accuracy']}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts and output by this example. Default ./eurobert",
+        default="./eurobert",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation text. "
+            "e.g. --dataset wikisent2.txt "
+            "for https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences"
+        ),
+        type=str,
+        required=False,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)