diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index c59eedc304c..93b45483906 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -13,7 +13,7 @@
 
 import logging
 import os
-from typing import final, List, Optional
+from typing import cast, final, List, Optional
 
 import serializer.tosa_serializer as ts
 from executorch.backends.arm.arm_vela import vela_compile
@@ -32,6 +32,7 @@
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch.export.exported_program import ExportedProgram
+from torch.fx import Node
 
 # TOSA backend debug functionality
 logger = logging.getLogger(__name__)
@@ -269,6 +270,7 @@ def preprocess(  # noqa: C901
         node_visitors = get_node_visitors(edge_program, tosa_spec)
         input_count = 0
         for node in graph_module.graph.nodes:
+            node = cast(Node, node)
             if node.op == "call_function":
                 process_call_function(node, tosa_graph, node_visitors, tosa_spec)
             elif node.op == "placeholder":
@@ -288,9 +290,6 @@ def preprocess(  # noqa: C901
                     "The rank of the input order is not equal to amount of input tensors"
                 )
 
-        # TODO: It would be awesome if this dump could somehow be done on top level and not here.
-        # Problem is that the desc.json has to be created on the tosa_graph object, which we can't
-        # access from top level.
         if artifact_path:
             tag = _get_first_delegation_tag(graph_module)
             dbg_tosa_dump(
@@ -311,6 +310,4 @@ def preprocess(  # noqa: C901
         else:
             raise RuntimeError(f"Unknown format {output_format}")
 
-        # Continueing from above. Can I put tosa_graph into this function?
-        # debug_handle_map = ...
         return PreprocessResult(processed_bytes=binary)
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index d755ffc8b15..8838cb72d6c 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -74,19 +74,15 @@ def get_tosa_compile_spec_unbuilt(
     the compile spec before calling .build() to finalize it.
     """
     if not custom_path:
-        intermediate_path = maybe_get_tosa_collate_path() or tempfile.mkdtemp(
-            prefix="arm_tosa_"
-        )
-    else:
-        intermediate_path = custom_path
+        custom_path = maybe_get_tosa_collate_path()
 
-    if not os.path.exists(intermediate_path):
-        os.makedirs(intermediate_path, exist_ok=True)
+    if custom_path is not None:
+        os.makedirs(custom_path, exist_ok=True)
     compile_spec_builder = (
         ArmCompileSpecBuilder()
         .tosa_compile_spec(tosa_version)
         .set_permute_memory_format(permute_memory_to_nhwc)
-        .dump_intermediate_artifacts_to(intermediate_path)
+        .dump_intermediate_artifacts_to(custom_path)
     )
 
     return compile_spec_builder
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index 3343ae748ca..f82b5afc3b8 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -111,7 +111,9 @@ def test_numerical_diff_prints(self):
                 model,
                 example_inputs=model.get_inputs(),
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80.0+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80.0+MI",
+                    permute_memory_to_nhwc=True,
+                    custom_path=tempfile.mkdtemp("diff_print_test"),
                 ),
             )
             .export()
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index bf436a8c183..4cef019c36c 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -124,7 +124,7 @@ def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int):
     def test_cat_4d_tosa_MI(self):
         square = torch.ones((2, 2, 2, 2))
         for dim in range(-3, 3):
-            test_data = ((square, square), dim)
+            test_data = ((square, square.clone()), dim)
             self._test_cat_tosa_MI_pipeline(self.Cat(), test_data)
 
     @parameterized.expand(Cat.test_parameters)
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
index 455b484b948..60c32a45579 100644
--- a/backends/arm/test/ops/test_scalars.py
+++ b/backends/arm/test/ops/test_scalars.py
@@ -157,7 +157,7 @@ def _test_add_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: tuple):
     def test_MI(self, test_name: str, op: torch.nn.Module, x, y):
         expected_exception = None
         if any(token in test_name for token in ("Sub_int", "Sub__int")):
-            expected_exception = RuntimeError
+            expected_exception = ValueError
         elif test_name.endswith("_st"):
             expected_exception = AttributeError
 
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index 85bfc15d2dc..c7194833cc0 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -93,8 +93,6 @@ def _test_select_tosa_BI_pipeline(
             .check(["torch.ops.quantized_decomposed"])
             .to_edge()
             .partition()
-            .dump_artifact()
-            .dump_operator_distribution()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .run_method_and_compare_outputs(inputs=test_data)
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index a8a113cf931..0bd90e3216f 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -16,14 +16,16 @@
 
 import numpy as np
 import torch
+import tosa_reference_model
 
 from executorch.backends.arm.test.conftest import arm_test_options, is_option_enabled
 
 from torch.export import ExportedProgram
 from torch.fx.node import Node
+from tosa import TosaGraph
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
+logger.setLevel(logging.CRITICAL)
 
 
 class QuantizationParams:
@@ -169,7 +171,7 @@ def __init__(
     ):
         self.intermediate_path = intermediate_path
         self.tosa_ref_model_path = tosa_ref_model_path or "tosa_reference_model"
-        assert os.path.exists(
+        assert self.intermediate_path is None or os.path.exists(
             self.intermediate_path
         ), f"TOSA artifact path don't exist! Path: {self.intermediate_path}"
 
@@ -332,7 +334,46 @@ def run_corstone(
         tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32)
         output_shape = self.output_node.args[0][0].meta["val"].shape
         tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(output_shape)
-        return [tosa_ref_output]
+        return tosa_ref_output
+
+    def run_tosa_graph(
+        self, graph: TosaGraph, inputs: list[np.ndarray] | list[torch.Tensor]
+    ) -> torch.Tensor:
+        """Runs the TOSA reference model with inputs and returns the result."""
+        data_np = [
+            prep_data_for_save(
+                input, self.is_quantized, self.input_names[i], self.qp_input[i]
+            )
+            for i, input in enumerate(inputs)
+        ]
+        # tosa_profile: 0 = Base Inference, 1 = Main Inference, 2 = Main Training.
+        tosa_profile = 0 if self.is_quantized else 1
+        debug_mode = "ALL" if logger.level <= logging.DEBUG else None
+        outputs, status = tosa_reference_model.run(
+            graph,
+            data_np,
+            verbosity=_tosa_refmodel_loglevel(logger.level),
+            tosa_profile=tosa_profile,
+            initialize_variable_tensor_from_numpy=1,  # True
+            debug_mode=debug_mode,
+        )
+
+        assert (
+            status == tosa_reference_model.GraphStatus.TOSA_VALID
+        ), "Non-valid TOSA given to reference model."
+
+        outputs_torch = []
+        for output in outputs:
+            output = torch.from_numpy(output)
+            if self.is_quantized:
+                # Need to dequant back to FP32 for comparison with torch output
+                quant_param = self.qp_output
+                assert (
+                    quant_param is not None
+                ), "There are no quantization parameters, check output parameters"
+                output = (output.to(torch.float32) - quant_param.zp) * quant_param.scale
+            outputs_torch.append(output)
+        return tuple(outputs_torch)
 
     def run_tosa_ref_model(
         self,
@@ -417,21 +458,13 @@ def run_tosa_ref_model(
         assert (
             shutil.which(self.tosa_ref_model_path) is not None
         ), f"tosa_reference_model tool not found, did you run examples/arm/setup.sh? Path: {self.tosa_ref_model_path}"
-        loglevel_map = {
-            logging.INFO: "INFO",
-            logging.CRITICAL: "LOW",
-            logging.ERROR: "LOW",
-            logging.WARNING: "MED",
-            logging.DEBUG: "HIGH",
-            logging.NOTSET: "MED",
-        }
-        clamped_logging_level = max(min(logger.level // 10 * 10, 50), 0)
+
         cmd_ref_model = [
             self.tosa_ref_model_path,
             "--test_desc",
             desc_file_path,
             "-l",
-            loglevel_map[clamped_logging_level],
+            _tosa_refmodel_loglevel(logger.level),
         ]
         _run_cmd(cmd_ref_model)
 
@@ -467,7 +500,10 @@ def run_tosa_ref_model(
 
 
 def prep_data_for_save(
-    data, is_quantized: bool, input_name: str, quant_param: QuantizationParams
+    data: torch.Tensor,
+    is_quantized: bool,
+    input_name: str,
+    quant_param: QuantizationParams,
 ):
     data_np = np.array(data.detach(), order="C").astype(
         f"{data.dtype}".replace("torch.", "")
@@ -576,7 +612,6 @@ def dbg_tosa_fb_to_json(tosa_fb: bytes) -> Dict:
     assert os.path.exists(
         tosa_schema_file
     ), f"tosa_schema_file: {tosa_schema_file} does not exist"
-
     assert shutil.which("flatc") is not None
     cmd_flatc = [
         "flatc",
@@ -611,3 +646,19 @@ def dbg_tosa_fb_to_json(tosa_fb: bytes) -> Dict:
         pass
 
     return json_out
+
+
+def _tosa_refmodel_loglevel(loglevel: int) -> str:
+    """Converts a logging loglevel to tosa_reference_model logginglevel,
+    returned as string.
+    """
+    loglevel_map = {
+        logging.INFO: "INFO",
+        logging.CRITICAL: "LOW",
+        logging.ERROR: "LOW",
+        logging.WARNING: "MED",
+        logging.DEBUG: "HIGH",
+        logging.NOTSET: "MED",
+    }
+    clamped_logging_level = max(min(loglevel // 10 * 10, 50), 0)
+    return loglevel_map[clamped_logging_level]
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 6784605bb48..4f9eae64be8 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+import tempfile
 
 from collections import Counter
 from pprint import pformat
@@ -35,7 +36,11 @@
 
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.devtools.backend_debug import get_delegation_info
-from executorch.exir import EdgeCompileConfig, ExecutorchProgramManager
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    ExecutorchProgramManager,
+)
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.lowered_backend_module import LoweredBackendModule
@@ -128,10 +133,15 @@ def __init__(
         super().__init__(dynamic_shapes)
         self.tosa_test_util = tosa_test_util
 
+    def run(self, artifact: EdgeProgramManager, inputs=None):
+        self.executorch_program = artifact.to_executorch(self.config)
+        if module := getattr(
+            artifact.exported_program().graph_module, "lowered_module_0", None
+        ):
+            self.buffer = module.processed_bytes
+
     def run_artifact(self, inputs):
-        tosa_output = self.tosa_test_util.run_tosa_ref_model(
-            inputs=inputs,
-        )
+        tosa_output = self.tosa_test_util.run_tosa_graph(self.buffer, inputs)
         return tosa_output
 
 
@@ -348,7 +358,7 @@ def run_method_and_compare_outputs(
             logger.info(f"Run #{run_iteration}, input shapes: {input_shape_str}")
 
             reference_output = reference_stage.run_artifact(reference_input)
-            test_output = tuple(test_stage.run_artifact(test_input))
+            test_output = test_stage.run_artifact(test_input)
             if (
                 is_nhwc
                 and test_stage == self.stages[self.stage_name(tester.ToExecutorch)]
@@ -515,6 +525,8 @@ def _compare_outputs(
             banner = "=" * 40 + "TOSA debug info" + "=" * 40
             logger.error(banner)
             path_to_tosa_files = self.runner_util.intermediate_path
+            if path_to_tosa_files is None:
+                path_to_tosa_files = tempfile.mkdtemp(prefix="executorch_result_dump_")
 
             export_stage = self.stages.get(self.stage_name(tester.Export), None)
             quantize_stage = self.stages.get(self.stage_name(tester.Quantize), None)
@@ -524,8 +536,8 @@ def _compare_outputs(
                 qp_output = _get_output_quantization_params(
                     export_stage.artifact, output_node
                 )
-                logger.error(f"{qp_input=}")
-                logger.error(f"{qp_output=}")
+                logger.error(f"Input QuantArgs: {qp_input}")
+                logger.error(f"Output QuantArgs: {qp_output}")
 
             logger.error(f"{path_to_tosa_files=}")
             import os
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index ebcc6d66f24..58efc46ae35 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -88,7 +88,7 @@ ethos_u_base_rev="24.08"
 
 # tosa reference model
 tosa_reference_model_url="https://review.mlplatform.org/tosa/reference_model"
-tosa_reference_model_rev="f9ea4ab7da19318fe36b1c34d68a3e40fd6e56c5"
+tosa_reference_model_rev="c5570b79e90c3a36ab8c4ddb8ee3fbc2cd3f7c38"
 
 # vela
 vela_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u-vela"
@@ -223,64 +223,19 @@ function patch_repo() {
 }
 
 function setup_tosa_reference_model() {
-    # The debug flow on the host includes running on a reference implementation of TOSA
-    # This is useful primarily for debug of quantization accuracy, but also for internal
-    # errors for the early codebase
-    cd "${root_dir}"
-    if [[ ! -e reference_model ]]; then
-        git clone ${tosa_reference_model_url}
-        cd reference_model
-        git checkout ${tosa_reference_model_rev}
-        git submodule update --init --recursive
-        cd ..
-    fi
-    cd reference_model
-    mkdir -p build
-    cd build
-    cmake ..
-
-    # make use of half the cores for building
-    if [[ "${OS}" == "Linux" ]]; then
-        n=$(( $(nproc) / 2 ))
-    elif [[ "${OS}" == "Darwin" ]]; then
-        n=$(( $(sysctl -n hw.logicalcpu) / 2 ))
-    else
-        n=1
-    fi
-
-    if [[ "$n" -lt 1 ]]; then
-        n=1
-    fi
+    
+    # reference_model flatbuffers version clashes with Vela.
+    # go with Vela's since it newer.
+    # Vela's flatbuffer requirement is expected to loosen, then remove this. MLETORCH-565
+    pip install tosa-tools@git+${tosa_reference_model_url}@${tosa_reference_model_rev} --no-dependencies flatbuffers
 
-    make -j"${n}"
-    cd reference_model
-    tosa_bin_path=`pwd`
-    echo "export PATH=\${PATH}:${tosa_bin_path}" >> "${setup_path_script}"
 }
 
 function setup_vela() {
     #
     # Prepare the Vela compiler for AoT to Ethos-U compilation
     #
-    cd "${root_dir}"
-    if [[ ! -e ethos-u-vela ]]; then
-        git clone ${vela_repo_url}
-        repo_dir="${root_dir}/ethos-u-vela"
-        base_rev=${vela_rev}
-        patch_repo
-    fi
-    cd "${root_dir}/ethos-u-vela"
-
-    # different command for conda vs venv
-    VNV=$(python3 -c "import sys; print('venv') if (sys.prefix != sys.base_prefix) else print('not_venv')")
-    if [ ${VNV} == "venv" ]; then
-	pip install .
-    else
-       # if not venv, we need the site-path where the vela
-       vela_path=$(python -c "import site; print(site.USER_BASE+'/bin')")
-       echo "export PATH=\${PATH}:${vela_path}" >> ${setup_path_script}
-       pip install . --user
-    fi
+    pip install ethos-u-vela@git+${vela_repo_url}@${vela_rev}
 }
 
 ########
diff --git a/setup.py b/setup.py
index ff1afa89bd6..41170f09c9d 100644
--- a/setup.py
+++ b/setup.py
@@ -706,10 +706,6 @@ def get_ext_modules() -> List[Extension]:
         "executorch/devtools/bundled_program": "devtools/bundled_program",
         "executorch/runtime": "runtime",
         "executorch/util": "util",
-        # Note: This will install a top-level module called "serializer",
-        # which seems too generic and might conflict with other pip packages.
-        "serializer": "backends/arm/third-party/serialization_lib/python/serializer",
-        "tosa": "backends/arm/third-party/serialization_lib/python/tosa",
     },
     cmdclass={
         "build": CustomBuild,