diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh
index f868d264f48..798aa627d65 100755
--- a/backends/arm/scripts/build_executorch.sh
+++ b/backends/arm/scripts/build_executorch.sh
@@ -16,18 +16,17 @@ et_root_dir=$(realpath ${et_root_dir})
 toolchain_cmake=${script_dir}/../../../examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
 toolchain_cmake=$(realpath ${toolchain_cmake})
 
-
-
 et_build_root="${et_root_dir}/arm_test"
 build_type="Release"
+build_devtools=false
 build_with_etdump=false
 
-
 help() {
     echo "Usage: $(basename $0) [options]"
     echo "Options:"
     echo "  --et_build_root=<FOLDER>  Build output root folder to use, defaults to ${et_build_root}"
     echo "  --build_type=<TYPE>       Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --devtools                Build Devtools libs"
     echo "  --etdump                  Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
     exit 0
 }
@@ -37,6 +36,7 @@ for arg in "$@"; do
       -h|--help) help ;;
       --et_build_root=*) et_build_root="${arg#*=}";;
       --build_type=*) build_type="${arg#*=}";;
+      --devtools) build_devtools=true ;;
       --etdump) build_with_etdump=true ;;
       *)
       ;;
@@ -44,25 +44,25 @@ for arg in "$@"; do
 done
 
 et_build_dir="${et_build_root}/cmake-out"
+
+# Used for flatcc host excutable if Devtools is used
 et_build_host_dir=${et_build_root}/cmake-out-host-tools
 
 set -x
 cd "${et_root_dir}"
 
-build_with_etdump_flags=""
 if [ "$build_with_etdump" = true ] ; then
     ( set +x ;
         echo "--------------------------------------------------------------------------------" ;
-        echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_build_host_dir} - ${et_build_host_dir}/bin/flatcc" ;
+        echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_build_host_dir}/bin/flatcc" ;
         echo "--------------------------------------------------------------------------------" )
 
-
     # Build host flatcc bin
     # This is a way to work around that the flatcc executable get build for target (e.g. Arm) later
     # and get replaced. flatcc is a tool used on the host for etdump and BundleIO handling.
     # The way to solve this is to generate it once for the host, then copy it to ${et_build_host_dir}/bin
     # and later point that out with -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc later.
-    mkdir -p ${et_build_host_dir}
+
     cmake                                                 \
         -DCMAKE_INSTALL_PREFIX=${et_build_host_dir}       \
         -DCMAKE_BUILD_TYPE=${build_type}                  \
@@ -79,18 +79,13 @@ if [ "$build_with_etdump" = true ] ; then
         -B"${et_build_host_dir}"                          \
         "${et_root_dir}"
 
-    # Copy host flatcc excutable to it's saved when we build for target (Arm) later
+    # third-party/flatcc/bin/flatcc gets build already in the in the cmake config step above
+    # so there is no cmake building step done
+
+    # Copy host flatcc excutable so it's saved when we build for target (Arm) later
+    et_build_host_dir=$(realpath ${et_build_host_dir})
     mkdir -p ${et_build_host_dir}/bin
     cp third-party/flatcc/bin/flatcc ${et_build_host_dir}/bin
-
-    # Add DevTools flags use in the Target build below
-    build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
-                                -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
-                                -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
-                                -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
-                                -DFLATCC_ALLOW_WERROR=OFF                         \
-                                -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc "
-    echo "build_with_etdump_flags=$build_with_etdump_flags"
 fi
 
 ( set +x ;
@@ -98,6 +93,25 @@ fi
     echo "Build ExecuTorch target libs ${build_type} into '${et_build_dir}'" ;
     echo "--------------------------------------------------------------------------------" )
 
+build_devtools_flags=" -DEXECUTORCH_BUILD_DEVTOOLS=OFF "
+if [ "$build_devtools" = true ] ; then
+    build_devtools_flags=" -DEXECUTORCH_BUILD_DEVTOOLS=ON "
+fi
+
+build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF "
+if [ "$build_with_etdump" = true ] ; then
+    # Add DevTools flags use in the Target build below
+    build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+                            -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+                            -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
+                            -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
+                            -DFLATCC_ALLOW_WERROR=OFF                         \
+                            -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc "
+fi
+
+echo "Building with Devtools: ${build_devtools_flags} ${build_with_etdump_flags}"
+
+
 # Build
 cmake                                                 \
     -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
@@ -108,6 +122,7 @@ cmake                                                 \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
     -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+    ${build_devtools_flags}                           \
     ${build_with_etdump_flags}                        \
     -DFLATC_EXECUTABLE="$(which flatc)"               \
     -B"${et_build_dir}"                               \
diff --git a/backends/arm/scripts/build_executorch_runner.sh b/backends/arm/scripts/build_executorch_runner.sh
index afa8f27bdff..3e658928274 100755
--- a/backends/arm/scripts/build_executorch_runner.sh
+++ b/backends/arm/scripts/build_executorch_runner.sh
@@ -15,6 +15,7 @@ pte_file=""
 target="ethos-u55-128"
 build_type="Release"
 system_config=""
+bundleio=false
 build_with_etdump=false
 extra_build_flags=""
 output_folder_set=false
@@ -22,6 +23,9 @@ output_folder="."
 et_build_root="${et_root_dir}/arm_test"
 ethosu_tools_dir=${et_root_dir}/examples/arm/ethos-u-scratch
 
+build_bundleio_flags=" -DET_BUNDLE_IO=OFF "
+build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF "
+
 help() {
     echo "Usage: $(basename $0) [options]"
     echo "Options:"
@@ -30,6 +34,7 @@ help() {
     echo "  --build_type=<TYPE>             Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
     echo "  --system_config=<CONFIG>        System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
     echo "                                     NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
+    echo "  --bundleio                      Support both pte and Bundle IO bpte using Devtools BundelIO with Input/RefOutput included"
     echo "  --etdump                        Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
     echo "  --extra_build_flags=<FLAGS>     Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --output=<FOLDER>               Output folder Default: <MODEL>/<MODEL>_<TARGET INFO>.pte"
@@ -45,6 +50,7 @@ for arg in "$@"; do
       --target=*) target="${arg#*=}";;
       --build_type=*) build_type="${arg#*=}";;
       --system_config=*) system_config="${arg#*=}";;
+      --bundleio) bundleio=true ;;
       --etdump) build_with_etdump=true ;;
       --extra_build_flags=*) extra_build_flags="${arg#*=}";;
       --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
@@ -64,9 +70,8 @@ et_build_dir=${et_build_root}/cmake-out
 et_build_dir=$(realpath ${et_build_dir})
 
 if [ "$output_folder_set" = false ] ; then
-    pte_folder=$(cd -- "$( dirname -- "${pte_file}" )" &> /dev/null && pwd)
-    pte_short_name=$(basename -- "${pte_file}" ".pte")
-    output_folder="$pte_folder/$pte_short_name"
+    # remove file ending
+    output_folder=${pte_file%.*}
 fi
 
 if [[ ${system_config} == "" ]]
@@ -86,18 +91,21 @@ else
     target_cpu=cortex-m85
 fi
 echo "--------------------------------------------------------------------------------"
-echo "Build Arm Baremetal executor_runner for ${target} with ${pte_file} using ${system_config} to '${output_folder}/cmake-out'"
+echo "Build Arm Baremetal executor_runner for ${target} with ${pte_file} using ${system_config} ${extra_build_flags} to '${output_folder}/cmake-out'"
 echo "--------------------------------------------------------------------------------"
 
 cd ${et_root_dir}/examples/arm/executor_runner
 
-build_with_etdump_flags=""
+if [ "$bundleio" = true ] ; then
+    build_bundleio_flags=" -DET_BUNDLE_IO=ON "
+fi
+
 if [ "$build_with_etdump" = true ] ; then
-    echo "Building with etdump e.g. -DEXECUTORCH_ENABLE_EVENT_TRACER=ON"
     build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
 fi
 
-mkdir -p "$output_folder"
+echo "Building with BundleIO/etdump/extra flags: ${build_bundleio_flags} ${build_with_etdump_flags} ${extra_build_flags}"
+mkdir -p "${output_folder}"
 
 cmake \
     -DCMAKE_BUILD_TYPE=${build_type}            \
@@ -105,9 +113,10 @@ cmake \
     -DTARGET_CPU=${target_cpu}                  \
     -DET_DIR_PATH:PATH=${et_root_dir}           \
     -DET_BUILD_DIR_PATH:PATH=${et_build_dir}    \
-    -DET_PTE_FILE_PATH:PATH="${pte_file}"            \
+    -DET_PTE_FILE_PATH:PATH="${pte_file}"       \
     -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}   \
     -DETHOSU_TARGET_NPU_CONFIG=${target}        \
+    ${build_bundleio_flags}                     \
     ${build_with_etdump_flags}                  \
     -DPYTHON_EXECUTABLE=$(which python3)        \
     -DSYSTEM_CONFIG=${system_config}            \
diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh
index 568f07011f2..e0237a9c414 100755
--- a/backends/arm/scripts/run_fvp.sh
+++ b/backends/arm/scripts/run_fvp.sh
@@ -19,12 +19,14 @@ _setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly ins
 
 elf_file=""
 target="ethos-u55-128"
+timeout="240"
 
 help() {
     echo "Usage: $(basename $0) [options]"
     echo "Options:"
     echo "  --elf=<ELF_FILE>         elf file to run"
     echo "  --target=<TARGET>        Target to build and run for Default: ${target}"
+    echo "  --timeout=<TIME_IN_SEC>  Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}"
     exit 0
 }
 
@@ -33,6 +35,7 @@ for arg in "$@"; do
       -h|--help) help ;;
       --elf=*) elf_file="${arg#*=}";;
       --target=*) target="${arg#*=}";;
+      --timeout=*) timeout="${arg#*=}";;
       *)
       ;;
     esac
@@ -63,6 +66,7 @@ num_macs=$(echo ${target} | cut -d - -f 3)
 
 echo "--------------------------------------------------------------------------------"
 echo "Running ${elf_file} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
+echo "WARNING: Corstone FVP is not cycle accurate and should NOT be used to determine valid runtime"
 echo "--------------------------------------------------------------------------------"
 
 log_file=$(mktemp)
@@ -75,7 +79,7 @@ if [[ ${target} == *"ethos-u55"*  ]]; then
         -C mps3_board.uart0.out_file='-'                    \
         -C mps3_board.uart0.shutdown_on_eot=1               \
         -a "${elf_file}"                                         \
-        --timelimit 220 2>&1 | tee ${log_file} || true # seconds
+        --timelimit ${timeout} 2>&1 | tee ${log_file} || true # seconds
     echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
 elif [[ ${target} == *"ethos-u85"*  ]]; then
     ${fvp_model}                                            \
@@ -86,7 +90,7 @@ elif [[ ${target} == *"ethos-u85"*  ]]; then
         -C mps4_board.uart0.out_file='-'                    \
         -C mps4_board.uart0.shutdown_on_eot=1               \
         -a "${elf_file}"                                         \
-        --timelimit 220 2>&1 | tee ${log_file} || true # seconds
+        --timelimit ${timeout} 2>&1 | tee ${log_file} || true # seconds
     echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
 else
     echo "Running ${elf_file} for ${target} is not supported"
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 6c2784501b0..90b34241f3d 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -92,18 +92,18 @@ test_run_ethosu_fvp() { # End to End model tests using run.sh
 
     # TOSA quantized
     echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
-    examples/arm/run.sh --target=TOSA --model_name=add
-    examples/arm/run.sh --target=TOSA --model_name=mul
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=TOSA --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=TOSA --model_name=mul
 
     # Ethos-U55
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
-    examples/arm/run.sh --target=ethos-u55-128 --model_name=add
-    examples/arm/run.sh --target=ethos-u55-128 --model_name=mul
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=mul
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
-    examples/arm/run.sh --target=ethos-u85-128 --model_name=add
-    examples/arm/run.sh --target=ethos-u85-128 --model_name=mul
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=mul
     echo "${TEST_SUITE_NAME}: PASS"
     }
 
@@ -113,26 +113,26 @@ test_models_ethosu_fvp() { # End to End model tests using model_test.py
     source examples/arm/ethos-u-scratch/setup_path.sh
 
     # Build common libs once
-    python3 backends/arm/test/test_model.py --build_libs
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --build_libs
 
     # TOSA quantized
     echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
-    python3 backends/arm/test/test_model.py --target=TOSA --model=mv2
-    python3 backends/arm/test/test_model.py --target=TOSA --model=mv3
-    python3 backends/arm/test/test_model.py --target=TOSA --model=lstm
-    python3 backends/arm/test/test_model.py --target=TOSA --model=edsr
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA --model=mv2
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA --model=mv3
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA --model=lstm
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA --model=edsr
 
     # Ethos-U55
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
-    python3 backends/arm/test/test_model.py --target=ethos-u55-128 --model=mv2
-    python3 backends/arm/test/test_model.py --target=ethos-u55-64 --model=mv3
-    python3 backends/arm/test/test_model.py --target=ethos-u55-256 --model=lstm
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-128 --model=mv2  --extra_flags="-DET_ATOL=1.20 -DET_RTOL=1.20"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-64  --model=mv3  --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-256 --model=lstm --extra_flags="-DET_ATOL=0.02 -DET_RTOL=0.02"
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
-    python3 backends/arm/test/test_model.py --target=ethos-u85-256 --model=mv2
-    python3 backends/arm/test/test_model.py --target=ethos-u85-1024 --model=mv3
-    python3 backends/arm/test/test_model.py --target=ethos-u85-128 --model=lstm
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256  --model=mv2  --extra_flags="-DET_ATOL=1.20 -DET_RTOL=1.20"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-1024 --model=mv3  --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128  --model=lstm --extra_flags="-DET_ATOL=0.02 -DET_RTOL=0.02"
     echo "${TEST_SUITE_NAME}: PASS"
     }
 
@@ -146,4 +146,4 @@ test_full_ethosu_fvp() { # All End to End model tests
 
 
 
-${TEST_SUITE}
\ No newline at end of file
+${TEST_SUITE}
diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py
index 990b9e5f70b..b94a5f65256 100755
--- a/backends/arm/test/test_model.py
+++ b/backends/arm/test/test_model.py
@@ -56,7 +56,12 @@ def get_args():
         default=False,
         help="Don't save temporary files during compilation",
     )
-
+    parser.add_argument(
+        "--extra_flags",
+        required=False,
+        default=None,
+        help="Extra cmake flags to pass the when building the executor_runner",
+    )
     args = parser.parse_args()
 
     if args.model and "ethos-u" in args.target and args.system_config is None:
@@ -95,6 +100,8 @@ def build_libs(et_build_root: str, script_path: str):
             os.path.join(script_path, "build_executorch.sh"),
             f"--et_build_root={et_build_root}",
             "--build_type=Release",
+            "--devtools",
+            "--etdump",
         ]
     )
     run_external_cmd(
@@ -148,6 +155,7 @@ def build_pte(
             "examples.arm.aot_arm_compiler",
             "--delegate",
             "--quantize",
+            "--bundleio",
             intermediate,
             f"--model_name={model_name}",
             f"--target={target}",
@@ -158,7 +166,7 @@ def build_pte(
         ]
     )
 
-    pte_file = os.path.join(output, f"{model_name}_arm_delegate_{args.target}.pte")
+    pte_file = os.path.join(output, f"{model_name}_arm_delegate_{args.target}.bpte")
     return pte_file
 
 
@@ -168,17 +176,26 @@ def build_ethosu_runtime(
     pte_file: str,
     target: str,
     system_config: str,
+    extra_flags: str,
     elf_build_path: str,
 ):
+
+    extra_build_flag = ""
+    if extra_flags:
+        extra_build_flag = f"--extra_build_flags={extra_flags}"
+
     run_external_cmd(
         [
             "bash",
             os.path.join(script_path, "build_executorch_runner.sh"),
             f"--et_build_root={et_build_root}",
             f"--pte={pte_file}",
+            "--bundleio",
+            "--etdump",
             f"--target={target}",
             "--build_type=Release",
             f"--system_config={system_config}",
+            extra_build_flag,
             f"--output={elf_build_path}",
         ]
     )
@@ -239,6 +256,7 @@ def run_elf_with_fvp(script_path: str, elf_file: str, target: str):
                 pte_file,
                 args.target,
                 args.system_config,
+                args.extra_flags,
                 elf_build_path,
             )
             print(f"ELF file created: {elf_file} ")
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index f7f2105b99c..1f224983d4e 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -13,9 +13,10 @@
 import os
 
 from pathlib import Path
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
+from examples.devtools.scripts.export_bundled_program import save_bundled_program
 from executorch.backends.arm.arm_backend import (
     ArmCompileSpecBuilder,
     get_tosa_spec,
@@ -36,6 +37,8 @@
     MobileNetV2Evaluator,
 )
 from executorch.devtools.backend_debug import get_delegation_info
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+
 from executorch.exir import (
     EdgeCompileConfig,
     ExecutorchBackendConfig,
@@ -56,27 +59,50 @@
 logging.basicConfig(level=logging.WARNING, format=FORMAT)
 
 
-def get_model_and_inputs_from_name(model_name: str) -> Tuple[torch.nn.Module, Any]:
+def get_model_and_inputs_from_name(
+    model_name: str, model_input: str | None
+) -> Tuple[torch.nn.Module, Any]:
     """Given the name of an example pytorch model, return it and example inputs.
 
     Raises RuntimeError if there is no example model corresponding to the given name.
     """
+    example_inputs = None
+    if model_input is not None:
+        logging.info(f"Load model input from {model_input}")
+        if model_input.endswith(".pt"):
+            example_inputs = torch.load(model_input, weights_only=False)
+        else:
+            raise RuntimeError(
+                f"Model input data '{model_input}' is not a valid name. Use --model_input <FILE>.pt e.g. saved with torch.save()"
+            )
+
     # Case 1: Model is defined in this file
     if model_name in models.keys():
+        logging.info(f"Internal model {model_name}")
         model = models[model_name]()
-        example_inputs = models[model_name].example_input
+        if example_inputs is None:
+            example_inputs = models[model_name].example_input
     # Case 2: Model is defined in examples/models/
     elif model_name in MODEL_NAME_TO_MODEL.keys():
         logging.warning(
             "Using a model from examples/models not all of these are currently supported"
         )
-        model, example_inputs, _, _ = EagerModelFactory.create_model(
+        logging.info(
+            f"Load {model_name} -> {MODEL_NAME_TO_MODEL[model_name]} from examples/models"
+        )
+
+        model, tmp_example_inputs, _, _ = EagerModelFactory.create_model(
             *MODEL_NAME_TO_MODEL[model_name]
         )
+        if example_inputs is None:
+            example_inputs = tmp_example_inputs
     # Case 3: Model is in an external python file loaded as a module.
     #         ModelUnderTest should be a torch.nn.module instance
     #         ModelInputs should be a tuple of inputs to the forward function
     elif model_name.endswith(".py"):
+        logging.info(
+            f"Load model file {model_name}   Variable ModelUnderTest=<Model> ModelInputs=<ModelInput>"
+        )
         import importlib.util
 
         # load model's module and add it
@@ -84,13 +110,22 @@ def get_model_and_inputs_from_name(model_name: str) -> Tuple[torch.nn.Module, An
         module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(module)
         model = module.ModelUnderTest
-        example_inputs = module.ModelInputs
-
+        if example_inputs is None:
+            example_inputs = module.ModelInputs
+    # Case 4: Model is in an saved model file torch.save(model)
+    elif model_name.endswith(".pth") or model_name.endswith(".pt"):
+        logging.info(f"Load model file {model_name}")
+        model = torch.load(model_name, weights_only=False)
+        if example_inputs is None:
+            raise RuntimeError(
+                f"Model '{model_name}' requires input data specify --model_input <FILE>.pt"
+            )
     else:
         raise RuntimeError(
             f"Model '{model_name}' is not a valid name. Use --help for a list of available models."
         )
-
+    logging.debug(f"Loaded model: {model}")
+    logging.debug(f"Loaded input: {example_inputs}")
     return model, example_inputs
 
 
@@ -107,7 +142,7 @@ def quantize(
     logging.debug(f"Original model: {model}")
     quantizer = None
     if is_ethosu(compile_specs):
-        quantizer = EthosUQuantizer(compile_spec)
+        quantizer = EthosUQuantizer(compile_specs)
     elif is_tosa(compile_specs):
         quantizer = TOSAQuantizer(get_tosa_spec(compile_specs))
     else:
@@ -365,13 +400,19 @@ def dump_delegation_info(edge, intermediate_files_folder: Optional[str] = None):
             file.write(delegation_info_string)
 
 
-def get_args():  # noqa C901
+def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-m",
         "--model_name",
         required=True,
-        help=f"Provide model name. Valid ones: {set(list(models.keys())+list(MODEL_NAME_TO_MODEL.keys()))}",
+        help=f"Model file .py/.pth/.pt, builtin model or a model from examples/models. Valid names: {set(list(models.keys())+list(MODEL_NAME_TO_MODEL.keys()))}",
+    )
+    parser.add_argument(
+        "--model_input",
+        required=False,
+        default=None,
+        help="Provide model input .pt file, or python variable name",
     )
     parser.add_argument(
         "-d",
@@ -381,6 +422,13 @@ def get_args():  # noqa C901
         default=False,
         help="Flag for producing ArmBackend delegated model",
     )
+    parser.add_argument(
+        "--bundleio",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Flag for producing BundleIO bpte file with input/output test/ref data.",
+    )
     parser.add_argument(
         "-t",
         "--target",
@@ -436,7 +484,7 @@ def get_args():  # noqa C901
         "--output",
         action="store",
         required=False,
-        help="Location for outputs, if not the default of cwd.",
+        help="Filename (if .pte or .bpte is used) or a folder for outputs, if not specified the default is to place files in cwd.",
     )
     parser.add_argument(
         "--system_config",
@@ -468,10 +516,6 @@ def get_args():  # noqa C901
             + "This is required for running quantized models with unquantized input."
         )
 
-    if args.quantize and not args.delegate:
-        logging.error("--delegate must be set when using --quanitze flag.")
-        exit(1)
-
     # if we have custom ops, register them before processing the model
     if args.so_library is not None:
         logging.info(f"Loading custom ops from {args.so_library}")
@@ -503,12 +547,136 @@ def get_args():  # noqa C901
     return args
 
 
-if __name__ == "__main__":
+def save_bpte_program(exec_prog, original_model: torch.nn.Module, output_name: str):
+    # Construct MethodTestSuite for Each Method
+
+    # Generate Test Suites
+    method_names = [
+        method.name for method in exec_prog.executorch_program.execution_plan
+    ]
+
+    program_inputs = {m_name: [example_inputs] for m_name in method_names}
+
+    method_test_suites: List[MethodTestSuite] = []
+    for m_name in method_names:
+        method_inputs = program_inputs[m_name]
+
+        # To create a bundled program, we first create every test cases from input. We leverage eager model
+        # to generate expected output for each test input, and use MethodTestCase to hold the information of
+        # each test case. We gather all MethodTestCase for same method into one MethodTestSuite, and generate
+        # bundled program by all MethodTestSuites.
+        method_test_cases: List[MethodTestCase] = []
+
+        if args.intermediates:
+            # Save model.pth
+            intermediates_path = Path(args.intermediates)
+            model_path = os.path.join(intermediates_path, "model.pth")
+            try:
+                torch.save(original_model, model_path)
+            except:
+                logging.warning(f"Could not torch.save(model, {model_path})")
+        method_index = 0
+        for method_input in method_inputs:
+            output_ref = original_model(*method_input)
+
+            logging.debug(f"input_{method_index}: {method_input}")
+            logging.debug(f"output_ref_{method_index}: {output_ref}")
+
+            if args.intermediates:
+                # Save model input and referece output
+                input_path = os.path.join(
+                    intermediates_path, f"input_{method_index}.pt"
+                )
+                try:
+                    torch.save(method_input, input_path)
+                except:
+                    logging.warning(
+                        f"Could not torch.save(input_{method_index}, {input_path})"
+                    )
+                refoutput_path = os.path.join(
+                    intermediates_path, f"output_ref_{method_index}.pt"
+                )
+                try:
+                    torch.save(output_ref, refoutput_path)
+                except:
+                    logging.warning(
+                        f"Could not torch.save(output_ref_{method_index}, {refoutput_path})"
+                    )
+
+            method_test_cases.append(
+                MethodTestCase(
+                    inputs=method_input,
+                    expected_outputs=output_ref,
+                )
+            )
+
+            method_index = method_index + 1
+
+        method_test_suites.append(
+            MethodTestSuite(
+                method_name=m_name,
+                test_cases=method_test_cases,
+            )
+        )
+
+    # Generate BundledProgram
+    save_bundled_program(exec_prog, method_test_suites, output_name)
+
+
+def to_edge_TOSA_delegate(
+    args,
+    model: torch.nn.Module,
+):
+    model_int8 = None
+    # As we can target multiple output encodings, one must
+    # be specified.
+    compile_spec = get_compile_spec(
+        args.target,
+        args.intermediates,
+        args.system_config,
+        args.memory_mode,
+    )
+    if args.quantize:
+        model = quantize(
+            model,
+            args.model_name,
+            compile_spec,
+            example_inputs,
+            args.evaluate,
+            args.evaluate_config,
+        )
+        model_int8 = model
+        # Wrap quantized model back into an exported_program
+        exported_program = torch.export.export_for_training(model, example_inputs)
+
+        if args.intermediates:
+            os.makedirs(args.intermediates, exist_ok=True)
+
+    if is_ethosu(compile_spec):
+        partitioner = EthosUPartitioner(compile_spec)
+    elif is_tosa(compile_spec):
+        partitioner = TOSAPartitioner(compile_spec)
+    else:
+        raise RuntimeError(f"Unhandled compile spec: {compile_spec}")
+
+    edge = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        compile_config=EdgeCompileConfig(
+            _check_ir_validity=False,
+        ),
+    )
+    return model_int8, edge
+
+
+if __name__ == "__main__":  # noqa: C901
     args = get_args()
 
     # Pick model from one of the supported lists
-    model, example_inputs = get_model_and_inputs_from_name(args.model_name)
-    model = model.eval()
+    original_model, example_inputs = get_model_and_inputs_from_name(
+        args.model_name, args.model_input
+    )
+    model = original_model.eval()
 
     # export_for_training under the assumption we quantize, the exported form also works
     # in to_edge if we don't quantize
@@ -519,44 +687,7 @@ def get_args():  # noqa C901
     # Quantize if required
     model_int8 = None
     if args.delegate:
-        # As we can target multiple output encodings, one must
-        # be specified.
-        compile_spec = get_compile_spec(
-            args.target,
-            args.intermediates,
-            args.system_config,
-            args.memory_mode,
-        )
-        if args.quantize:
-            model = quantize(
-                model,
-                args.model_name,
-                compile_spec,
-                example_inputs,
-                args.evaluate,
-                args.evaluate_config,
-            )
-            model_int8 = model
-            # Wrap quantized model back into an exported_program
-            exported_program = torch.export.export_for_training(model, example_inputs)
-
-            if args.intermediates:
-                os.makedirs(args.intermediates, exist_ok=True)
-
-        if is_ethosu(compile_spec):
-            partitioner = EthosUPartitioner(compile_spec)
-        elif is_tosa(compile_spec):
-            partitioner = TOSAPartitioner(compile_spec)
-        else:
-            raise RuntimeError(f"Unhandled compile spec: {compile_spec}")
-
-        edge = to_edge_transform_and_lower(
-            exported_program,
-            partitioner=[partitioner],
-            compile_config=EdgeCompileConfig(
-                _check_ir_validity=False,
-            ),
-        )
+        model_int8, edge = to_edge_TOSA_delegate(args, model)
     else:
         edge = to_edge_transform_and_lower(
             exported_program,
@@ -587,11 +718,33 @@ def get_args():  # noqa C901
         else f"_arm_{args.target}"
     )
 
+    if args.bundleio:
+        output_name = f"{output_name}.bpte"
+    else:
+        output_name = f"{output_name}.pte"
+
     if args.output is not None:
-        output_name = os.path.join(args.output, output_name)
+        if args.output.endswith(".pte") or args.output.endswith(".bpte"):
+            # --output is a pte or bundle pte filename use it as output name
+            if args.bundleio and not args.output.endswith(".bpte"):
+                raise RuntimeError(
+                    f"--bundleio expects a .bpte file ending to --output and not .pte {args.output}"
+                )
+            if not args.bundleio and not args.output.endswith(".pte"):
+                raise RuntimeError(
+                    f"When not using --bundleio a .bpte file should not be use as --output {args.output}"
+                )
+            output_name = args.output
+        else:
+            # --output is a folder
+            output_name = os.path.join(args.output, output_name)
 
-    save_pte_program(exec_prog, output_name)
-    print(f"PTE file saved as {output_name}.pte")
+    if args.bundleio:
+        save_bpte_program(exec_prog, original_model, output_name)
+        print(f"Bundle PTE file saved as {output_name}")
+    else:
+        save_pte_program(exec_prog, output_name)
+        print(f"PTE file saved as {output_name}")
 
     if args.evaluate:
         evaluate_model(
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index d43a7047080..11891e2fb93 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -9,11 +9,14 @@ project(arm_executor_runner)
 option(SEMIHOSTING "Enable semihosting" OFF)
 option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF)
 option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF)
+option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF)
+option(ET_ATOL "Set atol to use for BundleIO testing" OFF)
+option(ET_RTOL "Set rtol to use for BundleIO testing" OFF)
 
 if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
   message(
     FATAL_ERROR
-      "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the "
+      "ET_PTE_FILE_PATH must specify a model .pte or .bpte, for bare metal systems the "
       "model is built into the binary."
   )
 endif()
@@ -373,6 +376,18 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
   )
 endif()
 
+if(ET_BUNDLE_IO)
+  add_library(bundled_program STATIC IMPORTED)
+  set_property(
+    TARGET bundled_program
+    PROPERTY IMPORTED_LOCATION
+        "${ET_BUILD_DIR_PATH}/lib/libbundled_program.a"
+  )
+  list(APPEND arm_executor_runner_link
+    bundled_program
+  )
+endif()
+
 # Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
 # bin size as we link in a number of other symbols
 target_link_libraries(
@@ -402,6 +417,18 @@ if(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE)
   target_compile_definitions(arm_executor_runner PUBLIC ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE})
 endif()
 
+if(ET_BUNDLE_IO)
+  target_compile_definitions(arm_executor_runner PUBLIC -DET_BUNDLE_IO)
+endif()
+
+if(ET_ATOL)
+  target_compile_definitions(arm_executor_runner PUBLIC ET_ATOL=${ET_ATOL})
+endif()
+
+if(ET_RTOL)
+  target_compile_definitions(arm_executor_runner PUBLIC ET_RTOL=${ET_RTOL})
+endif()
+
 # Fixup compilation of retarget.c
 if(SEMIHOSTING)
   # Remove this when MLBEDSW-8910 is closed.
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index 2d08f733eba..48237acdf22 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -1,17 +1,12 @@
 /* Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
- * Copyright 2023-2024 Arm Limited and/or its affiliates.
+ * Copyright 2023-2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
 
 #include <errno.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <memory>
-#include <vector>
-
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/memory_allocator.h>
@@ -19,8 +14,17 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <memory>
+#include <vector>
 
 #include "arm_perf_monitor.h"
+
+#if defined(ET_BUNDLE_IO)
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#endif
+
 #if defined(ET_EVENT_TRACER_ENABLED)
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 #if !defined(SEMIHOSTING)
@@ -102,6 +106,24 @@ unsigned char __attribute__((
     section("input_data_sec"),
     aligned(16))) method_allocation_pool[method_allocation_pool_size];
 
+#if defined(ET_BUNDLE_IO)
+
+const size_t testset_idx = 0; // BundleIO test indexes to test if used
+
+#if defined(ET_ATOL)
+const float et_atol = ET_ATOL;
+#else
+const float et_atol = 0.01;
+#endif
+
+#if defined(ET_RTOL)
+const float et_rtol = ET_RTOL;
+#else
+const float et_rtol = 0.01;
+#endif
+
+#endif
+
 /**
  * The temp_allocation_pool is used for allocating temporary data during kernel
  * or delegate execution. This will be reset after each kernel or delegate call.
@@ -409,15 +431,41 @@ int main(int argc, const char* argv[]) {
     }
   }
 #endif
-  ET_LOG(Info, "Model in %p %c", model_pte, model_pte[0]);
-  auto loader = BufferDataLoader(model_pte, pte_size);
-  ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", pte_size);
+  ET_LOG(
+      Info, "PTE in %p %c Size: %lu bytes", model_pte, model_pte[0], pte_size);
+
+  // Find the offset to the embedded Program.
+  const void* program_data = model_pte;
+  size_t program_data_len = pte_size;
+
+#if defined(ET_BUNDLE_IO)
+  bool bundle_io = executorch::bundled_program::is_bundled_program(
+      reinterpret_cast<void*>(model_pte), pte_size);
+  if (bundle_io) {
+    // BundleIO bpte is provided, dig out the actual model from the data area
+    Error status = executorch::bundled_program::get_program_data(
+        reinterpret_cast<void*>(model_pte),
+        pte_size,
+        &program_data,
+        &program_data_len);
+
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "get_program_data() from bundle PTE failed: 0x%x",
+        (unsigned int)status);
+  }
+#endif
+  auto loader = BufferDataLoader(program_data, program_data_len);
+  ET_LOG(Info, "PTE Model data loaded. Size: %lu bytes.", program_data_len);
+
+  // Parse the program file. This is immutable, and can also be reused
+  // between multiple execution invocations across multiple threads.
   Result<Program> program = Program::load(&loader);
   if (!program.ok()) {
     ET_LOG(
         Info,
         "Program loading failed @ 0x%p: 0x%" PRIx32,
-        model_pte,
+        program_data,
         program.error());
   }
 
@@ -483,6 +531,7 @@ int main(int argc, const char* argv[]) {
   executorch::runtime::EventTracer* event_tracer_ptr = nullptr;
 
 #if defined(ET_EVENT_TRACER_ENABLED)
+  ET_LOG(Info, "Setting up ETDump");
   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
   event_tracer_ptr = &etdump_gen;
 #endif
@@ -499,21 +548,75 @@ int main(int argc, const char* argv[]) {
   }
   size_t method_loaded_memsize =
       method_allocator.used_size() - method_loaded_membase;
-  ET_LOG(Info, "Method loaded.");
+  ET_LOG(Info, "Method '%s' loaded.", method_name);
 
   ET_LOG(Info, "Preparing inputs...");
   size_t input_membase = method_allocator.used_size();
 
-  auto inputs =
-      ::prepare_input_tensors(*method, method_allocator, input_buffers);
-
-  if (!inputs.ok()) {
-    ET_LOG(
-        Info,
-        "Preparing inputs tensors for method %s failed with status 0x%" PRIx32,
-        method_name,
-        inputs.error());
+#if defined(ET_BUNDLE_IO)
+  if (bundle_io) {
+    // Get inputs from bundled IO ".bpte" data
+    // Useful for testing
+    ET_LOG(Info, "Input testset[%d] from bundled bpte", testset_idx);
+    Error status = executorch::bundled_program::load_bundled_input(
+        *method, model_pte, testset_idx);
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "load_bundled_input failed with status 0x%" PRIx32,
+        status);
+  } else
+#endif
+  {
+    // Here you would add code to get input from your Hardware
+    // Get inputs from SEMIHOSTING or fake it with a lot of "1"
+    // Use "static" to force to compiler to remove this when it goes out of
+    // scope
+    static auto prepared_inputs =
+        ::prepare_input_tensors(*method, method_allocator, input_buffers);
+
+    if (!prepared_inputs.ok()) {
+      ET_LOG(
+          Info,
+          "Preparing inputs tensors for method %s failed with status 0x%" PRIx32,
+          method_name,
+          prepared_inputs.error());
+    }
   }
+#ifdef DUMP_INPUT
+  {
+    std::vector<EValue> inputs(method->inputs_size());
+    ET_LOG(Info, "%zu inputs: ", inputs.size());
+    Error status = method->get_inputs(inputs.data(), inputs.size());
+    ET_CHECK(status == Error::Ok);
+
+    for (int i = 0; i < inputs.size(); ++i) {
+      Tensor t = inputs[i].toTensor();
+      // The output might be collected and parsed so printf() is used instead
+      // of ET_LOG() here
+      for (int j = 0; j < inputs[i].toTensor().numel(); ++j) {
+        if (t.scalar_type() == ScalarType::Int) {
+          printf(
+              "Input[%d][%d]: (int) %d\n",
+              i,
+              j,
+              inputs[i].toTensor().const_data_ptr<int>()[j]);
+        } else if (t.scalar_type() == ScalarType::Float) {
+          printf(
+              "Input[%d][%d]: (float) %f\n",
+              i,
+              j,
+              inputs[i].toTensor().const_data_ptr<float>()[j]);
+        } else if (t.scalar_type() == ScalarType::Char) {
+          printf(
+              "Input[%d][%d]: (char) %d\n",
+              i,
+              j,
+              inputs[i].toTensor().const_data_ptr<int8_t>()[j]);
+        }
+      }
+    }
+  }
+#endif
   size_t input_memsize = method_allocator.used_size() - input_membase;
   ET_LOG(Info, "Input prepared.");
 
@@ -524,7 +627,8 @@ int main(int argc, const char* argv[]) {
   StopMeasurements();
   size_t executor_memsize = method_allocator.used_size() - executor_membase;
 
-  ET_LOG(Info, "model_pte_loaded_size:     %lu bytes.", pte_size);
+  ET_LOG(Info, "model_pte_program_size:     %lu bytes.", program_data_len);
+  ET_LOG(Info, "model_pte_loaded_size:      %lu bytes.", pte_size);
 #if defined(SEMIHOSTING)
   if (input_file_allocator.size() > 0) {
     ET_LOG(
@@ -575,50 +679,34 @@ int main(int argc, const char* argv[]) {
   ET_LOG(Info, "%zu outputs: ", outputs.size());
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
+
   for (int i = 0; i < outputs.size(); ++i) {
     Tensor t = outputs[i].toTensor();
 #if !defined(SEMIHOSTING)
+#if !defined(ET_BUNDLE_IO)
     // The output might be collected and parsed so printf() is used instead
     // of ET_LOG() here
     for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
       if (t.scalar_type() == ScalarType::Int) {
         printf(
-            "Output[%d][%d]: %d\n",
+            "Output[%d][%d]: (int) %d\n",
             i,
             j,
             outputs[i].toTensor().const_data_ptr<int>()[j]);
       } else if (t.scalar_type() == ScalarType::Float) {
         printf(
-            "Output[%d][%d]: %f\n",
+            "Output[%d][%d]: (float) %f\n",
             i,
             j,
             outputs[i].toTensor().const_data_ptr<float>()[j]);
       } else if (t.scalar_type() == ScalarType::Char) {
         printf(
-            "Output[%d][%d]: %d\n",
+            "Output[%d][%d]: (char) %d\n",
             i,
             j,
             outputs[i].toTensor().const_data_ptr<int8_t>()[j]);
       }
     }
-#if defined(ET_EVENT_TRACER_ENABLED)
-    ETDumpResult result = etdump_gen.get_etdump_data();
-    if (result.buf != nullptr && result.size > 0) {
-      // On a device with no file system we can't just write it out
-      // to the file-system so we base64 encode it and dump it on the log.
-      int mode = 0;
-      size_t len = result.size;
-      size_t encoded_len = base64_encoded_size(result.size, mode);
-      uint8_t* encoded_buf = reinterpret_cast<uint8_t*>(
-          method_allocator.allocate(encoded_len + 1));
-      int ret = base64_encode(
-          encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode);
-      encoded_buf[encoded_len] = 0x00; // Ensure null termination
-      ET_LOG(Info, "Writing etdump.bin [base64]");
-      printf(
-          "#---\nbase64 -i -d <<<\"\\\n%s\\\n\" >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles\n#---\n",
-          encoded_buf);
-    }
 #endif
 #else
     char out_filename[255];
@@ -631,21 +719,66 @@ int main(int argc, const char* argv[]) {
         outputs[i].toTensor().nbytes(),
         out_file);
     fclose(out_file);
-#if defined(ET_EVENT_TRACER_ENABLED)
-    etdump_result result = etdump_gen.get_etdump_data();
-    if (result.buf != nullptr && result.size > 0) {
-      // On a device with a file system we can just write it out
-      // to the file-system.
-      char etdump_filename = "etdump.bin";
-      ET_LOG(Info, "Writing etdump to file: %s", etdump_filename);
-      FILE* f = fopen(etdump_filename, "w+");
-      fwrite((uint8_t*)result.buf, 1, result.size, f);
-      fclose(f);
-      free(result.buf);
-    }
 #endif
+  }
+
+#if defined(ET_BUNDLE_IO)
+  if (bundle_io) {
+    // Verify the result.
+    status = executorch::bundled_program::verify_method_outputs(
+        *method, model_pte, testset_idx, et_rtol, et_atol);
+    if (status == Error::Ok) {
+      ET_LOG(Info, "Model output match expected BundleIO bpte ref data.");
+      ET_LOG(Info, "TEST: BundleIO index[%d] Test_result: PASS", testset_idx);
+    } else {
+      ET_LOG(
+          Error,
+          "Model output don't match expected BundleIO bpte ref data. rtol=%f atol=%f",
+          et_rtol,
+          et_atol);
+      ET_LOG(Error, "TEST: BundleIO index[%d] Test_result: FAIL", testset_idx);
+    }
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "Bundle verification failed with status 0x%" PRIx32,
+        status);
+  }
 #endif
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+#if !defined(SEMIHOSTING)
+  ETDumpResult result = etdump_gen.get_etdump_data();
+  if (result.buf != nullptr && result.size > 0) {
+    // On a device with no file system we can't just write it out
+    // to the file-system so we base64 encode it and dump it on the log.
+    int mode = 0;
+    size_t len = result.size;
+    size_t encoded_len = base64_encoded_size(result.size, mode);
+    uint8_t* encoded_buf =
+        reinterpret_cast<uint8_t*>(method_allocator.allocate(encoded_len + 1));
+    int ret = base64_encode(
+        encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode);
+    encoded_buf[encoded_len] = 0x00; // Ensure null termination
+    ET_LOG(Info, "Writing etdump.bin [base64]");
+    printf(
+        "#---\nbase64 -i -d <<<\"\\\n%s\\\n\" >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles\n#---\n",
+        encoded_buf);
+  }
+#else
+  etdump_result result = etdump_gen.get_etdump_data();
+  if (result.buf != nullptr && result.size > 0) {
+    // On a device with a file system we can just write it out
+    // to the file-system.
+    char etdump_filename = "etdump.bin";
+    ET_LOG(Info, "Writing etdump to file: %s", etdump_filename);
+    FILE* f = fopen(etdump_filename, "w+");
+    fwrite((uint8_t*)result.buf, 1, result.size, f);
+    fclose(f);
+    free(result.buf);
   }
+#endif
+#endif
+
 out:
   ET_LOG(Info, "Program complete, exiting.");
 #if defined(SEMIHOSTING)
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index ce92312b652..5f1e3764de2 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -18,11 +18,14 @@ et_root_dir=$(realpath ${et_root_dir})
 
 
 model_name=""
+model_input_set=false
+model_input=""
 aot_arm_compiler_flags="--delegate --quantize"
 portable_kernels="aten::_softmax.out"
 target="ethos-u55-128"
 output_folder_set=false
 output_folder="."
+bundleio=false
 build_with_etdump=false
 build_type="Release"
 extra_build_flags=""
@@ -35,11 +38,13 @@ ethos_u_scratch_dir=${script_dir}/ethos-u-scratch
 function help() {
     echo "Usage: $(basename $0) [options]"
     echo "Options:"
-    echo "  --model_name=<MODEL>                   Model to run, can be a builtin, examples/models or a filename Default to all builtin models"
+    echo "  --model_name=<MODEL>                   Model file .py/.pth/.pt, builtin model or a model from examples/models. Passed to aot_arm_compiler"
+    echo "  --model_input=<INPUT>                  Provide model input .pt file to override the input in the model file.  Passed to aot_arm_compiler"
     echo "  --aot_arm_compiler_flags=<FLAGS>       Only used if --model_name is used Default: ${aot_arm_compiler_flags}"
     echo "  --portable_kernels=<OPS>               Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
     echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
     echo "  --output=<FOLDER>                      Target build output folder Default: ${output_folder}"
+    echo "  --bundleio                             Create Bundled pte using Devtools BundelIO with Input/RefOutput included"
     echo "  --etdump                               Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
     echo "  --build_type=<TYPE>                    Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
     echo "  --extra_build_flags=<FLAGS>            Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
@@ -56,10 +61,12 @@ for arg in "$@"; do
     case $arg in
       -h|--help) help ;;
       --model_name=*) model_name="${arg#*=}";;
+      --model_input=*) model_input="${arg#*=}" ; model_input_set=true  ;;
       --aot_arm_compiler_flags=*) aot_arm_compiler_flags="${arg#*=}";;
       --portable_kernels=*) portable_kernels="${arg#*=}";;
       --target=*) target="${arg#*=}";;
       --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
+      --bundleio) bundleio=true ;;
       --etdump) build_with_etdump=true ;;
       --build_type=*) build_type="${arg#*=}";;
       --extra_build_flags=*) extra_build_flags="${arg#*=}";;
@@ -121,13 +128,21 @@ hash arm-none-eabi-gcc \
 
 # Build executorch libraries
 cd $et_root_dir
+devtools_flag=""
+bundleio_flag=""
+et_dump_flag=""
 if [ "$build_with_etdump" = true ] ; then
+    devtools_flag="--devtools --etdump"
     et_dump_flag="--etdump"
-else
-    et_dump_flag=""
 fi
 
-backends/arm/scripts/build_executorch.sh --et_build_root="${et_build_root}" --build_type=$build_type $et_dump_flag
+if [ "$bundleio" = true ] ; then
+    devtools_flag="--devtools --etdump"
+    bundleio_flag="--bundleio"
+    et_dump_flag="--etdump"
+fi
+
+backends/arm/scripts/build_executorch.sh --et_build_root="${et_build_root}" --build_type=$build_type $devtools_flag
 backends/arm/scripts/build_portable_kernels.sh --et_build_root="${et_build_root}" --build_type=$build_type --portable_kernels=$portable_kernels
 
 # Build a lib quantized_ops_aot_lib
@@ -157,12 +172,21 @@ for i in "${!test_model[@]}"; do
     echo "--------------------------------------------------------------------------------"
 
     cd $et_root_dir
-    model_short_name=$(basename -- "${model}" ".py")
-    model_filename=${model_short_name}_arm_${target}.pte
+    # Remove path and file exetension to get model_short_name
+    ext=${model##*.}
+    model_short_name=$(basename -- "${model}" .$ext)
+    model_filename=${model_short_name}_arm_${target}
 
     if [[ "${model_compiler_flags}" == *"--delegate"* ]]; then
         # Name aligned with default aot_arm_compiler output
-        model_filename=${model_short_name}_arm_delegate_${target}.pte
+        model_filename=${model_short_name}_arm_delegate_${target}
+    fi
+    elf_folder=${model_filename}
+
+    if [ "$bundleio" = true ] ; then
+        model_filename=${model_filename}.bpte
+    else
+        model_filename=${model_filename}.pte
     fi
 
     if [ "$output_folder_set" = false ] ; then
@@ -170,15 +194,19 @@ for i in "${!test_model[@]}"; do
     fi
 
     output_folder=$(realpath ${output_folder})
-    mkdir -p ${output_folder}
-    pte_file=$(realpath -m ${output_folder}/${model_filename})
+    pte_file="${output_folder}/${model_filename}"
 
-    rm -f "${pte_file}"
+    mkdir -p ${output_folder}
 
-    ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}"
+    # Remove old pte files
+    rm -f "${output_folder}/${model_filename}"
+    
+    ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag"
     echo "CALL ${ARM_AOT_CMD}" >&2
     ${ARM_AOT_CMD} 1>&2
 
+    pte_file=$(realpath ${pte_file})
+
     [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "pte_data_size: $(wc -c ${pte_file})"
     echo "pte_file: ${pte_file}"
@@ -188,10 +216,11 @@ for i in "${!test_model[@]}"; do
     else
         set -x
         # Rebuild the application as the pte is imported as a header/c array
-        backends/arm/scripts/build_executorch_runner.sh "--pte=${pte_file}" --build_type=$build_type --target=$target --system_config=$system_config  $et_dump_flag --extra_build_flags="$extra_build_flags" --ethosu_tools_dir="$ethos_u_scratch_dir" --output="${output_folder}"
+        backends/arm/scripts/build_executorch_runner.sh --et_build_root="${et_build_root}" --pte="${pte_file}" --build_type=${build_type} --target=${target} --system_config=${system_config} ${bundleio_flag} ${et_dump_flag} --extra_build_flags="${extra_build_flags}" --ethosu_tools_dir="${ethos_u_scratch_dir}"
         if [ "$build_only" = false ] ; then
             # Execute the executor_runner on FVP Simulator
-            backends/arm/scripts/run_fvp.sh --elf=${output_folder}/cmake-out/arm_executor_runner --target=$target
+            elf_file="${output_folder}/${elf_folder}/cmake-out/arm_executor_runner"
+            backends/arm/scripts/run_fvp.sh --elf=${elf_file} --target=$target
         fi
         set +x
     fi