diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index bc9bbb8bae0..1333f481866 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -232,21 +232,24 @@ test_model_with_qnn() {
 # @param should_test If true, build and test the model using the coreml_executor_runner.
 test_model_with_coreml() {
   local should_test="$1"
+  local test_with_pybindings="$2"
+  local dtype="$3"
 
   if [[ "${BUILD_TOOL}" != "cmake" ]]; then
     echo "coreml only supports cmake."
     exit 1
   fi
 
-  DTYPE=float16
+  RUN_WITH_PYBINDINGS=""
+  if [[ "${test_with_pybindings}" == true ]]; then
+    echo \"Running with pybindings\"
+    export RUN_WITH_PYBINDINGS="--run_with_pybindings"
+  fi
 
-  "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}" --use_partitioner
+  "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision ${dtype} --use_partitioner ${RUN_WITH_PYBINDINGS}
   EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
 
   if [ -n "$EXPORTED_MODEL" ]; then
-    EXPORTED_MODEL_WITH_DTYPE="${EXPORTED_MODEL%.pte}_${DTYPE}.pte"
-    mv "$EXPORTED_MODEL" "$EXPORTED_MODEL_WITH_DTYPE"
-    EXPORTED_MODEL="$EXPORTED_MODEL_WITH_DTYPE"
     echo "OK exported model: $EXPORTED_MODEL"
   else
     echo "[error] failed to export model: no .pte file found"
@@ -303,7 +306,15 @@ elif [[ "${BACKEND}" == *"coreml"* ]]; then
   if [[ "${BACKEND}" == *"test"* ]]; then
     should_test_coreml=true
   fi
-  test_model_with_coreml "${should_test_coreml}"
+  test_with_pybindings=false
+  if [[ "${BACKEND}" == *"pybind"* ]]; then
+    test_with_pybindings=true
+  fi
+  dtype=float16
+  if [[ "${BACKEND}" == *"float32"* ]]; then
+    dtype=float32
+  fi
+  test_model_with_coreml "${should_test_coreml}" "${test_with_pybindings}" "${dtype}"
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index d7205514a68..8857029d96b 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -18,8 +18,8 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test-models-macos:
-    name: test-models-macos
+  test-models-macos-cpu:
+    name: test-models-macos-cpu
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
       matrix:
@@ -568,10 +568,12 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
 
-  test-apple-model:
-    name: test-apple-model
+  test-models-macos-coreml:
+    name: test-models-macos-coreml
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
+      matrix:
+        model: [dl3, edsr, efficient_sam, emformer_join, emformer_transcribe, ic3, ic4, mobilebert, mv2, mv3, resnet50, vit, w2l]
       fail-fast: false
     with:
       runner: macos-m1-stable
@@ -580,7 +582,23 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
+        MODEL_NAME=${{ matrix.model }}
         BUILD_TOOL=cmake
+        BACKEND="coreml-pybind"
+
+
+        # Set model specific overrides
+        if [[ "${MODEL_NAME}" == "mobilebert" ]]; then
+          # See https://github.com/pytorch/executorch/issues/12907
+          # mobilebert has nan output on FP16, and high MSE on fp32, so we disable runtime test now
+          BACKEND="coreml"
+        fi
+
+        if [[ "${MODEL_NAME}" == "efficient_sam" ]]; then
+          # See https://github.com/pytorch/executorch/issues/12906
+          # efficient_sam fails to run on CoreML
+          BACKEND="coreml"
+        fi
 
         bash .ci/scripts/setup-conda.sh
 
@@ -589,13 +607,28 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh
         echo "Finishing installing coreml."
 
-        # Build and test coreml model
-        MODELS=(mv3 ic4 resnet50 edsr mobilebert w2l)
-        for MODEL_NAME in "${MODELS[@]}"; do
-          echo "::group::Exporting coreml model: $MODEL_NAME"
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "coreml"
-          echo "::endgroup::"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
+
+  test-models-macos-mps:
+    name: test-models-macos-mps
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        BUILD_TOOL=cmake
+        bash .ci/scripts/setup-conda.sh
+
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
 
+        # Build and test mps model
+        for MODEL_NAME in mv3 ic4 resnet50 edsr mobilebert w2l; do
           echo "::group::Exporting mps model: $MODEL_NAME"
           PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "mps"
           echo "::endgroup::"
diff --git a/backends/apple/coreml/scripts/generate_test_models.sh b/backends/apple/coreml/scripts/generate_test_models.sh
index 001ba362393..6a73d697379 100755
--- a/backends/apple/coreml/scripts/generate_test_models.sh
+++ b/backends/apple/coreml/scripts/generate_test_models.sh
@@ -22,7 +22,7 @@ cd "$EXECUTORCH_ROOT_PATH"
 MODELS=("add" "add_mul" "mul" "mv3")
 for MODEL in "${MODELS[@]}"
 do
-  echo "Executorch: Generating $MODEL model" 
+  echo "Executorch: Generating $MODEL model"
   # TODO: Don't use the script in examples directory.
   python3 -m examples.apple.coreml.scripts.export --model_name "$MODEL" --save_processed_bytes
   mv -f "$MODEL""_coreml_all.pte" "$COREML_DIR_PATH/runtime/test/models"
@@ -36,7 +36,7 @@ COMPILE_MODELS=("add_mul")
 echo "Executorch: Generating compiled model"
 for MODEL in "${COMPILE_MODELS[@]}"
 do
-  echo "Executorch: Generating compiled $MODEL model" 
+  echo "Executorch: Generating compiled $MODEL model"
   python3 -m examples.apple.coreml.scripts.export --model_name "$MODEL" --compile
   mv -f "$MODEL""_compiled_coreml_all.pte" "$COREML_DIR_PATH/runtime/test/models"
 done
diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py
index b9acc3b8fb9..0b5f64d13c2 100644
--- a/examples/apple/coreml/scripts/export.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -3,6 +3,7 @@
 # Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 import argparse
+import collections
 import copy
 
 import pathlib
@@ -23,8 +24,7 @@
 from executorch.exir import to_edge
 
 from executorch.exir.backend.backend_api import to_backend
-
-from torch.export import export
+from executorch.extension.export_util.utils import save_pte_program
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent
 EXAMPLES_DIR = REPO_ROOT / "examples"
@@ -41,7 +41,16 @@
 )
 
 
-def parse_args() -> argparse.ArgumentParser:
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+_CAN_RUN_WITH_PYBINDINGS = (sys.platform == "darwin") and not is_fbcode()
+if _CAN_RUN_WITH_PYBINDINGS:
+    from executorch.runtime import Runtime
+
+
+def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
@@ -82,9 +91,12 @@ def parse_args() -> argparse.ArgumentParser:
         required=False,
         default=False,
     )
+    parser.add_argument(
+        "--run_with_pybindings",
+        action=argparse.BooleanOptionalAction,
+    )
 
     args = parser.parse_args()
-    # pyre-fixme[7]: Expected `ArgumentParser` but got `Namespace`.
     return args
 
 
@@ -95,7 +107,8 @@ def partition_module_to_coreml(module):
 def lower_module_to_coreml(module, compile_specs, example_inputs):
     module = module.eval()
     edge = to_edge(
-        export(module, example_inputs, strict=True), compile_config=_EDGE_COMPILE_CONFIG
+        torch.export.export(module, example_inputs, strict=True),
+        compile_config=_EDGE_COMPILE_CONFIG,
     )
     # All of the subsequent calls on the edge_dialect_graph generated above (such as delegation or
     # to_executorch()) are done in place and the graph is also modified in place. For debugging purposes
@@ -115,24 +128,23 @@ def lower_module_to_coreml(module, compile_specs, example_inputs):
 def export_lowered_module_to_executorch_program(lowered_module, example_inputs):
     lowered_module(*example_inputs)
     exec_prog = to_edge(
-        export(lowered_module, example_inputs, strict=True),
+        torch.export.export(lowered_module, example_inputs, strict=True),
         compile_config=_EDGE_COMPILE_CONFIG,
     ).to_executorch(config=exir.ExecutorchBackendConfig(extract_delegate_segments=True))
 
     return exec_prog
 
 
-def save_executorch_program(exec_prog, model_name, compute_unit):
-    buffer = exec_prog.buffer
-    filename = f"{model_name}_coreml_{compute_unit}.pte"
-    print(f"Saving exported program to {filename}")
-    with open(filename, "wb") as file:
-        file.write(buffer)
-    return
+def get_pte_base_name(args: argparse.Namespace) -> str:
+    pte_name = args.model_name
+    if args.compile:
+        pte_name += "_compiled"
+    pte_name = f"{pte_name}_coreml_{args.compute_unit}"
+    return pte_name
 
 
-def save_processed_bytes(processed_bytes, model_name, compute_unit):
-    filename = f"{model_name}_coreml_{compute_unit}.bin"
+def save_processed_bytes(processed_bytes, base_name: str):
+    filename = f"{base_name}.bin"
     print(f"Saving processed bytes to {filename}")
     with open(filename, "wb") as file:
         file.write(processed_bytes)
@@ -154,6 +166,37 @@ def generate_compile_specs_from_args(args):
     )
 
 
+def run_with_pybindings(executorch_program, eager_reference, example_inputs, precision):
+    if not _CAN_RUN_WITH_PYBINDINGS:
+        raise RuntimeError("Cannot run with pybindings on this platform.")
+
+    dtype = {
+        "float32": torch.float32,
+        "float16": torch.float16,
+    }[precision]
+
+    runtime = Runtime.get()
+    program = runtime.load_program(executorch_program.buffer)
+    method = program.load_method("forward")
+    et_outputs = method.execute(*example_inputs)[0]
+    eager_outputs = eager_reference(*example_inputs)
+    if isinstance(eager_outputs, collections.OrderedDict):
+        eager_outputs = eager_outputs["out"]
+    if isinstance(eager_outputs, list | tuple):
+        eager_outputs = eager_outputs[0]
+
+    mse = ((et_outputs - eager_outputs) ** 2).mean().sqrt()
+    print(f"Mean square error: {mse}")
+    assert mse < 0.1, "Mean square error is too high."
+
+    if dtype == torch.float32:
+        assert torch.allclose(
+            et_outputs, eager_outputs, atol=1e-02, rtol=1e-02
+        ), f"""Outputs do not match eager reference:
+        \tet_outputs (first 5)={et_outputs.reshape(-1)[0:5]}
+        \teager_outputs (first 5)={eager_outputs.reshape(-1)[0:5]}"""
+
+
 def main():
     args = parse_args()
 
@@ -170,49 +213,65 @@ def main():
             f"Valid compute units are {valid_compute_units}."
         )
 
-    model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
-        *MODEL_NAME_TO_MODEL[args.model_name]
+    model, example_args, example_kwargs, dynamic_shapes = (
+        EagerModelFactory.create_model(*MODEL_NAME_TO_MODEL[args.model_name])
     )
     if not args.dynamic_shapes:
         dynamic_shapes = None
 
     compile_specs = generate_compile_specs_from_args(args)
-    lowered_module = None
-
+    pte_base_name = get_pte_base_name(args)
     if args.use_partitioner:
-        model.eval()
-        exir_program_aten = torch.export.export(
-            model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
-        )
-
-        edge_program_manager = exir.to_edge(exir_program_aten)
-        edge_copy = copy.deepcopy(edge_program_manager)
-        partitioner = CoreMLPartitioner(
-            skip_ops_for_coreml_delegation=None, compile_specs=compile_specs
+        model = model.eval()
+        assert not args.generate_etrecord, "ETRecord is not supported with partitioner"
+        ep = torch.export.export(
+            model,
+            args=example_args,
+            kwargs=example_kwargs,
+            dynamic_shapes=dynamic_shapes,
         )
-        delegated_program_manager = edge_program_manager.to_backend(partitioner)
-        exec_program = delegated_program_manager.to_executorch(
-            config=exir.ExecutorchBackendConfig(extract_delegate_segments=True)
+        print(ep)
+        delegated_program = exir.to_edge_transform_and_lower(
+            ep,
+            partitioner=[CoreMLPartitioner(compile_specs=compile_specs)],
         )
+        exec_program = delegated_program.to_executorch()
+        save_pte_program(exec_program, pte_base_name)
+        if args.run_with_pybindings:
+            run_with_pybindings(
+                executorch_program=exec_program,
+                eager_reference=model,
+                example_inputs=example_args,
+                precision=args.compute_precision,
+            )
     else:
         lowered_module, edge_copy = lower_module_to_coreml(
             module=model,
-            example_inputs=example_inputs,
+            example_inputs=example_args,
             compile_specs=compile_specs,
         )
         exec_program = export_lowered_module_to_executorch_program(
             lowered_module,
-            example_inputs,
-        )
-
-    model_name = f"{args.model_name}_compiled" if args.compile else args.model_name
-    save_executorch_program(exec_program, model_name, args.compute_unit)
-    generate_etrecord(f"{args.model_name}_coreml_etrecord.bin", edge_copy, exec_program)
-
-    if args.save_processed_bytes and lowered_module is not None:
-        save_processed_bytes(
-            lowered_module.processed_bytes, args.model_name, args.compute_unit
+            example_args,
         )
+        save_pte_program(exec_program, pte_base_name)
+        if args.generate_etrecord:
+            generate_etrecord(
+                f"{args.model_name}_coreml_etrecord.bin", edge_copy, exec_program
+            )
+
+        if args.save_processed_bytes:
+            save_processed_bytes(
+                lowered_module.processed_bytes,
+                pte_base_name,
+            )
+        if args.run_with_pybindings:
+            run_with_pybindings(
+                executorch_program=exec_program,
+                eager_reference=model,
+                example_inputs=example_args,
+                precision=args.compute_precision,
+            )
 
 
 if __name__ == "__main__":