diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py
index ba6142a4826..77c73eab0b4 100755
--- a/.github/scripts/extract_benchmark_results.py
+++ b/.github/scripts/extract_benchmark_results.py
@@ -86,36 +86,6 @@ def parse_args() -> Any:
         action=ValidateDir,
         help="the directory to keep the benchmark results",
     )
-    parser.add_argument(
-        "--repo",
-        type=str,
-        required=True,
-        help="which GitHub repo this workflow run belongs to",
-    )
-    parser.add_argument(
-        "--head-branch",
-        type=str,
-        required=True,
-        help="the head branch that runs",
-    )
-    parser.add_argument(
-        "--workflow-name",
-        type=str,
-        required=True,
-        help="the name of the benchmark workflow",
-    )
-    parser.add_argument(
-        "--workflow-run-id",
-        type=int,
-        required=True,
-        help="the id of the benchmark workflow",
-    )
-    parser.add_argument(
-        "--workflow-run-attempt",
-        type=int,
-        required=True,
-        help="which retry of the workflow this is",
-    )
     parser.add_argument(
         "--benchmark-configs",
         type=str,
@@ -153,9 +123,10 @@ def extract_android_benchmark_results(
         # This is to handle the case where there is no benchmark results
         warning(f"Fail to load the benchmark results from {artifact_s3_url}")
         return []
+    return []
 
 
-def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
+def initialize_ios_metadata(test_name: str) -> Dict[str, Any]:
     """
     Extract the benchmark metadata from the test name, for example:
         test_forward_llama2_pte_iOS_17_2_1_iPhone15_4
@@ -364,14 +335,7 @@ def transform(
     app_type: str,
     benchmark_results: List,
     benchmark_config: Dict[str, str],
-    repo: str,
-    head_branch: str,
-    workflow_name: str,
-    workflow_run_id: int,
-    workflow_run_attempt: int,
     job_name: str,
-    job_id: int,
-    schema_version: str,
 ) -> List:
     """
     Transform the benchmark results into the format writable into the benchmark database
@@ -381,87 +345,51 @@ def transform(
     for r in benchmark_results:
         r["deviceInfo"]["device"] = job_name
 
-    if schema_version == "v2":
-        # TODO (huydhn): Clean up this branch after ExecuTorch dashboard migrates to v3
-        return [
-            {
-                # GH-info to identify where the benchmark is run
-                "repo": repo,
-                "head_branch": head_branch,
-                "workflow_id": workflow_run_id,
-                "run_attempt": workflow_run_attempt,
-                "job_id": job_id,
-                # The model
-                "name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
-                "dtype": (
-                    r["benchmarkModel"]["quantization"]
-                    if r["benchmarkModel"]["quantization"]
-                    else "unknown"
-                ),
-                # The metric value
-                "metric": r["metric"],
-                "actual": r["actualValue"],
-                "target": r["targetValue"],
-                # The device
-                "device": r["deviceInfo"]["device"],
-                "arch": r["deviceInfo"].get("os", ""),
-                # Not used here, just set it to something unique here
-                "filename": workflow_name,
-                "test_name": app_type,
-                "runner": job_name,
-            }
-            for r in benchmark_results
-        ]
-    elif schema_version == "v3":
-        v3_benchmark_results = []
-        # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
-        return [
-            {
-                "benchmark": {
-                    "name": "ExecuTorch",
-                    "mode": "inference",
-                    "extra_info": {
-                        "app_type": app_type,
-                        # Just keep a copy of the benchmark config here
-                        "benchmark_config": json.dumps(benchmark_config),
-                    },
-                },
-                "model": {
-                    "name": benchmark_config.get("model", r["benchmarkModel"]["name"]),
-                    "type": "OSS model",
-                    "backend": benchmark_config.get(
-                        "config", r["benchmarkModel"].get("backend", "")
-                    ),
+    # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    return [
+        {
+            "benchmark": {
+                "name": "ExecuTorch",
+                "mode": "inference",
+                "extra_info": {
+                    "app_type": app_type,
+                    # Just keep a copy of the benchmark config here
+                    "benchmark_config": json.dumps(benchmark_config),
                 },
-                "metric": {
-                    "name": r["metric"],
-                    "benchmark_values": [r["actualValue"]],
-                    "target_value": r["targetValue"],
-                    "extra_info": {
-                        "method": r.get("method", ""),
-                    },
+            },
+            "model": {
+                "name": benchmark_config.get("model", r["benchmarkModel"]["name"]),
+                "type": "OSS model",
+                "backend": benchmark_config.get(
+                    "config", r["benchmarkModel"].get("backend", "")
+                ),
+            },
+            "metric": {
+                "name": r["metric"],
+                "benchmark_values": [r["actualValue"]],
+                "target_value": r["targetValue"],
+                "extra_info": {
+                    "method": r.get("method", ""),
                 },
-                "runners": [
-                    {
-                        "name": r["deviceInfo"]["device"],
-                        "type": r["deviceInfo"]["os"],
-                        "avail_mem_in_gb": r["deviceInfo"].get("availMem", ""),
-                        "total_mem_in_gb": r["deviceInfo"].get("totalMem", ""),
-                    }
-                ],
-            }
-            for r in benchmark_results
-        ]
+            },
+            "runners": [
+                {
+                    "name": r["deviceInfo"]["device"],
+                    "type": r["deviceInfo"]["os"],
+                    "avail_mem_in_gb": r["deviceInfo"].get("availMem", ""),
+                    "total_mem_in_gb": r["deviceInfo"].get("totalMem", ""),
+                }
+            ],
+        }
+        for r in benchmark_results
+    ]
 
 
 def main() -> None:
     args = parse_args()
 
     # Across all devices, keeping both schemas for now until ExecuTorch dashboard migrates to v3
-    all_benchmark_results = {
-        "v2": [],
-        "v3": [],
-    }
+    all_benchmark_results = []
     benchmark_config = {}
 
     with open(args.artifacts) as f:
@@ -482,7 +410,7 @@ def main() -> None:
                 benchmark_config = read_benchmark_config(
                     artifact_s3_url, args.benchmark_configs
                 )
-
+            benchmark_results = []
             if app_type == "ANDROID_APP":
                 benchmark_results = extract_android_benchmark_results(
                     job_name, artifact_type, artifact_s3_url
@@ -494,32 +422,17 @@ def main() -> None:
                 )
 
             if benchmark_results:
-                for schema in all_benchmark_results.keys():
-                    results = transform(
-                        app_type,
-                        benchmark_results,
-                        benchmark_config,
-                        args.repo,
-                        args.head_branch,
-                        args.workflow_name,
-                        args.workflow_run_id,
-                        args.workflow_run_attempt,
-                        job_name,
-                        extract_job_id(args.artifacts),
-                        schema,
-                    )
-                    all_benchmark_results[schema].extend(results)
-
-    for schema in all_benchmark_results.keys():
-        if not all_benchmark_results.get(schema):
-            continue
-
-        output_dir = os.path.join(args.output_dir, schema)
-        os.makedirs(output_dir, exist_ok=True)
+                results = transform(
+                    app_type, benchmark_results, benchmark_config, job_name
+                )
+                all_benchmark_results.extend(results)
 
+        # add v3 in case we have higher version of schema
+        output_dir = os.path.join(args.output_dir, "v3")
+        os.makedirs(output_dir, exist_ok=True)
         output_file = os.path.basename(args.artifacts)
         with open(f"{output_dir}/{output_file}", "w") as f:
-            json.dump(all_benchmark_results[schema], f)
+            json.dump(all_benchmark_results, f)
 
 
 if __name__ == "__main__":
diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
index 82e49d6672e..7061eb72aa3 100644
--- a/.github/workflows/_android.yml
+++ b/.github/workflows/_android.yml
@@ -30,6 +30,7 @@ jobs:
 
         # Build LLM Demo for Android
         bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
+        bash build/build_android_instrumentation.sh
 
   # Running Android emulator directly on the runner and not using Docker
   run-emulator:
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index f21ed849d03..fbd2cae24e0 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -462,29 +462,14 @@ jobs:
             ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
               --artifacts "${ARTIFACTS_BY_JOB}" \
               --output-dir benchmark-results \
-              --repo ${{ github.repository }} \
-              --head-branch ${{ github.head_ref || github.ref_name }} \
-              --workflow-name "${{ github.workflow }}" \
-              --workflow-run-id ${{ github.run_id }} \
-              --workflow-run-attempt ${{ github.run_attempt }} \
               --benchmark-configs benchmark-configs
           done
 
-          for SCHEMA in v2 v3; do
-            for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
-              cat "${BENCHMARK_RESULTS}"
-              echo
-            done
+          for BENCHMARK_RESULTS in benchmark-results/v3/*.json; do
+            cat "${BENCHMARK_RESULTS}"
+            echo
           done
 
-      # TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
-      - name: Upload the benchmark results (v2)
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
-        with:
-          benchmark-results-dir: benchmark-results/v2
-          dry-run: false
-          schema-version: v2
-
       - name: Upload the benchmark results (v3)
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
         with:
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 83778d36c1b..1cf7e67f007 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -521,29 +521,14 @@ jobs:
             ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
               --artifacts "${ARTIFACTS_BY_JOB}" \
               --output-dir benchmark-results \
-              --repo ${{ github.repository }} \
-              --head-branch ${{ github.head_ref || github.ref_name }} \
-              --workflow-name "${{ github.workflow }}" \
-              --workflow-run-id ${{ github.run_id }} \
-              --workflow-run-attempt ${{ github.run_attempt }} \
               --benchmark-configs benchmark-configs
           done
 
-          for SCHEMA in v2 v3; do
-            for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
-              cat "${BENCHMARK_RESULTS}"
-              echo
-            done
+          for BENCHMARK_RESULTS in benchmark-results/v3/*.json; do
+            cat "${BENCHMARK_RESULTS}"
+            echo
           done
 
-      # TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
-      - name: Upload the benchmark results (v2)
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
-        with:
-          benchmark-results-dir: benchmark-results/v2
-          dry-run: false
-          schema-version: v2
-
       - name: Upload the benchmark results (v3)
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
         with:
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c066a7045aa..75d1db2cd36 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -38,8 +38,8 @@ jobs:
         # Build and test ExecuTorch with the add model on portable backend.
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "add" "${BUILD_TOOL}" "portable"
 
-  test-models-linux:
-    name: test-models-linux
+  test-models-linux-basic:
+    name: test-models-linux-basic
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -49,29 +49,94 @@ jobs:
         model: [mv3, vit]
         backend: [portable, xnnpack-quantization-delegation]
         build-tool: [cmake, buck2]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+          # TODO: Need to figure out why buck2 doesnt work on Graviton instances.
+          - runner: linux.arm64.2xlarge 
+            build-tool: buck2
+      fail-fast: false
+    with:
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        MODEL_NAME=${{ matrix.model }}
+        BUILD_TOOL=${{ matrix.build-tool }}
+        BACKEND=${{ matrix.backend }}
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+        # Build and test ExecuTorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
+
+  test-models-linux:
+    name: test-models-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        model: [linear, add, add_mul, ic3, mv2, resnet18, resnet50, mobilebert, emformer_transcribe]
+        backend: [portable, xnnpack-quantization-delegation]
         runner: [linux.2xlarge]
+        include:
+          - model: ic4
+            backend: portable
+            runner: linux.4xlarge.memory
+          - model: ic4
+            backend: xnnpack-quantization-delegation
+            runner: linux.4xlarge.memory
+          - model: emformer_join
+            backend: portable
+            runner: linux.4xlarge.memory
+          - model: emformer_join
+            backend: xnnpack-quantization-delegation
+            runner: linux.4xlarge.memory
+          - model: phi-4-mini
+            backend: portable
+            runner: linux.4xlarge.memory
+          - model: llama3_2_vision_encoder
+            backend: portable
+            runner: linux.4xlarge.memory
+          - model: w2l
+            backend: portable
+            runner: linux.4xlarge.memory
       fail-fast: false
     with:
       runner: ${{ matrix.runner }}
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: ${{ matrix.timeout }}
+      timeout: 90
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
         MODEL_NAME=${{ matrix.model }}
-        BUILD_TOOL=${{ matrix.build-tool }}
+        BUILD_TOOL=cmake
         BACKEND=${{ matrix.backend }}
-        DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Build and test ExecuTorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
 
   test-llama-runner-linux:
+    # Test Both linux x86 and linux aarch64
     name: test-llama-runner-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
@@ -80,21 +145,29 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        mode: [portable, xnnpack+custom, xnnpack+custom+qe,xnnpack+custom+quantize_kv,xnnpack+quantize_kv]
+        mode: [xnnpack+custom+qe,xnnpack+custom+quantize_kv,xnnpack+quantize_kv]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
         include:
-          - dtype: bf16
-            mode: portable
           - dtype: bf16
             mode: custom
+            runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
       fail-fast: false
     with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
-      upload-artifact: android-models
-      upload-artifact-to-s3: true
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index dd73e7321ee..e907e8215c9 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -23,10 +23,11 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
       matrix:
-        model: [add, add_mul, emformer_join, emformer_transcribe, ic3, ic4, linear, llama2, mobilebert, mv2, mv3, resnet18, resnet50, vit, w2l]
-        backend: [portable, xnnpack-quantization-delegation]
-        build-tool: [cmake]
-        runner: [macos-m1-stable]
+        # Mac runners are expensive and limited, and non reliable. 
+        # Do some basic testing for macos jobs, and rely mostly on 
+        # test-models-linux-aarch64 job instead.
+        model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l]
+        backend: [xnnpack-quantization-delegation]
         include:
           - model: efficient_sam
             backend: portable
@@ -34,34 +35,25 @@ jobs:
             backend: portable
           - model: llama3_2_vision_encoder
             backend: portable
-          - model: lstm
-            backend: portable
-          - model: mul
-            backend: portable
-          - model: phi-4-mini
-            backend: portable
-          - model: qwen2_5
-            backend: portable
-          - model: softmax
+          - model: mv3
             backend: portable
       fail-fast: false
     with:
-      runner: ${{ matrix.runner }}
+      runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
         MODEL_NAME=${{ matrix.model }}
-        BUILD_TOOL=${{ matrix.build-tool }}
+        BUILD_TOOL=cmake
         BACKEND=${{ matrix.backend }}
-        DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
 
         bash .ci/scripts/setup-conda.sh
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
         # Build and test executorch
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
 
   test-models-linux-aarch64:
     name: test-models-linux-aarch64
@@ -73,10 +65,22 @@ jobs:
       matrix:
         model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
         backend: [portable, xnnpack-quantization-delegation]
-        runner: [linux.arm64.2xlarge]
+        include:
+          - model: lstm
+            backend: portable
+          - model: mul
+            backend: portable
+          - model: softmax
+            backend: portable
+          - model: phi-4-mini
+            backend: portable
+          - model: qwen2_5
+            backend: portable
+          - model: llama3_2_vision_encoder
+            backend: portable
       fail-fast: false
     with:
-      runner: ${{ matrix.runner }}
+      runner: linux.arm64.2xlarge
       docker-image: executorch-ubuntu-22.04-gcc11-aarch64
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -284,18 +288,72 @@ jobs:
         # Test ANE llama
         ${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh
 
-  test-llama-runner-macos:
-    name: test-llama-runner-mac
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  test-llama-runner-linux:
+    # Test Both linux x86 and linux aarch64
+    name: test-llama-runner-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       matrix:
         dtype: [fp32]
-        mode: [portable, xnnpack+kv+custom, mps, coreml, xnnpack+custom+quantize_kv]
+        mode: [portable, xnnpack+custom]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
         include:
           - dtype: bf16
             mode: portable
+            runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+          - dtype: bf16
+            mode: portable
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
           - dtype: bf16
             mode: custom
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+      fail-fast: false
+    with:
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        DTYPE=${{ matrix.dtype }}
+        BUILD_TOOL="cmake"
+        MODE=${{ matrix.mode }}
+        ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}"
+        ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
+
+  test-llama-runner-macos:
+    name: test-llama-runner-mac
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      matrix:
+        dtype: [fp32]
+        mode: [mps, coreml, xnnpack+custom+quantize_kv]
       fail-fast: false
     with:
       runner: macos-m1-stable
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fabf667cbe1..3385bfb6d39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -474,6 +474,17 @@ cmake_dependent_option(
   "NOT FLATC_EXECUTABLE;EXECUTORCH_BUILD_HOST_TARGETS" OFF
 )
 
+
+set(FLATBUFFERS_BUILD_FLATC OFF CACHE BOOL "")
+set(FLATBUFFERS_BUILD_FLATHASH OFF CACHE BOOL "")
+set(FLATBUFFERS_BUILD_FLATLIB OFF CACHE BOOL "")
+set(FLATBUFFERS_BUILD_TESTS OFF CACHE BOOL "")
+set(FLATBUFFERS_INSTALL OFF CACHE BOOL "")
+# exir lets users set the alignment of tensor data embedded in the flatbuffer,
+# and some users need an alignment larger than the default, which is typically
+# 32.
+set(FLATBUFFERS_MAX_ALIGNMENT 1024)
+
 if(EXECUTORCH_BUILD_FLATC)
   if(FLATC_EXECUTABLE)
     # We could ignore this, but it could lead to confusion about which `flatc`
@@ -482,41 +493,50 @@ if(EXECUTORCH_BUILD_FLATC)
       FATAL_ERROR "May not set both EXECUTORCH_BUILD_FLATC and FLATC_EXECUTABLE"
     )
   endif()
-  set(FLATC_EXECUTABLE flatc)
-  set(FLATBUFFERS_BUILD_FLATC
-      ON
-      CACHE BOOL ""
-  )
-  set(FLATBUFFERS_BUILD_FLATHASH
-      OFF
-      CACHE BOOL ""
-  )
-  set(FLATBUFFERS_BUILD_FLATLIB
-      OFF
-      CACHE BOOL ""
-  )
-  set(FLATBUFFERS_BUILD_TESTS
-      OFF
-      CACHE BOOL ""
-  )
-  set(FLATBUFFERS_INSTALL
-      OFF
-      CACHE BOOL ""
-  )
-  add_subdirectory(third-party/flatbuffers)
 
-  # exir lets users set the alignment of tensor data embedded in the flatbuffer,
-  # and some users need an alignment larger than the default, which is typically
-  # 32.
-  target_compile_definitions(flatc PRIVATE FLATBUFFERS_MAX_ALIGNMENT=1024)
+  # Build flatc for the *host* to generate files as part of the build step.
+  include(ExternalProject)
+  ExternalProject_Add(
+    flatbuffers
+    PREFIX ${CMAKE_CURRENT_BINARY_DIR}/third-party/flatbuffers
+    BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/flatbuffers
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third-party/flatbuffers
+    CMAKE_ARGS -DFLATBUFFERS_BUILD_FLATC=ON
+               -DFLATBUFFERS_BUILD_FLATHASH=${FLATBUFFERS_BUILD_FLATHASH}
+               -DFLATBUFFERS_BUILD_FLATLIB=${FLATBUFFERS_BUILD_FLATLIB}
+               -DFLATBUFFERS_BUILD_TESTS=${FLATBUFFERS_BUILD_TESTS}
+               -DFLATBUFFERS_INSTALL=${FLATBUFFERS_INSTALL}
+               -DCMAKE_BUILD_TYPE=Release
+               -DCMAKE_CXX_FLAGS="-DFLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}"
+    INSTALL_COMMAND ""
+    BUILD_BYPRODUCTS <BINARY_DIR>/flatc
+  )
+  ExternalProject_Get_Property(flatbuffers BINARY_DIR)
+  set(FLATC_EXECUTABLE ${BINARY_DIR}/flatc)
+  set(FLATC_EXECUTABLE_BUILT_FROM_SOURCE YES)
 endif()
+
 if(NOT FLATC_EXECUTABLE)
   message(
-    FATAL_ERROR
-      "FLATC_EXECUTABLE must be set when EXECUTORCH_BUILD_FLATC is disabled. "
-      "Note that EXECUTORCH_BUILD_FLATC may be disabled implicitly when "
-      "cross-compiling or when EXECUTORCH_BUILD_HOST_TARGETS is disabled."
+    WARNING "FLATC_EXECUTABLE not specified, looking for flatc"
   )
+  find_program(FLATC_EXECUTABLE flatc)
+
+  if(NOT FLATC_EXECUTABLE)
+    message(
+      FATAL_ERROR
+        "FLATC_EXECUTABLE must be set when EXECUTORCH_BUILD_FLATC is disabled. "
+        "Note that EXECUTORCH_BUILD_FLATC may be disabled implicitly when "
+        "cross-compiling or when EXECUTORCH_BUILD_HOST_TARGETS is disabled."
+    )
+  endif()
+endif()
+
+add_executable(flatc IMPORTED GLOBAL)
+set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${FLATC_EXECUTABLE})
+
+if(FLATC_EXECUTABLE_BUILT_FROM_SOURCE)
+  add_dependencies(flatc flatbuffers)
 endif()
 
 #
diff --git a/backends/apple/coreml/TARGETS b/backends/apple/coreml/TARGETS
index df1165dd74e..a8802e99b56 100644
--- a/backends/apple/coreml/TARGETS
+++ b/backends/apple/coreml/TARGETS
@@ -5,6 +5,14 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 oncall("executorch")
 
+# TODO: this is a placeholder to support internal fbcode build. We should add the coreml backend target properly.
+runtime.python_library(
+    name = "coreml",
+    visibility = [
+        "@EXECUTORCH_CLIENTS",
+    ],
+)
+
 runtime.python_library(
     name = "backend",
     srcs = glob([
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index 96aa007563b..4bd4077a0f6 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -22,10 +22,6 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 set(_common_compile_options -Wno-deprecated-declarations)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
@@ -50,6 +46,7 @@ add_custom_command(
     "${_mps_schema__include_dir}/executorch/backends/apple/mps"
     ${_mps_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+  DEPENDS flatc
   COMMENT "Generating mps_schema headers"
   VERBATIM
 )
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index 65dd5430588..a96d38e5141 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -28,7 +28,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..
 add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 if(EXECUTORCH_CADENCE_CPU_RUNNER)
-  include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+  include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
   if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/backends/cadence/fusion_g3/operators/CMakeLists.txt b/backends/cadence/fusion_g3/operators/CMakeLists.txt
index f39614ee4f3..b3dbcae4135 100644
--- a/backends/cadence/fusion_g3/operators/CMakeLists.txt
+++ b/backends/cadence/fusion_g3/operators/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 28b3c3b6aca..36469dc92e7 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
index 69a104277fd..7d213a12813 100644
--- a/backends/cadence/reference/operators/CMakeLists.txt
+++ b/backends/cadence/reference/operators/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 560cac176b3..f5adc84f903 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -39,16 +39,13 @@ if(${ANDROID})
   find_library(android_log log)
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 set(qcir_schema_include_dir ${CMAKE_CURRENT_LIST_DIR}/aot/ir)
 set(qcir_schema_output ${qcir_schema_include_dir}/qcir_generated.h)
 add_custom_command(
   OUTPUT qcir_schema_output
   COMMAND ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
           ${qcir_schema_include_dir} ${qcir_schema_include_dir}/qcir.fbs
+  DEPENDS flatc
   COMMENT "Generating qualcomm ir schema headers"
   VERBATIM
 )
@@ -100,6 +97,7 @@ add_custom_command(
     "${_qnn_schema__include_dir}/executorch/backends/qualcomm"
     ${_qnn_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_SOURCE_DIR}
+  DEPENDS flatc
   COMMENT "Generating qnn_schema headers"
   VERBATIM
 )
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index fca34fdf6a4..db90bdc7c29 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -28,10 +28,6 @@ if(NOT PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # Include this file to access target_link_options_shared_lib This is required to
 # provide access to target_link_options_shared_lib which allows libraries to be
 # linked with the --whole-archive flag. This is required for libraries that
@@ -92,6 +88,7 @@ add_custom_command(
     ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
     "${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/" ${_vulkan_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+  DEPENDS flatc
   COMMENT "Generating vulkan_schema headers"
   VERBATIM
 )
diff --git a/backends/vulkan/runtime/vk_api/QueryPool.cpp b/backends/vulkan/runtime/vk_api/QueryPool.cpp
index b029cea7081..2f6d433b887 100644
--- a/backends/vulkan/runtime/vk_api/QueryPool.cpp
+++ b/backends/vulkan/runtime/vk_api/QueryPool.cpp
@@ -185,19 +185,20 @@ std::vector<ShaderResult> QueryPool::get_shader_timestamp_data() {
   std::vector<ShaderResult> shader_result;
   for (ShaderDuration& entry : shader_durations_) {
     shader_result.push_back(ShaderResult{
-        .kernel_name = entry.kernel_name,
-        .dispatch_id = entry.dispatch_id,
-        .start_time_ns = entry.start_time_ns,
-        .end_time_ns = entry.end_time_ns,
-        .metadata = ShaderMetadata{
-            .global_workgroup_size =
-                {entry.global_workgroup_size.width,
-                 entry.global_workgroup_size.height,
-                 entry.global_workgroup_size.depth},
-            .local_workgroup_size =
-                {entry.local_workgroup_size.width,
-                 entry.local_workgroup_size.height,
-                 entry.local_workgroup_size.depth},
+        /* .kernel_name = */ entry.kernel_name,
+        /* .dispatch_id = */ entry.dispatch_id,
+        /* .start_time_ns = */ entry.start_time_ns,
+        /* .end_time_ns = */ entry.end_time_ns,
+        /* .metadata = */
+        ShaderMetadata{
+            /* .global_workgroup_size = */
+            {entry.global_workgroup_size.width,
+             entry.global_workgroup_size.height,
+             entry.global_workgroup_size.depth},
+            /* .local_workgroup_size = */
+            {entry.local_workgroup_size.width,
+             entry.local_workgroup_size.height,
+             entry.local_workgroup_size.depth},
         }});
   }
   return shader_result;
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index a453b16aa58..8b3bf3d91c1 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -18,10 +18,6 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
@@ -82,6 +78,7 @@ add_custom_command(
     ${_xnnpack_schema__srcs}
   COMMAND mv ${_xnnpack_flatbuffer__outputs} ${_xnnpack_schema__outputs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+  DEPENDS flatc
   COMMENT "Generating xnnpack_schema headers"
   VERBATIM
 )
diff --git a/build/build_android_instrumentation.sh b/build/build_android_instrumentation.sh
new file mode 100644
index 00000000000..91bf03691b0
--- /dev/null
+++ b/build/build_android_instrumentation.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+which "${PYTHON_EXECUTABLE}"
+
+build_android_test() {
+  pushd extension/android_test
+  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew testDebugUnitTest
+  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
+  popd
+}
+
+collect_artifacts_to_be_uploaded() {
+  ARTIFACTS_DIR_NAME="$1"
+  # Collect Java library test
+  JAVA_LIBRARY_TEST_DIR="${ARTIFACTS_DIR_NAME}/library_test_dir"
+  mkdir -p "${JAVA_LIBRARY_TEST_DIR}"
+  cp extension/android_test/build/outputs/apk/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
+  cp extension/android_test/build/outputs/apk/androidTest/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
+}
+
+main() {
+  build_android_test
+  if [ -n "$ARTIFACTS_DIR_NAME" ]; then
+    collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME}
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  main "$@"
+fi
diff --git a/build/build_android_library.sh b/build/build_android_library.sh
index 01ea86bf830..32b2210a54e 100644
--- a/build/build_android_library.sh
+++ b/build/build_android_library.sh
@@ -149,11 +149,6 @@ build_android_demo_apps() {
   pushd extension/benchmark/android/benchmark
   ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
   popd
-
-  pushd extension/android_test
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew testDebugUnitTest
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
-  popd
 }
 
 collect_artifacts_to_be_uploaded() {
@@ -172,11 +167,6 @@ collect_artifacts_to_be_uploaded() {
   mkdir -p "${MINIBENCH_APP_DIR}"
   cp extension/benchmark/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}"
   cp extension/benchmark/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}"
-  # Collect Java library test
-  JAVA_LIBRARY_TEST_DIR="${ARTIFACTS_DIR_NAME}/library_test_dir"
-  mkdir -p "${JAVA_LIBRARY_TEST_DIR}"
-  cp extension/android_test/build/outputs/apk/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
-  cp extension/android_test/build/outputs/apk/androidTest/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
 }
 
 main() {
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index cf304d92523..5279ffa566c 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -25,7 +25,7 @@ endif()
 set(_common_compile_options -Wno-deprecated-declarations)
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   # Merge optimized and portable definitions, taking optimized where available.
diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt
index 3f3a836c12b..abd33bac886 100644
--- a/devtools/CMakeLists.txt
+++ b/devtools/CMakeLists.txt
@@ -38,10 +38,6 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # Paths to headers generated from the .fbs files. set(_etdump_schemas
 # etdump_schema_flatcc.fbs scalar_type.fbs)
 
@@ -205,7 +201,7 @@ add_custom_command(
     "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema"
     ${_bundled_program_schema__srcs}
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools
-  DEPENDS ${FLATC_EXECUTABLE} ${_bundled_program_schema__srcs}
+  DEPENDS flatc ${_bundled_program_schema__srcs}
   COMMENT "Generating bundled_program headers"
   VERBATIM
 )
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index 1a7562942e0..fe2d53c8be8 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -36,7 +36,7 @@ The basic flow looks like this:
 
 ## APIs
 
-We expose a CMake macro `[gen_selected_ops](https://github.com/pytorch/executorch/blob/main/build/Codegen.cmake#L12)`, to allow users specifying op info:
+We expose a CMake macro `[gen_selected_ops](https://github.com/pytorch/executorch/blob/main/scripts/build/Codegen.cmake#L12)`, to allow users specifying op info:
 
 ```
 gen_selected_ops(
diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt
index 319d8159ced..d3bf0fb0321 100644
--- a/examples/apple/mps/CMakeLists.txt
+++ b/examples/apple/mps/CMakeLists.txt
@@ -18,10 +18,6 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
@@ -63,7 +59,7 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
 
   # portable_ops_lib
   include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-  include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+  include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
   gen_selected_ops(LIB_NAME "mps_portable_ops_lib" INCLUDE_ALL_OPS "ON")
   generate_bindings_for_kernels(
     LIB_NAME "mps_portable_ops_lib" FUNCTIONS_YAML
diff --git a/examples/arm/CMakeLists.txt b/examples/arm/CMakeLists.txt
index 0c754beaaaf..2f8055ce5e9 100644
--- a/examples/arm/CMakeLists.txt
+++ b/examples/arm/CMakeLists.txt
@@ -36,7 +36,7 @@ find_package(executorch CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX})
 target_include_directories(executorch INTERFACE ${_common_include_directories})
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in functions.yaml
diff --git a/examples/devtools/CMakeLists.txt b/examples/devtools/CMakeLists.txt
index 7ed5232ba41..9319135f8e9 100644
--- a/examples/devtools/CMakeLists.txt
+++ b/examples/devtools/CMakeLists.txt
@@ -23,7 +23,7 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt
index 826a2c17fa0..d8e2a5bf667 100644
--- a/examples/mediatek/CMakeLists.txt
+++ b/examples/mediatek/CMakeLists.txt
@@ -20,7 +20,7 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/examples/models/llama/runner/CMakeLists.txt b/examples/models/llama/runner/CMakeLists.txt
index b707f385f33..919bc356551 100644
--- a/examples/models/llama/runner/CMakeLists.txt
+++ b/examples/models/llama/runner/CMakeLists.txt
@@ -21,7 +21,7 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt
index 2d0c30a620e..7bad4a827ae 100644
--- a/examples/models/llava/runner/CMakeLists.txt
+++ b/examples/models/llava/runner/CMakeLists.txt
@@ -21,7 +21,7 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
new file mode 100755
index 00000000000..2de86466130
--- /dev/null
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -x
+
+pip install -U moshi
+pip install bitsandbytes
+# Run llama2/install requirements for torchao deps
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+bash "$SCRIPT_DIR"/../llama/install_requirements.sh
diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py
new file mode 100644
index 00000000000..54b6b0d33ad
--- /dev/null
+++ b/examples/models/moshi/mimi/test_mimi.py
@@ -0,0 +1,156 @@
+import io
+import os
+import random
+import unittest
+
+import numpy as np
+import requests
+import torch
+import torch.nn as nn
+import torchaudio
+
+from huggingface_hub import hf_hub_download
+from moshi.models import loaders
+from torch.export import export, ExportedProgram
+
+
+def read_mp3_from_url(url):
+    response = requests.get(url)
+    response.raise_for_status()  # Ensure request is successful
+    audio_stream = io.BytesIO(response.content)
+    waveform, sample_rate = torchaudio.load(audio_stream, format="mp3")
+    return waveform.numpy(), sample_rate
+
+
+class TestMimiModel(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Setup once for all tests: Load model and prepare test data."""
+
+        # Get environment variables (if set), otherwise use default values
+        mimi_weight = os.getenv("MIMI_WEIGHT", None)
+        hf_repo = os.getenv("HF_REPO", loaders.DEFAULT_REPO)
+        device = "cuda" if torch.cuda.device_count() else "cpu"
+
+        def seed_all(seed):
+            torch.manual_seed(seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(seed)
+                torch.cuda.manual_seed_all(seed)
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+
+        seed_all(42424242)
+
+        if mimi_weight is None:
+            mimi_weight = hf_hub_download(hf_repo, loaders.MIMI_NAME)
+        cls.mimi = loaders.get_mimi(mimi_weight, device)
+        cls.device = device
+        cls.sample_pcm, cls.sample_sr = read_mp3_from_url(
+            "https://huggingface.co/lmz/moshi-swift/resolve/main/bria-24khz.mp3"
+        )
+
+    def test_mp3_loading(self):
+        """Ensure MP3 file loads correctly."""
+        self.assertIsInstance(self.sample_pcm, np.ndarray)
+        self.assertGreater(self.sample_sr, 0)
+
+    def test_encoding(self):
+        """Ensure encoding produces expected tensor shape."""
+        pcm_chunk_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        sample_pcm = torch.tensor(self.sample_pcm, device=self.device)
+        sample_pcm = sample_pcm[None]
+        chunk = sample_pcm[..., 0:pcm_chunk_size]
+        encoded = self.mimi.encode(chunk)
+        self.assertIsInstance(encoded, torch.Tensor)
+        self.assertGreater(encoded.shape[-1], 0)
+
+    def test_decoding(self):
+        """Ensure decoding produces expected output."""
+        pcm_chunk_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        sample_pcm = torch.tensor(self.sample_pcm, device=self.device)[None]
+        chunk = sample_pcm[..., 0:pcm_chunk_size]
+        encoded = self.mimi.encode(chunk)
+        decoded = self.mimi.decode(encoded)
+        self.assertIsInstance(decoded, torch.Tensor)
+
+    def test_streaming_encoding_decoding(self):
+        """Test streaming encoding and decoding consistency."""
+        pcm_chunk_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        sample_rate = self.mimi.sample_rate
+        max_duration_sec = 10.0
+        max_duration_len = int(sample_rate * max_duration_sec)
+
+        sample_pcm = torch.tensor(self.sample_pcm, device=self.device)
+        if sample_pcm.shape[-1] > max_duration_len:
+            sample_pcm = sample_pcm[..., :max_duration_len]
+        sample_pcm = sample_pcm[None].to(device=self.device)
+
+        all_codes = []
+        for start_idx in range(0, sample_pcm.shape[-1], pcm_chunk_size):
+            end_idx = min(sample_pcm.shape[-1], start_idx + pcm_chunk_size)
+            chunk = sample_pcm[..., start_idx:end_idx]
+            codes = self.mimi.encode(chunk)
+            if codes.shape[-1]:
+                all_codes.append(codes)
+
+        all_codes_th = torch.cat(all_codes, dim=-1)
+
+        all_pcms = []
+        with self.mimi.streaming(1):
+            for i in range(all_codes_th.shape[-1]):
+                codes = all_codes_th[..., i : i + 1]
+                pcm = self.mimi.decode(codes)
+                all_pcms.append(pcm)
+        all_pcms = torch.cat(all_pcms, dim=-1)
+
+        pcm_ref = self.mimi.decode(all_codes_th)
+        self.assertTrue(torch.allclose(pcm_ref, all_pcms, atol=1e-5))
+
+    def test_exported_decoding(self):
+        """Ensure exported decoding model is consistent with reference output."""
+
+        class MimiDecode(nn.Module):
+            def __init__(self, mimi: nn.Module):
+                super().__init__()
+                self.mimi_model = mimi
+
+            def forward(self, x):
+                return self.mimi_model.decode(x)
+
+        sample_pcm = torch.tensor(self.sample_pcm, device=self.device)[None]
+        pcm_chunk_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        chunk = sample_pcm[..., 0:pcm_chunk_size]
+        input = self.mimi.encode(chunk)
+
+        mimi_decode = MimiDecode(self.mimi)
+        ref_decode_output = mimi_decode(input)
+        exported_decode: ExportedProgram = export(mimi_decode, (input,), strict=False)
+        ep_decode_output = exported_decode.module()(input)
+        self.assertTrue(torch.allclose(ep_decode_output, ref_decode_output, atol=1e-6))
+
+    def test_exported_encoding(self):
+        """Ensure exported encoding model is consistent with reference output."""
+
+        class MimiEncode(nn.Module):
+            def __init__(self, mimi: nn.Module):
+                super().__init__()
+                self.mimi_model = mimi
+
+            def forward(self, x):
+                return self.mimi_model.encode(x)
+
+        mimi_encode = MimiEncode(self.mimi)
+        chunk = torch.tensor(self.sample_pcm, device=self.device)[None][
+            ..., 0 : int(self.mimi.sample_rate / self.mimi.frame_rate)
+        ]
+        ref_encode_output = mimi_encode(chunk)
+        exported_encode = export(mimi_encode, (chunk,), strict=False)
+        ep_encode_output = exported_encode.module()(chunk)
+        self.assertTrue(torch.allclose(ep_encode_output, ref_encode_output, atol=1e-6))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/portable/custom_ops/CMakeLists.txt b/examples/portable/custom_ops/CMakeLists.txt
index 02736cca964..9d165d342d0 100644
--- a/examples/portable/custom_ops/CMakeLists.txt
+++ b/examples/portable/custom_ops/CMakeLists.txt
@@ -28,7 +28,7 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index 0a46c061b64..c8946f63a6b 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -16,7 +16,7 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/examples/selective_build/CMakeLists.txt b/examples/selective_build/CMakeLists.txt
index c2ce3f09e7a..6647f0a62b4 100644
--- a/examples/selective_build/CMakeLists.txt
+++ b/examples/selective_build/CMakeLists.txt
@@ -22,7 +22,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.h b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.h
new file mode 100644
index 00000000000..c0c7dfbc49f
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.h
@@ -0,0 +1,27 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#ifdef __cplusplus
+ #import <executorch/extension/module/module.h>
+ #import <executorch/runtime/core/evalue.h>
+#endif
+#import <ModelRunnerDataKit/ModelRunnerDataKit-Swift.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ExecutorchRuntimeTensorValue : NSObject <ModelRuntimeTensorValueBridging>
+
+- (instancetype)init NS_UNAVAILABLE;
++ (instancetype)new NS_UNAVAILABLE;
+
+- (instancetype)initWithFloatArray:(NSArray<NSNumber *> *)floatArray shape:(NSArray<NSNumber *> *)sizes NS_SWIFT_NAME(init(floatArray:shape:));
+
+#ifdef __cplusplus
+- (nullable instancetype)initWithTensor:(torch::executor::Tensor)tensor error:(NSError * _Nullable * _Nullable)error;
+- (instancetype)initWithData:(std::vector<float>)floatData
+                       shape:(std::vector<int32_t>)shape NS_DESIGNATED_INITIALIZER;
+- (torch::executor::Tensor)backedValue;
+#endif
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.mm
new file mode 100644
index 00000000000..933bbe99e57
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.mm
@@ -0,0 +1,100 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#import "ExecutorchRuntimeTensorValue.h"
+
+#import <memory>
+
+#import <executorch/extension/module/module.h>
+
+using torch::executor::TensorImpl;
+using torch::executor::ScalarType;
+
+@implementation ExecutorchRuntimeTensorValue
+{
+  std::unique_ptr<TensorImpl> _tensor;
+  // TensorImpl DOES NOT take ownership.
+  // This float vector is what keeps the data in memory.
+  std::vector<float> _floatData;
+  std::vector<int32_t> _shape;
+}
+
+- (instancetype)initWithData:(std::vector<float>)floatData
+                       shape:(std::vector<int32_t>)shape
+{
+  if (self = [super init]) {
+    _floatData.assign(floatData.begin(), floatData.end());
+    _shape.assign(shape.begin(), shape.end());
+    _tensor = std::make_unique<TensorImpl>(ScalarType::Float, std::size(_shape), _shape.data(), _floatData.data());
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatArray:(NSArray<NSNumber *> *)floatArray shape:(NSArray<NSNumber *> *)shape
+{
+  std::vector<float> floatVector;
+  std::vector<int32_t> shapeVector;
+
+  floatVector.reserve(floatArray.count);
+  for (int i = 0; i < floatArray.count; i++) {
+    floatVector.push_back([floatArray[i] floatValue]);
+  }
+  shapeVector.reserve(shape.count);
+  for (int i = 0; i < shape.count; i++) {
+    shapeVector.push_back([shape[i] intValue]);
+  }
+
+  return [self initWithData:floatVector shape:shapeVector];
+}
+
+- (nullable instancetype)initWithTensor:(torch::executor::Tensor)tensor error:(NSError * _Nullable * _Nullable)error
+{
+  if (tensor.scalar_type() != ScalarType::Float) {
+    if (error) {
+      *error = [ModelRuntimeValueErrorFactory invalidType:[NSString stringWithFormat:@"torch::executor::ScalarType::%hhd", tensor.scalar_type()] expectedType:@"torch::executor::ScalarType::Float"];
+    }
+    return nil;
+  }
+
+  std::vector<float> floatVector;
+  std::vector<int32_t> shapeVector;
+  shapeVector.assign(tensor.sizes().begin(), tensor.sizes().end());
+  floatVector.assign(tensor.const_data_ptr<float>(), tensor.const_data_ptr<float>() + tensor.numel());
+  return [self initWithData:floatVector shape:shapeVector];
+}
+
+- (nullable ModelRuntimeTensorValueBridgingTuple *)floatRepresentationAndReturnError:(NSError * _Nullable * _Nullable)error
+{
+  if (_tensor->scalar_type() == torch::executor::ScalarType::Float) {
+    const auto *tensorPtr = _tensor->data<float>();
+    const auto sizes = _tensor->sizes();
+    std::vector<float> tensorVec(tensorPtr, tensorPtr + _tensor->numel());
+    std::vector<int32_t> tensorSizes(sizes.begin(), sizes.end());
+
+    NSMutableArray<NSNumber *> *floatArray = [[NSMutableArray alloc] initWithCapacity:tensorVec.size()];
+    for (float &i : tensorVec) {
+      [floatArray addObject:@(i)];
+    }
+
+    NSMutableArray<NSNumber *> *sizesArray = [[NSMutableArray alloc] initWithCapacity:tensorSizes.size()];
+    for (int &tensorSize : tensorSizes) {
+      [sizesArray addObject:@(tensorSize)];
+    }
+
+    return [[ModelRuntimeTensorValueBridgingTuple alloc] initWithFloatArray:floatArray shape:sizesArray];
+  }
+
+  if (error) {
+    *error = [ModelRuntimeValueErrorFactory
+              invalidType:[NSString stringWithFormat:@"torch::executor::ScalarType::%hhd", _tensor->scalar_type()]
+              expectedType:@"torch::executor::ScalarType::Float"];
+  }
+
+  return nil;
+}
+
+- (torch::executor::Tensor)backedValue
+{
+  return torch::executor::Tensor(_tensor.get());
+}
+
+@end
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.h b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.h
new file mode 100644
index 00000000000..591511b2b11
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.h
@@ -0,0 +1,28 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#ifdef __cplusplus
+ #import <executorch/extension/module/module.h>
+ #import <executorch/runtime/core/evalue.h>
+#endif
+
+#import <ModelRunnerDataKit/ModelRunnerDataKit-Swift.h>
+
+#import "ExecutorchRuntimeTensorValue.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ExecutorchRuntimeValue : NSObject <ModelRuntimeValueBridging>
+
+- (instancetype)init NS_UNAVAILABLE;
++ (instancetype)new NS_UNAVAILABLE;
+
+- (instancetype)initWithTensor:(ExecutorchRuntimeTensorValue *)tensorValue;
+
+#ifdef __cplusplus
+- (instancetype)initWithEValue:(torch::executor::EValue)value NS_DESIGNATED_INITIALIZER;
+- (torch::executor::EValue)getBackedValue;
+#endif
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.mm
new file mode 100644
index 00000000000..f8fb8c4a419
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.mm
@@ -0,0 +1,73 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#import "ExecutorchRuntimeValue.h"
+
+#import <map>
+#import <vector>
+
+#import "ExecutorchRuntimeTensorValue.h"
+
+using torch::executor::EValue;
+
+@implementation ExecutorchRuntimeValue
+{
+  EValue _value;
+  // IMPORTANT
+  // Tensor value keeps a reference to the original tensor value. However, the value that is wrapped by LiteInterpreterRuntimeTensorValue DOES NOT TAKE OWNERSHIP OF THE RAW DATA!
+  // This means once the wrapper is deallocated, the tensor value will be deallocated as well.
+  // This reference here is to keep the tensor value alive until the runtime is deallocated.
+  ExecutorchRuntimeTensorValue *_tensorValue;
+}
+
+- (instancetype)initWithEValue:(EValue)value
+{
+  if (self = [super init]) {
+    _value = value;
+  }
+  return self;
+}
+
+- (instancetype)initWithTensor:(ExecutorchRuntimeTensorValue *)tensorValue
+{
+  if (self = [self initWithEValue:EValue([tensorValue backedValue])]) {
+    _tensorValue = tensorValue;
+  }
+  return self;
+}
+
+- (nullable NSString *)stringValueAndReturnError:(NSError * _Nullable * _Nullable)error
+{
+  if (error) {
+    *error = [ModelRuntimeValueErrorFactory unsupportedType:@"ExecutorchRuntimeValue doesn't support strings"];
+  }
+  return nil;
+}
+
+- (nullable id<ModelRuntimeTensorValueBridging>)tensorValueAndReturnError:(NSError * _Nullable * _Nullable)error
+{
+  if (_value.isTensor()) {
+    return [[ExecutorchRuntimeTensorValue alloc] initWithTensor:_value.toTensor() error:error];
+  }
+
+  if (error) {
+    *error = [ModelRuntimeValueErrorFactory
+              invalidType:[NSString stringWithFormat:@"Tag::%d", _value.tag]
+              expectedType:@"Tag::Tensor"];
+  }
+  return nil;
+}
+
+- (EValue)getBackedValue
+{
+  return _value;
+}
+
+- (NSArray<id<ModelRuntimeValueBridging>> *)arrayValueAndReturnError:(NSError * _Nullable * _Nullable)error
+{
+  if (error) {
+    *error = [ModelRuntimeValueErrorFactory unsupportedType:@"EValue doesn't support arrays"];
+  }
+  return nil;
+}
+
+@end
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.h b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.h
new file mode 100644
index 00000000000..be965c87a6f
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.h
@@ -0,0 +1,23 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#import <Foundation/Foundation.h>
+
+#import "ExecutorchRuntimeValue.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ExecutorchRuntimeEngine : NSObject
+
+- (nonnull instancetype)init NS_UNAVAILABLE;
++ (nonnull instancetype)new NS_UNAVAILABLE;
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                           modelMethodName:(NSString *)modelMethodName
+                                     error:(NSError * _Nullable * _Nullable)error NS_DESIGNATED_INITIALIZER;
+
+- (nullable NSArray<ExecutorchRuntimeValue *> *)infer:(NSArray<ExecutorchRuntimeValue *> *)input
+                                                error:(NSError * _Nullable * _Nullable)error NS_SWIFT_NAME(infer(input:));
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.mm
new file mode 100644
index 00000000000..45a527bd1c0
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.mm
@@ -0,0 +1,107 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#import "ExecutorchRuntimeEngine.h"
+
+#import <map>
+#import <vector>
+
+#import <executorch/extension/module/module.h>
+
+static int kInitFailed = 0;
+static int kInferenceFailed = 1;
+
+static auto NSStringToString(NSString *string) -> std::string
+{
+  const char *cStr = [string cStringUsingEncoding:NSUTF8StringEncoding];
+  if (cStr) {
+    return cStr;
+  }
+
+  NSData *data = [string dataUsingEncoding:NSUTF8StringEncoding allowLossyConversion:NO];
+  return {reinterpret_cast<const char *>([data bytes]), [data length]};
+}
+
+static auto StringToNSString(const std::string &string) -> NSString *
+{
+  CFStringRef cfString = CFStringCreateWithBytes(
+    kCFAllocatorDefault,
+    reinterpret_cast<const UInt8 *>(string.c_str()),
+    string.size(),
+    kCFStringEncodingUTF8,
+    false
+  );
+  return (__bridge_transfer NSString *)cfString;
+}
+
+@implementation ExecutorchRuntimeEngine
+{
+  NSString *_modelPath;
+  NSString *_modelMethodName;
+  std::unique_ptr<torch::executor::Module> _module;
+}
+
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                  modelMethodName:(NSString *)modelMethodName
+                            error:(NSError * _Nullable * _Nullable)error
+{
+  if (self = [super init]) {
+    _modelPath = modelPath;
+    _modelMethodName = modelMethodName;
+    try {
+      _module = std::make_unique<torch::executor::Module>(NSStringToString(modelPath));
+      const auto e = _module->load_method(NSStringToString(modelMethodName));
+      if (e != executorch::runtime::Error::Ok) {
+        if (error) {
+          *error = [NSError errorWithDomain:@"ExecutorchRuntimeEngine"
+                                       code:kInitFailed
+                                   userInfo:@{NSDebugDescriptionErrorKey : StringToNSString(std::to_string(static_cast<uint32_t>(e)))}];
+        }
+        return nil;
+      }
+    } catch (...) {
+      if (error) {
+        *error = [NSError errorWithDomain:@"ExecutorchRuntimeEngine"
+                                     code:kInitFailed
+                                 userInfo:@{NSDebugDescriptionErrorKey : @"Unknown error"}];
+      }
+      return nil;
+    }
+  }
+  return self;
+}
+
+- (nullable NSArray<ExecutorchRuntimeValue *> *)infer:(NSArray<ExecutorchRuntimeValue *> *)input
+                                                error:(NSError * _Nullable * _Nullable)error
+{
+  try {
+    std::vector<torch::executor::EValue> inputEValues;
+    inputEValues.reserve(input.count);
+    for (ExecutorchRuntimeValue *inputValue in input) {
+      inputEValues.push_back([inputValue getBackedValue]);
+    }
+    const auto result = _module->execute(NSStringToString(_modelMethodName), inputEValues);
+    if (!result.ok()) {
+      const auto executorchError = static_cast<uint32_t>(result.error());
+      if (error) {
+        *error = [NSError errorWithDomain:@"ExecutorchRuntimeEngine"
+                                     code:kInferenceFailed
+                                 userInfo:@{NSDebugDescriptionErrorKey : StringToNSString(std::to_string(executorchError))}];
+      }
+      return nil;
+    }
+    NSMutableArray<ExecutorchRuntimeValue *> *const resultValues = [NSMutableArray new];
+    for (const auto &evalue : result.get()) {
+      [resultValues addObject:[[ExecutorchRuntimeValue alloc] initWithEValue:evalue]];
+    }
+    return resultValues;
+  } catch (...) {
+    if (error) {
+      *error = [NSError errorWithDomain:@"LiteInterpreterRuntimeEngine"
+                                   code:kInferenceFailed
+                               userInfo:@{NSDebugDescriptionErrorKey : @"Unknown error"}];
+    }
+    return nil;
+  }
+}
+
+@end
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeEngineTests.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeEngineTests.mm
new file mode 100644
index 00000000000..de59902dfca
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeEngineTests.mm
@@ -0,0 +1,61 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#import <XCTest/XCTest.h>
+
+#import <ExecutorchRuntimeBridge/ExecutorchRuntimeEngine.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ExecutorchRuntimeEngineTests : XCTestCase
+@end
+
+@implementation ExecutorchRuntimeEngineTests
+
+- (void)testInvalidModel
+{
+  NSString *const modelPath = @"invalid_model_path";
+
+  NSError *runtimeInitError = nil;
+  ExecutorchRuntimeEngine *const engine = [[ExecutorchRuntimeEngine alloc] initWithModelPath:modelPath modelMethodName:@"forward" error:&runtimeInitError];
+  XCTAssertNil(engine);
+  XCTAssertNotNil(runtimeInitError);
+
+  XCTAssertEqual(runtimeInitError.code, 0);
+  XCTAssertEqualObjects(runtimeInitError.userInfo[NSDebugDescriptionErrorKey], @"34");
+  // 34 is the code for AccessFailed.
+}
+
+- (void)testValidModel
+{
+  NSBundle *const bundle = [NSBundle bundleForClass:[self class]];
+  // This is a simple model that adds two tensors.
+  NSString *const modelPath = [bundle pathForResource:@"add" ofType:@"pte"];
+  NSError *runtimeInitError = nil;
+  ExecutorchRuntimeEngine *const engine = [[ExecutorchRuntimeEngine alloc] initWithModelPath:modelPath modelMethodName:@"forward" error:&runtimeInitError];
+  XCTAssertNotNil(engine);
+  XCTAssertNil(runtimeInitError);
+
+  ExecutorchRuntimeTensorValue *inputTensor = [[ExecutorchRuntimeTensorValue alloc] initWithFloatArray:@[@2.0] shape:@[@1]];
+  ExecutorchRuntimeValue *inputValue = [[ExecutorchRuntimeValue alloc] initWithTensor:inputTensor];
+
+  NSError *inferenceError = nil;
+  const auto output = [engine infer:@[inputValue, inputValue] error:&inferenceError];
+  XCTAssertNil(inferenceError);
+
+  XCTAssertEqual(output.count, 1);
+  NSError *tensorValueError = nil;
+  NSError *floatRepresentationError = nil;
+  const auto resultTensorValue = [[output.firstObject tensorValueAndReturnError:&tensorValueError]
+                                  floatRepresentationAndReturnError:&floatRepresentationError];
+
+  XCTAssertNil(tensorValueError);
+  XCTAssertNil(floatRepresentationError);
+  XCTAssertEqual(resultTensorValue.floatArray.count, 1);
+  XCTAssertEqual(resultTensorValue.shape.count, 1);
+  XCTAssertEqual(resultTensorValue.floatArray.firstObject.floatValue, 4.0);
+  XCTAssertEqual(resultTensorValue.shape.firstObject.integerValue, 1);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeValueTests.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeValueTests.mm
new file mode 100644
index 00000000000..742cfb8d40d
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeValueTests.mm
@@ -0,0 +1,67 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#import <XCTest/XCTest.h>
+
+#import <ExecutorchRuntimeBridge/ExecutorchRuntimeValue.h>
+#import <ModelRunnerDataKit/ModelRunnerDataKit-Swift.h>
+#import <executorch/extension/module/module.h>
+
+using torch::executor::EValue;
+using torch::executor::TensorImpl;
+using torch::executor::ScalarType;
+
+@interface ExecutorchRuntimeValueTests : XCTestCase
+@end
+
+@implementation ExecutorchRuntimeValueTests
+
+- (void)testStringValueWithError
+{
+  ExecutorchRuntimeValue *value = [[ExecutorchRuntimeValue alloc] initWithEValue:EValue((int64_t)1)];
+  XCTAssertNil([value stringValueAndReturnError:nil]);
+  NSError *error = nil;
+  XCTAssertNil([value stringValueAndReturnError:&error]);
+  XCTAssertNotNil(error);
+  XCTAssertEqualObjects([error description], @"Unsupported type: ExecutorchRuntimeValue doesn't support strings");
+}
+
+- (void)testTensorValue
+{
+  NSMutableArray *data = [NSMutableArray new];
+  for (int i = 0; i < 10; i++) {
+    [data addObject:@(i + 0.5f)];
+  }
+
+  NSArray *shape = @[@(10)];
+
+  ExecutorchRuntimeTensorValue *tensorValue = [[ExecutorchRuntimeTensorValue alloc] initWithFloatArray:data shape:shape];
+
+  const auto tuple = [tensorValue floatRepresentationAndReturnError:nil];
+  XCTAssertEqualObjects(tuple.floatArray, data);
+  XCTAssertEqualObjects(tuple.shape, shape);
+}
+
+- (void)testTensorValueWithFloatArrayWithError
+{
+  std::vector<std::int16_t> data = {1, 2, 3};
+  std::vector<int32_t> shape = {3};
+  TensorImpl tensorImpl(ScalarType::Int, std::size(shape), shape.data(), data.data());
+
+  XCTAssertNil([[ExecutorchRuntimeTensorValue alloc] initWithTensor:*new torch::executor::Tensor(&tensorImpl) error:nil]);
+  NSError *error = nil;
+  XCTAssertNil([[ExecutorchRuntimeTensorValue alloc] initWithTensor:*new torch::executor::Tensor(&tensorImpl) error:&error]);
+  XCTAssertNotNil(error);
+  XCTAssertEqualObjects([error description], @"Invalid type: torch::executor::ScalarType::3, expected torch::executor::ScalarType::Float");
+}
+
+- (void)testTensorValueWithError
+{
+  ExecutorchRuntimeValue *value = [[ExecutorchRuntimeValue alloc] initWithEValue:EValue((int64_t)1)];
+  XCTAssertNil([value tensorValueAndReturnError:nil]);
+  NSError *error = nil;
+  XCTAssertNil([value tensorValueAndReturnError:&error]);
+  XCTAssertNotNil(error);
+  XCTAssertEqualObjects([error description], @"Invalid type: Tag::4, expected Tag::Tensor");
+}
+
+@end
diff --git a/extension/apple/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport.swift b/extension/apple/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport.swift
new file mode 100644
index 00000000000..3fa2f590d85
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport.swift
@@ -0,0 +1,39 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+@_implementationOnly import ExecutorchRuntimeBridge
+import Foundation
+import ModelRunnerDataKit
+
+public struct ExecutorchRuntimeValueSupport {
+
+  public init() {}
+}
+
+extension ExecutorchRuntimeValueSupport: ModelRuntimeValueFactory {
+
+  public func createString(value: String) throws -> ModelRuntimeValue {
+    throw ModelRuntimeValueError.unsupportedType(String(describing: String.self))
+  }
+
+  public func createTensor(value: ModelRuntimeTensorValue) throws -> ModelRuntimeValue {
+    guard let tensorValue = value.innerValue as? ExecutorchRuntimeTensorValue else {
+      throw ModelRuntimeValueError.invalidType(
+        String(describing: value.innerValue.self),
+        String(describing: ExecutorchRuntimeTensorValue.self)
+      )
+    }
+    return ModelRuntimeValue(innerValue: ExecutorchRuntimeValue(tensor: tensorValue))
+  }
+}
+
+extension ExecutorchRuntimeValueSupport: ModelRuntimeTensorValueFactory {
+
+  public func createFloatTensor(value: [Float], shape: [Int]) -> ModelRuntimeTensorValue {
+    ModelRuntimeTensorValue(
+      innerValue: ExecutorchRuntimeTensorValue(
+        floatArray: value.compactMap { NSNumber(value: $0) },
+        shape: shape.compactMap { NSNumber(value: $0) }
+      )
+    )
+  }
+}
diff --git a/extension/apple/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport/__tests__/ExecutorchRuntimeValueSupportTests.swift b/extension/apple/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport/__tests__/ExecutorchRuntimeValueSupportTests.swift
new file mode 100644
index 00000000000..474dc798a42
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeValueSupport/ExecutorchRuntimeValueSupport/__tests__/ExecutorchRuntimeValueSupportTests.swift
@@ -0,0 +1,42 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+@testable import ExecutorchRuntimeValueSupport
+import XCTest
+
+public extension String {
+
+  /// Returns a random string.
+  /// This useful for testing when we want to ensure that production code
+  /// accidentally pass a test by using the same value as the test.
+  static func random() -> String {
+    UUID().uuidString
+  }
+}
+
+public extension Float {
+  static func randomPositive() -> Float {
+    .random(in: 1...Float.greatestFiniteMagnitude)
+  }
+}
+
+class ExecutorchRuntimeValueSupportTests: XCTestCase {
+
+  func testTensorValue() throws {
+    let factory = ExecutorchRuntimeValueSupport(),
+        size = 100,
+        data = (1...size).map { _ in Float.randomPositive() },
+        shape = [size]
+
+    let sut = try XCTUnwrap(try? factory.createTensor(value: factory.createFloatTensor(value: data, shape: shape)))
+
+    XCTAssertEqual(try? sut.tensorValue().floatRepresentation().floatArray, data)
+    XCTAssertEqual(try? sut.tensorValue().floatRepresentation().shape, shape)
+  }
+
+  func testCreateStringsThrows() {
+    let factory = ExecutorchRuntimeValueSupport(),
+        value: String = .random()
+
+    XCTAssertThrowsError(try factory.createString(value: value))
+  }
+}
diff --git a/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/ModelRuntime.swift b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/ModelRuntime.swift
new file mode 100644
index 00000000000..0c0da69996c
--- /dev/null
+++ b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/ModelRuntime.swift
@@ -0,0 +1,14 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+public enum ModelRuntimeError: Error {
+  case unsupportedInputType
+}
+
+public protocol ModelRuntime {
+  func infer(input: [ModelRuntimeValue]) throws -> [ModelRuntimeValue]
+
+  func getModelValueFactory() -> ModelRuntimeValueFactory
+  func getModelTensorFactory() -> ModelRuntimeTensorValueFactory
+}
diff --git a/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/ModelRuntimeValueError.swift b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/ModelRuntimeValueError.swift
new file mode 100644
index 00000000000..c5af8e02d62
--- /dev/null
+++ b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/ModelRuntimeValueError.swift
@@ -0,0 +1,27 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+public enum ModelRuntimeValueError: Error, CustomStringConvertible {
+  case unsupportedType(String)
+  case invalidType(String, String)
+
+  public var description: String {
+    switch self {
+    case .unsupportedType(let type):
+      return "Unsupported type: \(type)"
+    case .invalidType(let expectedType, let type):
+      return "Invalid type: \(type), expected \(expectedType)"
+    }
+  }
+}
+
+@objc public class ModelRuntimeValueErrorFactory: NSObject {
+  @objc public class func unsupportedType(_ type: String) -> Error {
+    return ModelRuntimeValueError.unsupportedType(type)
+  }
+
+  @objc public class func invalidType(_ actualType: String, expectedType: String) -> Error {
+    return ModelRuntimeValueError.invalidType(expectedType, actualType)
+  }
+}
diff --git a/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValue.swift b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValue.swift
new file mode 100644
index 00000000000..46c8066f2bd
--- /dev/null
+++ b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValue.swift
@@ -0,0 +1,17 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+public class ModelRuntimeTensorValue {
+  public let innerValue: ModelRuntimeTensorValueBridging
+  public init(innerValue: ModelRuntimeTensorValueBridging) {
+    self.innerValue = innerValue
+  }
+
+  public func floatRepresentation() throws -> (floatArray: [Float], shape: [Int]) {
+    let value = try innerValue.floatRepresentation()
+    let data = value.floatArray
+    let shape = value.shape
+    return (data.compactMap { $0.floatValue }, shape.compactMap { $0.intValue })
+  }
+}
diff --git a/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValueBridging.swift b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValueBridging.swift
new file mode 100644
index 00000000000..6328565fc02
--- /dev/null
+++ b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValueBridging.swift
@@ -0,0 +1,16 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+public class ModelRuntimeTensorValueBridgingTuple: NSObject {
+  @objc public let floatArray: [NSNumber]
+  @objc public let shape: [NSNumber]
+  @objc public init(floatArray: [NSNumber], shape: [NSNumber]) {
+    self.floatArray = floatArray
+    self.shape = shape
+  }
+}
+
+@objc public protocol ModelRuntimeTensorValueBridging {
+  func floatRepresentation() throws -> ModelRuntimeTensorValueBridgingTuple
+}
diff --git a/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValueFactory.swift b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValueFactory.swift
new file mode 100644
index 00000000000..5565a807e8b
--- /dev/null
+++ b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Tensor/ModelRuntimeTensorValueFactory.swift
@@ -0,0 +1,7 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+public protocol ModelRuntimeTensorValueFactory {
+  func createFloatTensor(value: [Float], shape: [Int]) -> ModelRuntimeTensorValue
+}
diff --git a/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValue.swift b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValue.swift
new file mode 100644
index 00000000000..c27c17ec2f2
--- /dev/null
+++ b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValue.swift
@@ -0,0 +1,22 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+public class ModelRuntimeValue {
+  public let value: ModelRuntimeValueBridging
+  public init(innerValue: ModelRuntimeValueBridging) {
+    self.value = innerValue
+  }
+
+  public func stringValue() throws -> String {
+    return try value.stringValue()
+  }
+
+  public func tensorValue() throws -> ModelRuntimeTensorValue {
+    return try ModelRuntimeTensorValue(innerValue: value.tensorValue())
+  }
+
+  public func arrayValue() throws -> [ModelRuntimeValue] {
+    return try value.arrayValue().map { ModelRuntimeValue(innerValue: $0) }
+  }
+}
diff --git a/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValueBridging.swift b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValueBridging.swift
new file mode 100644
index 00000000000..3eb4c532f4e
--- /dev/null
+++ b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValueBridging.swift
@@ -0,0 +1,9 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+@objc public protocol ModelRuntimeValueBridging {
+  func stringValue() throws -> String
+  func tensorValue() throws -> ModelRuntimeTensorValueBridging
+  func arrayValue() throws -> [ModelRuntimeValueBridging]
+}
diff --git a/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValueFactory.swift b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValueFactory.swift
new file mode 100644
index 00000000000..40e5ea74267
--- /dev/null
+++ b/extension/apple/ModelRunnerDataKit/ModelRunnerDataKit/Value/ModelRuntimeValueFactory.swift
@@ -0,0 +1,8 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+public protocol ModelRuntimeValueFactory {
+  func createString(value: String) throws -> ModelRuntimeValue
+  func createTensor(value: ModelRuntimeTensorValue) throws -> ModelRuntimeValue
+}
diff --git a/extension/flat_tensor/serialize/CMakeLists.txt b/extension/flat_tensor/serialize/CMakeLists.txt
index f1278c804db..d1ae797f8b3 100644
--- a/extension/flat_tensor/serialize/CMakeLists.txt
+++ b/extension/flat_tensor/serialize/CMakeLists.txt
@@ -9,10 +9,6 @@
 # cmake-format -i CMakeLists.txt
 # ~~~
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # The include directory that will contain the generated schema headers.
 set(_flat_tensor_schema__include_dir "${CMAKE_BINARY_DIR}/extension/flat_tensor/include")
 set(_flat_tensor_schema__output_dir "${_flat_tensor_schema__include_dir}/executorch/extension/flat_tensor/serialize")
@@ -37,7 +33,7 @@ function(generate_flat_tensor_schema _schema_srcs _schema_name)
       ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
       "${_flat_tensor_schema__output_dir}" ${_schema_srcs}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS ${FLATC_EXECUTABLE} ${_schema_srcs}
+    DEPENDS flatc ${_schema_srcs}
     COMMENT "Generating ${_schema_name} headers"
     VERBATIM
   )
@@ -49,7 +45,7 @@ function(generate_flat_tensor_schema _schema_srcs _schema_name)
   # and some users need an alignment larger than the default, which is typically
   # 32.
   target_compile_definitions(
-    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=1024
+    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}
   )
 
   target_include_directories(
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index eeb118d4344..6dec5d136ea 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index a9245768b9d..7adb980d224 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -21,7 +21,7 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/parallel/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
new file mode 100644
index 00000000000..dbfb3ff160c
--- /dev/null
+++ b/extension/parallel/targets.bzl
@@ -0,0 +1,22 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "thread_parallel",
+        exported_headers = [
+            "thread_parallel.h",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//executorch/runtime/kernel:thread_parallel_interface",
+        ],
+    )
diff --git a/install_requirements.py b/install_requirements.py
index 06dfbd9e9a6..9353dad180e 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -67,7 +67,7 @@ def python_is_compatible():
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20250301"
+NIGHTLY_VERSION = "dev20250311"
 
 
 def install_requirements(use_pytorch_nightly):
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 23e26bfa72b..6ed55c73e28 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -34,7 +34,7 @@ list(APPEND _common_compile_options -DET_BUILD_WITH_BLAS)
 # compiling for avx2 for now punting this to come back
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index e15970329c1..5072723296c 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -24,7 +24,7 @@ endif()
 set(_common_compile_options -Wno-deprecated-declarations)
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index bf2fe042a93..95fd1734d8e 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -32,6 +32,7 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:slice_util",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
             "//executorch/kernels/portable/cpu/util:upsample_util",
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
     )
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index 6b01ba4fc27..c3c4c161b5f 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -27,7 +27,7 @@ endif()
 set(_common_compile_options -Wno-deprecated-declarations)
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt
index 64f8821da1e..484363acdf5 100644
--- a/schema/CMakeLists.txt
+++ b/schema/CMakeLists.txt
@@ -9,10 +9,6 @@
 # cmake-format -i CMakeLists.txt
 # ~~~
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # The include directory that will contain the generated schema headers.
 set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include")
 set(_program_schema__output_dir "${_program_schema__include_dir}/executorch/schema")
@@ -37,7 +33,7 @@ function(generate_program_schema _schema_srcs _schema_name)
       ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
       "${_program_schema__output_dir}" ${_schema_srcs}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS ${FLATC_EXECUTABLE} ${_schema_srcs}
+    DEPENDS flatc ${_schema_srcs}
     COMMENT "Generating ${_schema_name} headers"
     VERBATIM
   )
@@ -49,7 +45,7 @@ function(generate_program_schema _schema_srcs _schema_name)
   # and some users need an alignment larger than the default, which is typically
   # 32.
   target_compile_definitions(
-    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=1024
+    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}
   )
 
   target_include_directories(
diff --git a/build/Codegen.cmake b/scripts/build/Codegen.cmake
similarity index 100%
rename from build/Codegen.cmake
rename to scripts/build/Codegen.cmake
diff --git a/setup.py b/setup.py
index 28251cdf0df..32ec94708af 100644
--- a/setup.py
+++ b/setup.py
@@ -652,10 +652,6 @@ def run(self):
 
         build_args = [f"-j{self.parallel}"]
 
-        # TODO(dbort): Try to manage these targets and the cmake args from the
-        # extension entries themselves instead of hard-coding them here.
-        build_args += ["--target", "flatc"]
-
         if ShouldBuild.pybindings():
             cmake_args += [
                 "-DEXECUTORCH_BUILD_PYBIND=ON",