diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index f21ed849d03..8c0ba752259 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -96,63 +96,6 @@ jobs:
 
           PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py $ARGS
 
-  prepare-test-specs:
-    runs-on: linux.2xlarge
-    needs: set-parameters
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Prepare the spec
-        id: prepare
-        shell: bash
-        env:
-          BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
-        working-directory: extension/benchmark/android/benchmark
-        run: |
-          set -eux
-
-          # The model will be exported in the next step to this S3 path
-          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
-          # We could write a script to properly use jinja here, but there is only one variable,
-          # so let's just sed it
-          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
-
-          BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
-          # The config for this benchmark runs, we save it in the test spec so that it can be fetched
-          # later by the upload script
-          sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' android-llm-device-farm-test-spec.yml.j2
-
-          cp android-llm-device-farm-test-spec.yml.j2 android-llm-device-farm-test-spec.yml
-          # Just print the test spec for debugging
-          cat android-llm-device-farm-test-spec.yml
-
-          # Save the benchmark configs so that we can use it later in the dashboard
-          echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
-          echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
-
-      - name: Upload the spec
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
-          retention-days: 1
-          if-no-files-found: error
-          path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml
-
-      - name: Update the benchmark configs
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
-          retention-days: 1
-          if-no-files-found: error
-          path: extension/benchmark/android/benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
-
   export-models:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -335,6 +278,69 @@ jobs:
         fi
         echo "::endgroup::"
 
+  prepare-test-specs:
+    runs-on: linux.2xlarge
+    needs:
+      - set-parameters
+      - export-models
+    strategy:
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Prepare the spec
+        id: prepare
+        shell: bash
+        env:
+          BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
+        working-directory: extension/benchmark/android/benchmark
+        run: |
+          set -eux
+
+          # The model will be exported in the next step to this S3 path
+          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
+
+          # Check if the model artifact exists, fail this step skip generating test-spec.
+          curl -s --head -f ${MODEL_PATH}
+
+          # We could write a script to properly use jinja here, but there is only one variable,
+          # so let's just sed it
+          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
+
+          BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
+          # The config for this benchmark runs, we save it in the test spec so that it can be fetched
+          # later by the upload script
+          sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' android-llm-device-farm-test-spec.yml.j2
+
+          cp android-llm-device-farm-test-spec.yml.j2 android-llm-device-farm-test-spec.yml
+          # Just print the test spec for debugging
+          cat android-llm-device-farm-test-spec.yml
+
+          # Save the benchmark configs so that we can use it later in the dashboard
+          echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
+          echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
+
+      - name: Upload the spec
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
+          retention-days: 1
+          if-no-files-found: error
+          path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml
+
+      - name: Update the benchmark configs
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
+          retention-days: 1
+          if-no-files-found: error
+          path: extension/benchmark/android/benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
+
   build-benchmark-app:
     name: build-benchmark-app
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 44aa645d16d..cc7f85e9386 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -98,63 +98,6 @@ jobs:
 
           echo "benchmark_configs is: ${{ steps.set-parameters.outputs.benchmark_configs }}"
 
-  prepare-test-specs:
-    runs-on: linux.2xlarge
-    needs: set-parameters
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Prepare the spec
-        id: prepare
-        shell: bash
-        env:
-          BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
-        working-directory: extension/benchmark/apple/Benchmark
-        run: |
-          set -eux
-
-          # The model will be exported in the next step to this S3 path
-          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
-          # We could write a script to properly use jinja here, but there is only one variable,
-          # so let's just sed it
-          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' default-ios-device-farm-appium-test-spec.yml.j2
-
-          BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
-          # The config for this benchmark runs, we save it in the test spec so that it can be fetched
-          # later by the upload script
-          sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' default-ios-device-farm-appium-test-spec.yml.j2
-
-          cp default-ios-device-farm-appium-test-spec.yml.j2 default-ios-device-farm-appium-test-spec.yml
-          # Just print the test spec for debugging
-          cat default-ios-device-farm-appium-test-spec.yml
-
-          # Save the benchmark configs so that we can use it later in the dashboard
-          echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
-          echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
-
-      - name: Upload the spec
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
-          retention-days: 1
-          if-no-files-found: error
-          path: extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml
-
-      - name: Update the benchmark configs
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
-          retention-days: 1
-          if-no-files-found: error
-          path: extension/benchmark/apple/Benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
-
   export-models:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -344,6 +287,68 @@ jobs:
         fi
         echo "::endgroup::"
 
+  prepare-test-specs:
+    runs-on: linux.2xlarge
+    needs:
+      - set-parameters
+      - export-models
+    strategy:
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Prepare the spec
+        id: prepare
+        shell: bash
+        env:
+          BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
+        working-directory: extension/benchmark/apple/Benchmark
+        run: |
+          set -eux
+
+          # The model will be exported in the next step to this S3 path
+          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
+          # Check if the model artifact exists, fail this step skip generating test-spec.
+          curl -s --head -f ${MODEL_PATH}
+          # We could write a script to properly use jinja here, but there is only one variable,
+          # so let's just sed it
+          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' default-ios-device-farm-appium-test-spec.yml.j2
+
+          BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
+          # The config for this benchmark runs, we save it in the test spec so that it can be fetched
+          # later by the upload script
+          sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' default-ios-device-farm-appium-test-spec.yml.j2
+
+          cp default-ios-device-farm-appium-test-spec.yml.j2 default-ios-device-farm-appium-test-spec.yml
+          # Just print the test spec for debugging
+          cat default-ios-device-farm-appium-test-spec.yml
+
+          # Save the benchmark configs so that we can use it later in the dashboard
+          echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
+          echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
+
+      - name: Upload the spec
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
+          retention-days: 1
+          if-no-files-found: error
+          path: extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml
+
+      - name: Update the benchmark configs
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
+          retention-days: 1
+          if-no-files-found: error
+          path: extension/benchmark/apple/Benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
+
+
   build-benchmark-app:
     name: build-benchmark-app
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6bdcda2f19c..de941663a88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -248,14 +248,15 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
-if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
+  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+  set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
-  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
+if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
-  set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_MODULE)
diff --git a/backends/apple/coreml/TARGETS b/backends/apple/coreml/TARGETS
index d77e33679ab..df1165dd74e 100644
--- a/backends/apple/coreml/TARGETS
+++ b/backends/apple/coreml/TARGETS
@@ -14,10 +14,10 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        "fbsource//third-party/pypi/coremltools:coremltools",
         ":executorchcoreml",
         "//executorch/exir/backend:backend_details",
         "//executorch/exir/backend:compile_spec_schema",
-        "fbsource//third-party/pypi/coremltools:coremltools",
     ],
 )
 
@@ -30,13 +30,13 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        "fbsource//third-party/pypi/coremltools:coremltools",
         ":backend",
         "//caffe2:torch",
         "//executorch/exir:lib",
         "//executorch/exir/backend:compile_spec_schema",
         "//executorch/exir/backend:partitioner",
         "//executorch/exir/backend:utils",
-        "fbsource//third-party/pypi/coremltools:coremltools",
     ],
 )
 
@@ -64,25 +64,23 @@ runtime.cxx_python_extension(
     headers = glob([
         "runtime/inmemoryfs/**/*.hpp",
     ]),
+    base_module = "",
+    compiler_flags = [
+        "-std=c++17",
+    ],
     preprocessor_flags = [
         "-Iexecutorch/backends/apple/coreml/runtime/util",
     ],
     types = [
         "executorchcoreml.pyi",
     ],
-    compiler_flags = [
-        "-std=c++17",
-    ],
-    base_module = "",
     visibility = [
         "//executorch/examples/apple/coreml/...",
         "@EXECUTORCH_CLIENTS",
     ],
-    external_deps = [
-        "pybind11",
-    ],
     deps = [
         "fbsource//third-party/nlohmann-json:nlohmann-json",
+        "fbsource//third-party/pybind11:pybind11",
     ],
 )
 
@@ -92,10 +90,10 @@ runtime.python_test(
         "test/*.py",
     ]),
     deps = [
+        "fbsource//third-party/pypi/pytest:pytest",
         ":partitioner",
         ":quantizer",
         "//caffe2:torch",
         "//pytorch/vision:torchvision",
-        "fbsource//third-party/pypi/pytest:pytest",
     ],
 )
diff --git a/backends/arm/tosa_mapping.py b/backends/arm/tosa_mapping.py
index 9a8b6b2c35d..292d6209bb3 100644
--- a/backends/arm/tosa_mapping.py
+++ b/backends/arm/tosa_mapping.py
@@ -107,7 +107,10 @@ def __init__(self, argument: Any) -> None:
         if isinstance(argument, (int, float)):
             self.__process_number(argument)
             return
+        if isinstance(argument, torch.dtype):
+            # Dtype is parsed from fake tensor
+            return
 
-        RuntimeError(
+        raise RuntimeError(
             f"Unhandled node input argument: {argument}, of type {type(argument)}"
         )
diff --git a/backends/qualcomm/aot/python/targets.bzl b/backends/qualcomm/aot/python/targets.bzl
index e1f5a6a8fc5..f29c02aa593 100644
--- a/backends/qualcomm/aot/python/targets.bzl
+++ b/backends/qualcomm/aot/python/targets.bzl
@@ -33,10 +33,10 @@ def define_common_targets():
             "//executorch/backends/qualcomm:schema",
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/pybind11:pybind11",
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
         ],
         external_deps = [
-            "pybind11",
             "libtorch_python",
         ],
         use_static_deps = True,
@@ -66,10 +66,10 @@ def define_common_targets():
             "//executorch/backends/qualcomm:schema",
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/pybind11:pybind11",
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
         ],
         external_deps = [
-            "pybind11",
             "libtorch_python",
         ],
         use_static_deps = True,
@@ -93,9 +93,7 @@ def define_common_targets():
             "//executorch/backends/qualcomm:schema",
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/pybind11:pybind11",
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
         ],
-        external_deps = [
-            "pybind11",
-        ],
     )
diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index 872ba355c70..8712c2709ac 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -21,6 +21,7 @@
     is_dynamic_qdq,
     is_per_channel,
     is_per_channel_group,
+    is_per_tensor,
     is_qparam,
     is_quant,
 )
@@ -66,8 +67,6 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False
 
         is_valid, _ = self.get_deps(node, ep)
-        if not is_valid:
-            why(node, "Failed to get valid dependent nodes.")
         return is_valid
 
     def get_node_and_deps(
@@ -97,9 +96,9 @@ def _detect_precision(self, node: torch.fx.Node) -> ConfigPrecisionType:
     def _overwrite_precision(self, node: torch.fx.Node):
         precision = self._detect_precision(node)
         if precision not in self.enabled_precision_types:
-            # detected precision is not enabled, lets try to partition it as fp32
+            # detected precision is not enabled, try to partition it as fp32
             if self.enabled_precision_types == [ConfigPrecisionType.FP32]:
-                # if only fp32 is enabled, then we can still partition fp32 gemms
+                # when only fp32 is enabled, then we can still partition fp32 gemms
                 # even with in a quantized graph
                 if precision in [
                     ConfigPrecisionType.STATIC_QUANT,
@@ -108,6 +107,7 @@ def _overwrite_precision(self, node: torch.fx.Node):
                     precision = ConfigPrecisionType.FP32
                     logging.info(f"Overwriting precision, partitioning {node} as FP32")
                     return True, precision
+
         return False, precision
 
     def get_deps(
@@ -123,6 +123,7 @@ def get_deps(
         precision = self._detect_precision(node)
         if precision not in self.supported_precision_types():
             # detected precision but it is either disabled or not supported
+            why(node, f"Unsupported precision type {precision}")
             return (False, [])
         _, precision = self._overwrite_precision(node)
         valid_bias, bias_deps = self._get_bias_deps(node, ep, precision)
@@ -143,7 +144,8 @@ def _get_weight_deps(
             # First find the weight
             weight_node = get_input_node(node, self.weight_idx)
             if not is_param_node(ep, weight_node):
-                return (False, [])  # weight must be a static param
+                why(node, "Expected weight to be a static param")
+                return (False, [])
             gemm_deps.append(weight_node)
 
             return (True, gemm_deps)
@@ -151,19 +153,33 @@ def _get_weight_deps(
             # Quantized Weight deps
             dequant_node = get_input_node(node, self.weight_idx)
             if not is_dequant(dequant_node):
+                why(node, "Expected  weight to have a dequantized node")
                 return False, []
             gemm_deps.append(dequant_node)
             weight = get_input_node(dequant_node, 0)
             if not is_param_node(ep, weight):
+                why(node, "Expected weight to be a static param")
                 return False, []
             gemm_deps.append(weight)
 
+            if (
+                is_per_tensor(dequant_node)
+                and precision == ConfigPrecisionType.DYNAMIC_QUANT
+            ):
+                why(
+                    node,
+                    "XNNPACK does not support per tensor quantized weights for dynamic quantization of activations",
+                )
+                return False, []
+
             if is_per_channel(dequant_node) or is_per_channel_group(dequant_node):
                 if len(dequant_node.all_input_nodes) < 2:
                     # Expected channel quantized to have scale/zp nodes
+                    why(node, "Expected channel quantized to have scale/zp nodes")
                     return False, []
 
                 gemm_deps.extend(dequant_node.all_input_nodes[1:3])
+
             return (True, gemm_deps)
 
     def _get_output_deps(
@@ -174,7 +190,7 @@ def _get_output_deps(
             # Look for fused activations and tail end quant node
             node_users = list(node.users.keys())
             if len(node_users) != 1:
-                # Expect quantized node to have a single output (fused act or dequant)
+                why(node, "Expected quantized node to have a single output")
                 return False, []
 
             # Check if the quantized pattern has a fused activation
@@ -190,6 +206,7 @@ def _get_output_deps(
 
             if not is_quant(n_output):
                 # Expected gemm_node --> fused_act (optional) --> dequant
+                why(node, "Expected output node to have a dequantized node")
                 return (False, [])
             gemm_deps.append(n_output)
         elif precision == ConfigPrecisionType.FP32:
@@ -210,8 +227,11 @@ def _get_bias_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
         gemm_deps = []
-        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
-            # if force force_fp32_dynamic_linear is enabled, then we
+        if (
+            precision == ConfigPrecisionType.FP32
+            and self.force_non_static_weights_for_f32_linear
+        ):
+            # if force_non_static_weights_for_f32_linear is enabled, then we
             # do not partition the weight node
             return (True, gemm_deps)
 
@@ -219,7 +239,8 @@ def _get_bias_deps(
             bias_node = get_input_node(node, self.bias_idx)
             if bias_node:
                 if not is_param_node(ep, bias_node):
-                    return (False, [])  # bias node must be a static param
+                    why(node, "Expected bias to be a static param")
+                    return (False, [])
                 gemm_deps.append(bias_node)
 
         return (True, gemm_deps)
@@ -233,7 +254,7 @@ def _get_act_deps(
         else:
             dq_input = get_input_node(node, self.act_idx)
             if not is_dequant(dq_input):
-                # Expected static quant input to be dequant node
+                why(node, "Expected act input to be dequant node")
                 return False, []
             gemm_deps.append(dq_input)
             if precision == ConfigPrecisionType.STATIC_QUANT:
@@ -243,6 +264,7 @@ def _get_act_deps(
             # q input node
             q_input = get_input_node(dq_input, 0)
             if not is_quant(q_input):
+                why(node, "Expected  dequant input to be quant node")
                 return (False, [])
 
             gemm_deps.append(q_input)
@@ -250,20 +272,20 @@ def _get_act_deps(
             if is_affine_qdq(q_input):
                 q_input_args = extract_qdq_affine_op_args_for_decomposed_ops(q_input)
             if not (is_node(q_input_args[1]) and is_node(q_input_args[2])):
-                # expected to find getitem node from choose qparam
+                why(node, "expected to find getitem node from choose qparam")
                 return (False, [])
 
             getitem1 = q_input_args[1]
             getitem2 = q_input_args[2]
 
             if not (is_getitem(getitem1) and is_getitem(getitem2)):
-                # expected getitem node from choose qparam
+                why(node, "expected getitem node from choose qparam")
                 return (False, [])
 
             gemm_deps.extend([getitem1, getitem2])
             choose_qparam = get_input_node(getitem1, 0)
             if not is_qparam(choose_qparam):
-                # expected to find choose_qparam node
+                why(node, "expected to find choose_qparam node")
                 return (False, [])
             gemm_deps.append(choose_qparam)
             return (True, gemm_deps)
@@ -287,8 +309,11 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
     def _get_weight_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
-        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
-            # if force fp32_dynamic_linear is enabled, then we
+        if (
+            precision == ConfigPrecisionType.FP32
+            and self.force_non_static_weights_for_f32_linear
+        ):
+            # if force_non_static_weights_for_f32_linear is enabled, then we
             # do not partition the weight node
             return (True, [])
 
@@ -394,9 +419,11 @@ def __init__(self, **kwargs):
     def _get_weight_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
-        # TODO(maxren, T210537195):
-        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
-            # if force fp32_dynamic_linear is on and we detected this as fp32, then we
+        if (
+            precision == ConfigPrecisionType.FP32
+            and self.force_non_static_weights_for_f32_linear
+        ):
+            # if force_non_static_weights_for_f32_linear is on and we detected this as fp32, then we
             # do not partition the weight node
             return (True, [])
 
@@ -471,6 +498,7 @@ def find_partition_args(input_node):
             # there can only be a single output node in partition
             or len(src_partition.output_nodes) != 1
         ):
+            why(node, "invalid source partition")
             return (False, [])
 
         # map addmm's args to the source partition linear's inputs and users
@@ -482,11 +510,11 @@ def find_partition_args(input_node):
         node.args = old_args
         node.users = old_users
 
-        # When using force_fp32_dynamic_linear, we want to get_deps to overwrite the source partition nodes.
+        # When using force_non_static_weights_for_f32_linear, we want to get_deps to overwrite the source partition nodes.
         # Else we want to be greedy.
         ret_deps = (
             list(set(deps) & set(src_partition.nodes))
-            if self.force_fp32_dynamic_linear
+            if self.force_non_static_weights_for_f32_linear
             else list(set(deps) | set(src_partition.nodes))
         )
 
@@ -512,8 +540,11 @@ def __init__(self, **kwargs):
     def _get_weight_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
-        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
-            # if force fp32_dynamic_linear is on and we detected this as fp32, then we
+        if (
+            precision == ConfigPrecisionType.FP32
+            and self.force_non_static_weights_for_f32_linear
+        ):
+            # if force_non_static_weights_for_f32_linear is on and we detected this as fp32, then we
             # do not partition the weight node
             return (True, [])
 
diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py
index d261416a76f..20018610fce 100644
--- a/backends/xnnpack/partition/config/xnnpack_config.py
+++ b/backends/xnnpack/partition/config/xnnpack_config.py
@@ -41,7 +41,9 @@ def __init__(self, **kwargs):
         super().__init__()
         self.enabled_precision_types = self.supported_precision_types()
         # Flag used in GEMMConfig()
-        self.force_fp32_dynamic_linear = kwargs.get("force_fp32_dynamic_linear", False)
+        self.force_non_static_weights_for_f32_linear = kwargs.get(
+            "force_non_static_weights_for_f32_linear", False
+        )
 
     def get_partition(
         self, node: torch.fx.Node, ep: ExportedProgram
diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py
index 30bb4f0aba2..690a1109a17 100644
--- a/backends/xnnpack/test/ops/test_linear.py
+++ b/backends/xnnpack/test/ops/test_linear.py
@@ -539,6 +539,66 @@ def _test_qd8_per_channel_linear(self, dtype: torch.dtype = torch.float):
                 uses_bias=uses_bias,
             )
 
+    def _test_qd8_linear_per_tensor_unsupported(self, dtype: torch.dtype = torch.float):
+        for uses_bias in (False, True):
+            module = BaseLinear(
+                in_size=8,
+                input_channels=13,
+                output_channels=17,
+                dtype=dtype,
+                use_bias=uses_bias,
+            )
+            inputs = module.get_inputs()
+            dynamic_shapes = ({1: torch.export.Dim("batch", max=100)},)
+
+            quant_config = get_symmetric_quantization_config(
+                is_per_channel=False,
+                is_dynamic=True,
+            )
+
+            for legacy_partitioner in (True, False):
+                for per_op_mode in (True, False):
+                    # Every combination should fail to partition Linear or [add]mm.
+                    DynamicallyQuantizedPartitioner = XnnpackPartitioner(
+                        config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
+                        per_op_mode=per_op_mode,
+                    )
+
+                    tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes)
+                    tester.quantize(Quantize(quantization_config=quant_config))
+                    tester.export()
+
+                    if legacy_partitioner:
+                        tester.to_edge()
+                        tester.partition(
+                            Partition(DynamicallyQuantizedPartitioner)
+                        ).dump_artifact()
+                        # should have [add]mm node
+                        if uses_bias:
+                            tester.check(
+                                [
+                                    "executorch_exir_dialects_edge__ops_aten_addmm_default",
+                                ]
+                            )
+                        else:
+                            tester.check(
+                                [
+                                    "executorch_exir_dialects_edge__ops_aten_mm_default",
+                                ]
+                            )
+                    else:
+                        tester.to_edge_transform_and_lower(
+                            ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
+                        ).dump_artifact()
+                        # should not have a delegate node
+                        tester.check_not(
+                            [
+                                "torch.ops.higher_order.executorch_call_delegate",
+                            ]
+                        )
+                    # No need to run the model, since it should fail to partition.
+                    return
+
     def _test_qd8_per_channel_4w_linear(self, dtype: torch.dtype = torch.float):
         qconfig = self._get_4b_dqconfig()
         input_channels = [2, 63]
@@ -697,10 +757,24 @@ def test_qs8_linear(self):
     def test_qd8_f16_per_channel_linear(self):
         self._test_qd8_per_channel_linear(dtype=torch.half)
 
+    def test_qd8_f16_per_tensor_linear(self):
+        """
+        XNNPACK doesn't support per_tensor quantized weights for dynamic quantized linear op.
+        This test is to verify that we can't lower per_tensor quantized weights to per_channel quantized weights.
+        """
+        self._test_qd8_linear_per_tensor_unsupported(dtype=torch.half)
+
     # Tests for q[dp]8-f32-qc8w
     def test_qd8_f32_per_channel_linear(self):
         self._test_qd8_per_channel_linear(dtype=torch.float)
 
+    def test_qd8_f32_per_tensor_linear(self):
+        """
+        XNNPACK doesn't support per_tensor quantized weights for dynamic quantized linear op.
+        This test is to verify that we can't lower per_tensor quantized weights to per_channel quantized weights.
+        """
+        self._test_qd8_linear_per_tensor_unsupported(dtype=torch.half)
+
     # Tests for q[dp]8-f16-qc4w
     def test_linear_qd8_f16_per_channel_int4(self):
         self._test_qd8_per_channel_4w_linear(dtype=torch.half)
@@ -874,7 +948,7 @@ def test_linear_qd8_as_fp32(self):
                 },
             )
 
-    def test_linear_fp32_with_force_as_mm(self):
+    def test_linear_with_force_non_static_weights_for_f32_linear(self):
         def check_signature(
             signature: ExportGraphSignature,
             force_flag: bool,
@@ -907,7 +981,7 @@ def check_signature(
                     inputs = module.get_inputs()
                     tester = Tester(module, inputs).export()
                     partitioner = XnnpackPartitioner(
-                        force_fp32_dynamic_linear=force_flag
+                        force_non_static_weights_for_f32_linear=force_flag
                     )
                     if legacy_mode:
                         tester.to_edge()
diff --git a/backends/xnnpack/test/ops/test_lstm.py b/backends/xnnpack/test/ops/test_lstm.py
index be209082b37..6c174b16f33 100644
--- a/backends/xnnpack/test/ops/test_lstm.py
+++ b/backends/xnnpack/test/ops/test_lstm.py
@@ -43,18 +43,20 @@ def test_fp32_lstm(self):
             .run_method_and_compare_outputs()
         )
 
-    def test_fp32_lstm_force_dynamic_linear(self):
+    def test_lstm_with_force_non_static_weights_for_f32_linear(self):
         (
             Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),))
             .export()
             .to_edge_transform_and_lower(
                 ToEdgeTransformAndLower(
-                    partitioners=[XnnpackPartitioner(force_fp32_dynamic_linear=True)]
+                    partitioners=[
+                        XnnpackPartitioner(force_non_static_weights_for_f32_linear=True)
+                    ]
                 )
             )
             .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
             # Weights are supplied as input to linears
-            # Biases are not owned by delegates when force_fp32_dynamic_linear is set
+            # Biases are not owned by delegates when force_non_static_weights_for_f32_linear is set
             .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0", "p_lstm_bias"])
             .to_executorch()
             .serialize()
diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py
index 7c035757a6f..49c5a963161 100644
--- a/backends/xnnpack/utils/quant_utils.py
+++ b/backends/xnnpack/utils/quant_utils.py
@@ -89,6 +89,15 @@ def is_per_channel(node: torch.fx.Node) -> bool:
     return is_per_channel or is_affine_per_channel_group
 
 
+def is_per_tensor(node: torch.fx.Node) -> bool:
+    if not (is_quant(node) or is_dequant(node)):
+        return False
+
+    is_per_tensor = "per_tensor" in node.target.__name__  # pyre-ignore
+
+    return is_per_tensor and not (is_per_channel(node))
+
+
 def is_affine_qdq(node: torch.fx.Node) -> bool:
     if not (is_quant(node) or is_dequant(node)):
         return False
diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp
index a5242c8ed4b..8c20bb4ad89 100644
--- a/devtools/etdump/etdump_flatcc.cpp
+++ b/devtools/etdump/etdump_flatcc.cpp
@@ -503,7 +503,7 @@ void ETDumpGen::set_debug_buffer(Span<uint8_t> buffer) {
   Result<BufferDataSink> bds_ret = BufferDataSink::create(buffer);
   ET_CHECK_MSG(
       bds_ret.ok(),
-      "Failed to write tensor with error 0x%" PRIx32,
+      "Failed to create data sink from debug buffer with error 0x%" PRIx32,
       static_cast<uint32_t>(bds_ret.error()));
 
   buffer_data_sink_ = std::move(bds_ret.get());
diff --git a/docs/source/using-executorch-building-from-source.md b/docs/source/using-executorch-building-from-source.md
index eae7fbabf57..8196c7d39df 100644
--- a/docs/source/using-executorch-building-from-source.md
+++ b/docs/source/using-executorch-building-from-source.md
@@ -80,6 +80,14 @@ portability details.
    ./install_executorch.sh --pybind off
    ```
 
+   For development, install the package in `--editable` mode, which allows to modify Python source code and see changes reflected immediately.
+   ```
+   ./install_executorch.sh --editable [--pybind xnnpack]
+   
+   # Or you can directly do the following if dependencies are already installed.
+   pip install -e .
+   ```
+
 > **_NOTE:_**  Cleaning the build system
 >
 > When fetching a new version of the upstream repo (via `git fetch` or `git
diff --git a/examples/models/checkpoint.py b/examples/models/checkpoint.py
index ee3fb560429..c84a689b951 100644
--- a/examples/models/checkpoint.py
+++ b/examples/models/checkpoint.py
@@ -64,7 +64,7 @@ def get_checkpoint_dtype(checkpoint: Dict[str, Any]) -> Optional[str]:
         mismatched_dtypes = [
             (key, value.dtype)
             for key, value in checkpoint.items()
-            if value.dtype != dtype
+            if hasattr(value, "dtype") and value.dtype != dtype
         ]
         if len(mismatched_dtypes) > 0:
             print(
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 3e9ceb34af5..4ba645ffd87 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import time
 from abc import ABC, abstractmethod
 from typing import List, Optional
 
@@ -97,6 +98,7 @@ def generate(  # noqa: C901
         pos_base: int = 0,
     ) -> List[int]:
         # Prefill
+        prefill_start = time.time()
         logits = self.forward(
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
@@ -105,11 +107,13 @@ def generate(  # noqa: C901
                 else None
             ),
         )
+        prefill_time = time.time() - prefill_start
 
         current_token = next_token(logits, temperature, top_p)
         print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 
+        generate_start = time.time()
         while len(tokens) < max_seq_len:
             if self.use_kv_cache:
                 logits = self.forward(
@@ -140,6 +144,10 @@ def generate(  # noqa: C901
             print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         print("\n")
 
+        generate_time = time.time() - generate_start
+        print(f"Prefill time: {prefill_time}")
+        print(f"Generation tok/s: {len(tokens) / generate_time}")
+
         return tokens if echo else tokens[len(prompt_tokens) :]
 
     def text_completion(
diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
index 40044db7428..ea8e2f5d319 100755
--- a/examples/qualcomm/oss_scripts/llama/model/static_llama.py
+++ b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
@@ -461,7 +461,7 @@ def get_metadata(self):
             "get_bos_id": 1,
             "get_eos_id": 2,
             "get_dim": self.dim,
-            "get_head_dim": self.dim // self.n_heads,
+            "get_head_dim": self.head_dim,
             "get_max_batch_size": self.max_batch_size,
             "get_max_seq_len": self.max_seq_len,
             "get_n_bos": 1,
diff --git a/exir/_serialize/_named_data_store.py b/exir/_serialize/_named_data_store.py
index 999913a4bb0..2c2d975937e 100644
--- a/exir/_serialize/_named_data_store.py
+++ b/exir/_serialize/_named_data_store.py
@@ -181,3 +181,30 @@ def get_named_data_store_output(self) -> NamedDataStoreOutput:
         # Clean up empty maps inside self.external_data
         self.external_data = {k: v for k, v in self.external_data.items() if len(v) > 0}
         return NamedDataStoreOutput(self.buffers, self.pte_data, self.external_data)
+
+    def merge_named_data_store(self, other: NamedDataStoreOutput) -> None:
+        """
+        Merge another NamedDataStore into this one.
+        Args:
+            other (NamedDataStore): the other NamedDataStore to merge.
+        Raises:
+            ValueError: when the key exists in both stores, and corresponding
+                data is different between them.
+        """
+        # Merge the pte_data.
+        for key, buffer_idx in other.pte_data.items():
+            self.add_named_data(
+                key,
+                other.buffers[buffer_idx].buffer,
+                other.buffers[buffer_idx].alignment,
+            )
+
+        # Merge the external_data.
+        for filename, key_to_buffer_idx in other.external_data.items():
+            for key, buffer_idx in key_to_buffer_idx.items():
+                self.add_named_data(
+                    key,
+                    other.buffers[buffer_idx].buffer,
+                    other.buffers[buffer_idx].alignment,
+                    external_tag=filename,
+                )
diff --git a/exir/_serialize/test/test_named_data_store.py b/exir/_serialize/test/test_named_data_store.py
index d5355f6d7bf..ffe6f2ddce7 100644
--- a/exir/_serialize/test/test_named_data_store.py
+++ b/exir/_serialize/test/test_named_data_store.py
@@ -83,3 +83,62 @@ def test_add_duplicate_key_fail(self) -> None:
         self.assertEqual(len(output.pte_data), 1)
         self.assertEqual(output.pte_data["key"], 0)
         self.assertEqual(len(output.external_data), 0)
+
+    def test_merge(self) -> None:
+        store1 = NamedDataStore()
+        store1.add_named_data("key1", b"data1", None, None)
+        store1.add_named_data("key2", b"data2", 16, "file1")
+
+        # Check items in the store1.
+        output = store1.get_named_data_store_output()
+        self.assertEqual(len(output.buffers), 2)
+        self.assertEqual(len(output.pte_data), 1)
+        self.assertEqual(len(output.external_data), 1)
+        self.assertEqual(len(output.external_data["file1"]), 1)
+
+        store2 = NamedDataStore()
+        store2.add_named_data("key1", b"data1", None, None)
+        store2.add_named_data("key3", b"data3", None, None)
+        store2.add_named_data("key4", b"data4", 16, "file1")
+        store2.add_named_data("key5", b"data5", 16, "file2")
+
+        # Check items in store2.
+        output2 = store2.get_named_data_store_output()
+        self.assertEqual(len(output2.buffers), 4)
+        self.assertEqual(len(output2.pte_data), 2)
+        self.assertEqual(len(output2.external_data), 2)
+        self.assertEqual(len(output2.external_data["file1"]), 1)
+        self.assertEqual(len(output2.external_data["file2"]), 1)
+
+        # Merge store2 into store1.
+        store1.merge_named_data_store(output2)
+
+        # Check items in store2 are merged into store1.
+        output = store1.get_named_data_store_output()
+        # key1, data1 exist in both store1 and store2, so we only have one copy of it.
+        self.assertEqual(len(output.buffers), 5)
+        self.assertEqual(len(output.pte_data), 2)
+        self.assertEqual(len(output.external_data), 2)
+        self.assertEqual(len(output.external_data["file1"]), 2)
+        self.assertEqual(len(output.external_data["file2"]), 1)
+
+    def test_merge_duplicate_error(self) -> None:
+        store1 = NamedDataStore()
+        store1.add_named_data("key1", b"data1", None, None)
+
+        # Check items in the store1.
+        output = store1.get_named_data_store_output()
+        self.assertEqual(len(output.buffers), 1)
+        self.assertEqual(len(output.pte_data), 1)
+
+        store2 = NamedDataStore()
+        store2.add_named_data("key1", b"data2", None, None)
+
+        # Check items in store2.
+        output2 = store2.get_named_data_store_output()
+        self.assertEqual(len(output2.buffers), 1)
+        self.assertEqual(len(output2.pte_data), 1)
+
+        # Merge store2 into store1 raises error as key1 is already in store1
+        # with different data.
+        self.assertRaises(ValueError, store1.merge_named_data_store, output2)
diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 966cae5f022..519f184871a 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -56,9 +57,9 @@ def to_backend(
      ) -> LoweredBackendModule:
 
      def to_backend(
-         graph_module: torch.fx.GraphModule,
-         partitioner: Type[TPartitioner],
-     ) -> torch.fx.GraphModule
+         edge_program: ExportedProgram,
+         partitioner: Partitioner,
+     ) -> ExportedProgram:
     """
     pass
 
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 5a9c101a06a..8295907d090 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -978,6 +978,18 @@ def _remove_invalid_ops_for_not_decompose(
 ) -> List[torch._ops.OpOverload]:
     # To address https://github.com/pytorch/executorch/issues/8781
     def keep(op):
+        # Explicit allow list
+        allow_list = []
+        try:
+            # Ops in torch.ops.quant are not always loaded, so we use try/except
+            # Aliases output, but we need to allow it for XNNPACK
+            allow_list.append(torch.ops.quant.choose_qparams_affine.default)
+        except:
+            pass
+
+        if op in allow_list:
+            return True
+
         schema = op._schema
         native_schema = _pybind_schema_to_native_schema(schema)
         if native_schema.is_mutable:
diff --git a/exir/verification/TARGETS b/exir/verification/TARGETS
index 8ee9e5546e3..092b48658df 100644
--- a/exir/verification/TARGETS
+++ b/exir/verification/TARGETS
@@ -10,13 +10,11 @@ cpp_python_extension(
         "bindings.cpp",
     ],
     deps = [
+        "fbsource//third-party/pybind11:pybind11",
         "//caffe2:torch-cpp-cpu",
         "//caffe2:torch_extension",
         "//caffe2/c10:c10",
     ],
-    external_deps = [
-        "pybind11",
-    ],
 )
 
 python_library(
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 47ad30e9390..eb8dd462378 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -448,6 +448,8 @@ def to_edge_transform_and_lower(
             compile_config=edge_config,
             constant_methods=self.metadata,
         )
+        if self.verbose:
+            logging.info(f"Exported graph:\n{self.edge_manager.exported_program()}")
         return self
 
     def to_executorch(
diff --git a/extension/pybindings/TARGETS b/extension/pybindings/TARGETS
index 17ccbb2477c..2e77127bf56 100644
--- a/extension/pybindings/TARGETS
+++ b/extension/pybindings/TARGETS
@@ -70,5 +70,8 @@ runtime.python_library(
         "//executorch/runtime/...",
         "@EXECUTORCH_CLIENTS",
     ],
-    deps = [":_portable_lib"],
+    deps = [
+        ":_portable_lib",
+        "//executorch/exir:_warnings",
+    ],
 )
diff --git a/extension/pytree/TARGETS b/extension/pytree/TARGETS
index 400a5b9504c..005c5c9c2d7 100644
--- a/extension/pytree/TARGETS
+++ b/extension/pytree/TARGETS
@@ -16,11 +16,9 @@ cpp_python_extension(
     ],
     base_module = "executorch.extension.pytree",
     deps = [
+        "fbsource//third-party/pybind11:pybind11",
         ":pytree",
     ],
-    external_deps = [
-        "pybind11",
-    ],
 )
 
 cpp_python_extension(
@@ -30,11 +28,9 @@ cpp_python_extension(
     ],
     base_module = "executorch.extension.pytree",
     deps = [
+        "fbsource//third-party/pybind11:pybind11",
         ":pytree",
     ],
-    external_deps = [
-        "pybind11",
-    ],
 )
 
 python_library(
diff --git a/extension/threadpool/cpuinfo_utils.cpp b/extension/threadpool/cpuinfo_utils.cpp
index 5dc3fa7fae5..21862fbd4aa 100644
--- a/extension/threadpool/cpuinfo_utils.cpp
+++ b/extension/threadpool/cpuinfo_utils.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
 
 #include <fstream>
@@ -84,7 +85,7 @@ bool populate_available_cpu_mids() {
   cpu_midrs->resize(num_possible_cores);
   const std::string kMidrFilePathPrefix = "/sys/devices/system/cpu/cpu";
   const std::string kMidrFilePathSuffix = "/regs/identification/midr_el1";
-  for (int32_t i = 0; i < num_possible_cores; ++i) {
+  for (const auto i : c10::irange(num_possible_cores)) {
     std::string midr_file_path =
         kMidrFilePathPrefix + std::to_string(i) + kMidrFilePathSuffix;
     ET_LOG(Info, "Reading file %s", midr_file_path.c_str());
@@ -115,7 +116,7 @@ uint32_t _get_num_performant_cores() {
     ET_LOG(Info, "CPU info and manual query on # of cpus dont match.");
     return 0;
   }
-  for (int32_t i = 0; i < cpu_midrs->size(); ++i) {
+  for (const auto i : c10::irange(cpu_midrs->size())) {
     uint32_t masked_midr = (*cpu_midrs)[i] & RIVISION_MASK;
     switch (masked_midr) {
       case CPUINFO_ARM_MIDR_CORTEX_A520:
@@ -148,7 +149,7 @@ uint32_t get_num_performant_cores() {
   uint32_t num_possible_cores = cpuinfo_get_processors_count();
   uint32_t num_non_performant_core = 0;
   if (uarch_count > 1) {
-    for (int32_t i = 0; i < uarch_count; ++i) {
+    for (const auto i : c10::irange(uarch_count)) {
       const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
       if (is_non_performant_core(uarch_info)) {
         num_non_performant_core += uarch_info->processor_count;
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 4a7185ce972..8bb0398b385 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -23,6 +23,7 @@ def define_common_targets():
         srcs = _THREADPOOL_SRCS,
         deps = [
             "//executorch/runtime/core:core",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
         exported_headers = _THREADPOOL_HEADERS,
         exported_deps = [
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
index e50bb3c71eb..97e75955837 100644
--- a/extension/training/CMakeLists.txt
+++ b/extension/training/CMakeLists.txt
@@ -26,7 +26,7 @@ target_include_directories(
 target_include_directories(extension_training PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(extension_training PUBLIC ${_common_compile_options})
 target_link_libraries(extension_training executorch_core
-    extension_data_loader extension_module extension_tensor)
+    extension_data_loader extension_module extension_tensor extension_flat_tensor)
 
 
 list(TRANSFORM _train_xor__srcs PREPEND "${EXECUTORCH_ROOT}/")
diff --git a/extension/training/examples/XOR/export_model.py b/extension/training/examples/XOR/export_model.py
index bfbe0ce2138..98e04f09a2f 100644
--- a/extension/training/examples/XOR/export_model.py
+++ b/extension/training/examples/XOR/export_model.py
@@ -11,14 +11,14 @@
 import os
 
 import torch
-from executorch.exir import to_edge
+from executorch.exir import ExecutorchBackendConfig, to_edge
 
 from executorch.extension.training.examples.XOR.model import Net, TrainingNet
 from torch.export import export
 from torch.export.experimental import _export_forward_backward
 
 
-def _export_model():
+def _export_model(external_mutable_weights: bool = False):
     net = TrainingNet(Net())
     x = torch.randn(1, 2)
 
@@ -30,7 +30,11 @@ def _export_model():
     # Lower the graph to edge dialect.
     ep = to_edge(ep)
     # Lower the graph to executorch.
-    ep = ep.to_executorch()
+    ep = ep.to_executorch(
+        config=ExecutorchBackendConfig(
+            external_mutable_weights=external_mutable_weights
+        )
+    )
     return ep
 
 
@@ -44,19 +48,27 @@ def main() -> None:
         "--outdir",
         type=str,
         required=True,
-        help="Path to the directory to write xor.pte files to",
+        help="Path to the directory to write xor.pte and xor.ptd files to",
+    )
+    parser.add_argument(
+        "--external",
+        action="store_true",
+        help="Export the model with external weights",
     )
     args = parser.parse_args()
 
-    ep = _export_model()
+    ep = _export_model(args.external)
 
     # Write out the .pte file.
     os.makedirs(args.outdir, exist_ok=True)
     outfile = os.path.join(args.outdir, "xor.pte")
     with open(outfile, "wb") as fp:
-        fp.write(
-            ep.buffer,
-        )
+        ep.write_to_file(fp)
+
+    if args.external:
+        # current infra doesnt easily allow renaming this file, so just hackily do it here.
+        ep._tensor_data["xor"] = ep._tensor_data.pop("_default_external_constant")
+        ep.write_tensor_data_to_file(args.outdir)
 
 
 if __name__ == "__main__":
diff --git a/extension/training/examples/XOR/train.cpp b/extension/training/examples/XOR/train.cpp
index 746daebbf1b..af1c37a6a50 100644
--- a/extension/training/examples/XOR/train.cpp
+++ b/extension/training/examples/XOR/train.cpp
@@ -23,12 +23,18 @@ using executorch::extension::training::optimizer::SGDOptions;
 using executorch::runtime::Error;
 using executorch::runtime::Result;
 DEFINE_string(model_path, "xor.pte", "Model serialized in flatbuffer format.");
+DEFINE_string(ptd_path, "", "Model weights serialized in flatbuffer format.");
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (argc != 1) {
+  if (argc == 0) {
+    ET_LOG(Error, "Please provide a model path.");
+    return 1;
+  } else if (argc > 2) {
     std::string msg = "Extra commandline args: ";
-    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+    for (int i = 2 /* skip argv[0] (pte path) and argv[1] (ptd path) */;
+         i < argc;
+         i++) {
       msg += argv[i];
     }
     ET_LOG(Error, "%s", msg.c_str());
@@ -46,7 +52,21 @@ int main(int argc, char** argv) {
   auto loader = std::make_unique<executorch::extension::FileDataLoader>(
       std::move(loader_res.get()));
 
-  auto mod = executorch::extension::training::TrainingModule(std::move(loader));
+  std::unique_ptr<executorch::extension::FileDataLoader> ptd_loader = nullptr;
+  if (!FLAGS_ptd_path.empty()) {
+    executorch::runtime::Result<executorch::extension::FileDataLoader>
+        ptd_loader_res =
+            executorch::extension::FileDataLoader::from(FLAGS_ptd_path.c_str());
+    if (ptd_loader_res.error() != Error::Ok) {
+      ET_LOG(Error, "Failed to open ptd file: %s", FLAGS_ptd_path.c_str());
+      return 1;
+    }
+    ptd_loader = std::make_unique<executorch::extension::FileDataLoader>(
+        std::move(ptd_loader_res.get()));
+  }
+
+  auto mod = executorch::extension::training::TrainingModule(
+      std::move(loader), nullptr, nullptr, nullptr, std::move(ptd_loader));
 
   // Create full data set of input and labels.
   std::vector<std::pair<
@@ -70,7 +90,10 @@ int main(int argc, char** argv) {
   // Get the params and names
   auto param_res = mod.named_parameters("forward");
   if (param_res.error() != Error::Ok) {
-    ET_LOG(Error, "Failed to get named parameters");
+    ET_LOG(
+        Error,
+        "Failed to get named parameters, error: %d",
+        static_cast<int>(param_res.error()));
     return 1;
   }
 
@@ -112,5 +135,6 @@ int main(int argc, char** argv) {
         std::string(param.first.data()), param.second});
   }
 
-  executorch::extension::flat_tensor::save_ptd("xor.ptd", param_map, 16);
+  executorch::extension::flat_tensor::save_ptd(
+      "trained_xor.ptd", param_map, 16);
 }
diff --git a/extension/training/pybindings/TARGETS b/extension/training/pybindings/TARGETS
index 6aa11ea6726..19b54961493 100644
--- a/extension/training/pybindings/TARGETS
+++ b/extension/training/pybindings/TARGETS
@@ -17,13 +17,11 @@ runtime.cxx_python_extension(
     types = ["_training_lib.pyi"],
     visibility = ["//executorch/extension/training/..."],
     deps = [
+        "fbsource//third-party/pybind11:pybind11",
         "//executorch/extension/aten_util:aten_bridge",
         "//executorch/extension/training/optimizer:sgd",
     ],
-    external_deps = [
-        "pybind11",
-        "libtorch_python",
-    ],
+    external_deps = ["libtorch_python"],
 )
 
 runtime.python_library(
diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp
index efb74e3a01f..40ce86e8fdc 100644
--- a/kernels/portable/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
+
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
@@ -41,7 +43,7 @@ int64_t coordinateToIndexWithDimOrder(
 
   dim_order_to_stride_nocheck(
       sizes.data(), dim_order.data(), sizes.size(), strides);
-  for (size_t i = 0; i < self.dim(); ++i) {
+  for (const auto i : c10::irange(self.dim())) {
     index += cur_indices[i] * strides[i];
   }
   return index;
@@ -59,7 +61,7 @@ void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
   for (ssize_t i = 0; i < self.numel(); i++) {
     // Update the current indices.
     for (ssize_t j = self.dim() - 1; j >= 0; j--) {
-      if (coordinate[j] + 1 < self.size(j)) {
+      if (coordinate[j] + 1 < static_cast<size_t>(self.size(j))) {
         coordinate[j]++;
         break;
       } else {
diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp
index 9f879179ec6..d36f416c7b4 100644
--- a/kernels/portable/cpu/op_amax.cpp
+++ b/kernels/portable/cpu/op_amax.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
@@ -44,7 +45,7 @@ Tensor& amax_out(
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amax.out", CTYPE, [&]() {
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+    for (const auto out_ix : c10::irange(out.numel())) {
       out_data[out_ix] = reduce_over_dim_list<CTYPE>(
           [](CTYPE v, CTYPE max_v) {
             return std::isnan(v) || v > max_v ? v : max_v;
diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp
index 4f6f3ce52e5..7c4c8186e59 100644
--- a/kernels/portable/cpu/op_amin.cpp
+++ b/kernels/portable/cpu/op_amin.cpp
@@ -5,7 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-
+#include <c10/util/irange.h>
 #include <cmath>
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
@@ -44,7 +44,7 @@ Tensor& amin_out(
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amin.out", CTYPE, [&]() {
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+    for (const auto out_ix : c10::irange(out.numel())) {
       out_data[out_ix] = reduce_over_dim_list<CTYPE>(
           [](CTYPE v, CTYPE min_v) {
             return std::isnan(v) || v < min_v ? v : min_v;
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
index 5eb656d5b76..39ad0171d5d 100644
--- a/kernels/portable/cpu/op_argmax.cpp
+++ b/kernels/portable/cpu/op_argmax.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <tuple>
 
@@ -46,7 +47,7 @@ Tensor& argmax_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmax.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+    for (const auto out_ix : c10::irange(out.numel())) {
       std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
           [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
             if (!std::isnan(acc_val) && (std::isnan(v) || v > acc_val)) {
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index 1c4a2572ea8..8148efa6264 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <tuple>
 
@@ -46,7 +47,7 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+    for (const auto out_ix : c10::irange(out.numel())) {
       std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
           [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
             if (!std::isnan(acc_val) && (std::isnan(v) || v < acc_val)) {
diff --git a/kernels/portable/cpu/op_expand_copy.cpp b/kernels/portable/cpu/op_expand_copy.cpp
index f1a7bfbf1fb..6c8685dd867 100644
--- a/kernels/portable/cpu/op_expand_copy.cpp
+++ b/kernels/portable/cpu/op_expand_copy.cpp
@@ -96,7 +96,8 @@ Tensor& expand_copy_out(
 
   ET_KERNEL_CHECK(
       ctx,
-      repeat_tensor(self, {repeats, repeats_size}, out) == Error::Ok,
+      repeat_tensor(self, makeArrayRef(repeats, repeats_size), out) ==
+          Error::Ok,
       InvalidArgument,
       out);
 
diff --git a/kernels/portable/cpu/util/activation_ops_util.cpp b/kernels/portable/cpu/util/activation_ops_util.cpp
index fe26d4fda04..abde15f8740 100644
--- a/kernels/portable/cpu/util/activation_ops_util.cpp
+++ b/kernels/portable/cpu/util/activation_ops_util.cpp
@@ -31,7 +31,7 @@ bool check_glu_args(const Tensor& in, int64_t dim, Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_floating_type(in));
 
   const size_t non_negative_dim = dim < 0 ? dim + in.dim() : dim;
-  const size_t dim_size = in.size(non_negative_dim);
+  const ssize_t dim_size = in.size(non_negative_dim);
 
   ET_CHECK_OR_RETURN_FALSE(
       dim_size % 2 == 0,
diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp
index d8569d23c2f..381e07cbe30 100644
--- a/kernels/portable/cpu/util/broadcast_util.cpp
+++ b/kernels/portable/cpu/util/broadcast_util.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/repeat_util.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
@@ -274,7 +275,7 @@ void delinearize_index(
     size_t* out_indexes,
     const size_t out_indexes_len) {
   ET_CHECK(shape.size() <= out_indexes_len);
-  for (auto i = 0; i < shape.size(); ++i) {
+  for (size_t i = 0; i < shape.size(); ++i) {
     auto dim = shape.size() - 1 - i;
     auto dim_size = shape[dim];
     out_indexes[dim] = linear_index % dim_size;
@@ -304,7 +305,8 @@ size_t linearize_access_indexes(
   size_t linear_index = 0;
   for (size_t i = 0; i < indexes_broadcast_from.size(); ++i) {
     // If this dimension is broadcasted, add zero to the linear address.
-    if (indexes_broadcast_from[i] >= broadcast_from_shape[i]) {
+    if (indexes_broadcast_from[i] >=
+        static_cast<size_t>(broadcast_from_shape[i])) {
       ET_CHECK_MSG(
           broadcast_from_shape[i] == 1,
           "Expected dim size == 1 if broadcasted, but actual dim size is %zu",
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
index 8efd6057dba..e7399ae0956 100644
--- a/kernels/portable/cpu/util/copy_ops_util.h
+++ b/kernels/portable/cpu/util/copy_ops_util.h
@@ -7,6 +7,7 @@
  */
 
 #pragma once
+#include <c10/util/irange.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -26,8 +27,8 @@ void _as_strided_copy(
     ArrayRef<int64_t> stride,
     int64_t dim) {
   // the last dimension, copy data
-  if (dim == size.size() - 1) {
-    for (size_t i = 0; i < size.at(dim); ++i) {
+  if (dim == static_cast<int64_t>(size.size()) - 1) {
+    for (const auto i : c10::irange(size.at(dim))) {
       output_data[i] = *input_data;
       input_data += stride.at(dim);
     }
@@ -35,7 +36,7 @@ void _as_strided_copy(
   }
   size_t trailing_dims = getTrailingDims(out, dim);
   // recursively set data for the next dimension
-  for (size_t i = 0; i < size.at(dim); ++i) {
+  for ([[maybe_unused]] const auto i : c10::irange(size.at(dim))) {
     _as_strided_copy<CTYPE>(
         input_data, output_data, out, size, stride, dim + 1);
     input_data += stride.at(dim);
diff --git a/kernels/portable/cpu/util/functional_util.h b/kernels/portable/cpu/util/functional_util.h
index cdf90813772..609a1a26fa5 100644
--- a/kernels/portable/cpu/util/functional_util.h
+++ b/kernels/portable/cpu/util/functional_util.h
@@ -8,6 +8,8 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
+
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
@@ -30,7 +32,7 @@ inline CTYPE apply_unary_reduce_fn(
     const int64_t size,
     const int64_t stride = 1) {
   CTYPE acc_val = data_in[0];
-  for (size_t i = 1; i < size; i++) {
+  for (const auto i : c10::irange(1, size)) {
     acc_val = reduce_fun(data_in[i * stride], acc_val);
   }
   return acc_val;
@@ -51,7 +53,7 @@ inline void apply_unary_map_fn(
     CTYPE_OUT* const data_out,
     const int64_t size,
     const int64_t stride = 1) {
-  for (size_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
     data_out[i * stride] = map_fun(data_in[i * stride]);
   }
 }
@@ -77,7 +79,7 @@ inline CTYPE_OUT apply_unary_map_reduce_fn(
     const int64_t size,
     const int64_t stride = 1) {
   CTYPE_OUT acc_val = map_fun(data_in[0]);
-  for (size_t i = 1; i < size; ++i) {
+  for (const auto i : c10::irange(1, size)) {
     acc_val = reduce_fun(map_fun(data_in[i * stride]), acc_val);
   }
   return acc_val;
diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp
index 2902cbfc138..09ba508a31d 100644
--- a/kernels/portable/cpu/util/reduce_util.cpp
+++ b/kernels/portable/cpu/util/reduce_util.cpp
@@ -48,8 +48,7 @@ ET_NODISCARD bool check_dim_list_is_valid(
       }
 
       const size_t non_neg_d = _normalize_non_neg_d(d, in.dim());
-      ET_LOG_AND_RETURN_IF_FALSE(
-          non_neg_d < kTensorDimensionLimit && non_neg_d >= 0);
+      ET_LOG_AND_RETURN_IF_FALSE(non_neg_d < kTensorDimensionLimit);
 
       ET_CHECK_OR_RETURN_FALSE(
           dim_exist[non_neg_d] == false,
@@ -86,7 +85,7 @@ size_t get_reduced_dim_product(
   }
   size_t dim_product = 1;
   if (!dim.has_value()) {
-    for (size_t i = 0; i < in.dim(); ++i) {
+    for (size_t i = 0; i < static_cast<size_t>(in.dim()); ++i) {
       dim_product *= in.size(i);
     }
     return dim_product;
@@ -108,7 +107,7 @@ size_t get_reduced_dim_product(
   size_t dim_product = 1;
   const size_t in_dim = in.dim();
   if (!dim_list.has_value() || dim_list.value().size() == 0) {
-    for (size_t i = 0; i < in.dim(); ++i) {
+    for (size_t i = 0; i < static_cast<size_t>(in.dim()); ++i) {
       dim_product *= in.size(i);
     }
     return dim_product;
@@ -136,7 +135,7 @@ size_t get_out_numel(
       ET_CHECK_VALID_DIM(dim_val, in.dim());
     }
     const size_t non_neg_dim = _normalize_non_neg_d(dim_val, in.dim());
-    for (size_t d = 0; d < in.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(in.dim()); ++d) {
       if (d != non_neg_dim) {
         out_numel *= in.size(d);
       }
@@ -155,7 +154,7 @@ size_t get_out_numel(
         dim_list) {
   size_t out_numel = 1;
   if (dim_list.has_value() && dim_list.value().size() != 0) {
-    for (size_t d = 0; d < in.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(in.dim()); ++d) {
       if (!check_dim_in_dim_list(d, in.dim(), dim_list.value())) {
         out_numel *= in.size(d);
       }
@@ -234,7 +233,7 @@ size_t compute_reduced_out_size(
   if (dim.has_value()) {
     const auto dim_val = dim.value();
     const size_t non_neg_dim = _normalize_non_neg_d(dim_val, in_dim);
-    for (ssize_t i = 0; i < non_neg_dim; ++i) {
+    for (size_t i = 0; i < non_neg_dim; ++i) {
       sizes_arr[i] = in.size(i);
     }
     if (keepdim) {
@@ -250,7 +249,7 @@ size_t compute_reduced_out_size(
     }
   } else {
     if (keepdim) {
-      for (size_t i = 0; i < in_dim; ++i) {
+      for (size_t i = 0; i < static_cast<size_t>(in_dim); ++i) {
         sizes_arr[i] = 1;
       }
     } else {
@@ -266,7 +265,9 @@ size_t compute_reduced_out_size(
         dim_list,
     bool keepdim,
     executorch::aten::SizesType* sizes_arr) {
-  const auto in_dim = in.dim();
+  // check_dim_in_dim_list and later comparisons
+  // expect in_dim to be size_t, so cast it here
+  const size_t in_dim = static_cast<size_t>(in.dim());
   size_t out_dim = in_dim;
 
   if (dim_list.has_value() && dim_list.value().size() != 0) {
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 25a2c0b44c4..35cfdfbaa72 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -50,7 +50,7 @@ void apply_on_flat_ix_with_dim_mask_and_base(
     const size_t start,
     const size_t end) {
   // Compute innermost dim from dim list
-  size_t inner_dim = in.dim() - 1;
+  int64_t inner_dim = in.dim() - 1;
   while (!dim_mask[inner_dim]) {
     inner_dim--;
   }
@@ -58,7 +58,7 @@ void apply_on_flat_ix_with_dim_mask_and_base(
   // Initialize array of indices per dimension. This array is used to maintain
   // the per-dimension index of the element in `in` that is being reduced over
   // Only the dims that are in the dim list are relevant.
-  size_t dim_index[kTensorDimensionLimit];
+  int64_t dim_index[kTensorDimensionLimit];
   for (int64_t d = 0; d < in.dim(); d++) {
     dim_index[d] = 0;
   }
diff --git a/kernels/portable/cpu/util/repeat_util.cpp b/kernels/portable/cpu/util/repeat_util.cpp
index 925fda9f793..be7231cb621 100644
--- a/kernels/portable/cpu/util/repeat_util.cpp
+++ b/kernels/portable/cpu/util/repeat_util.cpp
@@ -8,6 +8,7 @@
 
 #include <cstring>
 
+#include <executorch/kernels/portable/cpu/util/repeat_util.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -26,7 +27,7 @@ bool check_repeat_args(
     Tensor& out) {
   // Ensure the self tensors list is non-empty.
   ET_CHECK_OR_RETURN_FALSE(
-      repeats.size() >= self.dim(),
+      static_cast<ssize_t>(repeats.size()) >= self.dim(),
       "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
 
   // Repeat arrayref shall not contain negative element.
@@ -39,7 +40,7 @@ bool check_repeat_args(
 
   /// Check if out.size() is legal.
   ET_CHECK_OR_RETURN_FALSE(
-      out.dim() == repeats.size(),
+      static_cast<size_t>(out.dim()) == repeats.size(),
       "The dimension of out shall equal size of repeats, but now is %zd and %zd",
       out.dim(),
       repeats.size());
@@ -48,7 +49,7 @@ bool check_repeat_args(
   // kTensorDimensionLimit. Only check out tensor because the number of
   // dimension of out tensor shall have more than or equal to self tensor
   ET_CHECK_OR_RETURN_FALSE(
-      out.dim() <= kTensorDimensionLimit,
+      static_cast<size_t>(out.dim()) <= kTensorDimensionLimit,
       "The dimension of input and output should not be larger than %zd",
       kTensorDimensionLimit);
 
@@ -58,7 +59,7 @@ bool check_repeat_args(
   // repeats, and called it reformat_self_size. We then make point-to-point mul
   // of reformat_self_size and repeats. The result should equal out.size().
   size_t reformat_self_size[kTensorDimensionLimit];
-  for (size_t i = 0; i < out.dim() - self.dim(); i++) {
+  for (ssize_t i = 0; i < out.dim() - self.dim(); i++) {
     reformat_self_size[i] = 1;
   }
 
@@ -131,7 +132,7 @@ void repeat_internal(
   // The increment along index of slot array to reach the next possible valid
   // value.
   int64_t incr[kTensorDimensionLimit];
-  for (size_t i = 0; i < self_dim; i++) {
+  for (size_t i = 0; i < static_cast<size_t>(self_dim); i++) {
     incr[i] = self_size[i];
   }
 
@@ -141,7 +142,7 @@ void repeat_internal(
   // than self).
   size_t index = self_dim - 1;
   size_t start = out.dim() - self_dim;
-  while (slots[0] != out.size(start)) {
+  while (slots[0] != static_cast<size_t>(out.size(start))) {
     // Compute the offset (from origin) in the out tensor where this self
     // data will be copied to.
     size_t offset = compute_access_offset(slots, strides, self_dim);
@@ -151,7 +152,7 @@ void repeat_internal(
     slots[index] += incr[index];
     // If we have reached the limit in the innermost dimension, successively
     // increment the slot index of outer dimensions.
-    while (slots[index] == out.size(start + index)) {
+    while (slots[index] == static_cast<size_t>(out.size(start + index))) {
       if (index == 0) {
         break;
       }
@@ -227,7 +228,7 @@ Error repeat_tensor(
   // so we reset the upper bound of innermost dim to 1. 'in_incr' indicates
   // the size (in bytes) of the self data.
   int64_t limits[kTensorDimensionLimit];
-  for (size_t i = 0; i < self_dim; i++) {
+  for (ssize_t i = 0; i < self_dim; i++) {
     limits[i] = self_size[i];
   }
 
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index eef765d5eec..2b22687274f 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -61,7 +61,6 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
-        compiler_flags = ["-Wno-missing-prototypes"],
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
@@ -71,7 +70,6 @@ def define_common_targets():
         exported_headers = [
             "broadcast_util.h",
         ],
-        compiler_flags = ["-Wno-missing-prototypes"],
         deps = [
             ":repeat_util",
             "//executorch/runtime/kernel:kernel_includes",
diff --git a/kernels/prim_ops/et_view.cpp b/kernels/prim_ops/et_view.cpp
index 0f041dae00f..7f66bca1725 100644
--- a/kernels/prim_ops/et_view.cpp
+++ b/kernels/prim_ops/et_view.cpp
@@ -32,7 +32,8 @@ bool get_view_target_size(
     executorch::aten::ArrayRef<int64_t> size,
     int64_t dim,
     executorch::aten::SizesType* out_size) {
-  ET_LOG_AND_RETURN_IF_FALSE(size.size() == dim);
+  ET_LOG_AND_RETURN_IF_FALSE(
+      dim >= 0 && size.size() == static_cast<size_t>(dim));
   int minus1_dim = -1;
   int n_zero = 0;
   int64_t numel_without_minus_1 = 1;
diff --git a/runtime/core/data_loader.h b/runtime/core/data_loader.h
index 45fd1bc8189..3dda5516908 100644
--- a/runtime/core/data_loader.h
+++ b/runtime/core/data_loader.h
@@ -69,12 +69,12 @@ class DataLoader {
     SegmentInfo() = default;
 
     explicit SegmentInfo(
-        Type segment_type,
-        size_t segment_index = 0,
-        const char* descriptor = nullptr)
-        : segment_type(segment_type),
-          segment_index(segment_index),
-          descriptor(descriptor) {}
+        Type segment_type_,
+        size_t segment_index_ = 0,
+        const char* descriptor_ = nullptr)
+        : segment_type(segment_type_),
+          segment_index(segment_index_),
+          descriptor(descriptor_) {}
   };
 
   virtual ~DataLoader() = default;
diff --git a/runtime/core/exec_aten/util/dim_order_util.h b/runtime/core/exec_aten/util/dim_order_util.h
index 7a31db9d6ad..07b3d5c2a97 100644
--- a/runtime/core/exec_aten/util/dim_order_util.h
+++ b/runtime/core/exec_aten/util/dim_order_util.h
@@ -23,8 +23,8 @@ namespace runtime {
 namespace {
 template <typename DimOrderType>
 bool validate_dim_order(const DimOrderType* dim_order, const size_t dims) {
-  for (int32_t i = 0; i < dims; ++i) {
-    if (dim_order[i] >= dims) {
+  for (size_t i = 0; i < dims; ++i) {
+    if (dim_order[i] >= static_cast<DimOrderType>(dims)) {
       return false;
     }
   }
@@ -43,8 +43,8 @@ template <typename DimOrderType>
 inline bool is_contiguous_dim_order(
     const DimOrderType* dim_order,
     const size_t dims) {
-  for (int i = 0; i < dims; ++i) {
-    if (dim_order[i] != i) {
+  for (size_t i = 0; i < dims; ++i) {
+    if (dim_order[i] != static_cast<DimOrderType>(i)) {
       return false;
     }
   }
@@ -66,7 +66,7 @@ bool is_channels_last_dim_order(
     return false;
   }
   // 4-dim tensor is interpreted as NCHW, 5-dim tensor is interpreted as NCHWD
-  size_t channels_dim = 1;
+  DimOrderType channels_dim = 1;
   // Last value in the dim order should be the channels dim
   if (dim_order[dims - 1] != channels_dim) {
     return false;
@@ -75,8 +75,8 @@ bool is_channels_last_dim_order(
   if (dim_order[0] != 0) {
     return false;
   }
-  int d = 1;
-  while (d < dims - 1) {
+  DimOrderType d = 1;
+  while (d < static_cast<DimOrderType>(dims) - 1) {
     if (dim_order[d] != d + 1) {
       return false;
     }
@@ -163,8 +163,8 @@ struct StrideDimOrder {
   StridesType stride;
   DimOrderType dim_order;
 
-  StrideDimOrder(StridesType stride, DimOrderType dim_order)
-      : stride(stride), dim_order(dim_order) {}
+  StrideDimOrder(StridesType stride_, DimOrderType dim_order_)
+      : stride(stride_), dim_order(dim_order_) {}
   StrideDimOrder() = default;
   bool operator>(const StrideDimOrder& other) const {
     // descending order
diff --git a/runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp b/runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp
index cfd416285c5..02155a4d9b4 100644
--- a/runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp
+++ b/runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp
@@ -30,7 +30,9 @@ std::array<char, kTensorShapeStringSizeLimit> tensor_shape_to_c_string_impl(
   }
   *p++ = '(';
   for (const auto elem : shape) {
-    if (elem < 0 || elem > internal::kMaximumPrintableTensorShapeElement) {
+    if (elem < 0 ||
+        static_cast<size_t>(elem) >
+            internal::kMaximumPrintableTensorShapeElement) {
       static_assert(
           internal::kMaximumPrintableTensorShapeElement > 99999,
           "must have room for error string!");
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
index eb5ce10b6f3..fcc08ebf98d 100644
--- a/runtime/core/exec_aten/util/tensor_util.h
+++ b/runtime/core/exec_aten/util/tensor_util.h
@@ -584,7 +584,7 @@ inline bool tensors_have_same_dtype(
 
 inline bool tensor_is_rank(executorch::aten::Tensor t, size_t rank) {
   ET_CHECK_OR_RETURN_FALSE(
-      t.dim() == rank,
+      static_cast<size_t>(t.dim()) == rank,
       "Expected tensor.dim() to be %zu, but got %zu",
       static_cast<size_t>(rank),
       static_cast<size_t>(t.dim()));
@@ -596,7 +596,7 @@ inline bool tensor_has_rank_greater_or_equal_to(
     executorch::aten::Tensor t,
     size_t rank) {
   ET_CHECK_OR_RETURN_FALSE(
-      t.dim() >= rank,
+      static_cast<size_t>(t.dim()) >= rank,
       "Expected tensor.dim() to be >= %zu, but got %zu",
       static_cast<size_t>(rank),
       static_cast<size_t>(t.dim()));
@@ -608,7 +608,7 @@ inline bool tensor_has_rank_smaller_or_equal_to(
     executorch::aten::Tensor t,
     size_t rank) {
   ET_CHECK_OR_RETURN_FALSE(
-      t.dim() <= rank,
+      static_cast<size_t>(t.dim()) <= rank,
       "Expected tensor.dim() to be <= %zu, but got %zu",
       static_cast<size_t>(rank),
       static_cast<size_t>(t.dim()));
@@ -665,12 +665,12 @@ inline bool tensors_have_same_size_at_dims(
     executorch::aten::Tensor b,
     size_t dim_b) {
   ET_CHECK_OR_RETURN_FALSE(
-      dim_a < a.dim(),
+      dim_a < static_cast<size_t>(a.dim()),
       "Cannot retrieve dim %zu from tensor with dim %zu",
       static_cast<size_t>(dim_a),
       static_cast<size_t>(a.dim()));
   ET_CHECK_OR_RETURN_FALSE(
-      dim_b < b.dim(),
+      dim_b < static_cast<size_t>(b.dim()),
       "Cannot retrieve dim %zu from tensor with dim %zu",
       static_cast<size_t>(dim_b),
       static_cast<size_t>(b.dim()));
@@ -702,7 +702,9 @@ inline bool tensors_have_same_shape(
         static_cast<size_t>(b.numel()),
         static_cast<size_t>(a.dim()),
         static_cast<size_t>(b.dim()));
-    for (size_t d = 0; d < ET_MIN2(a.dim(), b.dim()); ++d) {
+    // Using [[maybe_unused]] as ET_LOG may not trigger based on verbosity
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a.dim(), b.dim()))) {
       ET_LOG(
           Error,
           "    size(%zu): (%zu, %zu)",
@@ -739,7 +741,8 @@ inline bool tensors_have_same_shape(
         static_cast<size_t>(a.dim()),
         static_cast<size_t>(b.dim()),
         static_cast<size_t>(c.dim()));
-    for (size_t d = 0; d < ET_MIN3(a.dim(), b.dim(), c.dim()); ++d) {
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN3(a.dim(), b.dim(), c.dim()))) {
       ET_LOG(
           Error,
           "    size(%zu): (%zu, %zu, %zu)",
@@ -779,7 +782,8 @@ inline bool tensor_has_expected_size(
         static_cast<size_t>(expected_sizes.size()));
     size_t a_dim = static_cast<size_t>(a.dim());
     size_t expected_dim = static_cast<size_t>(expected_sizes.size());
-    for (size_t d = 0; d < ET_MIN2(a_dim, expected_dim); ++d) {
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a_dim, expected_dim))) {
       ET_LOG(
           Error,
           "    size(%zu): (%zu, %zu)",
@@ -802,7 +806,8 @@ inline bool tensors_have_same_strides(
         ET_TENSOR_CHECK_PREFIX__ ": dim=(%zu, %zu)",
         static_cast<size_t>(a.dim()),
         static_cast<size_t>(b.dim()));
-    for (size_t d = 0; d < ET_MIN2(a.dim(), b.dim()); ++d) {
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a.dim(), b.dim()))) {
       ET_LOG(
           Error,
           "    stride(%zu): (%zu, %zu)",
@@ -827,7 +832,8 @@ inline bool tensors_have_same_strides(
         static_cast<size_t>(a.dim()),
         static_cast<size_t>(b.dim()),
         static_cast<size_t>(c.dim()));
-    for (size_t d = 0; d < ET_MIN3(a.dim(), b.dim(), c.dim()); ++d) {
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN3(a.dim(), b.dim(), c.dim()))) {
       ET_LOG(
           Error,
           "    stride(%zu): (%zu, %zu, %zu)",
@@ -894,7 +900,7 @@ inline size_t getLeadingDims(
       dim,
       ssize_t(tensor.dim()));
   size_t dims = 1;
-  for (size_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     dims *= static_cast<size_t>(tensor.size(i));
   }
   return dims;
@@ -911,7 +917,7 @@ inline size_t getTrailingDims(
       dim,
       ssize_t(tensor.dim()));
   size_t dims = 1;
-  for (size_t i = dim + 1; i < tensor.dim(); ++i) {
+  for (size_t i = dim + 1; i < static_cast<size_t>(tensor.dim()); ++i) {
     dims *= static_cast<size_t>(tensor.size(i));
   }
   return dims;
@@ -984,7 +990,7 @@ inline void indexToCoordinate(
     const executorch::aten::Tensor& tensor,
     size_t index,
     size_t* coordinate) {
-  ET_CHECK(index < tensor.numel());
+  ET_CHECK(index < static_cast<size_t>(tensor.numel()));
   for (auto i = 0; i < tensor.dim(); ++i) {
     auto dim = tensor.dim() - 1 - i;
     size_t dim_size = tensor.size(dim);
diff --git a/runtime/core/exec_aten/util/tensor_util_portable.cpp b/runtime/core/exec_aten/util/tensor_util_portable.cpp
index c1cbcfb6064..e4aa875aed4 100644
--- a/runtime/core/exec_aten/util/tensor_util_portable.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_portable.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/runtime/core/portable_type/tensor.h>
@@ -41,11 +42,11 @@ Error get_dim_order(
 bool tensor_has_valid_dim_order(torch::executor::Tensor t) {
   if (!validate_dim_order(t.dim_order().data(), t.dim_order().size())) {
     ET_LOG(Error, "Tensor dim order is not valid:");
-    for (size_t d = 0; d < t.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(t.dim()); ++d) {
       ET_LOG(
           Error,
           "    dim_order(%zu): %zu",
-          static_cast<size_t>(d),
+          d,
           static_cast<size_t>(t.dim_order()[d]));
     }
     return false;
@@ -62,11 +63,11 @@ bool tensor_is_default_or_channels_last_dim_order(torch::executor::Tensor t) {
     ET_LOG(
         Error,
         "Expected tensor to have default or channels last dim order, but got");
-    for (size_t d = 0; d < t.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(t.dim()); ++d) {
       ET_LOG(
           Error,
           "    dim_order(%zu): %zu",
-          static_cast<size_t>(d),
+          d,
           static_cast<size_t>(t.dim_order()[d]));
     }
   }
@@ -79,11 +80,11 @@ bool tensor_is_default_dim_order(torch::executor::Tensor t) {
 
   if (!ret_val) {
     ET_LOG(Error, "Expected tensor to have default dim order, but got");
-    for (size_t d = 0; d < t.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(t.dim()); ++d) {
       ET_LOG(
           Error,
           "    dim_order(%zu): %zu",
-          static_cast<size_t>(d),
+          d,
           static_cast<size_t>(t.dim_order()[d]));
     }
   }
@@ -96,11 +97,11 @@ bool tensor_is_channels_last_dim_order(torch::executor::Tensor t) {
 
   if (!ret_val) {
     ET_LOG(Error, "Expected tensor to have channels last dim order, but got");
-    for (size_t d = 0; d < t.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(t.dim()); ++d) {
       ET_LOG(
           Error,
           "    dim_order(%zu): %zu",
-          static_cast<size_t>(d),
+          d,
           static_cast<size_t>(t.dim_order()[d]));
     }
   }
diff --git a/runtime/core/portable_type/c10/c10/util/irange.h b/runtime/core/portable_type/c10/c10/util/irange.h
index 3249bdfa5cf..81104d9568f 100644
--- a/runtime/core/portable_type/c10/c10/util/irange.h
+++ b/runtime/core/portable_type/c10/c10/util/irange.h
@@ -24,7 +24,7 @@ struct integer_iterator {
   using pointer = I*;
   using reference = I&;
 
-  explicit constexpr integer_iterator(I value) : value(value) {}
+  explicit constexpr integer_iterator(I value_) : value(value_) {}
 
   constexpr I operator*() const {
     return value;
diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index 6366a8eac28..ede5a3d4101 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -35,8 +35,8 @@ ssize_t compute_numel(const TensorImpl::SizesType* sizes, ssize_t dim) {
   for (const auto i : c10::irange(dim)) {
     ET_CHECK_MSG(
         sizes[i] >= 0,
-        "Size must be non-negative, got %d at dimension %zd",
-        sizes[i],
+        "Size must be non-negative, got %zd at dimension %zd",
+        static_cast<ssize_t>(sizes[i]),
         i);
     numel *= sizes[i];
   }
@@ -76,7 +76,7 @@ ssize_t TensorImpl::element_size() const {
 
 Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
   ET_CHECK_OR_RETURN_ERROR(
-      new_sizes.size() == dim_,
+      static_cast<ssize_t>(new_sizes.size()) == dim_,
       NotSupported,
       "Attempted to change the tensor rank which is immutable: old=%zu, new=%zu",
       dim_,
@@ -120,7 +120,7 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
       const auto new_numel = compute_numel(new_sizes.data(), dim_);
 
       ET_CHECK_OR_RETURN_ERROR(
-          new_numel <= numel_bound_,
+          static_cast<size_t>(new_numel) <= numel_bound_,
           NotSupported,
           "Attempted to resize a bounded tensor with a maximum capacity of %zu elements to %zu elements.",
           numel_bound_,
diff --git a/runtime/core/tensor_layout.cpp b/runtime/core/tensor_layout.cpp
index f0fac442e20..2b862e6dc14 100644
--- a/runtime/core/tensor_layout.cpp
+++ b/runtime/core/tensor_layout.cpp
@@ -20,7 +20,7 @@ Result<size_t> calculate_nbytes(
     const Span<const int32_t>& sizes,
     const executorch::aten::ScalarType& scalar_type) {
   ssize_t n = 1;
-  for (ssize_t i = 0; i < sizes.size(); i++) {
+  for (const auto i : c10::irange(sizes.size())) {
     if (sizes[i] < 0) {
       return Error::InvalidArgument;
     }
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 0857bc1c976..7da7bafd3e5 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/runtime/executor/method.h>
 
+#include <c10/util/irange.h>
 #include <array>
 #include <cinttypes> // @donotremove
 #include <cstdint>
@@ -239,10 +240,10 @@ Result<InstructionArgs> gen_instruction_arguments(
   for (size_t i = 0; i < num_args; ++i) {
     int32_t arg_idx = arg_idxs[i];
     ET_CHECK_OR_RETURN_ERROR(
-        arg_idx < num_values,
+        static_cast<size_t>(arg_idx) < num_values,
         InvalidProgram,
-        "Arg index %d >= %" ET_PRIsize_t,
-        arg_idx,
+        "Arg index %zd >= %" ET_PRIsize_t,
+        static_cast<ssize_t>(arg_idx),
         num_values);
     arg_list[i] = &values[arg_idx];
   }
@@ -270,7 +271,7 @@ Result<bool> parse_cond_value(const EValue& cond_value) {
         static_cast<int8_t>(cond_val.scalar_type()));
 
     const bool* cond_data = cond_val.const_data_ptr<bool>();
-    for (size_t i = 0; i < cond_val.numel(); i++) {
+    for (size_t i = 0; i < static_cast<size_t>(cond_val.numel()); i++) {
       if (!cond_data[i]) {
         return false;
       }
@@ -481,7 +482,7 @@ Error Method::parse_values(const NamedDataMap* named_data_map) {
         for (size_t j = 0; j < items->size(); j++) {
           auto value_index = items->Get(j);
           ET_CHECK_OR_RETURN_ERROR(
-              value_index >= 0 && value_index < n_value,
+              value_index >= 0 && static_cast<size_t>(value_index) < n_value,
               InvalidProgram,
               "Invalid value index %" PRId64 " for IntList %" ET_PRIsize_t
               " index %" ET_PRIsize_t,
@@ -644,7 +645,7 @@ Error populate_operator_name(
       has_overload ? op->overload()->c_str() : "");
   ET_CHECK_OR_RETURN_ERROR(cx >= 0, Internal, "snprintf failed: %d", cx);
   ET_CHECK_OR_RETURN_ERROR(
-      cx < operator_name_size,
+      static_cast<size_t>(cx) < operator_name_size,
       Internal,
       "Operator name %s%s%s with length %d "
       "truncated to %" ET_PRIsize_t " due to internal buffer limit.",
@@ -672,7 +673,8 @@ Error Method::resolve_operator(
   char operator_name[kTempBufferSizeForName];
   const auto ops = serialization_plan_->operators();
   ET_CHECK_OR_RETURN_ERROR(
-      ops != nullptr && op_index < ops->size(),
+      ops != nullptr &&
+          static_cast<flatbuffers::uoffset_t>(op_index) < ops->size(),
       InvalidProgram,
       "Op index %" PRIu32 " out of range",
       op_index);
@@ -721,7 +723,11 @@ Error Method::resolve_operator(
   Result<OpFunction> op_function =
       get_op_function_from_registry(operator_name, {meta, count});
   if (!op_function.ok()) {
-    ET_LOG(Error, "Missing operator: [%d] %s", op_index, operator_name);
+    ET_LOG(
+        Error,
+        "Missing operator: [%zd] %s",
+        static_cast<ssize_t>(op_index),
+        operator_name);
     return op_function.error();
   }
   kernels[kernel_index] = op_function.get();
@@ -923,10 +929,10 @@ Error Method::init(
                     instr_args)
                     ->cond_value_index();
             ET_CHECK_OR_RETURN_ERROR(
-                index >= 0 && index < n_value_,
+                index >= 0 && static_cast<size_t>(index) < n_value_,
                 InvalidProgram,
-                "Index %d negative or >= %" ET_PRIsize_t,
-                index,
+                "Index %zd negative or >= %" ET_PRIsize_t,
+                static_cast<ssize_t>(index),
                 n_value_);
             chain_instruction_arg_lists[instr_idx] = InstructionArgs();
           } break;
@@ -944,9 +950,9 @@ Error Method::init(
     ET_CHECK_OR_RETURN_ERROR(
         num_instructions_missing_op == 0,
         OperatorMissing,
-        "There are %d instructions don't have corresponding operator registered. "
+        "There are %zu instructions don't have corresponding operator registered. "
         "See logs for details",
-        num_instructions_missing_op);
+        static_cast<size_t>(num_instructions_missing_op));
     if (delayed_error != Error::Ok) {
       return delayed_error;
     }
@@ -1315,7 +1321,7 @@ Error Method::execute_instruction() {
       auto delegate_idx =
           instruction->instr_args_as_DelegateCall()->delegate_index();
       ET_CHECK_OR_RETURN_ERROR(
-          delegate_idx < n_delegate_,
+          static_cast<size_t>(delegate_idx) < n_delegate_,
           Internal,
           "DELEGATE_CALL index %" PRIu32 " >= num delegates %" ET_PRIsize_t
           " at instruction %" ET_PRIsize_t,
@@ -1609,18 +1615,18 @@ Method::~Method() {
   // Destroy the values. It's necessary in ATen mode, where the refcount of
   // Tensors needs to be decremented properly.
   if (values_ != nullptr) {
-    for (int i = 0; i < n_value_; ++i) {
+    for (size_t i = 0; i < n_value_; ++i) {
       values_[i].~EValue();
     }
   }
   // Free any resources associated with delegate backends.
   if (delegates_ != nullptr) {
-    for (int i = 0; i < n_delegate_; i++) {
+    for (size_t i = 0; i < n_delegate_; i++) {
       delegates_[i].~BackendDelegate();
     }
   }
   // Free resources associated with external constants.
-  for (int i = 0; i < n_external_constants_; i++) {
+  for (const auto i : c10::irange(n_external_constants_)) {
     external_constants_[i].buffer.~FreeableBuffer();
   }
   // All other fields are trivially destructible.
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index bcc2390d2bd..651a815c335 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -56,7 +56,7 @@ size_t calculate_nbytes(
     Span<const int32_t> sizes,
     executorch::aten::ScalarType scalar_type) {
   ssize_t n = 1;
-  for (ssize_t i = 0; i < sizes.size(); i++) {
+  for (size_t i = 0; i < sizes.size(); i++) {
     n *= sizes[i];
   }
   // Use the full namespace to disambiguate from c10::elementSize.
@@ -110,7 +110,7 @@ size_t MethodMeta::num_inputs() const {
 Result<Tag> MethodMeta::input_tag(size_t index) const {
   auto num_inputs = this->num_inputs();
   ET_CHECK_OR_RETURN_ERROR(
-      index >= 0 && index < num_inputs,
+      index < num_inputs,
       InvalidArgument,
       "index %zu out of range. num_inputs: %zu",
       index,
@@ -118,10 +118,10 @@ Result<Tag> MethodMeta::input_tag(size_t index) const {
   auto input_index = s_plan_->inputs()->Get(index);
   size_t num_values = s_plan_->values()->size();
   ET_CHECK_OR_RETURN_ERROR(
-      input_index >= 0 && input_index < num_values,
+      input_index >= 0 && static_cast<size_t>(input_index) < num_values,
       InvalidProgram,
-      "internal value index %d out of range [0,%zu) for input %zu",
-      input_index,
+      "internal value index %zd out of range [0,%zu) for input %zu",
+      static_cast<ssize_t>(input_index),
       num_values,
       index);
   auto serialization_value = s_plan_->values()->Get(input_index);
@@ -160,7 +160,7 @@ size_t MethodMeta::num_outputs() const {
 Result<Tag> MethodMeta::output_tag(size_t index) const {
   auto num_outputs = this->num_outputs();
   ET_CHECK_OR_RETURN_ERROR(
-      index >= 0 && index < num_outputs,
+      index < num_outputs,
       InvalidArgument,
       "index %zu out of range. num_outputs: %zu",
       index,
@@ -168,10 +168,10 @@ Result<Tag> MethodMeta::output_tag(size_t index) const {
   auto output_index = s_plan_->outputs()->Get(index);
   size_t num_values = s_plan_->values()->size();
   ET_CHECK_OR_RETURN_ERROR(
-      output_index >= 0 && output_index < num_values,
+      output_index >= 0 && static_cast<size_t>(output_index) < num_values,
       InvalidProgram,
-      "internal value index %d out of range [0,%zu) for output %zu",
-      output_index,
+      "internal value index %zd out of range [0,%zu) for output %zu",
+      static_cast<ssize_t>(output_index),
       num_values,
       index);
   auto serialization_value = s_plan_->values()->Get(output_index);
@@ -218,7 +218,7 @@ size_t MethodMeta::num_memory_planned_buffers() const {
 Result<int64_t> MethodMeta::memory_planned_buffer_size(size_t index) const {
   auto num_buffers = this->num_memory_planned_buffers();
   ET_CHECK_OR_RETURN_ERROR(
-      index >= 0 && index < num_buffers,
+      index < num_buffers,
       InvalidArgument,
       "index %zu out of range. num_buffers: %zu",
       index,
diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp
index 964b8c8bdac..67f1edd4df3 100644
--- a/runtime/executor/program.cpp
+++ b/runtime/executor/program.cpp
@@ -163,10 +163,10 @@ Result<executorch_flatbuffer::ExecutionPlan*> get_execution_plan(
     ET_CHECK_OR_RETURN_ERROR(
         constant_buffer == nullptr || constant_buffer->size() == 0,
         InvalidProgram,
-        "constant_buffer contains %u items, "
-        "constant_segment.offsets contains %u items. Only one should be used.",
-        constant_buffer->size(),
-        constant_segment->offsets()->size());
+        "constant_buffer contains %zu items, "
+        "constant_segment.offsets contains %zu items. Only one should be used.",
+        static_cast<size_t>(constant_buffer->size()),
+        static_cast<size_t>(constant_segment->offsets()->size()));
     const auto* segments = flatbuffer_program->segments();
     ET_CHECK_OR_RETURN_ERROR(
         segments != nullptr, InvalidProgram, "No segments in program");
@@ -176,9 +176,9 @@ Result<executorch_flatbuffer::ExecutionPlan*> get_execution_plan(
     ET_CHECK_OR_RETURN_ERROR(
         constant_segment->segment_index() < segments->size(),
         InvalidProgram,
-        "Constant segment index %d invalid for program segments range %d",
-        constant_segment->segment_index(),
-        segments->size());
+        "Constant segment index %zu invalid for program segments range %zu",
+        static_cast<size_t>(constant_segment->segment_index()),
+        static_cast<size_t>(segments->size()));
 
     const executorch_flatbuffer::DataSegment* data_segment =
         segments->Get(constant_segment->segment_index());
@@ -347,8 +347,8 @@ Result<const void*> Program::get_constant_buffer_data(
     ET_CHECK_OR_RETURN_ERROR(
         storage_size <= nbytes,
         InvalidArgument,
-        "Constant buffer size %u larger than allocated nbytes %zu",
-        storage_size,
+        "Constant buffer size %zu larger than allocated nbytes %zu",
+        static_cast<size_t>(constant_buffer[buffer_index]->storage()->size()),
         nbytes);
 
     return storage->data();
@@ -479,8 +479,8 @@ Error Program::load_mutable_subsegment_into(
   if (segment_offsets->segment_index() >= num_segments) {
     ET_LOG(
         Error,
-        "Segment index %u out of range (>= %zu)",
-        segment_offsets->segment_index(),
+        "Segment index %zu out of range (>= %zu)",
+        static_cast<size_t>(segment_offsets->segment_index()),
         num_segments);
     return Error::NotFound;
   }
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index c5d07448a06..8993c5dc473 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -74,6 +74,10 @@ def define_common_targets():
                 "program.h",
                 "tensor_parser.h",
             ],
+            compiler_flags = select({
+                "ovr_config//os:windows": [],
+                "DEFAULT" :["-Wno-error=deprecated-declarations"]
+            }),
             preprocessor_flags = _program_preprocessor_flags(),
             exported_deps = [
                 ":memory_manager",
diff --git a/runtime/executor/tensor_parser.h b/runtime/executor/tensor_parser.h
index cfd711713ac..362f0b11e20 100644
--- a/runtime/executor/tensor_parser.h
+++ b/runtime/executor/tensor_parser.h
@@ -91,7 +91,7 @@ parseListOptionalType(
       evalp_list[output_idx] = nullptr;
     } else {
       ET_CHECK_OR_RETURN_ERROR(
-          index >= 0 && index < values_len,
+          index >= 0 && static_cast<size_t>(index) < values_len,
           InvalidProgram,
           "Invalid value index %" PRId32 " for ListOptional",
           index);
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
index de809ee09cc..002c7366be6 100644
--- a/runtime/executor/tensor_parser_exec_aten.cpp
+++ b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -64,7 +64,8 @@ ET_NODISCARD Result<void*> getMemPlannedPtr(
         "size_t cannot hold memory offset 0x%08" PRIx32 ".%08" PRIx32,
         memory_offset_high,
         memory_offset_low);
-    memory_offset |= static_cast<size_t>(memory_offset_high) << 32;
+    memory_offset |= static_cast<size_t>(memory_offset_high)
+        << (sizeof(size_t) - sizeof(uint32_t));
   }
   return allocator->get_offset_address(memory_id, memory_offset, nbytes);
 }
@@ -94,7 +95,7 @@ ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
   size_t output_idx = 0;
   for (int32_t tensor_index : *tensor_indices) {
     ET_CHECK_OR_RETURN_ERROR(
-        tensor_index >= 0 && tensor_index < values_len,
+        tensor_index >= 0 && static_cast<size_t>(tensor_index) < values_len,
         InvalidProgram,
         "Invalid value index %" PRId32 " for TensorList",
         tensor_index);
@@ -123,7 +124,9 @@ ET_NODISCARD Error validateTensorLayout(
       static_cast<int8_t>(expected_layout.scalar_type()));
   int dim = s_tensor->sizes()->size();
   ET_CHECK_OR_RETURN_ERROR(
-      dim == expected_layout.sizes().size(),
+      dim >= 0, InvalidExternalData, "Dim is negative: %d", dim)
+  ET_CHECK_OR_RETURN_ERROR(
+      static_cast<size_t>(dim) == expected_layout.sizes().size(),
       InvalidExternalData,
       "Dim mismatch. Expected %d, got %zu.",
       dim,
@@ -150,7 +153,7 @@ ET_NODISCARD Error validateTensorLayout(
 // Check if key exists in entries. If it does, return a pointer to the entry
 // otherwise return a nullptr.
 NamedData* get_data_by_key(const char* key, Span<NamedData> entries) {
-  for (int i = 0; i < entries.size(); i++) {
+  for (const auto i : c10::irange(entries.size())) {
     if (strcmp(key, entries[i].key) == 0) {
       return &entries[i];
     }
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
index b72fedc5eee..4b424b29f5c 100644
--- a/runtime/executor/tensor_parser_portable.cpp
+++ b/runtime/executor/tensor_parser_portable.cpp
@@ -107,12 +107,12 @@ Result<Tensor> parseTensor(
   // detect bad positive values, but we can reject negative values, which would
   // otherwise panic in the TensorImpl ctor. dim_order_to_stride() will validate
   // dim_order.
-  for (int i = 0; i < dim; i++) {
+  for (flatbuffers::uoffset_t i = 0; i < dim; i++) {
     ET_CHECK_OR_RETURN_ERROR(
         sizes[i] >= 0,
         InvalidProgram,
-        "Negative size[%d] %" PRId32,
-        i,
+        "Negative size[%zu] %" PRId32,
+        static_cast<size_t>(i),
         sizes[i]);
   }
 
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index b51c2567f0a..85705e5b3fd 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -79,7 +79,7 @@ Error register_kernels_internal(const Span<const Kernel> kernels) {
 
   for (const auto& kernel : kernels) {
     // Linear search. This is fine if the number of kernels is small.
-    for (int32_t i = 0; i < num_registered_kernels; i++) {
+    for (size_t i = 0; i < num_registered_kernels; i++) {
       Kernel k = registered_kernels[i];
       if (strcmp(kernel.name_, k.name_) == 0 &&
           kernel.kernel_key_ == k.kernel_key_) {
@@ -188,7 +188,7 @@ Error make_kernel_key_string(
     buf_size -= 1;
 
     // Add dim order.
-    for (int j = 0; j < meta.dim_order_.size(); j++) {
+    for (size_t j = 0; j < meta.dim_order_.size(); j++) {
       n = copy_char_as_number_to_buf((int)meta.dim_order_[j], buf, buf_size);
       if (n < 0) {
         return Error::InvalidArgument;
diff --git a/runtime/kernel/operator_registry.h b/runtime/kernel/operator_registry.h
index 82815852e6f..8e1eaca9981 100644
--- a/runtime/kernel/operator_registry.h
+++ b/runtime/kernel/operator_registry.h
@@ -33,7 +33,7 @@
 #define ET_LOG_TENSOR_META(meta_list)                                 \
   for (const auto& meta : meta_list) {                                \
     ET_LOG(Error, "dtype: %d | dim order: [", int(meta.dtype_));      \
-    for (int i = 0; i < meta.dim_order_.size(); i++) {                \
+    for (size_t i = 0; i < meta.dim_order_.size(); i++) {             \
       ET_LOG(Error, "%d,", static_cast<int32_t>(meta.dim_order_[i])); \
     }                                                                 \
     ET_LOG(Error, "]");                                               \
@@ -74,7 +74,7 @@ struct TensorMeta {
     if (dim_order_.size() != other.dim_order_.size()) {
       return false;
     }
-    for (int i = 0; i < dim_order_.size(); i++) {
+    for (size_t i = 0; i < dim_order_.size(); i++) {
       if (dim_order_[i] != other.dim_order_[i]) {
         return false;
       }
diff --git a/runtime/platform/log.cpp b/runtime/platform/log.cpp
index c1ad6ddcc0d..6529c73b238 100644
--- a/runtime/platform/log.cpp
+++ b/runtime/platform/log.cpp
@@ -92,8 +92,7 @@ void vlogf(
   }
   buf[kMaxLogMessageLength - 1] = 0;
 
-  et_pal_log_level_t pal_level =
-      (int(level) >= 0 && level < LogLevel::NumLevels)
+  et_pal_log_level_t pal_level = (level < LogLevel::NumLevels)
       ? kLevelToPal[size_t(level)]
       : et_pal_log_level_t::kUnknown;
 
diff --git a/runtime/platform/log.h b/runtime/platform/log.h
index 9ad234b2520..72ea8528442 100644
--- a/runtime/platform/log.h
+++ b/runtime/platform/log.h
@@ -33,6 +33,15 @@
 #define ET_LOG_ENABLED 1
 #endif // !defined(ET_LOG_ENABLED)
 
+// Even though it is supposed to be "portable" some toolchains
+// do not define, so providing a definition here
+#ifndef PRIu64
+#define PRIu64 "llu"
+#endif
+#ifndef PRId64
+#define PRId64 "lld"
+#endif
+
 namespace executorch {
 namespace runtime {
 
diff --git a/runtime/platform/profiler.cpp b/runtime/platform/profiler.cpp
index 2f514286aa1..21f68963c78 100644
--- a/runtime/platform/profiler.cpp
+++ b/runtime/platform/profiler.cpp
@@ -129,7 +129,8 @@ void track_allocation(int32_t id, uint32_t size) {
 uint32_t track_allocator(const char* name) {
   ET_CHECK_MSG(
       prof_header->allocator_entries < MEM_PROFILE_MAX_ALLOCATORS,
-      "Out of allocator tracking space, %d is needed. Increase MEM_PROFILE_MAX_ALLOCATORS and re-compile",
+      "Out of allocator tracking space, %" PRIu32
+      " is needed. Increase MEM_PROFILE_MAX_ALLOCATORS and re-compile",
       prof_header->allocator_entries);
   size_t str_len = strlen(name);
   size_t num_allocators = prof_header->allocator_entries;
@@ -151,7 +152,8 @@ void profiling_create_block(const char* name) {
     num_blocks += 1;
     ET_CHECK_MSG(
         num_blocks <= MAX_PROFILE_BLOCKS,
-        "Only %d blocks are supported and they've all been used up but %d is used. Increment MAX_PROFILE_BLOCKS and re-run",
+        "Only %d blocks are supported and they've all been used up but %" PRIu32
+        " is used. Increment MAX_PROFILE_BLOCKS and re-run",
         MAX_PROFILE_BLOCKS,
         num_blocks);
   }
diff --git a/schema/extended_header.cpp b/schema/extended_header.cpp
index fdc463207ba..3236b040c49 100644
--- a/schema/extended_header.cpp
+++ b/schema/extended_header.cpp
@@ -14,8 +14,6 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 
-#pragma clang diagnostic ignored "-Wdeprecated"
-
 namespace executorch {
 namespace runtime {
 
diff --git a/test/build_size_test.sh b/test/build_size_test.sh
index 823b399fe34..09c0188ff9b 100644
--- a/test/build_size_test.sh
+++ b/test/build_size_test.sh
@@ -11,9 +11,8 @@ set -e
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
 
-# TODO(#8149): Remove -Wno-sign-compare
 # TODO(#8357): Remove -Wno-int-in-bool-context
-COMMON_CXXFLAGS="-fno-exceptions -fno-rtti -Wall -Werror -Wno-sign-compare -Wno-unknown-pragmas -Wno-int-in-bool-context"
+COMMON_CXXFLAGS="-fno-exceptions -fno-rtti -Wall -Werror -Wno-int-in-bool-context"
 
 cmake_install_executorch_lib() {
   echo "Installing libexecutorch.a"