diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index b291722c3f0..21a0ea5d478 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-00e3eea170ce5db8ea9c62ce5e48f13886cd6d20
+d1b87e26e5c4343f5b56bb1e6f89b479b389bfac
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index c33cc533c02..52b2fd40060 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -1,5 +1,5 @@
 mpmath==1.3.0
-numpy==1.21.3; python_version == '3.10'
+numpy==1.22.0; python_version == '3.10'
 numpy==1.23.2; python_version == '3.11'
 numpy; python_version >= '3.12'
 PyYAML==6.0.1
diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index 2492b1fd3d6..deeaed34ac3 100644
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -27,7 +27,7 @@ set_up_aot() {
       -DCMAKE_INSTALL_PREFIX=$PWD \
       -DEXECUTORCH_BUILD_QNN=ON \
       -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-      -DEXECUTORCH_BUILD_SDK=ON \
+      -DEXECUTORCH_BUILD_DEVTOOLS=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 5721b7fd607..a912d565a3e 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -11,7 +11,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
 MODEL_NAME=$1 # stories110M
 BUILD_TOOL=$2 # buck2 or cmake
-DTYPE=$3 # fp16 or fp32
+DTYPE=$3 # fp16, bf16, or fp32
 MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
 UPLOAD_DIR=${5:-}
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
@@ -29,7 +29,7 @@ if [[ -z "${BUILD_TOOL:-}" ]]; then
 fi
 
 if [[ -z "${DTYPE:-}" ]]; then
-  echo "Missing dtype, choose fp16 or fp32, exiting..."
+  echo "Missing dtype, choose fp16, bf16, or fp32, exiting..."
   exit 1
 fi
 
@@ -174,6 +174,8 @@ fi
 EXPORTED_MODEL_NAME="llama2"
 if [[ "${DTYPE}" == "fp16" ]]; then
   EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_h"
+elif [[ "${DTYPE}" == "bf16" ]]; then
+  EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_bf"
 elif [[ "${DTYPE}" == "fp32" ]]; then
   :
 else
@@ -186,7 +188,7 @@ EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
 echo "Exporting ${EXPORTED_MODEL_NAME}"
 EXPORT_ARGS="-c ${CHECKPOINT_FILE_NAME} -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
 if [[ "${XNNPACK}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128"
+  EXPORT_ARGS="${EXPORT_ARGS} -X --xnnpack-extended-ops -qmode 8da4w -G 128"
 fi
 if [[ "${CUSTOM}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
@@ -211,7 +213,7 @@ echo "Creating tokenizer.bin"
 $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
 
 
-RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10"
+RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10 --warmup=1"
 # Check build tool.
 echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
 if [[ "${BUILD_TOOL}" == "buck2" ]]; then
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index 0b8574573fb..f558a508c93 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -50,13 +50,13 @@ prepare_artifacts_upload() {
 
 build_cmake_executor_runner() {
   echo "Building executor_runner"
-  (rm -rf ${CMAKE_OUTPUT_DIR} \
-    && mkdir ${CMAKE_OUTPUT_DIR} \
-    && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DCMAKE_BUILD_TYPE=Release \
-      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+  rm -rf ${CMAKE_OUTPUT_DIR}
+  cmake -DCMAKE_BUILD_TYPE=Debug \
+      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+      -B${CMAKE_OUTPUT_DIR} .
 
-  cmake --build ${CMAKE_OUTPUT_DIR} -j4
+  cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
 }
 
 run_portable_executor_runner() {
@@ -64,9 +64,7 @@ run_portable_executor_runner() {
   if [[ "${BUILD_TOOL}" == "buck2" ]]; then
     buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./${MODEL_NAME}.pte"
   elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
-    if [[ ! -f ${CMAKE_OUTPUT_DIR}/executor_runner ]]; then
-      build_cmake_executor_runner
-    fi
+    build_cmake_executor_runner
     ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte"
   else
     echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
@@ -176,6 +174,7 @@ test_model_with_qnn() {
   fi
 
   # Use SM8450 for S22, SM8550 for S23, and SM8560 for S24
+  # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
   "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only
diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
new file mode 100644
index 00000000000..40767013e23
--- /dev/null
+++ b/.ci/scripts/test_phi_3_mini.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+BUILD_TYPE=${1:-Debug}
+BUILD_DIR=${3:-cmake-out}
+MODEL_DIR=examples/models/phi-3-mini
+
+echo "Building with BUILD_TYPE: $BUILD_TYPE, BUILD_DIR: $BUILD_DIR"
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Number of processes for a parallel build
+NPROC=8
+if hash nproc &> /dev/null; then NPROC=$(nproc); fi
+
+cmake_install_executorch_libraries() {
+  cmake -DPYTHON_EXECUTABLE=python \
+      -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+      -DEXECUTORCH_ENABLE_LOGGING=1 \
+      -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+      -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+      -DEXECUTORCH_BUILD_XNNPACK=ON \
+      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+      -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+      -B${BUILD_DIR} .
+
+  cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+}
+
+cmake_build_phi_3_mini() {
+  cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+      -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+      -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+      -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+      -DEXECUTORCH_BUILD_XNNPACK=ON \
+      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+      -B${BUILD_DIR}/${MODEL_DIR} \
+      ${MODEL_DIR}
+
+  cmake --build ${BUILD_DIR}/${MODEL_DIR} -j${NPROC} --config ${BUILD_TYPE}
+}
+
+# Download and convert tokenizer.model
+prepare_tokenizer() {
+  echo "Downloading and converting tokenizer.model"
+  wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
+  $PYTHON_EXECUTABLE -m executorch.extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+}
+
+# Export phi-3-mini model to pte
+export_phi_3_mini () {
+  echo "Exporting phi-3-mini. This will take a few minutes"
+  $PYTHON_EXECUTABLE -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run phi-3-mini runner at ${NOW}"
+    if [[ ! -f "phi-3-mini.pte" ]]; then
+        echo "Export failed. Abort"
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.bin" ]]; then
+        echo "tokenizer.bin is missing."
+        exit 1
+    fi
+
+    ${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
+    --model_path=phi-3-mini.pte \
+    --tokenizer_path=tokenizer.bin \
+    --seq_len=128 \
+    --temperature=0 \
+    --prompt="<|system|>
+You are a helpful assistant.<|end|>
+<|user|>
+What is the capital of France?<|end|>
+<|assistant|>" > result.txt
+
+    # verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_RESULT="The capital of France is Paris."
+    if [[ "${RESULT}" == *"${EXPECTED_RESULT}"* ]]; then
+        echo "Expected result prefix: ${EXPECTED_RESULT}"
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Expected result prefix: ${EXPECTED_RESULT}"
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+# Step 1. Build ExecuTorch and phi-3-mini runner
+cmake_install_executorch_libraries
+cmake_build_phi_3_mini
+
+# Step 2. Export the tokenizer and model
+prepare_tokenizer
+export_phi_3_mini
+
+# Step 3. Run and verify result
+run_and_verify
diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py
new file mode 100755
index 00000000000..3c917bc1cf3
--- /dev/null
+++ b/.github/scripts/extract_benchmark_results.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import os
+import re
+import time
+import zipfile
+from argparse import Action, ArgumentParser, Namespace
+from io import BytesIO
+from logging import info, warning
+from typing import Any, List, Optional
+from urllib import error, request
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+BENCHMARK_RESULTS_FILENAME = "benchmark_results.json"
+ARTIFACTS_FILENAME_REGEX = re.compile(r"(android|ios)-artifacts-(?P<job_id>\d+).json")
+
+
+class ValidateArtifacts(Action):
+    def __call__(
+        self,
+        parser: ArgumentParser,
+        namespace: Namespace,
+        values: Any,
+        option_string: Optional[str] = None,
+    ) -> None:
+        if os.path.isfile(values) and values.endswith(".json"):
+            setattr(namespace, self.dest, values)
+            return
+
+        parser.error(f"{values} is not a valid JSON file (*.json)")
+
+
+class ValidateOutputDir(Action):
+    def __call__(
+        self,
+        parser: ArgumentParser,
+        namespace: Namespace,
+        values: Any,
+        option_string: Optional[str] = None,
+    ) -> None:
+        if os.path.isdir(values):
+            setattr(namespace, self.dest, values)
+            return
+
+        parser.error(f"{values} is not a valid directory")
+
+
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("extract benchmark results from AWS Device Farm artifacts")
+    parser.add_argument(
+        "--artifacts",
+        type=str,
+        required=True,
+        action=ValidateArtifacts,
+        help="the list of artifacts from AWS in JSON format",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        action=ValidateOutputDir,
+        help="the directory to keep the benchmark results",
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        required=True,
+        help="which GitHub repo this workflow run belongs to",
+    )
+    parser.add_argument(
+        "--head-branch",
+        type=str,
+        required=True,
+        help="the head branch that runs",
+    )
+    parser.add_argument(
+        "--workflow-name",
+        type=str,
+        required=True,
+        help="the name of the benchmark workflow",
+    )
+    parser.add_argument(
+        "--workflow-run-id",
+        type=int,
+        required=True,
+        help="the id of the benchmark workflow",
+    )
+    parser.add_argument(
+        "--workflow-run-attempt",
+        type=int,
+        required=True,
+        help="which retry of the workflow this is",
+    )
+
+    return parser.parse_args()
+
+
+def extract_android_benchmark_results(
+    job_name: str, artifact_type: str, artifact_s3_url: str
+) -> List:
+    """
+    The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT
+    artifact, so we will just need to get it
+
+    Return the list of benchmark results.
+    """
+    if artifact_type != "CUSTOMER_ARTIFACT":
+        return []
+
+    try:
+        with request.urlopen(artifact_s3_url) as data:
+            with zipfile.ZipFile(BytesIO(data.read())) as customer_artifact:
+                for name in customer_artifact.namelist():
+                    if BENCHMARK_RESULTS_FILENAME in name:
+                        return json.loads(customer_artifact.read(name))
+
+    except error.HTTPError:
+        warning(f"Fail to {artifact_type} {artifact_s3_url}")
+        return []
+    except json.decoder.JSONDecodeError:
+        # This is to handle the case where there is no benchmark results
+        warning(f"Fail to load the benchmark results from {artifact_s3_url}")
+        return []
+
+
+def extract_job_id(artifacts_filename: str) -> int:
+    """
+    Extract the job id from the artifacts filename
+    """
+    m = ARTIFACTS_FILENAME_REGEX.match(os.path.basename(artifacts_filename))
+    if not m:
+        return 0
+    return int(m.group("job_id"))
+
+
+def transform(
+    app_type: str,
+    benchmark_results: List,
+    repo: str,
+    head_branch: str,
+    workflow_name: str,
+    workflow_run_id: int,
+    workflow_run_attempt: int,
+    job_name: str,
+    job_id: int,
+) -> List:
+    """
+    Transform the benchmark results into the format writable into the benchmark database
+    """
+    # Overwrite the device name here with the job name as it has more information about
+    # the device, i.e. Samsung Galaxy S22 5G instead of just Samsung
+    for r in benchmark_results:
+        r["deviceInfo"]["device"] = job_name
+
+    # TODO (huydhn): This is the current schema of the database oss_ci_benchmark_v2,
+    # and I'm trying to fit ET benchmark results into it, which is kind of awkward.
+    # However, the schema is going to be updated soon
+    return [
+        {
+            # GH-info to identify where the benchmark is run
+            "repo": repo,
+            "head_branch": head_branch,
+            "workflow_id": workflow_run_id,
+            "run_attempt": workflow_run_attempt,
+            "job_id": job_id,
+            # The model
+            "name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
+            "dtype": (
+                r["benchmarkModel"]["quantization"]
+                if r["benchmarkModel"]["quantization"]
+                else "unknown"
+            ),
+            # The metric value
+            "metric": r["metric"],
+            "actual": r["actualValue"],
+            "target": r["targetValue"],
+            # The device
+            "device": r["deviceInfo"]["device"],
+            "arch": r["deviceInfo"].get("os", ""),
+            # Not used here, just set it to something unique here
+            "filename": workflow_name,
+            "test_name": app_type,
+            "runner": job_name,
+        }
+        for r in benchmark_results
+    ]
+
+
+def main() -> None:
+    args = parse_args()
+
+    # Across all devices
+    all_benchmark_results = []
+
+    with open(args.artifacts) as f:
+        for artifact in json.load(f):
+            app_type = artifact.get("app_type", "")
+            # We expect this to be set to either ANDROID_APP or IOS_APP
+            if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
+                info(
+                    f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
+                )
+                continue
+
+            job_name = artifact["job_name"]
+            artifact_type = artifact["type"]
+            artifact_s3_url = artifact["s3_url"]
+
+            if app_type == "ANDROID_APP":
+                benchmark_results = extract_android_benchmark_results(
+                    job_name, artifact_type, artifact_s3_url
+                )
+                if benchmark_results:
+                    benchmark_results = transform(
+                        app_type,
+                        benchmark_results,
+                        args.repo,
+                        args.head_branch,
+                        args.workflow_name,
+                        args.workflow_run_id,
+                        args.workflow_run_attempt,
+                        job_name,
+                        extract_job_id(args.artifacts),
+                    )
+                    all_benchmark_results.extend(benchmark_results)
+
+            if app_type == "IOS_APP":
+                # TODO (huydhn): Implement the logic for iOS next
+                pass
+
+    if all_benchmark_results:
+        output_file = os.path.basename(args.artifacts)
+        with open(f"{args.output_dir}/{output_file}", "w") as f:
+            json.dump(all_benchmark_results, f)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 4045d6f99ef..7d50a441024 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -105,6 +105,7 @@ jobs:
           # Mapping devices to their corresponding device-pool-arn
           declare -A DEVICE_POOL_ARNS
           DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
+          DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
 
           # Resolve device names with their corresponding ARNs
           if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -139,6 +140,7 @@ jobs:
       submodules: 'true'
       timeout: 60
       upload-artifact: android-models
+      upload-artifact-to-s3: true
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         echo "::group::Setting up dev environment"
@@ -174,43 +176,10 @@ jobs:
         fi
         echo "::endgroup::"
 
-  # Upload models to S3. The artifacts are needed not only by the device farm but also TorchChat
-  upload-models:
-    needs: export-models
-    runs-on: linux.2xlarge
-    if: always()  # Continue this job regardless of previous job outcome
-    steps:
-      - name: Download the models from GitHub
-        uses: actions/download-artifact@v3
-        with:
-          # The name here needs to match the name of the upload-artifact parameter
-          name: android-models
-          path: ${{ runner.temp }}/artifacts/
-
-      - name: Verify the models
-        shell: bash
-        working-directory: ${{ runner.temp }}/artifacts/
-        run: |
-          ls -lah ./
-
-      - name: Upload the models to S3
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
-          retention-days: 1
-          if-no-files-found: ignore
-          path: ${{ runner.temp }}/artifacts/
-
-  build-llm-demo:
-    name: build-llm-demo
+  build-benchmark-app:
+    name: build-benchmark-app
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     needs: set-parameters
-    strategy:
-      matrix:
-          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
-      fail-fast: false
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
@@ -218,6 +187,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: android-apps
+      upload-artifact-to-s3: true
       script: |
         set -eux
 
@@ -227,43 +197,11 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
-        if [[ ${{ matrix.delegate }} == "qnn" ]]; then
-            PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
-            PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
-        fi
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
-        # TODO: This needs to be replaced with a generic loader .apk
-        # Build LLM Demo for Android
         export ANDROID_ABIS="arm64-v8a"
-        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
-
-  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
-  upload-android-apps:
-    needs: build-llm-demo
-    runs-on: linux.2xlarge
-    steps:
-      - name: Download the apps from GitHub
-        uses: actions/download-artifact@v3
-        with:
-          # The name here needs to match the name of the upload-artifact parameter
-          name: android-apps
-          path: ${{ runner.temp }}/artifacts/
-
-      - name: Verify the apps
-        shell: bash
-        working-directory: ${{ runner.temp }}/artifacts/
-        run: |
-          ls -lah ./
-
-      - name: Upload the apps to S3
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
-          retention-days: 14
-          if-no-files-found: ignore
-          path: ${{ runner.temp }}/artifacts/
+        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:
@@ -273,8 +211,8 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     needs:
       - set-parameters
-      - upload-models
-      - upload-android-apps
+      - build-benchmark-app
+      - export-models
     strategy:
       matrix:
         model: ${{ fromJson(needs.set-parameters.outputs.models) }}
@@ -282,19 +220,92 @@ jobs:
         device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
       fail-fast: false
     with:
+      # Due to scheduling a job may be pushed beyond the default 60m threshold
+      timeout: 120
       device-type: android
       runner: linux.2xlarge
       test-infra-ref: ''
       # This is the ARN of ExecuTorch project on AWS
       project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
       device-pool-arn: ${{ matrix.device }}
-      # Uploaded to S3 from the previous job, the name of the app comes from the project itself.
-      # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
-      # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
-      # one app+flavor that could load and run the model.
-      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/minibench/app-debug.apk
-      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/minibench/app-debug-androidTest.apk
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
       # NB: Need to set the default spec here so that it works for periodic too
       test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }}
       # Uploaded to S3 from the previous job
-      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+
+  upload-benchmark-results:
+    needs:
+      - benchmark-on-device
+    if: always()
+    runs-on: linux.2xlarge
+    environment: upload-benchmark-results
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: false
+
+      - name: Authenticate with AWS
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Setup conda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: '3.10'
+
+      - name: Download the list of artifacts from S3
+        env:
+          ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/
+        shell: bash
+        run: |
+          set -eux
+          ${CONDA_RUN} python -mpip install awscli==1.32.18
+
+          mkdir -p artifacts
+          pushd artifacts
+          ${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" .
+          popd
+
+          ls -lah artifacts
+
+      - name: Extract the benchmark results JSON
+        shell: bash
+        run: |
+          set -eux
+
+          mkdir -p benchmark-results
+
+          for ARTIFACTS_BY_JOB in artifacts/*.json; do
+            [ -f "${ARTIFACTS_BY_JOB}" ] || break
+            echo "${ARTIFACTS_BY_JOB}"
+            ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
+              --artifacts "${ARTIFACTS_BY_JOB}" \
+              --output-dir benchmark-results \
+              --repo ${{ github.repository }} \
+              --head-branch ${{ github.head_ref || github.ref_name }} \
+              --workflow-name ${{ github.workflow }} \
+              --workflow-run-id ${{ github.run_id }} \
+              --workflow-run-attempt ${{ github.run_attempt }}
+          done
+
+          ls -lah benchmark-results
+
+          for BENCHMARK_RESULTS in benchmark-results/*.json; do
+            cat "${BENCHMARK_RESULTS}"
+            echo
+          done
+
+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: 'benchmark-results'
+          dry-run: false
diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
new file mode 100644
index 00000000000..39838858486
--- /dev/null
+++ b/.github/workflows/android-release-artifacts.yml
@@ -0,0 +1,66 @@
+name: Android Release Artifacts
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: Version name to be uploaded for AAR release
+        required: false
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-aar:
+    name: build-aar
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12-android
+      submodules: 'true'
+      ref: ${{ github.sha }}
+      timeout: 90
+      upload-artifact: android-apps
+      upload-artifact-to-s3: true
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
+        # Build LLM Demo for Android
+        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+
+        shasum -a 256 "${ARTIFACTS_DIR_NAME}/llm_demo/executorch.aar"
+
+  upload-release-aar:
+    name: upload-release-aar
+    needs: build-aar
+    runs-on: ubuntu-22.04
+    timeout-minutes: 10
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v1.7.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-android
+          aws-region: us-east-1
+      - name: Upload AAR RC to AWS S3
+        shell: bash
+        run: |
+          wget https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/executorch.aar
+          shasum -a 256 executorch.aar > executorch.aar.sha256sums
+
+          pip install awscli==1.32.18
+          AWS_CMD="aws s3 cp"
+          VERSION="${{ inputs.version }}"
+          VERSION_NAME="${VERSION:-temp_snapshot}"
+          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar --acl public-read
+          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar.sha256sums --acl public-read
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 6ed558d3ad2..54e9dbb7619 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -33,6 +33,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: android-apps
+      upload-artifact-to-s3: true
       script: |
         set -eux
 
@@ -45,38 +46,6 @@ jobs:
         # Build LLM Demo for Android
         bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
-  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
-  upload-artifacts:
-    needs: build-llm-demo
-    runs-on: linux.2xlarge
-    steps:
-      - name: Download the artifacts from GitHub
-        uses: actions/download-artifact@v3
-        with:
-          # The name here needs to match the name of the upload-artifact parameter
-          name: android-apps
-          path: ${{ runner.temp }}/artifacts/
-
-      - name: Verify the artifacts
-        shell: bash
-        working-directory: ${{ runner.temp }}/artifacts/
-        run: |
-          ls -lah ./
-
-      - name: Upload the artifacts to S3
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
-          # NOTE: Consume stale artifacts won't make sense for benchmarking as the goal is always to
-          # benchmark models as fresh as possible. I'm okay to keep the 14 retention-days for now
-          # for TorchChat until we have a periodic job can publish it more often. Ideally I want to
-          # reduce it to <= 2 day, meaning the benchmark job will run daily.
-          retention-days: 14
-          if-no-files-found: ignore
-          path: ${{ runner.temp }}/artifacts/
-
   # Running Android emulator directly on the runner and not using Docker
   run-emulator:
     needs: build-llm-demo
@@ -141,29 +110,3 @@ jobs:
           emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none
           # This is to make sure that the job doesn't fail flakily
           emulator-boot-timeout: 900
-
-  # Let's see how expensive this job is, we might want to tone it down by running it periodically
-  test-llama-app:
-    # Only PR from ExecuTorch itself has permission to access AWS, forked PRs will fail to
-    # authenticate with the cloud service
-    if: ${{ !github.event.pull_request.head.repo.fork }}
-    needs: upload-artifacts
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
-    with:
-      device-type: android
-      runner: linux.2xlarge
-      test-infra-ref: ''
-      # This is the ARN of ExecuTorch project on AWS
-      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
-      # This is the custom Android device pool that only includes Samsung Galaxy S2x
-      device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
-      # Uploaded to S3 from the previous job, the name of the app comes from the project itself
-      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug.apk
-      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug-androidTest.apk
-      test-spec: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
-      # Among the input, this is the biggest file, so it is cached on AWS to make the test faster. Note that the file is deleted by AWS after 30
-      # days and the job will automatically re-upload the file when that happens.
-      extra-data: https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index e214e33ac1c..463c2cc662a 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -74,9 +74,9 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: "stories110M"
+          CRON_DEFAULT_MODELS: "stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l"
           CRON_DEFAULT_DEVICES: "apple_iphone_15"
-          CRON_DEFAULT_DELEGATES: "xnnpack"
+          CRON_DEFAULT_DELEGATES: "xnnpack,coreml"
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -124,11 +124,13 @@ jobs:
           delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
       fail-fast: false
     with:
-      runner: macos-latest-xlarge
+      # NB: Need to use our AWS MacOS runner to upload large models to S3
+      runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'true'
       timeout: 60
       upload-artifact: ios-models
+      upload-artifact-to-s3: true
       script: |
         set -eux
 
@@ -176,34 +178,6 @@ jobs:
         fi
         echo "::endgroup::"
 
-  upload-models:
-    needs: export-models
-    runs-on: linux.2xlarge
-    if: always()  # Continue this job regardless of previous job outcome
-    steps:
-      - name: Download the models from GitHub
-        uses: actions/download-artifact@v3
-        with:
-          # The name here needs to match the name of the upload-artifact parameter
-          name: ios-models
-          path: ${{ runner.temp }}/artifacts/
-
-      - name: Verify the models
-        shell: bash
-        working-directory: ${{ runner.temp }}/artifacts/
-        run: |
-          ls -lah ./
-
-      - name: Upload the models to S3
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
-          retention-days: 1
-          if-no-files-found: ignore
-          path: ${{ runner.temp }}/artifacts/
-
   build-benchmark-app:
     name: build-benchmark-app
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -297,7 +271,7 @@ jobs:
         with:
           s3-bucket: gha-artifacts
           s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
+            ${{ github.repository }}/${{ github.run_id }}/artifacts
           retention-days: 14
           if-no-files-found: ignore
           path: ${{ runner.temp }}/artifacts/
@@ -306,7 +280,7 @@ jobs:
     needs:
       - set-parameters
       - upload-benchmark-app
-      - upload-models
+      - export-models
     permissions:
       id-token: write
       contents: read
@@ -318,6 +292,8 @@ jobs:
         device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
       fail-fast: false
     with:
+      # Due to scheduling a job may be pushed beyond the default 60m threshold
+      timeout: 120
       device-type: ios
       # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS
       runner: linux.2xlarge
@@ -326,7 +302,7 @@ jobs:
       project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
       device-pool-arn: ${{ matrix.device }}
       # Uploaded to S3 from the previous job
-      ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/Benchmark.ipa
-      ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/Benchmark.xctestrun.zip
+      ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa
+      ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
       test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }}
-      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 2224c2d5159..89f59068d0c 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -13,21 +13,13 @@ on:
       - install_requirements.sh
       - backends/apple/**
       - build/build_apple_frameworks.sh
+      - build/build_apple_llm_demo.sh
       - build/create_frameworks.sh
       - build/test_ios_ci.sh
       - examples/demo-apps/apple_ios/**
       - extension/apple/**
       - extension/module/**
   workflow_dispatch:
-  # TODO (huydhn): This is used to validate the test spec. Eventually, we need a proper
-  # perf benchmark workflow like android-perf. This can be cleaned up once that workflow
-  # is ready
-  workflow_call:
-    inputs:
-      test_spec:
-        description: The test spec to drive the test on AWS devices
-        required: false
-        type: string
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@@ -116,7 +108,7 @@ jobs:
       # Uploaded to S3 from the previous job
       ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.ipa
       ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.xctestrun.zip
-      test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }}
+      test-spec: https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml
 
   build-frameworks-ios:
     name: build-frameworks-ios
@@ -224,3 +216,70 @@ jobs:
             shasum -a 256 "${FILENAME}"
             ${AWS_CMD} "${FILENAME}" s3://ossci-ios/executorch/ --acl public-read
           done
+
+  build-benchmark-app:
+    name: build-benchmark-app
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
+    with:
+      runner: macos-latest-xlarge
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      upload-artifact: ios-apps
+      secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Setting up CI environment"
+        .ci/scripts/setup-conda.sh
+
+        BUILD_TOOL=cmake
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
+        # Setup Apple certificate for iOS development
+        BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \
+        BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \
+        KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \
+        .ci/scripts/setup-ios.sh
+
+        # Install CoreML Backend Requirements
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          backends/apple/coreml/scripts/install_requirements.sh
+
+        # Install MPS Backend Requirements
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          backends/apple/mps/install_requirements.sh
+        echo "::endgroup::"
+
+        echo "::group::Build ExecuTorch iOS frameworks"
+        FRAMEWORKS=(
+          "executorch"
+          "backend_coreml"
+          "backend_mps"
+          "backend_xnnpack"
+          "kernels_custom"
+          "kernels_optimized"
+          "kernels_portable"
+          "kernels_quantized"
+        )
+
+        # Build Release iOS Frameworks
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
+
+        mkdir -p extension/apple/Benchmark/Frameworks
+        for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
+          cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/apple/Benchmark/Frameworks/
+        ) done
+        echo "::endgroup::"
+
+        echo "::group::Build ExecuTorch benchmark app"
+        mkdir -p extension/apple/Benchmark/Models
+        ${CONDA_RUN} --no-capture-output \
+          build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        echo "::endgroup::"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ca13d9bbd22..f7d2b627bc5 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -91,6 +91,13 @@ jobs:
         dtype: [fp32]
         build-tool: [buck2, cmake]
         mode: [portable, xnnpack+custom, xnnpack+custom+qe]
+        include:
+          - dtype: bf16
+            build-tool: cmake
+            mode: portable
+          - dtype: bf16
+            build-tool: buck2
+            mode: portable
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -407,3 +414,30 @@ jobs:
         PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
+
+  test-phi-3-mini-runner-linux:
+    name: test-phi-3-mini-runner-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install phi-3-mini requirements
+        bash examples/models/phi-3-mini/install_requirements.sh
+
+        # run e2e (export, tokenizer and runner)
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index c1a0d175d04..d7130561fa6 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -223,8 +223,10 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        build-tool: [buck2, cmake]
         mode: [portable, xnnpack+kv+custom, mps, coreml]
+        include:
+          - dtype: bf16
+            mode: portable
       fail-fast: false
     with:
       runner: macos-m1-stable
@@ -235,25 +237,12 @@ jobs:
       script: |
 
         DTYPE=${{ matrix.dtype }}
-        BUILD_TOOL=${{ matrix.build-tool }}
         MODE=${{ matrix.mode }}
 
-        if [[ "${BUILD_TOOL}" == "buck2" ]]; then
-          # TODO: Will add more modes that don't support buck2
-          if [[ "${MODE}" == "mps" ]]; then
-            echo "mps doesn't support buck2."
-            exit 0
-          fi
-          if [[ "${MODE}" == "coreml" ]]; then
-            echo "coreml doesn't support buck2."
-            exit 0
-          fi
-        fi
-
         bash .ci/scripts/setup-conda.sh
 
         # Setup executorch
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh cmake
 
         if [[ "${MODE}" == "mps" ]]; then
           # Install mps delegate
@@ -268,7 +257,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}"
 
   # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
   # test-llava-runner-macos:
@@ -351,3 +340,97 @@ jobs:
           PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
           echo "::endgroup::"
         done
+
+  test-huggingface-transformers:
+    name: test-huggingface-transformers
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    secrets: inherit
+    strategy:
+      matrix:
+        hf_model_repo: [google/gemma-2b]
+      fail-fast: false
+    with:
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.12xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        echo "::group::Set up ExecuTorch"
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+
+        echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
+        rm -rf cmake-out
+        cmake \
+            -DCMAKE_INSTALL_PREFIX=cmake-out \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+            -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+            -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+            -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+            -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+            -DEXECUTORCH_BUILD_XNNPACK=ON \
+            -DPYTHON_EXECUTABLE=python \
+            -Bcmake-out .
+        cmake --build cmake-out -j9 --target install --config Release
+
+        echo "Build llama runner"
+        dir="examples/models/llama2"
+        cmake \
+            -DCMAKE_INSTALL_PREFIX=cmake-out \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+            -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+            -DEXECUTORCH_BUILD_XNNPACK=ON \
+            -DPYTHON_EXECUTABLE=python \
+            -Bcmake-out/${dir} \
+            ${dir}
+        cmake --build cmake-out/${dir} -j9 --config Release
+        echo "::endgroup::"
+
+        echo "::group::Set up HuggingFace Dependencies"
+        if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then
+          echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR."
+          exit 1
+        fi
+        pip install -U "huggingface_hub[cli]"
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        pip install accelerate sentencepiece
+        # TODO(guangyang): Switch to use released transformers library after all required patches are included
+        pip install "git+https://github.com/huggingface/transformers.git@6cc4dfe3f1e8d421c6d6351388e06e9b123cbfe1"
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export to ExecuTorch"
+        TOKENIZER_FILE=tokenizer.model
+        TOKENIZER_BIN_FILE=tokenizer.bin
+        ET_MODEL_NAME=et_model
+        # Fetch the file using a Python one-liner
+        DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c "
+        from huggingface_hub import hf_hub_download
+        # Download the file from the Hugging Face Hub
+        downloaded_path = hf_hub_download(
+            repo_id='${{ matrix.hf_model_repo }}',
+            filename='${TOKENIZER_FILE}'
+        )
+        print(downloaded_path)
+        ")
+        if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then
+            echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH"
+            python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE}
+            ls ./tokenizer.bin
+        else
+            echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}."
+            exit 1
+        fi
+
+        python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
+
+        cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
+        echo "::endgroup::"
diff --git a/.github/workflows/upload-android-test-specs.yml b/.github/workflows/upload-android-test-specs.yml
index dd6bcca4309..e9b1054080c 100644
--- a/.github/workflows/upload-android-test-specs.yml
+++ b/.github/workflows/upload-android-test-specs.yml
@@ -13,7 +13,9 @@ on:
       - extension/android/benchmark/android-llm-device-farm-test-spec.yml
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  # NB: This concurency group needs to be different than the one used in android-perf, otherwise
+  # GH complains about concurrency deadlock
+  group: android-spec-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
 jobs:
@@ -27,7 +29,7 @@ jobs:
         with:
           s3-bucket: gha-artifacts
           s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
+            ${{ github.repository }}/${{ github.run_id }}/artifacts
           retention-days: 1
           if-no-files-found: error
           path: extension/android/benchmark/android-llm-device-farm-test-spec.yml
@@ -43,7 +45,7 @@ jobs:
       models: stories110M
       devices: samsung_galaxy_s22
       delegates: xnnpack
-      test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/android-llm-device-farm-test-spec.yml
+      test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/android-llm-device-farm-test-spec.yml
 
   upload-android-test-spec:
     needs: validate-android-test-spec
diff --git a/.github/workflows/upload-apple-test-specs.yml b/.github/workflows/upload-apple-test-specs.yml
index f5db9a04a60..06d20ef2beb 100644
--- a/.github/workflows/upload-apple-test-specs.yml
+++ b/.github/workflows/upload-apple-test-specs.yml
@@ -13,7 +13,9 @@ on:
       - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  # NB: This concurency group needs to be different than the one used in apple-perf, otherwise
+  # GH complains about concurrency deadlock
+  group: apple-spec-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
 jobs:
@@ -27,22 +29,24 @@ jobs:
         with:
           s3-bucket: gha-artifacts
           s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifact
+            ${{ github.repository }}/${{ github.run_id }}/artifacts
           retention-days: 1
           if-no-files-found: error
           path: examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml
 
-  # TODO (huydhn): An example on how to validate the test spec using the iOS demo app, but we need a proper
-  # perf benchmark workflow like android-perf
   validate-apple-test-spec:
     needs: upload-apple-test-spec-for-validation
-    uses: ./.github/workflows/apple.yml
+    uses: ./.github/workflows/apple-perf.yml
     secrets: inherit
     permissions:
       id-token: write
       contents: read
     with:
-      test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/default-ios-device-farm-appium-test-spec.yml
+      # Just use a small model here with a minimal amount of configuration to test the spec
+      models: stories110M
+      devices: apple_iphone_15
+      delegates: xnnpack
+      test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/default-ios-device-farm-appium-test-spec.yml
 
   upload-apple-test-spec:
     needs: validate-apple-test-spec
diff --git a/.gitignore b/.gitignore
index bd3528a4c4b..176edf9300b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .hypothesis
 buck-out/
-cmake-out/
+cmake-out*
+.DS_Store
 cmake-android-out/
 cmake-out-android/
 cmake-ios-out/
diff --git a/.lintrunner.toml b/.lintrunner.toml
index c28512c5986..7aa15d65638 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -74,6 +74,9 @@ exclude_patterns = [
     # NB: Objective-C is not supported
     'examples/apple/**',
     'examples/demo-apps/apple_ios/**',
+    # File contains @generated
+    'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
+    'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
 ]
 command = [
     'python',
@@ -177,6 +180,9 @@ exclude_patterns = [
     '**/*.bat',
     '**/*.jpg',
     '**/*.jar',
+    # File contains @generated
+    'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
+    'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
 ]
 command = [
     'python',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index add38ec56e0..020cd2cb2f0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,6 +183,8 @@ option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
 
 option(EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" OFF)
 
+option(EXECUTORCH_BUILD_EXTENSION_TRAINING "Build the training extension" OFF)
+
 option(EXECUTORCH_BUILD_GTESTS "Build googletest based test binaries" OFF)
 
 option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF)
@@ -197,7 +199,7 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
 option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF)
 
-option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch Developer Tools")
+option(EXECUTORCH_BUILD_DEVTOOLS "Build the ExecuTorch Developer Tools")
 
 option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
 
@@ -456,7 +458,7 @@ endif()
 add_subdirectory(schema)
 
 #
-# executorch_no_prim_ops: Minimal runtime library
+# executorch_core: Minimal runtime library
 #
 # The bare-minimum runtime library, supporting the Program and Method
 # interfaces. Does not contain any operators, including primitive ops. Does not
@@ -464,14 +466,14 @@ add_subdirectory(schema)
 #
 
 # Remove any PAL-definition files from the sources.
-list(FILTER _executorch_no_prim_ops__srcs EXCLUDE REGEX
+list(FILTER _executorch_core__srcs EXCLUDE REGEX
      "runtime/platform/default/[^/]*.cpp$"
 )
 
 # Add the source file that maps to the requested default PAL implementation.
 if(EXECUTORCH_PAL_DEFAULT MATCHES "^(posix|minimal)$")
   message(STATUS "executorch: Using PAL default '${EXECUTORCH_PAL_DEFAULT}'")
-  list(APPEND _executorch_no_prim_ops__srcs
+  list(APPEND _executorch_core__srcs
        "runtime/platform/default/${EXECUTORCH_PAL_DEFAULT}.cpp"
   )
 else()
@@ -481,45 +483,49 @@ else()
   )
 endif()
 
-add_library(executorch_no_prim_ops ${_executorch_no_prim_ops__srcs})
-target_link_libraries(executorch_no_prim_ops PRIVATE program_schema)
+add_library(executorch_core ${_executorch_core__srcs})
+
+# Legacy name alias.
+add_library(executorch_no_prim_ops ALIAS executorch_core)
+
+target_link_libraries(executorch_core PRIVATE program_schema)
 if(EXECUTORCH_USE_DL)
   # Check if dl exists for this toolchain and only then link it.
   find_library(DL_LIBRARY_EXISTS NAMES dl)
   # Check if the library was found
   if(DL_LIBRARY_EXISTS)
-    target_link_libraries(executorch_no_prim_ops PRIVATE dl) # For dladdr()
+    target_link_libraries(executorch_core PRIVATE dl) # For dladdr()
   endif()
 endif()
 target_include_directories(
-  executorch_no_prim_ops PUBLIC ${_common_include_directories}
+  executorch_core PUBLIC ${_common_include_directories}
 )
-target_compile_options(executorch_no_prim_ops PUBLIC ${_common_compile_options})
+target_compile_options(executorch_core PUBLIC ${_common_compile_options})
 if(MAX_KERNEL_NUM)
   target_compile_definitions(
-    executorch_no_prim_ops PRIVATE MAX_KERNEL_NUM=${MAX_KERNEL_NUM}
+    executorch_core PRIVATE MAX_KERNEL_NUM=${MAX_KERNEL_NUM}
   )
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND AND APPLE)
   # shared version
   add_library(
-    executorch_no_prim_ops_shared SHARED ${_executorch_no_prim_ops__srcs}
+    executorch_core_shared SHARED ${_executorch_core__srcs}
   )
-  target_link_libraries(executorch_no_prim_ops_shared PRIVATE program_schema)
+  target_link_libraries(executorch_core_shared PRIVATE program_schema)
   if(DL_LIBRARY_EXISTS)
     # For dladdr()
-    target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl)
+    target_link_libraries(executorch_core_shared PRIVATE dl)
   endif()
   target_include_directories(
-    executorch_no_prim_ops_shared PUBLIC ${_common_include_directories}
+    executorch_core_shared PUBLIC ${_common_include_directories}
   )
   target_compile_options(
-    executorch_no_prim_ops_shared PUBLIC ${_common_compile_options}
+    executorch_core_shared PUBLIC ${_common_compile_options}
   )
   if(MAX_KERNEL_NUM)
     target_compile_definitions(
-      executorch_no_prim_ops_shared PRIVATE MAX_KERNEL_NUM=${MAX_KERNEL_NUM}
+      executorch_core_shared PRIVATE MAX_KERNEL_NUM=${MAX_KERNEL_NUM}
     )
   endif()
 endif()
@@ -532,7 +538,7 @@ endif()
 # any backends.
 #
 add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PRIVATE executorch_no_prim_ops)
+target_link_libraries(executorch PRIVATE executorch_core)
 target_include_directories(executorch PUBLIC ${_common_include_directories})
 target_compile_options(executorch PUBLIC ${_common_compile_options})
 target_link_options_shared_lib(executorch)
@@ -568,7 +574,7 @@ endif()
 # Install `executorch` library as well as `executorch-config.cmake` under
 # ${CMAKE_INSTALL_PREFIX}/
 install(
-  TARGETS executorch executorch_no_prim_ops
+  TARGETS executorch executorch_core
   DESTINATION lib
   INCLUDES
   DESTINATION ${_common_include_directories}
@@ -616,7 +622,7 @@ if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
-if(EXECUTORCH_BUILD_SDK)
+if(EXECUTORCH_BUILD_DEVTOOLS)
   set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
       ON
       CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
@@ -636,6 +642,10 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()
@@ -658,7 +668,7 @@ if(EXECUTORCH_BUILD_PYBIND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
   endif()
 
-  if(NOT EXECUTORCH_BUILD_SDK)
+  if(NOT EXECUTORCH_BUILD_DEVTOOLS)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
   endif()
 
@@ -674,11 +684,16 @@ if(EXECUTORCH_BUILD_PYBIND)
       etdump
       executorch
       extension_data_loader
-      portable_ops_lib
       util
       torch
   )
 
+  if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+    list(APPEND _dep_libs optimized_native_cpu_ops_lib)
+  else()
+    list(APPEND _dep_libs portable_ops_lib)
+  endif()
+
   if(EXECUTORCH_BUILD_COREML)
     list(APPEND _dep_libs coremldelegate)
   endif()
@@ -712,7 +727,7 @@ if(EXECUTORCH_BUILD_PYBIND)
     util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
   )
   target_compile_options(util PUBLIC ${_pybind_compile_options})
-  target_link_libraries(util PRIVATE torch c10 executorch)
+  target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)
 
   # pybind portable_lib
   pybind11_add_module(portable_lib SHARED extension/pybindings/pybindings.cpp)
diff --git a/README.md b/README.md
index 6368c873f62..e9ab0773a11 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,11 @@ please visit our documentation website [for the latest release](https://pytorch.
 
 Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.
 
+Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
+
+
+**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama2/README.md) models via ExecuTorch.
+
 ## Feedback
 
 We welcome any feedback, suggestions, and bug reports from the community to help
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index 7f927284cdd..59f7f473ffe 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -13,7 +13,7 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-if(EXECUTORCH_BUILD_SDK)
+if(EXECUTORCH_BUILD_DEVTOOLS)
   # protobuf requires frtti
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti")
 endif()
@@ -134,9 +134,9 @@ target_include_directories(
   coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
 )
 target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
-target_link_libraries(coremldelegate PRIVATE executorch_no_prim_ops)
+target_link_libraries(coremldelegate PRIVATE executorch_core)
 
-if(EXECUTORCH_BUILD_SDK)
+if(EXECUTORCH_BUILD_DEVTOOLS)
   target_sources(coremldelegate PRIVATE ${SDK_SOURCES} ${PROTOBUF_SOURCES})
   target_include_directories(
     coremldelegate
@@ -159,7 +159,7 @@ find_library(SQLITE_LIBRARY sqlite3)
 
 target_link_libraries(
   coremldelegate
-  PRIVATE executorch_no_prim_ops ${ACCELERATE_FRAMEWORK} ${COREML_FRAMEWORK}
+  PRIVATE executorch_core ${ACCELERATE_FRAMEWORK} ${COREML_FRAMEWORK}
           ${FOUNDATION_FRAMEWORK} ${SQLITE_LIBRARY}
 )
 
@@ -174,9 +174,9 @@ endif()
 target_compile_options(coremldelegate PRIVATE "-fobjc-arc")
 target_compile_options(coremldelegate PRIVATE "-fno-exceptions")
 
-if(EXECUTORCH_BUILD_SDK)
+if(EXECUTORCH_BUILD_DEVTOOLS)
   target_compile_options(
-    executorch_no_prim_ops PUBLIC -DET_EVENT_TRACER_ENABLED
+    executorch_core PUBLIC -DET_EVENT_TRACER_ENABLED
   )
   target_compile_options(coremldelegate PRIVATE "-frtti")
   target_compile_options(libprotobuf-lite PRIVATE "-frtti")
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
index 0bbd1132e9f..58026593462 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h
@@ -8,10 +8,6 @@
 #import <CoreML/CoreML.h>
 #import <vector>
 
-#if !defined(MODEL_STATE_IS_SUPPORTED) && __has_include(<CoreML/MLModel+MLState.h>)
-#define MODEL_STATE_IS_SUPPORTED 1
-#endif
-
 NS_ASSUME_NONNULL_BEGIN
 
 @class ETCoreMLAsset;
@@ -45,7 +41,7 @@ __attribute__((objc_subclassing_restricted))
 @property (strong, readonly, nonatomic) MLModel* mlModel;
 
 /// The model state.
-@property (strong, readonly, nonatomic) id state API_AVAILABLE(macos(15.0), ios(18.0), tvos(18.0), watchos(11.0));
+@property (strong, readonly, nonatomic, nullable) id state;
 
 /// The asset from which the model is loaded.
 @property (strong, readonly, nonatomic) ETCoreMLAsset* asset;
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
index 250d5cd951a..6b39ae5f920 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
@@ -157,6 +157,19 @@ size_t get_number_of_bytes(MLMultiArrayDataType data_type) {
     return get_multi_array_constraints_by_name(description.outputDescriptionsByName);
 }
 
+#if MODEL_STATE_IS_SUPPORTED
+API_AVAILABLE(macos(15.0), ios(18.0), tvos(18.0), watchos(11.0))
+void reset_state_for_feature_name(NSString *feature_name, MLState *state) {
+    [state getMultiArrayForStateNamed:feature_name handler:^(MLMultiArray *buffer) {
+        [buffer getMutableBytesWithHandler:^(void *mutableBytes, NSInteger size, NSArray<NSNumber *> * __unused strides) {
+            uint8_t *start = reinterpret_cast<uint8_t *>(mutableBytes);
+            uint8_t *end = start + size;
+            std::fill(start, end, uint8_t(0));
+        }];
+    }];
+}
+#endif
+
 }
 
 #pragma mark - ETCoreMLModel
@@ -282,7 +295,6 @@ MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type
 - (nullable id<MLFeatureProvider>)predictionFromFeatures:(id<MLFeatureProvider>)input
                                                  options:(MLPredictionOptions *)options
                                                    error:(NSError **)error {
-
 #if MODEL_STATE_IS_SUPPORTED
     if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) {
         if (self.state != nil) {
@@ -294,21 +306,16 @@ MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type
     }
 #endif
 
-    return [self.mlModel predictionFromFeatures:input
-                                        options:options
-                                          error:error];
+    id<MLFeatureProvider> result = [self.mlModel predictionFromFeatures:input
+                                                                options:options
+                                                                  error:error];
+
+    return result;
 }
 
 - (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error {
-    BOOL prewarm = YES;
-#if MODEL_STATE_IS_SUPPORTED
-    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) {
-        prewarm = (self.mlModel.modelDescription.stateDescriptionsByName.count == 0);
-    }
-#endif
-
     NSError *localError = nil;
-    BOOL result = prewarm ? [self.mlModel prewarmAndReturnError:&localError] : NO;
+    BOOL result = [self.mlModel prewarmUsingState:self.state error:error];
     if (!result) {
         ETCoreMLLogError(localError,
                          "%@: Failed to prewarm model with identifier = %@",
@@ -316,6 +323,16 @@ - (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error {
                          self.identifier);
     }
 
+#if MODEL_STATE_IS_SUPPORTED
+    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) {
+        NSDictionary<NSString *, MLFeatureDescription *> *stateDescriptions = self.mlModel.modelDescription.stateDescriptionsByName;
+        [stateDescriptions enumerateKeysAndObjectsUsingBlock:^(NSString *featureName, MLFeatureDescription * __unused obj, BOOL * __unused stop) {
+            reset_state_for_feature_name(featureName, (MLState *) self.state);
+        }];
+    }
+#endif
+
+
     if (error) {
         *error = localError;
     }
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index 2a5c3ed6961..cd0fbc86f99 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -669,16 +669,15 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset {
                                                                        error:&localError];
     // Try without output backings.
     if (!modelOutputs && predictionOptions.outputBackings.count > 0) {
-        localError = nil;
         executor.ignoreOutputBackings = YES;
+        localError = nil;
+        modelOutputs = [executor executeModelWithInputs:inputFeatures
+                                      predictionOptions:predictionOptions
+                                         loggingOptions:loggingOptions
+                                            eventLogger:eventLogger
+                                                  error:&localError];
     }
-    
-    modelOutputs = [executor executeModelWithInputs:inputFeatures
-                                  predictionOptions:predictionOptions
-                                     loggingOptions:loggingOptions
-                                        eventLogger:eventLogger
-                                              error:&localError];
-    
+
     if (error) {
         *error = localError;
     }
diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h
index c066608b893..6caf99507dc 100644
--- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h
+++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h
@@ -8,6 +8,9 @@
 
 #import <CoreML/CoreML.h>
 
+#if !defined(MODEL_STATE_IS_SUPPORTED) && __has_include(<CoreML/MLModel+MLState.h>)
+#define MODEL_STATE_IS_SUPPORTED 1
+#endif
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -15,9 +18,10 @@ NS_ASSUME_NONNULL_BEGIN
 
 /// Pre-warms the model by running a prediction with zeroed-out inputs.
 ///
+/// @param state The model state.
 /// @param error   On failure, error is filled with the failure information.
 /// @retval `YES` if the prediction succeeded otherwise `NO`.
-- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error;
+- (BOOL)prewarmUsingState:(nullable id)state error:(NSError* __autoreleasing*)error;
 
 @end
 
diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
index 97d0400796f..d6f59666cf0 100644
--- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
+++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
@@ -71,16 +71,28 @@ + (MLMultiArray *)zeroedMultiArrayWithShape:(NSArray<NSNumber *> *)shape
 
 @implementation MLModel (Prewarm)
 
-- (BOOL)prewarmAndReturnError:(NSError * __autoreleasing *)error {
+- (BOOL)prewarmUsingState:(nullable id)state error:(NSError * __autoreleasing *)error {
     @autoreleasepool {
         id<MLFeatureProvider> inputs = ::get_zeroed_inputs(self, error);
         if (!inputs) {
             return NO;
         }
 
-        id<MLFeatureProvider> outputs = [self predictionFromFeatures:inputs error:error];
+
+        id<MLFeatureProvider> outputs = nil;
+        if (state != nil) {
+#if MODEL_STATE_IS_SUPPORTED
+            if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) {
+                outputs = [self predictionFromFeatures:inputs usingState:(MLState *)state error:error];
+                return outputs != nil;
+            }
+#endif
+        }
+
+        outputs = [self predictionFromFeatures:inputs error:error];
         return outputs != nil;
     }
 }
 
+
 @end
diff --git a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
index d7218905fc2..691d4d726ed 100644
--- a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
+++ b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
@@ -13,6 +13,8 @@
 #import <executorch/runtime/platform/runtime.h>
 #import <string>
 
+#import "MLModel_Prewarm.h"
+
 static constexpr size_t kRuntimeMemorySize = 50 * 1024U * 1024U; // 50 MB
 
 using namespace torch::executor;
@@ -184,20 +186,28 @@ - (void)executeModelAtURL:(NSURL *)modelURL nLoads:(NSUInteger)nLoads nExecution
 - (void)testAddProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"add_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
 }
 
 - (void)testMulProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"mul_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
 }
 
 - (void)testMV3ProgramExecute {
     NSURL *modelURL = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"];
     XCTAssertNotNil(modelURL);
-    [self executeModelAtURL:modelURL nLoads:5 nExecutions:2];
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
+}
+
+#if MODEL_STATE_IS_SUPPORTED
+- (void)testStateProgramExecute {
+    NSURL *modelURL = [[self class] bundledResourceWithName:@"state_coreml_all" extension:@"pte"];
+    XCTAssertNotNil(modelURL);
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
 }
+#endif
 
 - (void)executeMultipleModelsConcurrently:(NSArray<NSURL *> *)modelURLs
                                    nLoads:(NSUInteger)nLoads
diff --git a/backends/apple/coreml/runtime/test/export_stateful_model.py b/backends/apple/coreml/runtime/test/export_stateful_model.py
new file mode 100644
index 00000000000..61d1a93980f
--- /dev/null
+++ b/backends/apple/coreml/runtime/test/export_stateful_model.py
@@ -0,0 +1,77 @@
+# Copyright © 2024 Apple Inc. All rights reserved.
+#
+# Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+import os
+from pathlib import Path
+
+import coremltools as ct
+import executorch.exir as exir
+
+import torch
+
+from executorch.backends.apple.coreml.compiler import CoreMLBackend
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from torch.export import export
+
+
+class StatefulModel(torch.nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        max_seq_len: int,
+    ):
+        super().__init__()
+        self.register_buffer(
+            "cache", torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32)
+        )
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k_val: torch.Tensor,
+        input_pos: torch.Tensor,
+    ):
+        q_T = q.transpose(0, 1)
+        k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val)
+        attn = k.mm(q_T)
+        return attn
+
+
+def main() -> None:
+    embedding_dim = 3
+    max_seq_len = 2
+    model = StatefulModel(embedding_dim=embedding_dim, max_seq_len=max_seq_len)
+    example_inputs = (
+        torch.randn((1, embedding_dim)),
+        torch.randn((1, embedding_dim)),
+        torch.tensor([0]),
+    )
+    exported_model = export(model, example_inputs)
+    edge_program_manager = exir.to_edge(exported_model)
+    compile_specs = CoreMLBackend.generate_compile_specs(
+        compute_precision=ct.precision.FLOAT16,
+        compute_unit=ct.ComputeUnit.ALL,
+        minimum_deployment_target=ct.target.iOS18,
+    )
+
+    partitioner = CoreMLPartitioner(
+        skip_ops_for_coreml_delegation=None,
+        compile_specs=compile_specs,
+    )
+
+    delegated_program_manager = edge_program_manager.to_backend(partitioner)
+    exec_program = delegated_program_manager.to_executorch(
+        config=exir.ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+
+    buffer = exec_program.buffer
+    models_dir = Path(os.path.dirname(os.path.realpath(__file__))) / "models"
+    models_dir.mkdir(parents=False, exist_ok=True)
+    file_path = models_dir / "state_coreml_all.pte"
+    with open(file_path.resolve(), "wb") as file:
+        file.write(buffer)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
index c347c56db03..1cb29d7c962 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 8307EB892C9262060011AE6D /* state_coreml_all.pte */; };
 		83BB78A02C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm in Sources */ = {isa = PBXBuildFile; fileRef = 83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */; };
 		83BB78BF2C66AAAE00274ED7 /* add_mul_coreml_all.bin in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */; };
 		83BB78C02C66AAAE00274ED7 /* add_mul_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */; };
@@ -104,7 +105,7 @@
 		C9E7D7962AB3F9BF00CCAE5D /* KeyValueStoreTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78E2AB3F9BF00CCAE5D /* KeyValueStoreTests.mm */; };
 		C9E7D7A22AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D7A12AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm */; };
 		C9EC7E1B2BC73B3200A6B166 /* MultiArrayTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */; };
-		F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */; };
+		F24817E52BC655E100E80D98 /* libexecutorch_core.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E42BC655E100E80D98 /* libexecutorch_core.a */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -120,6 +121,7 @@
 /* End PBXCopyFilesBuildPhase section */
 
 /* Begin PBXFileReference section */
+		8307EB892C9262060011AE6D /* state_coreml_all.pte */ = {isa = PBXFileReference; lastKnownFileType = file; name = state_coreml_all.pte; path = ../test/models/state_coreml_all.pte; sourceTree = "<group>"; };
 		83BB789E2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = ETCoreMLModelDebugInfo.h; path = ../sdk/ETCoreMLModelDebugInfo.h; sourceTree = "<group>"; };
 		83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = ETCoreMLModelDebugInfo.mm; path = ../sdk/ETCoreMLModelDebugInfo.mm; sourceTree = "<group>"; };
 		83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_mul_coreml_all.bin; path = ../test/models/add_mul_coreml_all.bin; sourceTree = "<group>"; };
@@ -308,7 +310,7 @@
 		C9EA3FE52B73EF6300B7D7BD /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		C9EC7E092BC662A300A6B166 /* objc_array_util.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = objc_array_util.h; path = ../util/objc_array_util.h; sourceTree = "<group>"; };
 		C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = MultiArrayTests.mm; path = ../test/MultiArrayTests.mm; sourceTree = "<group>"; };
-		F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = ../libraries/libexecutorch_no_prim_ops.a; sourceTree = "<group>"; };
+		F24817E42BC655E100E80D98 /* libexecutorch_core.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_core.a; path = ../libraries/libexecutorch_core.a; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -317,7 +319,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				C94D510F2ABDF87500AF47FD /* Accelerate.framework in Frameworks */,
-				F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */,
+				F24817E52BC655E100E80D98 /* libexecutorch_core.a in Frameworks */,
 				C94D510E2ABDF86800AF47FD /* libsqlite3.tbd in Frameworks */,
 				C94D50D92ABD7B2400AF47FD /* CoreML.framework in Frameworks */,
 				C99883862B95AD7D000953A3 /* libprotobuf-lite.a in Frameworks */,
@@ -538,7 +540,7 @@
 				C96560942AABFDCE005F8126 /* libsqlite3.tbd */,
 				C96560922AABF992005F8126 /* CoreML.framework */,
 				C96560902AABF982005F8126 /* Accelerate.framework */,
-				F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */,
+				F24817E42BC655E100E80D98 /* libexecutorch_core.a */,
 				C965608D2AABF72A005F8126 /* libexecutorch.a */,
 			);
 			name = "Recovered References";
@@ -607,6 +609,7 @@
 				C98551982AD2542D009143F9 /* mv3_coreml_all.pte */,
 				83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */,
 				83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */,
+				8307EB892C9262060011AE6D /* state_coreml_all.pte */,
 			);
 			name = models;
 			sourceTree = "<group>";
@@ -677,6 +680,7 @@
 				C985519E2AD2542D009143F9 /* mv3_coreml_all.pte in Resources */,
 				C98551A02AD2542D009143F9 /* add_coreml_all.bin in Resources */,
 				C98551A22AD2542D009143F9 /* mul_coreml_all.pte in Resources */,
+				8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */,
 				C98551A32AD2542D009143F9 /* add_coreml_all.pte in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
diff --git a/backends/apple/coreml/scripts/build_tests.sh b/backends/apple/coreml/scripts/build_tests.sh
index 911c6cd4e10..5fbde6ac66f 100755
--- a/backends/apple/coreml/scripts/build_tests.sh
+++ b/backends/apple/coreml/scripts/build_tests.sh
@@ -59,7 +59,7 @@ cmake --build "$CMAKE_PROTOBUF_BUILD_DIR_PATH"  -j9 -t libprotobuf-lite
 echo "ExecuTorch: Copying libraries"
 mkdir "$LIBRARIES_DIR_PATH"
 cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch.a" "$LIBRARIES_DIR_PATH"
-cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch_no_prim_ops.a" "$LIBRARIES_DIR_PATH"
+cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch_core.a" "$LIBRARIES_DIR_PATH"
 cp -f "$CMAKE_PROTOBUF_BUILD_DIR_PATH/libprotobuf-lite.a" "$LIBRARIES_DIR_PATH"
 
 #Copy ExecuTorch headers
diff --git a/backends/apple/coreml/scripts/generate_test_models.sh b/backends/apple/coreml/scripts/generate_test_models.sh
index bbe9809ff8d..0c1822aa828 100755
--- a/backends/apple/coreml/scripts/generate_test_models.sh
+++ b/backends/apple/coreml/scripts/generate_test_models.sh
@@ -17,14 +17,17 @@ cd "$EXECUTORCH_ROOT_PATH"
 
 mkdir "$COREML_DIR_PATH/runtime/test/models/"
 #Generate models
-echo "Executorch: Generating test models"
 cd "$EXECUTORCH_ROOT_PATH"
 
 MODELS=("add" "add_mul" "mul" "mv3")
 for MODEL in "${MODELS[@]}"
 do
+  echo "Executorch: Generating $MODEL model" 
   # TODO: Don't use the script in examples directory.
   python3 -m examples.apple.coreml.scripts.export --model_name "$MODEL" --save_processed_bytes
   mv -f "$MODEL""_coreml_all.pte" "$COREML_DIR_PATH/runtime/test/models"
   mv -f "$MODEL""_coreml_all.bin" "$COREML_DIR_PATH/runtime/test/models"
 done
+
+echo "Executorch: Generating stateful model"
+python3 "$SCRIPT_DIR_PATH/../runtime/test/export_stateful_model.py"
diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
index b6c9a073e08..b3ea0d77ca0 100755
--- a/backends/apple/coreml/scripts/install_requirements.sh
+++ b/backends/apple/coreml/scripts/install_requirements.sh
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone --depth 1 --branch 8.0b2 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index f47139a0000..96aa007563b 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -75,7 +75,7 @@ target_link_libraries(
   mpsdelegate
   PRIVATE bundled_program
           mps_schema
-          executorch_no_prim_ops
+          executorch_core
           ${FOUNDATION_FRAMEWORK}
           ${METAL_FRAMEWORK}
           ${MPS_FRAMEWORK}
diff --git a/backends/apple/mps/runtime/operations/OperationUtils.mm b/backends/apple/mps/runtime/operations/OperationUtils.mm
index c3c5c93362a..2336868863d 100644
--- a/backends/apple/mps/runtime/operations/OperationUtils.mm
+++ b/backends/apple/mps/runtime/operations/OperationUtils.mm
@@ -31,8 +31,13 @@
       return MPSDataTypeFloat32;
     case DataType::mps_data_type_int8:
       return MPSDataTypeInt8;
-    case DataType::mps_data_type_int4:
-      return MPSDataTypeInt4;
+    case DataType::mps_data_type_int4: {
+      if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, *)) {
+        return MPSDataTypeInt4;
+      } else {
+        return ((MPSDataType)(MPSDataTypeSignedBit | 4));
+      }
+    }
     case DataType::mps_data_type_int16:
       return MPSDataTypeInt16;
     case DataType::mps_data_type_int32:
diff --git a/backends/apple/mps/runtime/operations/QuantDequant.mm b/backends/apple/mps/runtime/operations/QuantDequant.mm
index 7818bab2565..c37282f79a1 100644
--- a/backends/apple/mps/runtime/operations/QuantDequant.mm
+++ b/backends/apple/mps/runtime/operations/QuantDequant.mm
@@ -30,17 +30,19 @@
 
   MPSGraphTensor* inputTensor = getMPSGraphTensor(graphNode->input1_id());
   MPSGraphTensor* scalesTensor = getMPSGraphTensor(graphNode->scales_id());
-
-  MPSGraphTensor *zpTensor = [_mpsGraph constantWithScalar:0
+  if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, *)) {
+    MPSGraphTensor *zpTensor = [_mpsGraph constantWithScalar:0
                                                   dataType:MPSDataTypeInt4];
+    MPSGraphTensor *wDqTensor = [_mpsGraph dequantizeTensor:inputTensor
+                                                scaleTensor:scalesTensor
+                                            zeroPointTensor:zpTensor
+                                                  dataType:MPSDataTypeFloat16
+                                                      name:nil];
+    _idToMPSGraphTensor[graphNode->output_id()] = wDqTensor;
+  } else {
+    _idToMPSGraphTensor[graphNode->output_id()] = nil;
+  }
 
-  MPSGraphTensor *wDqTensor = [_mpsGraph dequantizeTensor:inputTensor
-                                              scaleTensor:scalesTensor
-                                          zeroPointTensor:zpTensor
-                                                dataType:MPSDataTypeFloat16
-                                                    name:nil];
-
-  _idToMPSGraphTensor[graphNode->output_id()] = wDqTensor;
   return Error::Ok;
 }
 
diff --git a/backends/apple/mps/setup.md b/backends/apple/mps/setup.md
index c8fdfeb98e4..7cd4c240a43 100644
--- a/backends/apple/mps/setup.md
+++ b/backends/apple/mps/setup.md
@@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp
 ```
 
 ### Profiling:
-1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while you're exporting your model.
+1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
 ```bash
 cd executorch
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
 ```
-2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./sdk-etdump.md).
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
 ```
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
 ```
diff --git a/backends/arm/README.md b/backends/arm/README.md
index 375259c62ab..6f4642f8d44 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -9,7 +9,7 @@ The expected flow is:
  * torch.nn.module -> TOSA -> command_stream for fully AoT flows e.g. embedded.
  * torch.nn.module -> TOSA for flows supporting a JiT compilation step.
 
-Current backend support is being developed for TOSA to Ethos(TM)-U55/65 via the
+Current backend support is being developed for TOSA to Ethos(TM)-U55/65/85 via the
 ethos-u-vela compilation stack. which follows the fully AoT flow.
 
 ## Layout
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 7803cf84950..6d391b170a5 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -52,11 +52,11 @@ def __init__(self):
     def ethosu_compile_spec(
         self,
         config: str,
-        system_config: Optional[str] = None,
-        memory_mode: Optional[str] = None,
+        system_config: str,
+        memory_mode: str,
         extra_flags: Optional[str] = None,
         config_ini: Optional[str] = "Arm/vela.ini",
-    ):
+    ) -> "ArmCompileSpecBuilder":
         """
         Generate compile spec for Ethos-U NPU
 
@@ -86,7 +86,7 @@ def ethosu_compile_spec(
 
         return self
 
-    def tosa_compile_spec(self):
+    def tosa_compile_spec(self) -> "ArmCompileSpecBuilder":
         """
         Generate compile spec for TOSA flatbuffer output
         """
@@ -96,14 +96,18 @@ def tosa_compile_spec(self):
         self.output_format = "tosa"
         return self
 
-    def dump_intermediate_artifacts_to(self, output_path: str):
+    def dump_intermediate_artifacts_to(
+        self, output_path: str
+    ) -> "ArmCompileSpecBuilder":
         """
         Sets a path for dumping intermediate results during such as tosa and pte.
         """
         self.path_for_intermediates = output_path
         return self
 
-    def set_permute_memory_format(self, set_nhwc_permutation: bool = True):
+    def set_permute_memory_format(
+        self, set_nhwc_permutation: bool = True
+    ) -> "ArmCompileSpecBuilder":
         """
         Permute to channel last in compiler and runtime. Compilation and
         runtime will convert rank 4 inputs to channel last for each sub-graph.
@@ -111,7 +115,7 @@ def set_permute_memory_format(self, set_nhwc_permutation: bool = True):
         self.permute_nhwc = set_nhwc_permutation
         return self
 
-    def set_quantize_io(self, quantize_io: bool = False):
+    def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         """
         Quantization of inputs and dequantization of outputs for cases where
         whole graph is quantized and method signature is not of quantized type.
@@ -119,7 +123,7 @@ def set_quantize_io(self, quantize_io: bool = False):
         self.quantize_io = quantize_io
         return self
 
-    def build(self):
+    def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
         """
@@ -168,6 +172,17 @@ def get_intermediate_path(compile_spec: List[CompileSpec]) -> Optional[str]:
     return None
 
 
+def _get_first_delegation_tag(graph_module) -> str | None:
+    """Get the first delegation tag from the graph_module or return None."""
+    for node in graph_module.graph.nodes:
+        tag = node.meta.get("delegation_tag")
+        if tag:
+            return tag
+
+    logger.debug("No delegation tag found in partition.")
+    return None
+
+
 @final
 class ArmBackend(BackendDetails):
     @staticmethod
@@ -202,7 +217,7 @@ def preprocess(  # noqa: C901
         # const data directly. Path created and data written only in debug builds.
         tosa_graph = ts.TosaSerializer(artifact_path)
         graph_module = ArmPassManager().transform_to_backend_pipeline(
-            graph_module=edge_program.graph_module, compile_spec=compile_spec
+            exported_program=edge_program, compile_spec=compile_spec
         )
 
         node_visitors = get_node_visitors(edge_program)
@@ -222,8 +237,13 @@ def preprocess(  # noqa: C901
         # TODO: It would be awesome if this dump could somehow be done on top level and not here.
         # Problem is that the desc.json has to be created on the tosa_graph object, which we can't
         # access from top level.
-        if artifact_path is not None:
-            dbg_tosa_dump(tosa_graph, artifact_path)
+        if artifact_path:
+            tag = _get_first_delegation_tag(graph_module)
+            dbg_tosa_dump(
+                tosa_graph,
+                artifact_path,
+                suffix="{}".format(f"_{tag}" if tag else ""),
+            )
 
         # Serialize and return the program. While we have always produced TOSA
         # output as an intermediate, some flows compile to device binaries in
diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py
index 6b57c3d9658..d02c149ce32 100644
--- a/backends/arm/arm_partitioner.py
+++ b/backends/arm/arm_partitioner.py
@@ -58,6 +58,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.mm.default,
             exir_ops.edge.aten.repeat.default,
             exir_ops.edge.aten.relu.default,
+            exir_ops.edge.aten.rsqrt.default,
             exir_ops.edge.aten._softmax.default,
             exir_ops.edge.aten.slice_copy.Tensor,
             exir_ops.edge.aten.sub.Tensor,
@@ -65,6 +66,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.clone.default,
             exir_ops.edge.aten.mean.dim,
             exir_ops.edge.aten.unsqueeze_copy.default,
+            exir_ops.edge.aten.squeeze_copy.dims,
             operator.getitem,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index d491437ded3..01bb8bd55e5 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -50,7 +50,12 @@ def vela_compile(tosa_graph, args: List[str]):
         args.append(tosa_path)
         vela.main(" ".join(args).split(" "))
 
-        np_path = os.path.join(output_dir, "out_sg0_vela.npz")
+        if any("ethos-u85" in arg for arg in args) or any(
+            "debug-force-regor" in arg for arg in args
+        ):
+            np_path = os.path.join(tmpdir, "output", "out_vela.npz")
+        else:
+            np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
         blocks = b""
 
         with np.load(np_path, allow_pickle=False) as data:
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 7b94bfa837d..529ad2bbe2c 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -28,9 +28,11 @@
     op_quant,
     op_relu,
     op_repeat,
+    op_rsqrt,
     op_sigmoid,
     op_slice,
     op_softmax,
+    op_squeeze,
     op_sub,
     op_unsqueeze,
     op_view,
diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py
index b5dcf3f9873..2618c9e71d3 100644
--- a/backends/arm/operators/op_placeholder.py
+++ b/backends/arm/operators/op_placeholder.py
@@ -28,6 +28,13 @@ def process_inputs(
     tosa_graph: ts.TosaSerializer,
 ):
     """Serialize an input node"""
+    # inputs need to be in default dim_order (contiguous memory format)
+    meta = node.meta["val"]
+    if meta.dim_order() != tuple(range(meta.dim())):
+        raise RuntimeError(
+            f"Arm backend only supports contiguous memory format for inputs. "
+            f"Expected dim_order: {tuple(range(meta.dim()))}, but got: {meta.dim_order()} for node {node.name}"
+        )
     inputs = [TosaArg(node)]
     input_shape = inputs[0].shape
     input_dim_order = inputs[0].dim_order
diff --git a/backends/arm/operators/op_rsqrt.py b/backends/arm/operators/op_rsqrt.py
new file mode 100644
index 00000000000..9225c7d938f
--- /dev/null
+++ b/backends/arm/operators/op_rsqrt.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import numpy as np
+import serializer.tosa_serializer as ts
+import torch
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_quant_utils import (
+    dequantize_value,
+    get_quant_node_args,
+    QuantArgs,
+    quantize_value,
+)
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class RsqrtVisitor(NodeVisitor):
+    target = "aten.rsqrt.default"
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        if is_quant_node:
+            # Assume quantized input is 8 bit.
+            # Create attribute for 8 bit table lookup.
+            input_node = node.all_input_nodes[0]
+            in_quantargs = get_quant_node_args(input_node)
+            output_node = list(node.users)[0]
+            out_quantargs = get_quant_node_args(output_node)
+            table = rsqrt_table_8bit(in_quantargs, out_quantargs)
+            table_attr = ts.TosaSerializerAttribute()
+            table_attr.TableAttribute(table)
+            tosa_graph.addOperator(
+                TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
+            )
+        else:
+            tosa_graph.addOperator(TosaOp.Op().RSQRT, [inputs[0].name], [output.name])
+
+
+def rsqrt_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs):
+    """
+    Returns a table mapping 256 entries to rqsrt([qmin,qmax])
+    Reference: https://www.mlplatform.org/tosa/tosa_spec.html#_rsqrt
+    """
+
+    def rqsrt(x):
+        # Convert quantized input to floating point rqsrt input space.
+        v = dequantize_value(x, in_quantargs)
+        # Compute rqsrt.
+        v = 1 / np.sqrt(v)
+        # Convert rqsrt output back to quantized space.
+        return quantize_value(v, out_quantargs)
+
+    return [
+        rqsrt(x)
+        for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8)
+    ]
diff --git a/backends/arm/operators/op_squeeze.py b/backends/arm/operators/op_squeeze.py
new file mode 100644
index 00000000000..0429d214ff8
--- /dev/null
+++ b/backends/arm/operators/op_squeeze.py
@@ -0,0 +1,44 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import serializer.tosa_serializer as ts
+import torch
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_utils import tosa_shape
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class SqueezeVisitor(NodeVisitor):
+    target = "aten.squeeze_copy.dims"
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        shape = inputs[0].shape
+        rank = len(shape)
+        # In some cases, e.g. torch.randn((1, 5, 1, 5)).squeeze(),
+        # dims == [0, 1, 2, 3] even though all dims cannot be squeezed.
+        # We need to verify that shape[dim] == 1 before squeezing the dim.
+        dims = [dim % rank for dim in inputs[1].special if shape[dim] == 1]
+        new_shape = [shape[i] for i in range(rank) if i not in dims]
+        new_shape = tosa_shape(new_shape, output.dim_order)
+        attr = ts.TosaSerializerAttribute()
+        attr.ReshapeAttribute(new_shape)
+        tosa_graph.addOperator(
+            TosaOp.Op().RESHAPE, [inputs[0].name], [output.name], attr
+        )
diff --git a/backends/arm/passes/annotate_channels_last_dim_order_pass.py b/backends/arm/passes/annotate_channels_last_dim_order_pass.py
index a5b657af49f..222c0a7cb36 100644
--- a/backends/arm/passes/annotate_channels_last_dim_order_pass.py
+++ b/backends/arm/passes/annotate_channels_last_dim_order_pass.py
@@ -34,7 +34,9 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
             prev_node = node.args[0]
             if cast(torch.fx.Node, prev_node).op != "placeholder":
                 return False
-            return is_consumer_node_depthwise_conv2d(node)
+            if is_consumer_node_depthwise_conv2d(node):
+                consumer_node = list(node.users)[0]
+                return consumer_node.args[1] == node
         elif node.op == "placeholder":
             # node is an input, weight or bias node
             consumer_node = list(node.users)[0]
diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py
index 75ef551171e..03fbb38d04b 100644
--- a/backends/arm/passes/arm_pass_manager.py
+++ b/backends/arm/passes/arm_pass_manager.py
@@ -22,6 +22,7 @@
 )
 from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass
 from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass
+from executorch.exir import ExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.pass_manager import PassManager
 
@@ -32,7 +33,7 @@ def _transform(self, graph_module: torch.fx.GraphModule):
         return self(graph_module).graph_module
 
     def transform_to_backend_pipeline(
-        self, graph_module: torch.fx.GraphModule, compile_spec: list[CompileSpec]
+        self, exported_program: ExportedProgram, compile_spec: list[CompileSpec]
     ):
         """Apply passes before transforming program to backend"""
         self.add_pass(SizeAdjustConv2DPass())
@@ -46,4 +47,4 @@ def transform_to_backend_pipeline(
                 if memory_format == "nhwc":
                     self.add_pass(AnnotateChannelsLastDimOrder())
 
-        return self._transform(graph_module)
+        return self._transform(exported_program.graph_module)
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index 853fd47c29c..7daf52e103f 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -271,6 +271,7 @@ class ArmQuantizer(Quantizer):
         "mm",
         "cat",
         "one_to_one",
+        "generic",
     ]
 
     def __init__(self) -> None:
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
index fe9c5e34e6b..49da003ef6d 100644
--- a/backends/arm/quantizer/arm_quantizer_utils.py
+++ b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -146,7 +146,10 @@ def is_share_obs_or_fq_op(op: Callable) -> bool:
         torch.ops.aten.permute.default,
         torch.ops.aten.permute_copy.default,
         torch.ops.aten.squeeze.dim,
+        torch.ops.aten.squeeze.dims,
+        torch.ops.aten.squeeze.default,
         torch.ops.aten.squeeze_copy.dim,
+        torch.ops.aten.unsqueeze.default,
         # TODO: remove?
         torch.ops.aten.adaptive_avg_pool2d.default,
         torch.ops.aten.view_copy.default,
diff --git a/backends/arm/quantizer/quantization_annotation/__init__.py b/backends/arm/quantizer/quantization_annotation/__init__.py
index f7219201dec..594911075f7 100644
--- a/backends/arm/quantizer/quantization_annotation/__init__.py
+++ b/backends/arm/quantizer/quantization_annotation/__init__.py
@@ -53,6 +53,7 @@ def decorator(annotator: AnnotatorType):
     add_annotator,
     cat_annotator,
     conv_annotator,
+    generic_annotator,
     linear_annotator,
     max_pool2d_annotator,
     mm_annotator,
diff --git a/backends/arm/quantizer/quantization_annotation/generic_annotator.py b/backends/arm/quantizer/quantization_annotation/generic_annotator.py
new file mode 100644
index 00000000000..a4909916935
--- /dev/null
+++ b/backends/arm/quantizer/quantization_annotation/generic_annotator.py
@@ -0,0 +1,79 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import Callable, List, Optional
+
+import torch
+import torch.fx
+from executorch.backends.arm.quantizer import arm_quantizer_utils
+from executorch.backends.arm.quantizer.quantization_annotation import register_annotator
+from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
+from torch.ao.quantization.quantizer import SharedQuantizationSpec
+from torch.ao.quantization.quantizer.utils import (
+    _annotate_input_qspec_map,
+    _annotate_output_qspec,
+)
+from torch.fx import Node
+
+
+_SUPPORTED_OPS = [
+    # DATA LAYOUT OPS
+    torch.ops.aten.squeeze.default,
+    torch.ops.aten.squeeze_copy.default,
+    torch.ops.aten.unsqueeze.default,
+    torch.ops.aten.unsqueeze_copy.default,
+    torch.ops.aten.reshape.default,
+    # Disabling these as there seems to be an issue with support for complex
+    # datatypes in torch:
+    # torch.ops.aten.view_as_complex.default,
+    # torch.ops.aten.view_as_complex_copy.default,
+    # torch.ops.aten.view_as_real.default,
+    # torch.ops.aten.view_as_real_copy.default,
+    torch.ops.aten.view_copy.default,
+    torch.ops.aten.slice.Tensor,
+    torch.ops.aten.slice_copy.Tensor,
+    # 'concat' should be handled separately as it has a sequence of inputs and
+    # makes the implementation unnecessary complicated.
+    # torch.ops.aten.concat.default,
+    torch.ops.aten.transpose.Dimname,
+    torch.ops.aten.transpose.int,
+    torch.ops.aten.transpose_copy.int,
+    torch.ops.aten.tile.default,
+    torch.ops.aten.flip.default,
+]
+
+
+@register_annotator("generic")
+def _annotate_generic(
+    gm: torch.fx.GraphModule,
+    quantization_config: QuantizationConfig,
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """Propagate qspecs to generic ops like unsqueeze, reshape etc."""
+    annotated_partitions = []
+
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in _SUPPORTED_OPS:
+            continue
+        if filter_fn and not filter_fn(node):
+            continue
+        if arm_quantizer_utils.is_annotated(node):
+            continue
+
+        input_node = node.args[0]
+
+        # Using a non-shared quantization spec here as a SharedQuantizationSpec
+        # can lead to a recursion.
+        _annotate_input_qspec_map(
+            node, input_node, quantization_config.get_input_act_qspec()
+        )
+        _annotate_output_qspec(node, SharedQuantizationSpec((input_node, node)))
+
+        arm_quantizer_utils.mark_nodes_as_annotated([node])
+        annotated_partitions.append([node])
+
+    return annotated_partitions
diff --git a/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py
index 8d507c11ef3..3a189c0d8f1 100644
--- a/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py
+++ b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py
@@ -35,7 +35,11 @@ def _annotate_one_to_one(
     Typical ops are ops implemented with a lookup table.
     """
     annotated_partitions = []
-    one_to_one_ops = (torch.ops.aten.exp.default, torch.ops.aten.log.default)
+    one_to_one_ops = {
+        torch.ops.aten.exp.default,
+        torch.ops.aten.log.default,
+        torch.ops.aten.rsqrt.default,
+    }
     for node in gm.graph.nodes:
         if node.op != "call_function" or node.target not in one_to_one_ops:
             continue
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 26ffb0b9700..b0452fb9e7b 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -15,16 +15,31 @@
 
 #include <ethosu_driver.h>
 
-#include "executorch/backends/arm/runtime/VelaBinStream.h"
-#include "executorch/runtime/backend/interface.h"
-#include "executorch/runtime/core/error.h"
-#include "executorch/runtime/core/evalue.h"
-#include "executorch/runtime/core/exec_aten/util/scalar_type_util.h"
+#include <executorch/backends/arm/runtime/VelaBinStream.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 
 using namespace std;
 
-namespace torch {
-namespace executor {
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Result;
+
+namespace executorch {
+namespace backends {
+namespace arm {
 
 typedef struct {
   FreeableBuffer* processed;
@@ -141,7 +156,16 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
             Error,
             "Input %d expected Integer (4 byte) or Char (1 byte) integer inputs, got ScalarType id %s",
             i,
-            toString(tensor_in.scalar_type()));
+            executorch::runtime::toString(tensor_in.scalar_type()));
+        return Error::InvalidProgram;
+      }
+      supported = executorch::runtime::is_contiguous_dim_order(
+          tensor_in.dim_order().data(), tensor_in.dim());
+      if (!supported) {
+        ET_LOG(
+            Error,
+            "Input %d expected contiguous dim_order, but got non-contiguous dim_order",
+            i);
         return Error::InvalidProgram;
       }
 
@@ -258,7 +282,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
  private:
   Error check_requires_permute(
       int index,
-      const exec_aten::Tensor tensor,
+      const executorch::aten::Tensor tensor,
       VelaIO* io,
       bool permuted_io_flag,
       bool* is_permuted) const {
@@ -281,18 +305,27 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
       }
     }
     if (!permuted_shape) {
-      // Error check matching shapes in the general case
+      // Check the number of elements in each tensor match
+      int tensor_count = 1;
+      int io_count = 1;
+
       for (int i = 0; i < tensor.dim(); i++) {
-        if (tensor.size(i) != io->shape[i]) {
-          ET_LOG(Error, "Tensor input/output %d mismatched shape", index);
-          ET_LOG(
-              Error,
-              "dimension %d mismatch, %zd != %d",
-              index,
-              tensor.size(i),
-              io->shape[i]);
-          return Error::InvalidProgram;
-        }
+        tensor_count = tensor_count * tensor.size(i);
+      }
+
+      // The VelaIO type has a shape of fixed size 4
+      for (int i = 0; i < 4; i++) {
+        io_count = io_count * io->shape[i];
+      }
+
+      if (tensor_count != io_count) {
+        ET_LOG(Error, "Input tensor sizes do not match");
+        ET_LOG(
+            Error,
+            "Program expects %d elements but got %d",
+            io_count,
+            tensor_count);
+        return Error::InvalidProgram;
       }
     }
     *is_permuted = permuted_shape;
@@ -324,5 +357,6 @@ Backend backend_id{"ArmBackend", &backend};
 static auto registered = register_backend(backend_id);
 } // namespace
 
-} // namespace executor
-} // namespace torch
+} // namespace arm
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/VelaBinStream.cpp b/backends/arm/runtime/VelaBinStream.cpp
index e2badbbd9fd..a26fe9f23e2 100644
--- a/backends/arm/runtime/VelaBinStream.cpp
+++ b/backends/arm/runtime/VelaBinStream.cpp
@@ -10,10 +10,15 @@
  *          as that function emits this format and the two need to align.
  */
 
+#include <executorch/backends/arm/runtime/VelaBinStream.h>
+
 #include <cstring>
 
-#include "executorch/backends/arm/runtime/VelaBinStream.h"
-#include "executorch/runtime/core/error.h"
+#include <executorch/runtime/core/error.h>
+
+namespace executorch {
+namespace backends {
+namespace arm {
 
 // get next mul of 16 ptr, return n if already aligned
 static uintptr_t next_mul_16(uintptr_t n) {
@@ -91,3 +96,7 @@ bool vela_bin_read(const char* data, VelaHandles* handles, int size) {
   // We've fallen off the end without finding vela_end_stream
   return false;
 }
+
+} // namespace arm
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/VelaBinStream.h b/backends/arm/runtime/VelaBinStream.h
index e946078f5a4..04b8b2ada00 100644
--- a/backends/arm/runtime/VelaBinStream.h
+++ b/backends/arm/runtime/VelaBinStream.h
@@ -18,6 +18,10 @@
 #include <cstddef>
 #include <cstdint>
 
+namespace executorch {
+namespace backends {
+namespace arm {
+
 // Standard block name size
 const uint32_t kVelaBlockNameLength = 16;
 
@@ -67,3 +71,7 @@ bool vela_bin_read(const char* data, VelaHandles* handles, int size);
  * on the Ethos-U.
  */
 bool vela_bin_validate(const char* data, int size);
+
+} // namespace arm
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index f85fd1f2dac..2ae86b1d1eb 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -4,9 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 import os
+import platform
 import shutil
 import subprocess
+import sys
 import tempfile
 
 import pytest
@@ -14,6 +17,7 @@
 import torch
 
 from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 _enabled_options: list[str] = []
 
@@ -36,6 +40,7 @@ def pytest_configure(config):
                 "Tests are run with --arm_run_corstone300 but corstone300 FVP is not installed."
             )
         _enabled_options.append("corstone300")
+    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
 
 
 def pytest_collection_modifyitems(config, items):
@@ -53,11 +58,17 @@ def pytest_collection_modifyitems(config, items):
 
 
 def load_libquantized_ops_aot_lib():
+    so_ext = {
+        "Darwin": "dylib",
+        "Linux": "so",
+        "Windows": "dll",
+    }.get(platform.system(), None)
+
     find_lib_cmd = [
         "find",
         "cmake-out-aot-lib",
         "-name",
-        "libquantized_ops_aot_lib.so",
+        f"libquantized_ops_aot_lib.{so_ext}",
     ]
     res = subprocess.run(find_lib_cmd, capture_output=True)
     if res.returncode == 0:
@@ -85,7 +96,32 @@ def is_option_enabled(option: str, fail_if_not_enabled: bool = False) -> bool:
             return False
 
 
-def get_tosa_compile_spec(permute_memory_to_nhwc=True, custom_path=None):
+def maybe_get_tosa_collate_path() -> str | None:
+    """
+    Checks the environment variable TOSA_TESTCASES_BASE_PATH and returns the
+    path to the where to store the current tests if it is set.
+    """
+    tosa_test_base = os.environ.get("TOSA_TESTCASES_BASE_PATH")
+    if tosa_test_base:
+        current_test = os.environ.get("PYTEST_CURRENT_TEST")
+        #'backends/arm/test/ops/test_mean_dim.py::TestMeanDim::test_meandim_tosa_BI_0_zeros (call)'
+        test_class = current_test.split("::")[1]
+        test_name = current_test.split("::")[-1].split(" ")[0]
+        if "BI" in test_name:
+            tosa_test_base = os.path.join(tosa_test_base, "tosa-bi")
+        elif "MI" in test_name:
+            tosa_test_base = os.path.join(tosa_test_base, "tosa-mi")
+        else:
+            tosa_test_base = os.path.join(tosa_test_base, "other")
+
+        return os.path.join(tosa_test_base, test_class, test_name)
+
+    return None
+
+
+def get_tosa_compile_spec(
+    permute_memory_to_nhwc=True, custom_path=None
+) -> list[CompileSpec]:
     """
     Default compile spec for TOSA tests.
     """
@@ -98,7 +134,13 @@ def get_tosa_compile_spec_unbuilt(
     """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify
     the compile spec before calling .build() to finalize it.
     """
-    intermediate_path = custom_path or tempfile.mkdtemp(prefix="arm_tosa_")
+    if not custom_path:
+        intermediate_path = maybe_get_tosa_collate_path() or tempfile.mkdtemp(
+            prefix="arm_tosa_"
+        )
+    else:
+        intermediate_path = custom_path
+
     if not os.path.exists(intermediate_path):
         os.makedirs(intermediate_path, exist_ok=True)
     compile_spec_builder = (
@@ -112,8 +154,8 @@ def get_tosa_compile_spec_unbuilt(
 
 
 def get_u55_compile_spec(
-    permute_memory_to_nhwc=False, quantize_io=False, custom_path=None
-):
+    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+) -> list[CompileSpec]:
     """
     Default compile spec for Ethos-U55 tests.
     """
@@ -122,10 +164,21 @@ def get_u55_compile_spec(
     ).build()
 
 
+def get_u85_compile_spec(
+    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+) -> list[CompileSpec]:
+    """
+    Default compile spec for Ethos-U85 tests.
+    """
+    return get_u85_compile_spec_unbuilt(
+        permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path
+    ).build()
+
+
 def get_u55_compile_spec_unbuilt(
-    permute_memory_to_nhwc=False, quantize_io=False, custom_path=None
+    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
 ) -> ArmCompileSpecBuilder:
-    """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify
+    """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify
     the compile spec before calling .build() to finalize it.
     """
     artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u55_")
@@ -137,7 +190,29 @@ def get_u55_compile_spec_unbuilt(
             "ethos-u55-128",
             system_config="Ethos_U55_High_End_Embedded",
             memory_mode="Shared_Sram",
-            extra_flags=None,
+            extra_flags="--debug-force-regor --output-format=raw",
+        )
+        .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
+        .set_permute_memory_format(permute_memory_to_nhwc)
+        .dump_intermediate_artifacts_to(artifact_path)
+    )
+    return compile_spec
+
+
+def get_u85_compile_spec_unbuilt(
+    permute_memory_to_nhwc=True, quantize_io=False, custom_path=None
+) -> list[CompileSpec]:
+    """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify
+    the compile spec before calling .build() to finalize it.
+    """
+    artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u85_")
+    compile_spec = (
+        ArmCompileSpecBuilder()
+        .ethosu_compile_spec(
+            "ethos-u85-128",
+            system_config="Ethos_U85_SYS_DRAM_Mid",
+            memory_mode="Shared_Sram",
+            extra_flags="--output-format=raw",
         )
         .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
         .set_permute_memory_format(permute_memory_to_nhwc)
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index aa9703f9eba..c4f47daa08c 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -6,6 +6,7 @@
 
 import logging
 import os
+import shutil
 import tempfile
 import unittest
 
@@ -126,8 +127,62 @@ def test_numerical_diff_prints(self):
             self.fail()
 
 
-class TestDumpOperatorsAndDtypes(unittest.TestCase):
-    def test_dump_ops_and_dtypes(self):
+def test_dump_ops_and_dtypes():
+    model = Linear(20, 30)
+    (
+        ArmTester(
+            model,
+            example_inputs=model.get_inputs(),
+            compile_spec=common.get_tosa_compile_spec(),
+        )
+        .quantize()
+        .dump_dtype_distribution()
+        .dump_operator_distribution()
+        .export()
+        .dump_dtype_distribution()
+        .dump_operator_distribution()
+        .to_edge()
+        .dump_dtype_distribution()
+        .dump_operator_distribution()
+        .partition()
+        .dump_dtype_distribution()
+        .dump_operator_distribution()
+    )
+    # Just test that there are no execptions.
+
+
+def test_dump_ops_and_dtypes_parseable():
+    model = Linear(20, 30)
+    (
+        ArmTester(
+            model,
+            example_inputs=model.get_inputs(),
+            compile_spec=common.get_tosa_compile_spec(),
+        )
+        .quantize()
+        .dump_dtype_distribution(print_table=False)
+        .dump_operator_distribution(print_table=False)
+        .export()
+        .dump_dtype_distribution(print_table=False)
+        .dump_operator_distribution(print_table=False)
+        .to_edge()
+        .dump_dtype_distribution(print_table=False)
+        .dump_operator_distribution(print_table=False)
+        .partition()
+        .dump_dtype_distribution(print_table=False)
+        .dump_operator_distribution(print_table=False)
+    )
+    # Just test that there are no execptions.
+
+
+class TestCollateTosaTests(unittest.TestCase):
+    """Tests the collation of TOSA tests through setting the environment variable TOSA_TESTCASE_BASE_PATH."""
+
+    def test_collate_tosa_BI_tests(self):
+        # Set the environment variable to trigger the collation of TOSA tests
+        os.environ["TOSA_TESTCASES_BASE_PATH"] = "test_collate_tosa_tests"
+        # Clear out the directory
+
         model = Linear(20, 30)
         (
             ArmTester(
@@ -136,16 +191,59 @@ def test_dump_ops_and_dtypes(self):
                 compile_spec=common.get_tosa_compile_spec(),
             )
             .quantize()
-            .dump_dtype_distribution()
-            .dump_operator_distribution()
             .export()
-            .dump_dtype_distribution()
-            .dump_operator_distribution()
             .to_edge()
-            .dump_dtype_distribution()
-            .dump_operator_distribution()
             .partition()
-            .dump_dtype_distribution()
-            .dump_operator_distribution()
+            .to_executorch()
+        )
+        # test that the output directory is created and contains the expected files
+        assert os.path.exists(
+            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests"
+        )
+        assert os.path.exists(
+            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag8.tosa"
+        )
+        assert os.path.exists(
+            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag8.json"
+        )
+
+        os.environ.pop("TOSA_TESTCASES_BASE_PATH")
+        shutil.rmtree("test_collate_tosa_tests", ignore_errors=True)
+
+
+def test_dump_tosa_ops(caplog):
+    caplog.set_level(logging.INFO)
+    model = Linear(20, 30)
+    (
+        ArmTester(
+            model,
+            example_inputs=model.get_inputs(),
+            compile_spec=common.get_tosa_compile_spec(),
         )
-        # Just test that there are no execeptions.
+        .quantize()
+        .export()
+        .to_edge()
+        .partition()
+        .dump_operator_distribution()
+    )
+    assert "TOSA operators:" in caplog.text
+
+
+def test_fail_dump_tosa_ops(caplog):
+    caplog.set_level(logging.INFO)
+
+    class Add(torch.nn.Module):
+        def forward(self, x):
+            return x + x
+
+    model = Add()
+    compile_spec = common.get_u55_compile_spec()
+    (
+        ArmTester(model, example_inputs=(torch.ones(5),), compile_spec=compile_spec)
+        .quantize()
+        .export()
+        .to_edge()
+        .partition()
+        .dump_operator_distribution()
+    )
+    assert "Can not get operator distribution for Vela command stream." in caplog.text
diff --git a/backends/arm/test/misc/test_dim_order_guards.py b/backends/arm/test/misc/test_dim_order_guards.py
new file mode 100644
index 00000000000..8bad1493b1c
--- /dev/null
+++ b/backends/arm/test/misc/test_dim_order_guards.py
@@ -0,0 +1,58 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import pytest
+
+import torch
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+
+class Conv2D(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv2d = torch.nn.Conv2d(in_channels=2, out_channels=3, kernel_size=(3, 3))
+
+    def forward(self, x):
+        return self.conv2d(x.to(memory_format=torch.channels_last))
+
+    def get_inputs(self):
+        return (torch.randn(1, 2, 20, 20),)
+
+
+class TestDimOrderGuards(unittest.TestCase):
+
+    def test_tosa_MI_pipeline(self):
+        module = Conv2D()
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .to_edge()
+        )
+        with pytest.raises(RuntimeError):
+            tester.partition()
+
+    def test_tosa_BI_pipeline(self):
+        module = Conv2D()
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge()
+        )
+        with pytest.raises(RuntimeError):
+            tester.partition()
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index 248153a5180..a50e2732f15 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -84,7 +84,7 @@ def test_mv2_tosa_BI(self):
         )
 
     def test_mv2_u55_BI(self):
-        (
+        tester = (
             ArmTester(
                 self.mv2,
                 example_inputs=self.model_inputs,
@@ -96,4 +96,29 @@ def test_mv2_u55_BI(self):
             .check(list(self.operators_after_quantization))
             .partition()
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-300"
+            )
+
+    def test_mv2_u85_BI(self):
+        tester = (
+            ArmTester(
+                self.mv2,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            )
+            .quantize()
+            .export()
+            .to_edge(config=self._edge_compile_config)
+            .check(list(self.operators_after_quantization))
+            .partition()
+            .to_executorch()
+            .serialize()
+        )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-320"
+            )
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 63023327f79..e3eeb187da3 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -13,6 +13,7 @@
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -92,16 +93,17 @@ def _test_add_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_add_u55_BI_pipeline(
+    def _test_add_ethos_BI_pipeline(
         self,
         module: torch.nn.Module,
+        compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
         tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -114,8 +116,7 @@ def _test_add_u55_BI_pipeline(
             .serialize()
         )
 
-        if common.is_option_enabled("corstone300"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+        return tester
 
     @parameterized.expand(Add.test_parameters)
     def test_add_tosa_MI(self, test_data: torch.Tensor):
@@ -130,7 +131,28 @@ def test_add_tosa_BI(self, test_data: torch.Tensor):
     @parameterized.expand(Add.test_parameters)
     def test_add_u55_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
-        self._test_add_u55_BI_pipeline(self.Add(), test_data)
+        tester = self._test_add_ethos_BI_pipeline(
+            self.Add(),
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            test_data,
+        )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                qtol=1, inputs=test_data, target_board="corstone-300"
+            )
+
+    @parameterized.expand(Add.test_parameters)
+    def test_add_u85_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        tester = self._test_add_ethos_BI_pipeline(
+            self.Add(),
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            test_data,
+        )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                qtol=1, inputs=test_data, target_board="corstone-320"
+            )
 
     @parameterized.expand(Add2.test_parameters)
     def test_add2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
@@ -145,4 +167,21 @@ def test_add2_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
     @parameterized.expand(Add2.test_parameters)
     def test_add2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
-        self._test_add_u55_BI_pipeline(self.Add2(), test_data)
+        tester = self._test_add_ethos_BI_pipeline(
+            self.Add2(), common.get_u55_compile_spec(), test_data
+        )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                qtol=1, inputs=test_data, target_board="corstone-300"
+            )
+
+    @parameterized.expand(Add2.test_parameters)
+    def test_add2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        tester = self._test_add_ethos_BI_pipeline(
+            self.Add2(), common.get_u85_compile_spec(), test_data
+        )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                qtol=1, inputs=test_data, target_board="corstone-320"
+            )
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index 32a0e5555a3..6c14420dbcf 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -13,6 +13,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -86,14 +87,17 @@ def _test_avgpool2d_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_avgpool2d_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_avgpool2d_tosa_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -141,6 +145,22 @@ def test_avgpool2d_tosa_u55_BI(
         test_data: torch.Tensor,
         model_params: int | Tuple[int, int],
     ):
-        self._test_avgpool2d_tosa_u55_BI_pipeline(
-            self.AvgPool2d(*model_params), (test_data,)
+        self._test_avgpool2d_tosa_ethos_BI_pipeline(
+            self.AvgPool2d(*model_params),
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            (test_data,),
+        )
+
+    @parameterized.expand(test_data_suite)
+    @unittest.expectedFailure
+    def test_avgpool2d_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        model_params: int | Tuple[int, int],
+    ):
+        self._test_avgpool2d_tosa_ethos_BI_pipeline(
+            self.AvgPool2d(*model_params),
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            (test_data,),
         )
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 30f45261247..e4e6abb7bb3 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -11,6 +11,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 torch.manual_seed(1)
@@ -83,14 +84,17 @@ def _test_bmm_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_bmm_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...]
+    def _test_bmm_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor, ...],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -132,4 +136,13 @@ def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
     @unittest.expectedFailure
     def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor):
         test_data = (operand1,)
-        self._test_bmm_u55_BI_pipeline(self.BMMSingleInput(), test_data)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(BMMSingleInput.test_parameters)
+    def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMMSingleInput(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index f677aa5590c..9723ba0f0c0 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -13,6 +13,7 @@
 from executorch.backends.arm.test import common
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -89,14 +90,17 @@ def _test_cat_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_cat_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int]
+    def _test_cat_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[tuple[torch.Tensor, ...], int],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -128,4 +132,13 @@ def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
     @parameterized.expand(Cat.test_parameters)
     def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
-        self._test_cat_u55_BI_pipeline(self.Cat(), test_data)
+        self._test_cat_ethosu_BI_pipeline(
+            self.Cat(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(Cat.test_parameters)
+    def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_ethosu_BI_pipeline(
+            self.Cat(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 8386283f24e..9852c5c4520 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -21,6 +21,8 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -76,16 +78,15 @@ def _test_clone_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_clone_tosa_u55_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_clone_tosa_ethos_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.clone.default": 1})
@@ -95,6 +96,20 @@ def _test_clone_tosa_u55_pipeline(
             .to_executorch()
         )
 
+    def _test_clone_tosa_u55_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_clone_tosa_ethos_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_clone_tosa_u85_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_clone_tosa_ethos_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(Clone.test_parameters)
     def test_clone_tosa_MI(self, test_tensor: torch.Tensor):
         self._test_clone_tosa_MI_pipeline(self.Clone(), (test_tensor,))
@@ -106,3 +121,7 @@ def test_clone_tosa_BI(self, test_tensor: torch.Tensor):
     @parameterized.expand(Clone.test_parameters)
     def test_clone_u55_BI(self, test_tensor: torch.Tensor):
         self._test_clone_tosa_u55_pipeline(self.Clone(), (test_tensor,))
+
+    @parameterized.expand(Clone.test_parameters)
+    def test_clone_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_clone_tosa_u85_pipeline(self.Clone(), (test_tensor,))
diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py
index 82748799533..decf790ce51 100644
--- a/backends/arm/test/ops/test_conv.py
+++ b/backends/arm/test/ops/test_conv.py
@@ -12,6 +12,7 @@
 from executorch.backends.arm.test import common
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -241,19 +242,6 @@ def forward(self, x):
     ("two_conv2d", two_conv2d),
 ]
 
-# Expected fails on Ethos-U55/U65. This is a known limitation.
-# Check: https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/heads/main/SUPPORTED_OPS.md
-#     IFM Tensor batch size must be 1 - [FULLY_CONNECTED, RESHAPE, SHAPE, SLICE, SOFTMAX, SPLIT, SPLIT_V, SQUEEZE, STRIDED_SLICE, UNPACK]
-testsuite_u55 = testsuite.copy()
-testsuite_u55.remove(("2x2_3x2x40x40_nobias", conv2d_2x2_3x2x40x40_nobias))
-testsuite_u55.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1))
-
-# Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191.
-testsuite_u55.remove(("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2))
-testsuite_u55.remove(
-    ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1)
-)
-
 
 class TestConv2D(unittest.TestCase):
     """Tests Conv2D, both single ops and multiple Convolutions in series."""
@@ -297,14 +285,17 @@ def _test_conv2d_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_conv2d_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_conv2d_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -323,6 +314,18 @@ def test_conv2d_tosa_MI(self, test_name, model):
     def test_conv2d_tosa_BI(self, test_name, model):
         self._test_conv2d_tosa_BI_pipeline(model, model.get_inputs())
 
-    @parameterized.expand(testsuite_u55)
+    @parameterized.expand(testsuite)
     def test_conv2d_u55_BI(self, test_name, model):
-        self._test_conv2d_u55_BI_pipeline(model, model.get_inputs())
+        self._test_conv2d_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            model,
+            model.get_inputs(),
+        )
+
+    @parameterized.expand(testsuite)
+    def test_conv2d_u85_BI(self, test_name, model):
+        self._test_conv2d_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            model,
+            model.get_inputs(),
+        )
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 31051ef8f7d..9bc12c5166a 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -9,9 +9,12 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -199,14 +202,17 @@ def _test_conv_combo_tosa_BI_pipeline(
             )
         )
 
-    def _test_conv_combo_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_conv_combo_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -230,7 +236,19 @@ def test_conv_meandim_tosa_BI(self):
 
     def test_conv_meandim_u55_BI(self):
         model = ComboConv2dMeandim()
-        self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs())
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            model.get_inputs(),
+        )
+
+    def test_conv_meandim_u85_BI(self):
+        model = ComboConv2dMeandim()
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            model.get_inputs(),
+        )
 
     ##############################
     ## Conv + batch norm + relu ##
@@ -245,7 +263,17 @@ def test_conv_batchnorm_relu6_tosa_BI(self):
 
     def test_conv_batchnorm_relu6_u55_BI(self):
         model = ComboConvBatchnormRelu6()
-        self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs())
+        self._test_conv_combo_ethos_BI_pipeline(
+            model, common.get_u55_compile_spec(), model.get_inputs()
+        )
+
+    def test_conv_batchnorm_relu_u85_BI(self):
+        model = ComboConvBatchnormRelu6()
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(),
+            model.get_inputs(),
+        )
 
     ##################
     ## Conv + ReLU6 ##
@@ -266,7 +294,17 @@ def test_conv_relu6_tosa_BI(self, test_data: torch.Tensor):
     def test_conv_relu6_u55_BI(self, test_data: torch.Tensor):
         model = ComboConvRelu6()
         test_data = (test_data,)
-        self._test_conv_combo_u55_BI_pipeline(model, test_data)
+        self._test_conv_combo_ethos_BI_pipeline(
+            model, common.get_u55_compile_spec(permute_memory_to_nhwc=True), test_data
+        )
+
+    @parameterized.expand(ComboConvRelu6.test_data)
+    def test_conv_relu6_u85_BI(self, test_data: torch.Tensor):
+        model = ComboConvRelu6()
+        test_data = (test_data,)
+        self._test_conv_combo_ethos_BI_pipeline(
+            model, common.get_u85_compile_spec(permute_memory_to_nhwc=True), test_data
+        )
 
     ###############################
     ## Block bottleneck residual ##
@@ -275,10 +313,24 @@ def test_block_bottleneck_residual_tosa_MI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
+    # TODO: Investigate flakyness (MLTORCH-307)
+    @pytest.mark.flaky(reruns=3)
     def test_block_bottleneck_residual_tosa_BI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
 
     def test_block_bottleneck_residual_u55_BI(self):
         model = ComboBlockBottleneckResidual()
-        self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs())
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            model.get_inputs(),
+        )
+
+    def test_block_bottleneck_residual_u85_BI(self):
+        model = ComboBlockBottleneckResidual()
+        self._test_conv_combo_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            model.get_inputs(),
+        )
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 9b3f79e6a11..a63066bee68 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -16,6 +16,7 @@
 from executorch.backends.arm.test.ops.test_conv import Conv2d
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -112,24 +113,6 @@
     ("two_dw_conv2d", two_dw_conv2d),
 ]
 
-# Expected fails on Ethos-U55/U65. This is a known limitation.
-# Check: https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/heads/main/SUPPORTED_OPS.md
-#   For depth multipliers > 1, IFM channels must be 1 and OFM channels must be
-#   equal to the depth multiplier
-# and
-#   depthwise_multiplier = out_channels / in_channels
-testsuite_u55 = testsuite.copy()
-testsuite_u55.remove(("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1))
-testsuite_u55.remove(("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1))
-testsuite_u55.remove(("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3))
-testsuite_u55.remove(
-    ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias)
-)
-testsuite_u55.remove(("two_dw_conv2d", two_dw_conv2d))
-
-# Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191.
-testsuite_u55.remove(("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1))
-
 
 class TestDepthwiseConv2D(unittest.TestCase):
     """Tests Conv2D where groups == in_channels and out_channels = K * in_channels. This
@@ -172,14 +155,17 @@ def _test_dw_conv2d_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_dw_conv2d_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_dw_conv2d_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -191,16 +177,35 @@ def _test_dw_conv2d_u55_BI_pipeline(
         )
 
     @parameterized.expand(testsuite)
-    def test_dw_conv2d_tosa_MI(self, test_name, model):
+    def test_dw_conv2d_tosa_MI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv2d_tosa_MI_pipeline(model, model.get_inputs())
 
     # TODO: Investigate flakyness (MLTORCH-307)
     @parameterized.expand(testsuite)
     @pytest.mark.flaky(reruns=3)
-    def test_dw_conv2d_tosa_BI(self, test_name, model):
+    def test_dw_conv2d_tosa_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv2d_tosa_BI_pipeline(model, model.get_inputs())
 
-    @parameterized.expand(testsuite_u55, skip_on_empty=True)
-    @unittest.expectedFailure
-    def test_dw_conv2d_u55_BI(self, test_name, model):
-        self._test_dw_conv2d_u55_BI_pipeline(model, model.get_inputs())
+    @parameterized.expand(testsuite, skip_on_empty=True)
+    def test_dw_conv2d_u55_BI(
+        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
+    ):
+        self._test_dw_conv2d_ethos_BI_pipeline(
+            model,
+            common.get_u55_compile_spec(
+                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
+            ),
+            model.get_inputs(),
+        )
+
+    @parameterized.expand(testsuite)
+    def test_dw_conv2d_u85_BI(
+        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
+    ):
+        self._test_dw_conv2d_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(
+                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
+            ),
+            model.get_inputs(),
+        )
diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py
index 79020ade25c..6e85d8fe49b 100644
--- a/backends/arm/test/ops/test_exp.py
+++ b/backends/arm/test/ops/test_exp.py
@@ -12,6 +12,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 test_data_suite = [
@@ -71,8 +72,11 @@ def _test_exp_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_exp_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_exp_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
@@ -103,8 +107,14 @@ def test_exp_tosa_MI(
     def test_exp_tosa_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_exp_tosa_BI_pipeline(self.Exp(), (test_data,))
 
-    # Fails due to Vela diff from Tosa spec, expected to work with Regor.
     @parameterized.expand(test_data_suite)
-    @unittest.expectedFailure
     def test_exp_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_exp_tosa_u55_BI_pipeline(self.Exp(), (test_data,))
+        self._test_exp_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Exp(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_exp_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_exp_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Exp(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index 66c081a544c..e9bbea9a5e5 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -76,7 +76,9 @@ def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_expand_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+    def _test_expand_ethosu_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple
+    ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
@@ -104,6 +106,15 @@ def test_expand_tosa_BI(self, test_input, multiples):
 
     # Expected failure since tosa.TILE is unsupported by Vela.
     @parameterized.expand(Expand.test_parameters)
-    @unittest.expectedFailure
+    @unittest.expectedFailure  # TODO: MLBEDSW-9386
     def test_expand_u55_BI(self, test_input, multiples):
-        self._test_expand_tosa_u55_pipeline(self.Expand(), (test_input, multiples))
+        self._test_expand_ethosu_BI_pipeline(
+            self.Expand(), common.get_u55_compile_spec(), (test_input, multiples)
+        )
+
+    @parameterized.expand(Expand.test_parameters)
+    @unittest.expectedFailure  # TODO: MLBEDSW-9386
+    def test_expand_u85_BI(self, test_input, multiples):
+        self._test_expand_ethosu_BI_pipeline(
+            self.Expand(), common.get_u85_compile_spec(), (test_input, multiples)
+        )
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index 1be7f59ab8f..2722edef328 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -15,6 +15,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -93,13 +94,11 @@ def _test_full_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+    def _test_full_tosa_ethos_pipeline(
+        self, compile_spec: list[CompileSpec], module: torch.nn.Module, test_data: Tuple
+    ):
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize()
             .export()
             .check_count({"torch.ops.aten.full.default": 1})
@@ -110,6 +109,16 @@ def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple
             .to_executorch()
         )
 
+    def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        self._test_full_tosa_ethos_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_full_tosa_u85_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        self._test_full_tosa_ethos_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     def test_only_full_tosa_MI(self):
         self._test_full_tosa_MI_pipeline(self.Full(), ())
 
@@ -138,6 +147,13 @@ def test_full_u55_BI(self, test_tensor: Tuple):
             test_tensor,
         )
 
+    @parameterized.expand(AddVariableFull.test_parameters)
+    def test_full_u85_BI(self, test_tensor: Tuple):
+        self._test_full_tosa_u85_pipeline(
+            self.AddVariableFull(),
+            test_tensor,
+        )
+
     # This fails since full outputs int64 by default if 'fill_value' is integer, which our backend doesn't support.
     @unittest.expectedFailure
     def test_integer_value(self):
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 6fdbb2127e0..3f68ab0251a 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -15,6 +15,7 @@
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -153,14 +154,17 @@ def _test_linear_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=True)
         )
 
-    def _test_linear_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
+    def _test_linear_tosa_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor],
+    ) -> ArmTester:
         tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -172,9 +176,7 @@ def _test_linear_tosa_u55_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-
-        if common.is_option_enabled("corstone300"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+        return tester
 
     @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
     def test_linear_tosa_MI(
@@ -215,10 +217,32 @@ def test_linear_tosa_u55_BI(
     ):
         in_features = test_data.shape[-1]
         test_data = (test_data,)
-        self._test_linear_tosa_u55_BI_pipeline(
+        tester = self._test_linear_tosa_ethosu_BI_pipeline(
+            self.Linear(
+                in_features=in_features,
+                out_features=out_features,
+            ),
+            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            test_data,
+        )
+
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(test_data_suite_rank1)
+    def test_linear_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        out_features: int,
+    ):
+        in_features = test_data.shape[-1]
+        test_data = (test_data,)
+        self._test_linear_tosa_ethosu_BI_pipeline(
             self.Linear(
                 in_features=in_features,
                 out_features=out_features,
             ),
+            common.get_u85_compile_spec(permute_memory_to_nhwc=False),
             test_data,
         )
diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py
index 80bc17c987f..269b7be25f5 100644
--- a/backends/arm/test/ops/test_log.py
+++ b/backends/arm/test/ops/test_log.py
@@ -12,6 +12,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 test_data_suite = [
@@ -71,14 +72,17 @@ def _test_log_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_log_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_log_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -103,8 +107,14 @@ def test_log_tosa_MI(
     def test_log_tosa_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_log_tosa_BI_pipeline(self.Log(), (test_data,))
 
-    # Fails due to Vela diff from Tosa spec, logected to work with Regor.
     @parameterized.expand(test_data_suite)
-    @unittest.expectedFailure
     def test_log_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_log_tosa_u55_BI_pipeline(self.Log(), (test_data,))
+        self._test_log_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Log(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_log_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_log_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Log(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index e48d749c194..0653e84e704 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -13,6 +13,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -91,14 +92,17 @@ def _test_meandim_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_meandim_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_meandim_tosa_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -138,4 +142,20 @@ def test_meandim_tosa_u55_BI(
         test_name: str,
         test_data: torch.Tensor,
     ):
-        self._test_meandim_tosa_u55_BI_pipeline(self.MeanDim(), (test_data,))
+        self._test_meandim_tosa_ethosu_BI_pipeline(
+            self.MeanDim(),
+            common.get_u55_compile_spec(),
+            (test_data,),
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_meandim_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+    ):
+        self._test_meandim_tosa_ethosu_BI_pipeline(
+            self.MeanDim(),
+            common.get_u85_compile_spec(),
+            (test_data,),
+        )
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index 9a9b3ef579b..4271496eaa9 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -12,6 +12,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -87,14 +88,17 @@ def _test_mm_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_mm_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_mm_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -131,11 +135,29 @@ def test_mm_single_input_tosa_BI(self, operand1: torch.Tensor):
     @unittest.expectedFailure
     def test_mm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
-        self._test_mm_u55_BI_pipeline(self.MM(), test_data)
+        self._test_mm_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.MM(), test_data
+        )
 
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
     @parameterized.expand(MMSingleInput.test_parameters)
     @unittest.expectedFailure
     def test_mm_single_input_u55_BI(self, operand1: torch.Tensor):
         test_data = (operand1,)
-        self._test_mm_u55_BI_pipeline(self.MMSingleInput(), test_data)
+        self._test_mm_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.MMSingleInput(), test_data
+        )
+
+    @parameterized.expand(MM.test_parameters)
+    def test_mm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_mm_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.MM(), test_data
+        )
+
+    @parameterized.expand(MMSingleInput.test_parameters)
+    def test_mm_single_input_u85_BI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_mm_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.MMSingleInput(), test_data
+        )
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index dee8b62f1b2..a1c2dba5fed 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -10,6 +10,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 test_data_sute = [
@@ -101,14 +102,17 @@ def _test_mul_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1.0)
         )
 
-    def _test_mul_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
+    def _test_mul_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: tuple[torch.Tensor, torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -141,9 +145,7 @@ def test_mul_tosa_BI(
         test_data = (input_, other_)
         self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
 
-    # Expected to fail since RESCALE cannot be fused with MUL in Vela.
     @parameterized.expand(test_data_sute)
-    @unittest.expectedFailure
     def test_mul_u55_BI(
         self,
         test_name: str,
@@ -151,4 +153,18 @@ def test_mul_u55_BI(
         other_: torch.Tensor,
     ):
         test_data = (input_, other_)
-        self._test_mul_u55_BI_pipeline(self.Mul(), test_data)
+        self._test_mul_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Mul(), test_data
+        )
+
+    @parameterized.expand(test_data_sute)
+    def test_mul_u85_BI(
+        self,
+        test_name: str,
+        input_: torch.Tensor,
+        other_: torch.Tensor,
+    ):
+        test_data = (input_, other_)
+        self._test_mul_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Mul(), test_data
+        )
diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py
index d2ca8540f4c..effbccc74d5 100644
--- a/backends/arm/test/ops/test_relu.py
+++ b/backends/arm/test/ops/test_relu.py
@@ -17,6 +17,7 @@
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 
@@ -82,15 +83,18 @@ def _test_relu_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_relu_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_relu_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -116,5 +120,13 @@ def test_relu_tosa_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_relu_tosa_BI_pipeline(self.Relu(), (test_data,))
 
     @parameterized.expand(test_data_suite)
-    def test_relu_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_relu_tosa_u55_BI_pipeline(self.Relu(), (test_data,))
+    def test_relu_u55_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_relu_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Relu(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_relu_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_relu_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Relu(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py
index a6fad033456..542f0d6256b 100644
--- a/backends/arm/test/ops/test_repeat.py
+++ b/backends/arm/test/ops/test_repeat.py
@@ -21,6 +21,7 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
 
@@ -77,13 +78,15 @@ def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_repeat_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+    def _test_repeat_ethosu_pipeline(
+        self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple
+    ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -103,8 +106,16 @@ def test_repeat_tosa_MI(self, test_input, multiples):
     def test_repeat_tosa_BI(self, test_input, multiples):
         self._test_repeat_tosa_BI_pipeline(self.Repeat(), (test_input, multiples))
 
-    # Expected failure since tosa.TILE is unsupported by Vela.
     @parameterized.expand(Repeat.test_parameters)
-    @unittest.expectedFailure
+    @unittest.expectedFailure  # TODO: MLBEDSW-9386
     def test_repeat_u55_BI(self, test_input, multiples):
-        self._test_repeat_tosa_u55_pipeline(self.Repeat(), (test_input, multiples))
+        self._test_repeat_ethosu_pipeline(
+            common.get_u55_compile_spec(), self.Repeat(), (test_input, multiples)
+        )
+
+    @parameterized.expand(Repeat.test_parameters)
+    @unittest.expectedFailure  # TODO: MLBEDSW-9386
+    def test_repeat_u85_BI(self, test_input, multiples):
+        self._test_repeat_ethosu_pipeline(
+            common.get_u85_compile_spec(), self.Repeat(), (test_input, multiples)
+        )
diff --git a/backends/arm/test/ops/test_rsqrt.py b/backends/arm/test/ops/test_rsqrt.py
new file mode 100644
index 00000000000..2ccb7ec9916
--- /dev/null
+++ b/backends/arm/test/ops/test_rsqrt.py
@@ -0,0 +1,107 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Tests the rsqrt op.
+#
+
+import unittest
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+class TestRsqrt(unittest.TestCase):
+    class Rsqrt(torch.nn.Module):
+        test_parameters = [
+            (torch.ones(1, 10, 10, 10),),
+            (torch.rand(1, 10, 10, 10),),
+            (torch.rand(1, 5, 10, 20),),
+            (torch.rand(5, 10, 20),),
+        ]
+
+        def forward(self, x: torch.Tensor):
+            return x.rsqrt()
+
+    def _test_rsqrt_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check_count({"torch.ops.aten.rsqrt.default": 1})
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_rsqrt_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.rsqrt.default": 1})
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_rsqrt_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: tuple[torch.Tensor],
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.rsqrt.default": 1})
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(Rsqrt.test_parameters)
+    def test_rsqrt_tosa_MI(self, test_tensor: torch.Tensor):
+        self._test_rsqrt_tosa_MI_pipeline(self.Rsqrt(), (test_tensor,))
+
+    @parameterized.expand(Rsqrt.test_parameters)
+    def test_rsqrt_tosa_BI(self, test_tensor: torch.Tensor):
+        self._test_rsqrt_tosa_BI_pipeline(self.Rsqrt(), (test_tensor,))
+
+    @parameterized.expand(Rsqrt.test_parameters)
+    def test_rsqrt_u55_BI(self, test_tensor: torch.Tensor):
+        self._test_rsqrt_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Rsqrt(), (test_tensor,)
+        )
+
+    @parameterized.expand(Rsqrt.test_parameters)
+    def test_rsqrt_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_rsqrt_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Rsqrt(), (test_tensor,)
+        )
diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py
index 7a0435689f4..ddc29a66a4b 100644
--- a/backends/arm/test/ops/test_sigmoid.py
+++ b/backends/arm/test/ops/test_sigmoid.py
@@ -13,6 +13,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 logger = logging.getLogger(__name__)
@@ -102,14 +103,17 @@ def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tup
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_sigmoid_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_sigmoid_tosa_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -122,6 +126,20 @@ def _test_sigmoid_tosa_u55_BI_pipeline(
             .to_executorch()
         )
 
+    def _test_sigmoid_tosa_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        self._test_sigmoid_tosa_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_sigmoid_tosa_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        self._test_sigmoid_tosa_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(test_data_suite)
     def test_sigmoid_tosa_MI(
         self,
@@ -134,19 +152,41 @@ def test_sigmoid_tosa_MI(
     def test_sigmoid_tosa_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_sigmoid_tosa_BI_pipeline(self.Sigmoid(), (test_data,))
 
+    def test_add_sigmoid_tosa_MI(self):
+        self._test_sigmoid_tosa_MI_pipeline(self.AddSigmoid(), (test_data_suite[0][1],))
+
+    @unittest.skip(
+        reason="Started to fails when PyTorch 2.5->2.6 https://github.com/pytorch/executorch/issues/5832"
+    )
     def test_add_sigmoid_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(self.AddSigmoid(), (test_data_suite[0][1],))
 
+    def test_sigmoid_add_tosa_MI(self):
+        self._test_sigmoid_tosa_MI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],))
+
+    @unittest.skip(
+        reason="Started to fails when PyTorch 2.5->2.6 https://github.com/pytorch/executorch/issues/5832"
+    )
     def test_sigmoid_add_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],))
 
+    def test_sigmoid_add_sigmoid_tosa_MI(self):
+        self._test_sigmoid_tosa_MI_pipeline(
+            self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
+        )
+
+    @unittest.skip(
+        reason="Started to fails when PyTorch 2.5->2.6 https://github.com/pytorch/executorch/issues/5832"
+    )
     def test_sigmoid_add_sigmoid_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(
             self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
         )
 
-    # Fails due to Vela diff from Tosa spec, expected to work with Regor.
     @parameterized.expand(test_data_suite)
-    @unittest.expectedFailure
     def test_sigmoid_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_sigmoid_tosa_u55_BI_pipeline(self.Sigmoid(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_sigmoid_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_sigmoid_tosa_u85_BI_pipeline(self.Sigmoid(), (test_data,))
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
index 14874df156e..ca026c7f420 100644
--- a/backends/arm/test/ops/test_slice.py
+++ b/backends/arm/test/ops/test_slice.py
@@ -15,6 +15,7 @@
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -77,8 +78,11 @@ def _test_slice_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_slice_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_slice_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
@@ -96,6 +100,20 @@ def _test_slice_u55_BI_pipeline(
             .to_executorch()
         )
 
+    def _test_slice_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_slice_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_slice_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_slice_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(Slice.test_tensors)
     def test_slice_tosa_MI(self, tensor):
         self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor,))
@@ -108,9 +126,10 @@ def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor):
     def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor):
         self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), True)
 
-    # Fails during Vela compilation when trying to use a Tuple as a Named tuple,
-    # Could be Vela Issue, wait until Regor.
     @parameterized.expand(Slice.test_tensors)
-    @unittest.expectedFailure
     def test_slice_u55_BI(self, test_tensor: torch.Tensor):
         self._test_slice_u55_BI_pipeline(self.Slice(), (test_tensor,))
+
+    @parameterized.expand(Slice.test_tensors)
+    def test_slice_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_slice_u85_BI_pipeline(self.Slice(), (test_tensor,))
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index 20da65b687f..a7d25d266de 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -12,6 +12,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -80,14 +81,17 @@ def _test_softmax_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_softmax_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_softmax_tosa_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -100,6 +104,20 @@ def _test_softmax_tosa_u55_BI_pipeline(
             .to_executorch()
         )
 
+    def _test_softmax_tosa_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        self._test_softmax_tosa_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_softmax_tosa_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        self._test_softmax_tosa_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(test_data_suite)
     def test_softmax_tosa_MI(
         self,
@@ -132,3 +150,13 @@ def test_softmax_tosa_u55_BI(
         dim: int,
     ):
         self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    @unittest.expectedFailure
+    def test_softmax_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        dim: int,
+    ):
+        self._test_softmax_tosa_u85_BI_pipeline(self.Softmax(dim=dim), (test_data,))
diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py
index bc998179c0c..3f6edc0c2b8 100644
--- a/backends/arm/test/ops/test_split.py
+++ b/backends/arm/test/ops/test_split.py
@@ -14,6 +14,7 @@
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 test_data_t = tuple[torch.Tensor, int | list[int], int]
@@ -94,15 +95,15 @@ def _test_split_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_split_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: test_data_t
+    def _test_split_ethosu_BI_pipeline(
+        self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: test_data_t
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -123,17 +124,44 @@ def test_split_with_sizes_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitWithSizes(), test_data)
 
     @parameterized.expand(Split.test_data)
-    def test_split_n_out_tosa_MI(self, test_data: test_data_t):
+    def test_split_one_out_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitSingleOut(), test_data)
+
+    @parameterized.expand(Split.test_data)
+    def test_split_two_out_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitTwoOut(), test_data)
 
     @parameterized.expand(Split.test_data)
     def test_split_tosa_BI(self, test_data: test_data_t):
         self._test_split_tosa_BI_pipeline(self.Split(), test_data)
 
-    # Fails during Vela compilation when trying to use a Tuple as a Named tuple,
-    # Could be Vela Issue, wait until Regor.
-    @parameterized.expand(Split.test_data)
-    @unittest.expectedFailure
+    @parameterized.expand(
+        [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]]
+    )
     def test_split_u55_BI(self, test_data: test_data_t):
-        self._test_split_u55_BI_pipeline(self.Split(), test_data)
+        self._test_split_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Split(), test_data
+        )
+
+    # TODO MLETORCH-350
+    @parameterized.expand([Split.test_data[3], Split.test_data[5]])
+    @unittest.expectedFailure
+    def test_split_u55_BI_skip(self, test_data: test_data_t):
+        self._test_split_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Split(), test_data
+        )
+
+    @parameterized.expand(
+        [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]]
+    )
+    def test_split_u85_BI(self, test_data: test_data_t):
+        self._test_split_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Split(), test_data
+        )
+
+    @parameterized.expand([Split.test_data[3], Split.test_data[5]])
+    @unittest.expectedFailure
+    def test_split_u85_BI_skip(self, test_data: test_data_t):
+        self._test_split_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Split(), test_data
+        )
diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py
new file mode 100644
index 00000000000..4fe420708a2
--- /dev/null
+++ b/backends/arm/test/ops/test_squeeze.py
@@ -0,0 +1,221 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Tests the squeeze op which squeezes a given dimension with size 1 into a lower ranked tensor.
+#
+
+import unittest
+from typing import Optional, Tuple
+
+import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+class TestSqueeze(unittest.TestCase):
+    class SqueezeDim(torch.nn.Module):
+        test_parameters: list[tuple[torch.Tensor, int]] = [
+            (torch.randn(1, 1, 5), -2),
+            (torch.randn(1, 2, 3, 1), 3),
+            (torch.randn(1, 5, 1, 5), -2),
+        ]
+
+        def forward(self, x: torch.Tensor, dim: int):
+            return x.squeeze(dim)
+
+    class SqueezeDims(torch.nn.Module):
+        test_parameters: list[tuple[torch.Tensor, tuple[int]]] = [
+            (torch.randn(1, 5, 5, 1), (0, -1)),
+            (torch.randn(1, 5, 1, 5), (0, -2)),
+        ]
+
+        def forward(self, x: torch.Tensor, dims: tuple[int]):
+            return x.squeeze(dims)
+
+    class Squeeze(torch.nn.Module):
+        test_parameters: list[tuple[torch.Tensor]] = [
+            (torch.randn(1, 5, 5, 1),),
+            (torch.randn(1, 5, 1, 5),),
+        ]
+
+        def forward(self, x: torch.Tensor):
+            return x.squeeze()
+
+    def _test_squeeze_tosa_MI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor, Optional[tuple[int]]],
+        export_target: str,
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=False),
+            )
+            .export()
+            .check_count({export_target: 1})
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_squeeze_tosa_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor, Optional[tuple[int]]],
+        export_target: str,
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=False),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({export_target: 1})
+            .to_edge()
+            .partition()
+            .dump_artifact()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_squeeze_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor, Optional[tuple[int]]],
+        export_target: str,
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({export_target: 1})
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(Squeeze.test_parameters)
+    def test_squeeze_tosa_MI(
+        self,
+        test_tensor: torch.Tensor,
+    ):
+        self._test_squeeze_tosa_MI_pipeline(
+            self.Squeeze(), (test_tensor,), "torch.ops.aten.squeeze.default"
+        )
+
+    @parameterized.expand(Squeeze.test_parameters)
+    def test_squeeze_tosa_BI(
+        self,
+        test_tensor: torch.Tensor,
+    ):
+        self._test_squeeze_tosa_BI_pipeline(
+            self.Squeeze(), (test_tensor,), "torch.ops.aten.squeeze.default"
+        )
+
+    @parameterized.expand(Squeeze.test_parameters)
+    def test_squeeze_u55_BI(
+        self,
+        test_tensor: torch.Tensor,
+    ):
+        self._test_squeeze_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            self.Squeeze(),
+            (test_tensor,),
+            "torch.ops.aten.squeeze.default",
+        )
+
+    @parameterized.expand(Squeeze.test_parameters)
+    def test_squeeze_u85_BI(
+        self,
+        test_tensor: torch.Tensor,
+    ):
+        self._test_squeeze_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(permute_memory_to_nhwc=False),
+            self.Squeeze(),
+            (test_tensor,),
+            "torch.ops.aten.squeeze.default",
+        )
+
+    @parameterized.expand(SqueezeDim.test_parameters)
+    def test_squeeze_dim_tosa_MI(self, test_tensor: torch.Tensor, dim: int):
+        self._test_squeeze_tosa_MI_pipeline(
+            self.SqueezeDim(), (test_tensor, dim), "torch.ops.aten.squeeze.dim"
+        )
+
+    @parameterized.expand(SqueezeDim.test_parameters)
+    def test_squeeze_dim_tosa_BI(self, test_tensor: torch.Tensor, dim: int):
+        self._test_squeeze_tosa_BI_pipeline(
+            self.SqueezeDim(), (test_tensor, dim), "torch.ops.aten.squeeze.dim"
+        )
+
+    @parameterized.expand(SqueezeDim.test_parameters)
+    def test_squeeze_dim_u55_BI(self, test_tensor: torch.Tensor, dim: int):
+        self._test_squeeze_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            self.SqueezeDim(),
+            (test_tensor, dim),
+            "torch.ops.aten.squeeze.dim",
+        )
+
+    @parameterized.expand(SqueezeDim.test_parameters)
+    def test_squeeze_dim_u85_BI(self, test_tensor: torch.Tensor, dim: int):
+        self._test_squeeze_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(permute_memory_to_nhwc=False),
+            self.SqueezeDim(),
+            (test_tensor, dim),
+            "torch.ops.aten.squeeze.dim",
+        )
+
+    @parameterized.expand(SqueezeDims.test_parameters)
+    def test_squeeze_dims_tosa_MI(self, test_tensor: torch.Tensor, dims: tuple[int]):
+        self._test_squeeze_tosa_MI_pipeline(
+            self.SqueezeDims(), (test_tensor, dims), "torch.ops.aten.squeeze.dims"
+        )
+
+    @parameterized.expand(SqueezeDims.test_parameters)
+    def test_squeeze_dims_tosa_BI(self, test_tensor: torch.Tensor, dims: tuple[int]):
+        self._test_squeeze_tosa_BI_pipeline(
+            self.SqueezeDims(), (test_tensor, dims), "torch.ops.aten.squeeze.dims"
+        )
+
+    @parameterized.expand(SqueezeDims.test_parameters)
+    def test_squeeze_dims_u55_BI(self, test_tensor: torch.Tensor, dims: tuple[int]):
+        self._test_squeeze_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            self.SqueezeDims(),
+            (test_tensor, dims),
+            "torch.ops.aten.squeeze.dims",
+        )
+
+    @parameterized.expand(SqueezeDims.test_parameters)
+    def test_squeeze_dims_u85_BI(self, test_tensor: torch.Tensor, dims: tuple[int]):
+        self._test_squeeze_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(permute_memory_to_nhwc=False),
+            self.SqueezeDims(),
+            (test_tensor, dims),
+            "torch.ops.aten.squeeze.dims",
+        )
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index 2ae7c3ab36f..e80c0436989 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -13,6 +13,7 @@
 from executorch.backends.arm.test import common
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -75,14 +76,17 @@ def _test_sub_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_sub_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_sub_ethosu_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize()
             .export()
@@ -104,14 +108,40 @@ def test_sub_tosa_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
         self._test_sub_tosa_BI_pipeline(self.Sub(), test_data)
 
-    # Expected to fail since RESCALE cannot be fused with SUB in Vela.
     @parameterized.expand(Sub.test_parameters)
-    @unittest.expectedFailure
     def test_sub_u55_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
-        self._test_sub_u55_BI_pipeline(self.Sub(), test_data)
+        self._test_sub_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Sub(), test_data
+        )
+
+    @parameterized.expand(Sub.test_parameters)
+    def test_sub_u85_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_sub_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Sub(), test_data
+        )
 
     @parameterized.expand(Sub2.test_parameters)
     def test_sub2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
         self._test_sub_tosa_MI_pipeline(self.Sub2(), test_data)
+
+    @parameterized.expand(Sub2.test_parameters)
+    def test_sub2_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_sub_tosa_BI_pipeline(self.Sub2(), test_data)
+
+    @parameterized.expand(Sub2.test_parameters)
+    def test_sub2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_sub_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Sub2(), test_data
+        )
+
+    @parameterized.expand(Sub2.test_parameters)
+    def test_sub2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_sub_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Sub2(), test_data
+        )
diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py
index 6da6a196c07..8431efa2717 100644
--- a/backends/arm/test/ops/test_unsqueeze.py
+++ b/backends/arm/test/ops/test_unsqueeze.py
@@ -21,13 +21,14 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
 class TestSimpleUnsqueeze(unittest.TestCase):
     class Unsqueeze(torch.nn.Module):
-        shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 5), (5, 5, 5)]
-        test_parameters: list[tuple[torch.Tensor]] = [(torch.ones(n),) for n in shapes]
+        shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 5), (5, 4, 3)]
+        test_parameters: list[tuple[torch.Tensor]] = [(torch.randn(n),) for n in shapes]
 
         def forward(self, x: torch.Tensor, dim):
             return x.unsqueeze(dim)
@@ -39,7 +40,7 @@ def _test_unsqueeze_tosa_MI_pipeline(
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(),
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=False),
             )
             .export()
             .check_count({"torch.ops.aten.unsqueeze.default": 1})
@@ -58,7 +59,7 @@ def _test_unsqueeze_tosa_BI_pipeline(
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(),
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=False),
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -70,15 +71,18 @@ def _test_unsqueeze_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_unsqueeze_tosa_u55_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int]
+    def _test_unsqueeze_ethosu_BI_pipeline(
+        self,
+        compile_spec: CompileSpec,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor, int],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -100,4 +104,16 @@ def test_unsqueeze_tosa_BI(self, test_tensor: torch.Tensor):
 
     @parameterized.expand(Unsqueeze.test_parameters)
     def test_unsqueeze_u55_BI(self, test_tensor: torch.Tensor):
-        self._test_unsqueeze_tosa_u55_pipeline(self.Unsqueeze(), (test_tensor, 0))
+        self._test_unsqueeze_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            self.Unsqueeze(),
+            (test_tensor, 0),
+        )
+
+    @parameterized.expand(Unsqueeze.test_parameters)
+    def test_unsqueeze_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_unsqueeze_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(permute_memory_to_nhwc=False),
+            self.Unsqueeze(),
+            (test_tensor, 0),
+        )
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 1f51261bf7a..53025c0ac08 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -21,6 +21,7 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
 
@@ -73,8 +74,11 @@ def _test_view_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_view_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_view_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
@@ -92,6 +96,20 @@ def _test_view_u55_BI_pipeline(
             .to_executorch()
         )
 
+    def _test_view_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_view_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), module, test_data
+        )
+
+    def _test_view_u85_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        self._test_view_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), module, test_data
+        )
+
     @parameterized.expand(View.test_parameters)
     def test_view_tosa_MI(self, test_tensor: torch.Tensor):
         self._test_view_tosa_MI_pipeline(self.View(), (test_tensor,))
@@ -103,3 +121,7 @@ def test_view_tosa_BI(self, test_tensor: torch.Tensor):
     @parameterized.expand(View.test_parameters)
     def test_view_u55_BI(self, test_tensor: torch.Tensor):
         self._test_view_u55_BI_pipeline(self.View(), (test_tensor,))
+
+    @parameterized.expand(View.test_parameters)
+    def test_view_u85_BI(self, test_tensor: torch.Tensor):
+        self._test_view_u85_BI_pipeline(self.View(), (test_tensor,))
diff --git a/backends/arm/test/quantizer/test_generic_annotater.py b/backends/arm/test/quantizer/test_generic_annotater.py
new file mode 100644
index 00000000000..b859757df4b
--- /dev/null
+++ b/backends/arm/test/quantizer/test_generic_annotater.py
@@ -0,0 +1,86 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+import unittest
+
+import torch
+from executorch.backends.arm.quantizer.arm_quantizer_utils import is_annotated
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+class SingleOpModel(torch.nn.Module):
+    def __init__(self, op, example_input, **op_kwargs) -> None:
+        super().__init__()
+        self.op = op
+        self._example_input = example_input
+        self.op_kwargs = op_kwargs
+
+    def forward(self, x):
+        return self.op(x, **self.op_kwargs)
+
+    def example_inputs(self):
+        return self._example_input
+
+
+class TestGenericAnnotator(unittest.TestCase):
+    def check_annotation(self, model):
+        tester = ArmTester(
+            model, model.example_inputs(), common.get_tosa_compile_spec()
+        )
+        quant_model = tester.quantize().get_artifact()
+        partitions = get_source_partitions(quant_model.graph, [model.op])
+        partitions = list(itertools.chain.from_iterable(partitions.values()))
+
+        assert len(partitions) == 1
+        partition = partitions[0]
+        assert all(is_annotated(node) for node in partition.nodes)
+
+    def test_squeeze(self):
+        self.check_annotation(SingleOpModel(torch.squeeze, (torch.rand(8, 8, 1),)))
+        self.check_annotation(SingleOpModel(torch.squeeze_copy, (torch.rand(8, 8, 1),)))
+
+    def test_unsqueeze(self):
+        self.check_annotation(
+            SingleOpModel(torch.unsqueeze, (torch.rand(8, 8),), dim=0)
+        )
+        self.check_annotation(
+            SingleOpModel(torch.unsqueeze_copy, (torch.rand(8, 8),), dim=0)
+        )
+
+    def test_reshape(self):
+        self.check_annotation(
+            SingleOpModel(torch.reshape, (torch.randn(8, 8),), shape=(64,)),
+        )
+
+    def test_view(self):
+        self.check_annotation(
+            SingleOpModel(torch.view_copy, (torch.randn(4, 4),), size=(2, 8)),
+        )
+
+    def test_slice(self):
+        self.check_annotation(
+            SingleOpModel(torch.slice_copy, (torch.randn(3, 4),)),
+        )
+
+    def test_transpose(self):
+        self.check_annotation(
+            SingleOpModel(torch.transpose, (torch.randn(2, 3),), dim0=0, dim1=1),
+        )
+        self.check_annotation(
+            SingleOpModel(torch.transpose_copy, (torch.randn(2, 3),), dim0=0, dim1=1),
+        )
+
+    def test_tile(self):
+        self.check_annotation(
+            SingleOpModel(torch.tile, (torch.randn(4, 4),), dims=(2,)),
+        )
+
+    def test_flip(self):
+        self.check_annotation(
+            SingleOpModel(torch.flip, (torch.randn(2, 4),), dims=(0, 1)),
+        )
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index c8259c38d1e..0a0143e14c6 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -6,10 +6,12 @@
 import json
 import logging
 import os
+import re
 import shutil
 import subprocess
 import tempfile
 
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
@@ -65,7 +67,7 @@ def _get_input_names(program: ExportedProgram) -> list[str]:
 
 
 def _get_input_quantization_params(
-    program: ExportedProgram, input_names: list[str]
+    program: ExportedProgram,
 ) -> list[QuantizationParams]:
     """
     Get input QuantizationParams in a program, maximum one per input to the program.
@@ -78,6 +80,7 @@ def _get_input_quantization_params(
     """
 
     quant_params = []
+    input_names = _get_input_names(program)
     num_inputs = len(input_names)
     for node in program.graph.nodes:
         if (
@@ -174,19 +177,29 @@ def __init__(
         self.qp_input: list[QuantizationParams] = None
         self.qp_output: QuantizationParams = None
         self.timeout = 120
+        self.target_board: str = None
 
         self._has_init_run = False
 
-    def init_run(self, exported_program: ExportedProgram, is_quantized: bool):
-        self.input_names = _get_input_names(exported_program)
+    def init_run(
+        self,
+        exported_program: ExportedProgram,
+        edge_program: ExportedProgram,
+        is_quantized: bool,
+        target_board: str,
+    ):
+
+        if target_board not in ["corstone-300", "corstone-320"]:
+            raise RuntimeError(f"Unknown target board: {target_board}")
+
+        self.input_names = _get_input_names(edge_program)
         self.output_node = _get_output_node(exported_program)
         self.output_name = self.output_node.name
         self.is_quantized = is_quantized
+        self.target_board = target_board
 
         if is_quantized:
-            self.qp_input = _get_input_quantization_params(
-                exported_program, self.input_names
-            )
+            self.qp_input = _get_input_quantization_params(exported_program)
             self.qp_output = _get_output_quantization_params(
                 exported_program, self.output_node
             )
@@ -199,7 +212,7 @@ def init_run(self, exported_program: ExportedProgram, is_quantized: bool):
     def set_timeout(self, timeout: int):
         self.timeout = timeout
 
-    def run_corstone300(
+    def run_corstone(
         self,
         inputs: Tuple[torch.Tensor],
     ) -> list[torch.Tensor]:
@@ -224,7 +237,9 @@ def run_corstone300(
                 os.path.join(self.intermediate_path, f"{name}.bin"),
             )
         elf_path = os.path.join(
-            "cmake-out", "arm_semihosting_executor_runner", "arm_executor_runner"
+            "cmake-out",
+            f"arm_semihosting_executor_runner_{self.target_board}",
+            "arm_executor_runner",
         )
         assert os.path.exists(
             elf_path
@@ -234,39 +249,76 @@ def run_corstone300(
         for input_path in input_paths:
             cmd_line += f" -i {input_path}"
 
-        command_args = [
-            "FVP_Corstone_SSE-300_Ethos-U55",
-            "-C",
-            "ethosu.num_macs=128",
-            "-C",
-            "mps3_board.visualisation.disable-visualisation=1",
-            "-C",
-            "mps3_board.telnetterminal0.start_telnet=0",
-            "-C",
-            "mps3_board.uart0.out_file='-'",
-            "-C",
-            "cpu0.CFGITCMSZ=11",
-            "-C",
-            "cpu0.semihosting-enable=1",
-            "-C",
-            "cpu0.semihosting-stack_base=0",
-            "-C",
-            "cpu0.semihosting-heap_limit=0",
-            "-C",
-            f"cpu0.semihosting-cmd_line='{cmd_line}'",
-            "-a",
-            elf_path,
-            "--timelimit",
-            f"{self.timeout}",
-        ]
-        result = _run_cmd(command_args, check=False)
+        command_args = {
+            "corstone-300": [
+                "FVP_Corstone_SSE-300_Ethos-U55",
+                "-C",
+                "ethosu.num_macs=128",
+                "-C",
+                "mps3_board.visualisation.disable-visualisation=1",
+                "-C",
+                "mps3_board.telnetterminal0.start_telnet=0",
+                "-C",
+                "mps3_board.uart0.out_file='-'",
+                "-C",
+                "cpu0.CFGITCMSZ=11",
+                "-C",
+                "cpu0.semihosting-enable=1",
+                "-C",
+                "cpu0.semihosting-stack_base=0",
+                "-C",
+                "cpu0.semihosting-heap_limit=0",
+                "-C",
+                f"cpu0.semihosting-cmd_line='{cmd_line}'",
+                "-a",
+                elf_path,
+                "--timelimit",
+                f"{self.timeout}",
+            ],
+            "corstone-320": [
+                "FVP_Corstone_SSE-320",
+                "-C",
+                "mps4_board.subsystem.ethosu.num_macs=128",
+                "-C",
+                "mps4_board.visualisation.disable-visualisation=1",
+                "-C",
+                "mps4_board.telnetterminal0.start_telnet=0",
+                "-C",
+                "mps4_board.uart0.out_file='-'",
+                "-C",
+                "mps4_board.uart0.unbuffered_output=1",
+                "-C",
+                "mps4_board.uart0.shutdown_on_eot=1",
+                "-C",
+                "mps4_board.subsystem.cpu0.semihosting-enable=1",
+                "-C",
+                "mps4_board.subsystem.cpu0.semihosting-stack_base=0",
+                "-C",
+                "mps4_board.subsystem.cpu0.semihosting-heap_limit=0",
+                "-C",
+                f"mps4_board.subsystem.cpu0.semihosting-cmd_line='{cmd_line}'",
+                "-a",
+                elf_path,
+                "--timelimit",
+                f"{self.timeout}",
+            ],
+        }
+
+        result = _run_cmd(command_args[self.target_board], check=False)
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Failed to run {command_args[self.target_board]}\nError: {result.stderr.decode()}"
+            )
         result_stdout = result.stdout.decode()
-        if "Hard fault" in result_stdout or len(result.stderr) > 0:
+
+        error_regex = r"(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)"
+
+        # Check for errors in the output
+        # regex to check for error or fault messages in stdout from FVP
+        if re.compile(error_regex, re.MULTILINE).search(result_stdout):
             raise RuntimeError(
-                f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}"
+                f"Corstone simulation failed:\ncmd: {command_args[self.target_board]}\n, log: \n {result_stdout}\n{result.stderr.decode()}"
             )
-        elif "E [" in result_stdout:
-            logger.error(result_stdout)
 
         tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32)
         output_shape = self.output_node.args[0][0].meta["val"].shape
@@ -325,7 +377,18 @@ def run_tosa_ref_model(
             self._has_init_run
         ), "RunnerUtil needs to be initialized using init_run() before running tosa reference."
 
-        desc_file_path = os.path.join(self.intermediate_path, "desc.json")
+        all_desc_file_paths = [
+            str(path) for path in Path(self.intermediate_path).glob("desc*.json")
+        ]
+        assert (
+            all_desc_file_paths
+        ), f"No TOSA description file found in '{self.intermediate_path}'."
+        if len(all_desc_file_paths) != 1:
+            raise NotImplementedError(
+                "Graphs with more than one partition are currently not supported."
+            )
+
+        desc_file_path = all_desc_file_paths[0]
         assert os.path.exists(
             desc_file_path
         ), f"desc_file_path: {desc_file_path} does not exist"
@@ -391,11 +454,11 @@ def run_tosa_ref_model(
 def prep_data_for_save(
     data, is_quantized: bool, input_name: str, quant_param: QuantizationParams
 ):
-    data_np = data.detach().numpy().astype(np.float32)
+    data_np = np.array(data.detach(), order="C").astype(np.float32)
 
     if is_quantized:
         assert (
-            quant_param.node_name == input_name
+            quant_param.node_name in input_name
         ), "These quantization params do not match the input tensor name"
         data_np = (
             ((data_np / np.float32(quant_param.scale)) + quant_param.zp)
@@ -488,7 +551,10 @@ def dbg_tosa_fb_to_json(tosa_fb: bytes) -> Dict:
     with open(tosa_input_file, "wb") as f:
         f.write(tosa_fb)
 
-    tosa_schema_file = "./backends/arm/third-party/serialization_lib/schema/tosa.fbs"
+    arm_backend_path = os.path.realpath(os.path.dirname(__file__) + "/..")
+    tosa_schema_file = os.path.join(
+        arm_backend_path, "third-party/serialization_lib/schema/tosa.fbs"
+    )
     assert os.path.exists(
         tosa_schema_file
     ), f"tosa_schema_file: {tosa_schema_file} does not exist"
diff --git a/backends/arm/test/setup_testing.sh b/backends/arm/test/setup_testing.sh
index 683eee7a007..5625ae212f2 100755
--- a/backends/arm/test/setup_testing.sh
+++ b/backends/arm/test/setup_testing.sh
@@ -13,17 +13,30 @@ ethos_u_root_dir=${et_root_dir}/examples/arm/ethos-u-scratch/ethos-u
 
 toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
 et_build_dir=${et_root_dir}/cmake-out
-build_test_dir=${et_build_dir}/arm_semihosting_executor_runner
+build_root_test_dir=${et_build_dir}/arm_semihosting_executor_runner
 fvp_model=FVP_Corstone_SSE-300_Ethos-U55
 
 # Build Arm Baremetal executor_runner in semihosting mode.
 # Put in backends/arm/test/res to be used by unit tests.
 function build_semihosting_executorch_runner() {
+    target_board=$1
+    build_test_dir=${build_root_test_dir}_${target_board}
+    echo "[${FUNCNAME[0]}] Configuring ${target_board}"
+    if [[ ${target_board} == "corstone-300" ]]; then
+        local target_cpu=cortex-m55
+    elif [[ ${target_board} == "corstone-320" ]]; then
+        local target_cpu=cortex-m85
+    else
+        echo "[${FUNCNAME[0]}] ERROR: Invalid target_board specified!"
+        exit 1
+    fi
     cd ${et_root_dir}/examples/arm/executor_runner
     pwd
     mkdir -p ${build_test_dir}
     cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}          \
-          -DTARGET_CPU=cortex-m55                            \
+          -DCMAKE_BUILD_TYPE=RelWithDebInfo                  \
+          -DTARGET_CPU=${target_cpu}                         \
+          -DTARGET_BOARD=${target_board}                     \
           -DSEMIHOSTING=ON                                   \
           -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${build_test_dir} \
           -B ${build_test_dir}                               \
@@ -40,4 +53,6 @@ function build_semihosting_executorch_runner() {
     find ${build_test_dir} -name "arm_executor_runner"
 }
 
-build_semihosting_executorch_runner
\ No newline at end of file
+build_semihosting_executorch_runner corstone-300
+
+build_semihosting_executorch_runner corstone-320
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 98fac29144c..eb52f4b2070 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -7,13 +7,14 @@
 
 from collections import Counter
 from pprint import pformat
-from typing import Any, List, Literal, Optional, Tuple, Union
+from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
 
 import executorch.backends.xnnpack.test.tester.tester as tester
 
 import numpy as np
+import serializer.tosa_serializer as ts
 
-import torch
+import torch.fx
 
 from executorch.backends.arm.arm_backend import get_intermediate_path, is_permute_memory
 from executorch.backends.arm.arm_partitioner import ArmPartitioner
@@ -23,42 +24,71 @@
 )
 
 from executorch.backends.arm.test.runner_utils import (
-    _get_input_names,
     _get_input_quantization_params,
     _get_output_node,
     _get_output_quantization_params,
     dbg_tosa_fb_to_json,
     RunnerUtil,
 )
+from executorch.backends.arm.tosa_mapping import extract_tensor_meta
 
 from executorch.backends.xnnpack.test.tester import Tester
+from executorch.devtools.backend_debug import get_delegation_info
 from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+from executorch.exir.lowered_backend_module import LoweredBackendModule
+from tabulate import tabulate
+from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec
 from torch.fx import Graph
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 
 class Partition(tester.Partition):
     def dump_artifact(self, path_to_dump: Optional[str]):
         super().dump_artifact(path_to_dump)
 
-        to_print = None
-        for spec in self.graph_module.lowered_module_0.compile_specs:
-            if spec.key == "output_format":
-                if spec.value == b"tosa":
-                    tosa_fb = self.graph_module.lowered_module_0.processed_bytes
+        output = "Formated Graph Signature:\n"
+        output += _format_export_graph_signature(
+            self.artifact.exported_program().graph_signature
+        )
+
+        def get_output_format(lowered_module) -> str | None:
+            for spec in lowered_module.compile_specs:
+                if spec.key == "output_format":
+                    return spec.value.decode()
+            return None
+
+        for node in self.graph_module.graph.nodes:
+            if node.op == "get_attr" and node.name.startswith("lowered_module_"):
+                lowered_module = getattr(self.graph_module, node.name)
+                assert isinstance(
+                    lowered_module, LoweredBackendModule
+                ), f"Attribute {node.name} must be of type LoweredBackendModule."
+
+                output_format = get_output_format(lowered_module)
+                if output_format == "tosa":
+                    tosa_fb = lowered_module.processed_bytes
                     to_print = dbg_tosa_fb_to_json(tosa_fb)
                     to_print = pformat(to_print, compact=True, indent=1)
-                    to_print = f"\n TOSA deserialized: \n{to_print}"
-                elif spec.value == b"vela":
-                    vela_cmd_stream = self.graph_module.lowered_module_0.processed_bytes
-                    to_print = str(vela_cmd_stream)
-                    to_print = f"\n Vela command stream: \n{to_print}"
-                break
-        assert to_print is not None, "No TOSA nor Vela compile spec found"
-        _dump_str(to_print, path_to_dump)
+                    output += f"\nTOSA deserialized {node.name}: \n{to_print}\n"
+                elif output_format == "vela":
+                    vela_cmd_stream = lowered_module.processed_bytes
+                    output += (
+                        f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n"
+                    )
+                else:
+                    logger.warning(
+                        f"No TOSA nor Vela compile spec found in compile specs of {node.name}."
+                    )
+                    continue
+
+        if not output:
+            logger.warning("No output to print generated from artifact.")
+            return
+
+        _dump_str(output, path_to_dump)
 
 
 class Serialize(tester.Serialize):
@@ -68,7 +98,7 @@ def __init__(self, runner_util: RunnerUtil, timeout: int = 1):
         self.runner.set_timeout(timeout)
 
     def run_artifact(self, inputs):
-        return self.runner.run_corstone300(inputs)
+        return self.runner.run_corstone(inputs)
 
     def dump_artifact(self, path_to_dump: Optional[str]):
         if not path_to_dump:
@@ -196,6 +226,7 @@ def run_method_and_compare_outputs(
         self,
         inputs: Optional[Tuple[torch.Tensor]] = None,
         stage: Optional[str] = None,
+        target_board: Optional[str] = "corstone-300",
         num_runs=1,
         atol=1e-03,
         rtol=1e-03,
@@ -219,14 +250,22 @@ def run_method_and_compare_outputs(
             self.runner_util is not None
         ), "self.tosa_test_util is not initialized, cannot use run_method()"
         assert (
-            self.stages[self.stage_name(tester.Export)] is not None
-        ), "To compare outputs, at least the Export stage needs to be run."
+            self.stages[self.stage_name(tester.ToEdge)] is not None
+        ), "To compare outputs, at least the ToEdge stage needs to be run."
 
         stage = stage or self.cur
         test_stage = self.stages[stage]
         is_quantized = self.stages[self.stage_name(tester.Quantize)] is not None
+
+        exported_program = self.stages[self.stage_name(tester.Export)].artifact
+        edge_program = self.stages[
+            self.stage_name(tester.ToEdge)
+        ].artifact.exported_program()
         self.runner_util.init_run(
-            self.stages[self.stage_name(tester.Export)].artifact, is_quantized
+            exported_program,
+            edge_program,
+            is_quantized,
+            target_board,
         )
 
         if is_quantized:
@@ -236,7 +275,9 @@ def run_method_and_compare_outputs(
             reference_stage = self.stages[self.stage_name(InitialModel)]
             quantization_scale = None
 
-        print(f"Comparing Stage {test_stage} with Stage {reference_stage}")
+        logger.info(
+            f"Comparing Stage '{self.stage_name(test_stage)}' with Stage '{self.stage_name(reference_stage)}'"
+        )
         is_nhwc = is_permute_memory(self.compile_spec)
 
         # Loop inputs and compare reference stage with the compared stage.
@@ -262,7 +303,8 @@ def run_method_and_compare_outputs(
                 generated_input.shape if hasattr(generated_input, "shape") else (1,)
                 for generated_input in reference_input
             ]
-            print(f"Run {run_iteration} with input shapes: {input_shapes}")
+            input_shape_str = ", ".join([str(list(i)) for i in input_shapes])
+            logger.info(f"Run #{run_iteration}, input shapes: {input_shape_str}")
 
             reference_output = reference_stage.run_artifact(reference_input)
             test_output = tuple(test_stage.run_artifact(test_input))
@@ -298,30 +340,84 @@ def get_graph(self, stage: str | None = None) -> Graph:
         return graph
 
     def dump_operator_distribution(
-        self, path_to_dump: Optional[str] = None
-    ) -> ArmQuantizer:
-        """Dump a dictionary with {operator: operator count} for the operators in the
-        graph of the current stage.
+        self, path_to_dump: Optional[str] = None, print_table: bool = True
+    ):
+        """Dump the distribution of operators in the current stage.
+        In the partition stage, additional information is included such as the number of
+        delegates and the distribution of TOSA operators.
+        Set parameter print_table to False to dump in a parseable format.
+
 
         Returns self for daisy-chaining.
         """
-        graph = self.get_graph(self.cur)
-        op_dist = _get_operator_distribution(graph)
-        to_print = self.cur + " operators: " + _format_dict(op_dist) + "\n"
+        line = "#" * 10
+        to_print = f"{line} {self.cur.capitalize()} Operator Distribution {line}\n"
+
+        if self.cur == self.stage_name(tester.Partition) and print_table:
+            graph_module = self.get_artifact().exported_program().graph_module
+            if print_table:
+                delegation_info = get_delegation_info(graph_module)
+                op_dist = delegation_info.get_operator_delegation_dataframe()
+            else:
+                op_dist = dict(_get_operator_distribution(graph_module.graph))
+            to_print += _format_dict(op_dist, print_table)
+            to_print += "\n" + _get_tosa_operator_distribution(
+                graph_module, print_table
+            )
+            to_print += "\n"
+            to_print += delegation_info.get_summary()
+        else:
+            graph = self.get_graph(self.cur)
+            op_dist = dict(_get_operator_distribution(graph))
+            if print_table:
+                op_dist = {
+                    "Operator": list(op_dist),
+                    "Count": [op_dist[key] for key in op_dist],
+                }
+            to_print += _format_dict(op_dist, print_table) + "\n"
+
         _dump_str(to_print, path_to_dump)
+
         return self
 
     def dump_dtype_distribution(
-        self, path_to_dump: Optional[str] = None
-    ) -> ArmQuantizer:
-        """Dump a dictionary with {dtype: dtype count} for the dtypes of the nodes in the
-        graph of the current stage.
+        self, path_to_dump: Optional[str] = None, print_table: bool = True
+    ):
+        """Dump a the distributions of dtypes of nodes and placeholders in the current stage.
+        Set parameter print_table to False to dump in a parseable format.
 
         Returns self for daisy-chaining.
         """
+
+        line = "#" * 10
+        to_print = (
+            f"{line} {self.cur.capitalize()} Placeholder Dtype Distribution {line}\n"
+        )
+
         graph = self.get_graph(self.cur)
-        op_dist = _get_dtype_distribution(graph)
-        to_print = self.cur + " placeholder data types: " + _format_dict(op_dist) + "\n"
+        dtype_dist_placeholders, dtype_dirst_tensors = _get_dtype_distribution(graph)
+        all_dtypes = set(dtype_dist_placeholders.keys()) | set(
+            dtype_dirst_tensors.keys()
+        )
+        if print_table:
+            dtype_dist = {
+                "Dtype": all_dtypes,
+                "Placeholder Count": [
+                    (
+                        dtype_dist_placeholders[key]
+                        if key in dtype_dist_placeholders
+                        else 0
+                    )
+                    for key in all_dtypes
+                ],
+                "Tensor Count": [
+                    (dtype_dirst_tensors[key] if key in dtype_dirst_tensors else 0)
+                    for key in all_dtypes
+                ],
+            }
+        else:
+            dtype_dist = dict(dtype_dist_placeholders + dtype_dirst_tensors)
+        to_print += _format_dict(dtype_dist, print_table) + "\n"
         _dump_str(to_print, path_to_dump)
         return self
 
@@ -373,11 +469,8 @@ def _compare_outputs(
             export_stage = self.stages.get(self.stage_name(tester.Export), None)
             quantize_stage = self.stages.get(self.stage_name(tester.Quantize), None)
             if export_stage is not None and quantize_stage is not None:
-                input_names = _get_input_names(export_stage.artifact)
                 output_node = _get_output_node(export_stage.artifact)
-                qp_input = _get_input_quantization_params(
-                    export_stage.artifact, input_names
-                )
+                qp_input = _get_input_quantization_params(export_stage.artifact)
                 qp_output = _get_output_quantization_params(
                     export_stage.artifact, output_node
                 )
@@ -399,17 +492,20 @@ def _compare_outputs(
             raise e
 
 
-def _get_dtype_distribution(graph: Graph) -> dict:
-    """Counts the occurences of placeholder data types in a graph.
-    The result is a dict {'data type':'number of placeholders'}
+def _get_dtype_distribution(graph: Graph) -> tuple[dict, dict]:
+    """Counts the occurences of placeholder and call_function dtypes in a graph.
+    The result is a tuple of Counters (placeholder_distribution, call_function_distribution)
     """
-    return Counter(
-        [
-            node.meta["val"].dtype
-            for node in list(graph.nodes)
-            if node.op == "placeholder"
-        ]
-    )
+    placeholder_dtypes = []
+    call_function_dtypes = []
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            placeholder_dtypes.append(str(node.meta["val"].dtype))
+        if node.op == "call_function":
+            if "val" in node.meta:
+                dtype, _, _ = extract_tensor_meta(node.meta)
+                call_function_dtypes.append(ts.DTypeNames[dtype])
+    return Counter(placeholder_dtypes), Counter(call_function_dtypes)
 
 
 def _get_operator_distribution(graph: Graph) -> dict[str, int]:
@@ -421,13 +517,71 @@ def _get_operator_distribution(graph: Graph) -> dict[str, int]:
     )
 
 
+def _format_export_graph_signature(signature: ExportGraphSignature) -> str:
+    def specs_dict(specs: list[InputSpec | OutputSpec], title: str):
+        _dict: dict[str, list] = {title: [], "arg": [], "kind": [], "target": []}
+        for i, spec in enumerate(specs):
+            _dict[title].append(i)
+            _dict["arg"].append(spec.arg)
+            _dict["kind"].append(spec.kind)
+            _dict["target"].append(spec.target if spec.target else "-")
+        return _dict
+
+    input_dict = specs_dict(signature.input_specs, "Inputs")
+    output_dict = specs_dict(signature.output_specs, "Outputs")
+
+    return f"{_format_dict(input_dict)}\n{_format_dict(output_dict)}"
+
+
+def _get_tosa_operator_distribution(
+    graph_module: torch.fx.GraphModule, print_table=False
+) -> str:
+    """Counts the occurences of operator names of all lowered modules containing
+    a TOSA flatbuffer.
+    The result is a string with the operator distribution or an error message.
+    """
+    op_list = []
+    id = 0
+    while lowered_module := getattr(graph_module, f"lowered_module_{id}", None):
+        for spec in lowered_module.compile_specs:
+            if spec.key != "output_format":
+                continue
+            if spec.value == b"tosa":
+                tosa_fb = lowered_module.processed_bytes
+                tosa_json = dbg_tosa_fb_to_json(tosa_fb)
+                for region in tosa_json["regions"]:
+                    for block in region["blocks"]:
+                        op_list.extend(
+                            [operator["op"] for operator in block["operators"]]
+                        )
+                break
+            elif spec.value == b"vela":
+                return "Can not get operator distribution for Vela command stream."
+            else:
+                return f"Unknown output format '{spec.value}'."
+        id += 1
+    if id == 0:
+        return "No delegate with name 'lowered_module_0 found in graph module."
+    op_dist = dict(Counter(op_list))
+    op_dist = {
+        "Operator": list(op_dist.keys()),
+        "Count": [item[1] for item in op_dist.items()],
+    }
+    return "TOSA operators:\n" + _format_dict(dict(op_dist), print_table)
+
+
 def _dump_str(to_print: str, path_to_dump: Optional[str] = None):
     if path_to_dump:
         with open(path_to_dump, "a") as fp:
             fp.write(to_print)
     else:
-        print(to_print)
+        logger.info(to_print)
 
 
-def _format_dict(to_print: dict) -> str:
-    return pformat(to_print, compact=True, indent=1)
+def _format_dict(to_print: dict, print_table: bool = True) -> str:
+    if isinstance(list(to_print.items())[0], Iterable) and print_table:
+        return tabulate(
+            to_print, headers="keys", tablefmt="fancy_grid", maxcolwidths=35
+        )
+    else:
+        return pformat(to_print, compact=True, indent=1)
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index aee8aae8df3..cfafac16760 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -50,10 +50,10 @@ def dbg_node(node):
 
 
 # Output TOSA flatbuffer and test harness file
-def dbg_tosa_dump(tosa_graph, path):
-    filename = "output.tosa"
+def dbg_tosa_dump(tosa_graph: ts.TosaSerializer, path: str, suffix: str = ""):
+    filename = f"output{suffix}.tosa"
 
-    logger.info(f"Emitting debug output to {path}")
+    logger.info(f"Emitting debug output to: {path=}, {suffix=}")
 
     os.makedirs(path, exist_ok=True)
 
@@ -65,7 +65,7 @@ def dbg_tosa_dump(tosa_graph, path):
         f.write(fb)
     assert os.path.exists(filepath_tosa_fb), "Failed to write TOSA flatbuffer"
 
-    filepath_desc_json = os.path.join(path, "desc.json")
+    filepath_desc_json = os.path.join(path, f"desc{suffix}.json")
     with open(filepath_desc_json, "w") as f:
         f.write(js)
     assert os.path.exists(filepath_desc_json), "Failed to write TOSA JSON"
@@ -76,7 +76,7 @@ def dbg_fail(node, tosa_graph, path):
     logger.warn("Internal error due to poorly handled node:")
     dbg_node(node)
     logger.warn(f"Debug output captured in '{path}'.")
-    raise RuntimeError("TOSA Internal Error on node, enable logging for further info")
+    raise RuntimeError("TOSA Internal Error on node, enable logging for further info.")
 
 
 # Helper function to match TOSA's broadcasting rank requirement
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index d786142f085..773dd0e52d5 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -20,7 +20,6 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
@@ -30,54 +29,6 @@ if(EXECUTORCH_NNLIB_OPT)
   set(TARGET_DIR hifi)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
 endif()
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-# Source root directory for executorch.
-if(NOT EXECUTORCH_ROOT)
-  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
-endif()
-
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
-
-# Find prebuilt libraries. executorch package should contain portable_ops_lib,
-# etdump, bundled_program.
-find_package(executorch CONFIG REQUIRED)
-target_link_options_shared_lib(executorch)
-target_link_options_shared_lib(portable_ops_lib)
-
-target_include_directories(executorch INTERFACE ${_common_include_directories})
-
-find_package(
-  gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party
-)
-
-add_executable(cadence_runner cadence_runner/cadence_runner.cpp)
-target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
-
-target_include_directories(
-  etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../sdk/include
-                   ${EXECUTORCH_ROOT}/third-party/flatcc/include
-)
-
-target_include_directories(
-  cadence_runner PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
-                        ${_common_include_directories}
-)
-
-target_link_libraries(
-  cadence_runner
-  executorch
-  gflags
-  etdump
-  extension_data_loader
-  bundled_program
-  cadence_ops_lib
-  flatccrt
-)
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 08093efe317..ae60c299f2c 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -22,6 +22,7 @@ python_library(
     deps = [
         "fbsource//third-party/pypi/tabulate:tabulate",
         "//caffe2:torch",
+        "//executorch/exir:lib",
         "//executorch/exir:memory",
         "//executorch/exir/dialects:lib",
         "//executorch/exir/dialects/edge:lib",
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index e1494f8d20d..5b151a3b6a4 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -30,13 +30,14 @@
 )
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
-from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.pt2e.export_utils import model_is_exported
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 from torch.export import export
 from torch.export.exported_program import ExportedProgram
 
+from .utils import print_ops_info
+
 
 # Note: this is not meant as a primary API since it can create inconsistencies
 # if the quantizer here is different from the quantizer used to convert. It is
@@ -58,7 +59,7 @@ def convert_pt2(
     """
 
     # Export with dynamo
-    model_gm = capture_pre_autograd_graph(model, inputs)
+    model_gm = torch.export.export_for_training(model, inputs).module()
 
     if model_gm_has_SDPA(model_gm):  # pyre-fixme[6]
         # Decompose SDPA
@@ -194,16 +195,17 @@ def export_to_edge(
 
 
 # Export the model and lower it to an EdgeProgramManager (in edge IR), and
-# apply passes specific to Cadence DSP execution.
+# apply passes specific to Cadence DSP execution. Return both to print the
+# differences.
 def export_to_cadence(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
     dump_graphs: bool = False,
 ) -> EdgeProgramManager:
-    edge_program_manager = export_to_edge(model, inputs)
+    edge_prog_manager = export_to_edge(model, inputs)
 
     # Run a couple required passes for quant/dequant ops
-    cadence_program_manager = edge_program_manager.transform(
+    cadence_prog_manager = edge_prog_manager.transform(
         [
             InitializePipeline(),
             RemoveZeroSizedCatArgsPass(),
@@ -217,4 +219,10 @@ def export_to_cadence(
         ]
     )
 
-    return cadence_program_manager
+    # Print some information to terminal
+    print_ops_info(
+        edge_prog_manager.exported_program().graph_module,
+        cadence_prog_manager.exported_program().graph_module,
+    )
+
+    return cadence_prog_manager
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
index f7920f0b8fb..10433016e38 100644
--- a/backends/cadence/aot/export_example.py
+++ b/backends/cadence/aot/export_example.py
@@ -10,14 +10,12 @@
 import tempfile
 
 from executorch.backends.cadence.aot.ops_registrations import *  # noqa
-import os
 from typing import Any, Tuple
 
 from executorch.backends.cadence.aot.compiler import (
     convert_pt2,
     export_to_cadence,
-    export_to_edge,
-    quantize_pt2,
+    fuse_pt2,
 )
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
 from executorch.backends.cadence.runtime import runtime
@@ -25,46 +23,13 @@
 from executorch.exir import ExecutorchProgramManager
 from torch import nn
 
-from .utils import print_ops_info
+from .utils import save_bpte_program, save_pte_program
 
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
 
-def _save_pte_program(
-    prog: ExecutorchProgramManager, model_name: str, output_dir: str = ""
-) -> None:
-    if model_name.endswith(".pte"):
-        filename = model_name
-    else:
-        filename = os.path.join(output_dir, f"{model_name}.pte")
-
-    try:
-        with open(filename, "wb") as file:
-            prog.write_to_file(file)
-            logging.info(f"Saved exported program to {filename}")
-    except Exception as e:
-        logging.error(f"Error while saving to {filename}: {e}")
-
-
-def _save_bpte_program(
-    buffer: bytes,
-    model_name: str,
-    output_dir: str = "",
-) -> None:
-    if model_name.endswith(".bpte"):
-        filename = model_name
-    else:
-        filename = os.path.join(output_dir, f"{model_name}.bpte")
-    try:
-        with open(filename, "wb") as f:
-            f.write(buffer)
-        logging.info(f"Saved exported program to {filename}")
-    except Exception as e:
-        logging.error(f"Error while saving to {output_dir}: {e}")
-
-
 def export_model(
     model: nn.Module,
     example_inputs: Tuple[Any, ...],
@@ -74,32 +39,28 @@ def export_model(
     working_dir = tempfile.mkdtemp(dir="/tmp")
     logging.debug(f"Created work directory {working_dir}")
 
-    # convert the model (also called in quantize_pt2)
-    converted_model = convert_pt2(model, example_inputs, CadenceQuantizer())
+    # Instantiate the quantizer
+    quantizer = CadenceQuantizer()
 
-    # Get reference outputs from quantized_model
-    ref_outputs = converted_model(*example_inputs)
+    # Convert the model
+    converted_model = convert_pt2(model, example_inputs, quantizer)
 
-    # Quantize the model
-    quantized_model = quantize_pt2(model, example_inputs)
+    # Get reference outputs from converted model
+    ref_outputs = converted_model(*example_inputs)
 
-    # Get edge program (also called in export_to_cadence)
-    edge_prog_manager = export_to_edge(quantized_model, example_inputs)
+    # Quantize the model (note: quantizer needs to be the same as
+    # the one used in convert_pt2)
+    quantized_model = fuse_pt2(converted_model, quantizer)
 
     # Get edge program after Cadence specific passes
     cadence_prog_manager = export_to_cadence(quantized_model, example_inputs)
 
+    # Get executorch program after Cadence specific passes
     exec_prog: ExecutorchProgramManager = cadence_prog_manager.to_executorch()
 
     logging.info("Final exported graph:\n")
     exec_prog.exported_program().graph_module.graph.print_tabular()
 
-    # Print some information to terminal
-    print_ops_info(
-        edge_prog_manager.exported_program().graph_module,
-        cadence_prog_manager.exported_program().graph_module,
-    )
-
     forward_test_data = BundledProgramManager.bundled_program_test_data_gen(
         method="forward", inputs=example_inputs, expected_outputs=ref_outputs
     )
@@ -110,9 +71,9 @@ def export_model(
         forward_test_data,
     )
     # Save the program as pte (default name is CadenceDemoModel.pte)
-    _save_pte_program(exec_prog, file_name, working_dir)
+    save_pte_program(exec_prog, file_name, working_dir)
     # Save the program as btpe (default name is CadenceDemoModel.bpte)
-    _save_bpte_program(buffer, file_name, working_dir)
+    save_bpte_program(buffer, file_name, working_dir)
 
     logging.debug(
         f"Executorch bundled program buffer saved to {file_name} is {len(buffer)} total bytes"
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
index b710f7d4e57..9e32f3472da 100644
--- a/backends/cadence/aot/utils.py
+++ b/backends/cadence/aot/utils.py
@@ -8,10 +8,12 @@
 
 import logging
 import operator
+import os
 from typing import Dict, List, Tuple
 
 import torch
-from executorch.exir import memory
+
+from executorch.exir import ExecutorchProgramManager, memory
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from tabulate import tabulate
@@ -104,11 +106,11 @@ def get_ops_count(graph_module: torch.fx.GraphModule) -> Dict[str, int]:
             ):
                 continue
             # If the op is already present, increment the count
-            if get_edge_overload_packet(node.target).__name__ in freq:
-                freq[get_edge_overload_packet(node.target).__name__] += 1
+            if node.target._name in freq:
+                freq[node.target._name] += 1
             # else, add a new entry
             else:
-                freq[get_edge_overload_packet(node.target).__name__] = 1
+                freq[node.target._name] = 1
     return freq
 
 
@@ -185,3 +187,36 @@ def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool:
             if node.target == torch.ops.aten.scaled_dot_product_attention.default:
                 return True
     return False
+
+
+def save_pte_program(
+    prog: ExecutorchProgramManager, model_name: str, output_dir: str = ""
+) -> None:
+    if model_name.endswith(".pte"):
+        filename = model_name
+    else:
+        filename = os.path.join(output_dir, f"{model_name}.pte")
+
+    try:
+        with open(filename, "wb") as file:
+            prog.write_to_file(file)
+            logging.info(f"Saved exported program to {filename}")
+    except Exception as e:
+        logging.error(f"Error while saving to {filename}: {e}")
+
+
+def save_bpte_program(
+    buffer: bytes,
+    model_name: str,
+    output_dir: str = "",
+) -> None:
+    if model_name.endswith(".bpte"):
+        filename = model_name
+    else:
+        filename = os.path.join(output_dir, f"{model_name}.bpte")
+    try:
+        with open(filename, "wb") as f:
+            f.write(buffer)
+        logging.info(f"Saved exported program to {filename}")
+    except Exception as e:
+        logging.error(f"Error while saving to {output_dir}: {e}")
diff --git a/backends/cadence/build_cadence_xtensa.sh b/backends/cadence/build_cadence_xtensa.sh
new file mode 100644
index 00000000000..eebd0707d19
--- /dev/null
+++ b/backends/cadence/build_cadence_xtensa.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+unset CMAKE_PREFIX_PATH
+git submodule sync
+git submodule update --init
+./install_requirements.sh
+
+rm -rf cmake-out
+
+STEPWISE_BUILD=false
+
+if $STEPWISE_BUILD; then
+    echo "Building ExecuTorch"
+    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake  \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CPUINFO=OFF \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=OFF \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -Bcmake-out .
+
+    echo "Building any Cadence-specific binaries on top"
+    cmake -DBUCK2="$BUCK" \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=ON \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
+        -DPYTHON_EXECUTABLE=python3 \
+        -DEXECUTORCH_NNLIB_OPT=ON \
+        -DEXECUTORCH_BUILD_GFLAGS=ON \
+        -DHAVE_FNMATCH_H=OFF \
+        -Bcmake-out/backends/cadence \
+        backends/cadence
+    cmake --build cmake-out/backends/cadence  -j16
+else
+    echo "Building Cadence toolchain with ExecuTorch packages"
+    cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+    cmake -DBUCK2="$BUCK" \
+        -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CPUINFO=OFF \
+        -DEXECUTORCH_BUILD_FLATC=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=ON \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
+        -DPYTHON_EXECUTABLE=python3 \
+        -DEXECUTORCH_NNLIB_OPT=ON \
+        -DHAVE_FNMATCH_H=OFF \
+        -Bcmake-out
+    cmake --build cmake-out --target install --config Release -j16
+fi
+
+echo "Run simple model to verify cmake build"
+python3 -m examples.portable.scripts.export --model_name="add"
+xt-run --turbo cmake-out/executor_runner  --model_path=add.pte
diff --git a/backends/cadence/cadence_runner/CMakeLists.txt b/backends/cadence/cadence_runner/CMakeLists.txt
new file mode 100644
index 00000000000..17bdc855149
--- /dev/null
+++ b/backends/cadence/cadence_runner/CMakeLists.txt
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Set the minimum required version of CMake for this project.
+cmake_minimum_required(VERSION 3.10)
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+# Set the project name.
+project(cadence_backend)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(TARGET_DIR reference)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+
+# Find prebuilt libraries. executorch package should contain portable_ops_lib,
+# etdump, bundled_program.
+find_package(executorch CONFIG REQUIRED)
+target_link_options_shared_lib(executorch)
+target_link_options_shared_lib(portable_ops_lib)
+
+target_include_directories(executorch INTERFACE ${_common_include_directories})
+
+find_package(
+  gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party
+)
+
+add_executable(cadence_runner
+    ${EXECUTORCH_ROOT}/examples/devtools/example_runner/example_runner.cpp
+)
+target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
+
+target_include_directories(
+  etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/include
+                   ${EXECUTORCH_ROOT}/third-party/flatcc/include
+)
+
+target_include_directories(
+  cadence_runner PUBLIC ${ROOT_DIR}/../.. ${CMAKE_BINARY_DIR}
+                        ${_common_include_directories}
+)
+
+target_link_libraries(
+  cadence_runner
+  executorch
+  gflags
+  etdump
+  extension_data_loader
+  bundled_program
+  cadence_ops_lib
+  flatccrt
+)
diff --git a/backends/cadence/build_cadence_runner.sh b/backends/cadence/cadence_runner/build_cadence_runner.sh
similarity index 85%
rename from backends/cadence/build_cadence_runner.sh
rename to backends/cadence/cadence_runner/build_cadence_runner.sh
index 51f363f8de4..40b4eb37de2 100755
--- a/backends/cadence/build_cadence_runner.sh
+++ b/backends/cadence/cadence_runner/build_cadence_runner.sh
@@ -12,7 +12,7 @@ set -euo pipefail
 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 readonly SCRIPT_DIR
 
-readonly EXECUTORCH_ROOT="${SCRIPT_DIR}/../.."
+readonly EXECUTORCH_ROOT="${SCRIPT_DIR}/../../.."
 
 # Allow overriding the number of build jobs. Default to 9.
 export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-9}"
@@ -23,7 +23,7 @@ main() {
   rm -rf cmake-out
   cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_SDK=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
     -DPYTHON_EXECUTABLE=python3 \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
@@ -32,8 +32,9 @@ main() {
     -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
     -DEXECUTORCH_BUILD_CPUINFO=OFF \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
-    -Bcmake-out .
-  cmake --build cmake-out --target install --config Release
+    -DEXECUTORCH_NNLIB_OPT=OFF \
+    -Bcmake-out
+  cmake --build cmake-out --target install --config Release -j16
 
   local example_dir=backends/cadence
   local build_dir="cmake-out/${example_dir}"
@@ -43,7 +44,7 @@ main() {
     -DCMAKE_BUILD_TYPE=Release \
     -B"${build_dir}" \
     "${example_dir}"
-  cmake --build "${build_dir}" --config Release
+  cmake --build "${build_dir}" --config Release -j16
 
   local runner="${PWD}/${build_dir}/cadence_runner"
   if [[ ! -f "${runner}" ]]; then
diff --git a/backends/cadence/cadence_runner/cadence_runner.cpp b/backends/cadence/cadence_runner/cadence_runner.cpp
deleted file mode 100644
index a269ed5a8e8..00000000000
--- a/backends/cadence/cadence_runner/cadence_runner.cpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @file
- *
- * This tool can run ExecuTorch model files that only use operators that
- * are covered by the portable kernels, with possible delegate to the
- * test_backend_compiler_lib.
- *
- * It sets all input tensor data to ones, and assumes that the outputs are
- * all fp32 tensors.
- */
-
-#include <fstream>
-#include <memory>
-
-#include <gflags/gflags.h>
-
-#include <executorch/devtools/bundled_program/bundled_program.h>
-#include <executorch/devtools/etdump/etdump_flatcc.h>
-#include <executorch/extension/data_loader/buffer_data_loader.h>
-#include <executorch/runtime/executor/method.h>
-#include <executorch/runtime/executor/program.h>
-#include <executorch/runtime/platform/log.h>
-#include <executorch/runtime/platform/runtime.h>
-
-static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
-
-DEFINE_string(
-    bundled_program_path,
-    "CadenceDemoModel.bpte",
-    "Model serialized in flatbuffer format.");
-
-DEFINE_int32(
-    testset_idx,
-    0,
-    "Index of bundled verification set to be run "
-    "by bundled model for verification");
-
-DEFINE_string(
-    etdump_path,
-    "etdump.etdp",
-    "If etdump generation is enabled an etdump will be written out to this path");
-
-DEFINE_bool(
-    output_verification,
-    false,
-    "Comapre the model output to the reference outputs present in the BundledProgram.");
-
-DEFINE_bool(
-    print_output,
-    false,
-    "Print the output of the ET model to stdout, if needs.");
-
-DEFINE_bool(dump_outputs, true, "Dump outputs to etdump file");
-
-DEFINE_bool(
-    dump_intermediate_outputs,
-    false,
-    "Dump intermediate outputs to etdump file.");
-
-DEFINE_string(
-    debug_output_path,
-    "debug_output.bin",
-    "Path to dump debug outputs to.");
-
-DEFINE_int32(
-    debug_buffer_size,
-    262144, // 256 KB
-    "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
-
-using namespace torch::executor;
-
-std::vector<uint8_t> load_file_or_die(const char* path) {
-  std::ifstream file(path, std::ios::binary | std::ios::ate);
-  const size_t nbytes = file.tellg();
-  file.seekg(0, std::ios::beg);
-  auto file_data = std::vector<uint8_t>(nbytes);
-  ET_CHECK_MSG(
-      file.read(reinterpret_cast<char*>(file_data.data()), nbytes),
-      "Could not load contents of file '%s'",
-      path);
-  return file_data;
-}
-
-int main(int argc, char** argv) {
-  runtime_init();
-
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (argc != 1) {
-    std::string msg = "Extra commandline args:";
-    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
-      msg += std::string(" ") + argv[i];
-    }
-    ET_LOG(Error, "%s", msg.c_str());
-    return 1;
-  }
-
-  // Read in the entire file.
-  const char* bundled_program_path = FLAGS_bundled_program_path.c_str();
-  std::vector<uint8_t> file_data = load_file_or_die(bundled_program_path);
-
-  // Find the offset to the embedded Program.
-  const void* program_data;
-  size_t program_data_len;
-  Error status = torch::executor::bundled_program::GetProgramData(
-      reinterpret_cast<void*>(file_data.data()),
-      file_data.size(),
-      &program_data,
-      &program_data_len);
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "GetProgramData() failed on file '%s': 0x%x",
-      bundled_program_path,
-      (unsigned int)status);
-
-  auto buffer_data_loader =
-      util::BufferDataLoader(program_data, program_data_len);
-
-  // Parse the program file. This is immutable, and can also be reused
-  // between multiple execution invocations across multiple threads.
-  Result<Program> program = Program::load(&buffer_data_loader);
-  if (!program.ok()) {
-    ET_LOG(Error, "Failed to parse model file %s", bundled_program_path);
-    return 1;
-  }
-  ET_LOG(Info, "Model file %s is loaded.", bundled_program_path);
-
-  // Use the first method in the program.
-  const char* method_name = nullptr;
-  {
-    const auto method_name_result = program->get_method_name(0);
-    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
-    method_name = *method_name_result;
-  }
-  ET_LOG(Info, "Running method %s", method_name);
-
-  // MethodMeta describes the memory requirements of the method.
-  Result<MethodMeta> method_meta = program->method_meta(method_name);
-  ET_CHECK_MSG(
-      method_meta.ok(),
-      "Failed to get method_meta for %s: 0x%x",
-      method_name,
-      (unsigned int)method_meta.error());
-
-  //
-  // The runtime does not use malloc/new; it allocates all memory using the
-  // MemoryManger provided by the client. Clients are responsible for allocating
-  // the memory ahead of time, or providing MemoryAllocator subclasses that can
-  // do it dynamically.
-  //
-
-  // The method allocator is used to allocate all dynamic C++ metadata/objects
-  // used to represent the loaded method. This allocator is only used during
-  // loading a method of the program, which will return an error if there was
-  // not enough memory.
-  //
-  // The amount of memory required depends on the loaded method and the runtime
-  // code itself. The amount of memory here is usually determined by running the
-  // method and seeing how much memory is actually used, though it's possible to
-  // subclass MemoryAllocator so that it calls malloc() under the hood (see
-  // MallocMemoryAllocator).
-  //
-  // In this example we use a statically allocated memory pool.
-  MemoryAllocator method_allocator{
-      MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
-
-  // The memory-planned buffers will back the mutable tensors used by the
-  // method. The sizes of these buffers were determined ahead of time during the
-  // memory-planning pasees.
-  //
-  // Each buffer typically corresponds to a different hardware memory bank. Most
-  // mobile environments will only have a single buffer. Some embedded
-  // environments may have more than one for, e.g., slow/large DRAM and
-  // fast/small SRAM, or for memory associated with particular cores.
-  std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
-  std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
-  size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
-  for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
-    // .get() will always succeed because id < num_memory_planned_buffers.
-    size_t buffer_size =
-        static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
-    ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
-    planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
-    planned_spans.push_back({planned_buffers.back().get(), buffer_size});
-  }
-  HierarchicalAllocator planned_memory(
-      {planned_spans.data(), planned_spans.size()});
-
-  // Assemble all of the allocators into the MemoryManager that the Executor
-  // will use.
-  MemoryManager memory_manager(&method_allocator, &planned_memory);
-
-  //
-  // Load the method from the program, using the provided allocators. Running
-  // the method can mutate the memory-planned buffers, so the method should only
-  // be used by a single thread at at time, but it can be reused.
-  //
-  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
-  Result<Method> method =
-      program->load_method(method_name, &memory_manager, &etdump_gen);
-  ET_CHECK_MSG(
-      method.ok(),
-      "Loading of method %s failed with status 0x%" PRIx32,
-      method_name,
-      method.error());
-  ET_LOG(Info, "Method loaded.");
-
-  void* debug_buffer = malloc(FLAGS_debug_buffer_size);
-  if (FLAGS_dump_intermediate_outputs) {
-    Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
-    etdump_gen.set_debug_buffer(buffer);
-    etdump_gen.set_event_tracer_debug_level(
-        EventTracerDebugLogLevel::kIntermediateOutputs);
-  } else if (FLAGS_dump_outputs) {
-    Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
-    etdump_gen.set_debug_buffer(buffer);
-    etdump_gen.set_event_tracer_debug_level(
-        EventTracerDebugLogLevel::kProgramOutputs);
-  }
-  // Use the inputs embedded in the bundled program.
-  status = torch::executor::bundled_program::LoadBundledInput(
-      *method, file_data.data(), FLAGS_testset_idx);
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "LoadBundledInput failed with status 0x%" PRIx32,
-      status);
-
-  ET_LOG(Info, "Inputs prepared.");
-
-  // Run the model.
-  status = method->execute();
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "Execution of method %s failed with status 0x%" PRIx32,
-      method_name,
-      status);
-  ET_LOG(Info, "Model executed successfully.");
-
-  // Print the outputs.
-  if (FLAGS_print_output) {
-    std::vector<EValue> outputs(method->outputs_size());
-    status = method->get_outputs(outputs.data(), outputs.size());
-    ET_CHECK(status == Error::Ok);
-    for (EValue& output : outputs) {
-      // TODO(T159700776): This assumes that all outputs are fp32 tensors. Add
-      // support for other EValues and Tensor dtypes, and print tensors in a
-      // more readable way.
-      auto output_tensor = output.toTensor();
-      auto data_output = output_tensor.const_data_ptr<float>();
-      for (size_t j = 0; j < output_tensor.numel(); ++j) {
-        ET_LOG(Info, "%f", data_output[j]);
-      }
-    }
-  }
-
-  // Dump the etdump data containing profiling/debugging data to the specified
-  // file.
-  etdump_result result = etdump_gen.get_etdump_data();
-  if (result.buf != nullptr && result.size > 0) {
-    FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
-    fwrite((uint8_t*)result.buf, 1, result.size, f);
-    fclose(f);
-    free(result.buf);
-  }
-
-  if (FLAGS_output_verification) {
-    // Verify the outputs.
-    status =
-        torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
-            *method,
-            file_data.data(),
-            FLAGS_testset_idx,
-            1e-3, // rtol
-            1e-5 // atol
-        );
-    ET_CHECK_MSG(
-        status == Error::Ok,
-        "Bundle verification failed with status 0x%" PRIx32,
-        status);
-    ET_LOG(Info, "Model verified successfully.");
-  }
-
-  if (FLAGS_dump_outputs || FLAGS_dump_intermediate_outputs) {
-    FILE* f = fopen(FLAGS_debug_output_path.c_str(), "w+");
-    fwrite((uint8_t*)debug_buffer, 1, FLAGS_debug_buffer_size, f);
-    fclose(f);
-  }
-  free(debug_buffer);
-
-  return 0;
-}
diff --git a/backends/cadence/cadence_runner/targets.bzl b/backends/cadence/cadence_runner/targets.bzl
deleted file mode 100644
index b59a98cd75a..00000000000
--- a/backends/cadence/cadence_runner/targets.bzl
+++ /dev/null
@@ -1,29 +0,0 @@
-load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
-load("@fbsource//tools/build_defs:fb_xplat_cxx_binary.bzl", "fb_xplat_cxx_binary")
-load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
-
-def define_common_targets():
-    fb_native.export_file(
-        name = "cadence_runner.cpp",
-        src = "cadence_runner.cpp",
-        visibility = [
-            "PUBLIC",
-        ],
-    )
-
-    fb_xplat_cxx_binary(
-        name = "cadence_runner",
-        srcs = ["cadence_runner.cpp"],
-        headers = [],
-        platforms = CXX,
-        visibility = ["PUBLIC"],
-        deps = [
-            "fbsource//arvr/third-party/gflags:gflags",
-            "fbsource//xplat/executorch/devtools/etdump:etdump_flatcc",
-            "fbsource//xplat/executorch/devtools/bundled_program:runtime",
-            "fbsource//xplat/executorch/extension/data_loader:file_data_loader",
-            "fbsource//xplat/executorch/extension/data_loader:buffer_data_loader",
-            "fbsource//xplat/executorch/kernels/portable:generated_lib",
-            "fbsource//xplat/executorch/runtime/executor:program",
-        ],
-    )
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 15d1a4ddd52..d03bb1c01ef 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -10,6 +10,8 @@ add_library(
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
 )
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 target_include_directories(
   cadence_kernels
@@ -19,6 +21,7 @@ target_include_directories(
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/
+  ${_common_include_directories}
 )
 
 target_link_libraries(cadence_kernels PRIVATE xa_nnlib)
diff --git a/backends/cadence/hifi/kernels/TARGETS b/backends/cadence/hifi/kernels/TARGETS
new file mode 100644
index 00000000000..67f2bab681a
--- /dev/null
+++ b/backends/cadence/hifi/kernels/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp
index 5a2d58d2e2f..4d9183e4cc2 100644
--- a/backends/cadence/hifi/kernels/kernels.cpp
+++ b/backends/cadence/hifi/kernels/kernels.cpp
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
-#include "xa_nnlib_common.h"
-#include "xa_nnlib_common_macros.h"
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <xa_nnlib_common.h>
+#include <xa_nnlib_common_macros.h>
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 789c8942a85..b5659824615 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -8,12 +8,9 @@
 
 #pragma once
 
-#include "inttypes.h"
-#include "stddef.h"
-#include "xa_type_def.h"
-
-/* For NNLIB APIs */
-#include "xa_nnlib_kernels_api.h"
+#include <inttypes.h>
+#include <stddef.h>
+#include <xa_type_def.h>
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/kernels/targets.bzl b/backends/cadence/hifi/kernels/targets.bzl
new file mode 100644
index 00000000000..acdc39dd16d
--- /dev/null
+++ b/backends/cadence/hifi/kernels/targets.bzl
@@ -0,0 +1,18 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "kernels",
+        srcs = ["kernels.cpp"],
+        exported_headers = [
+            "kernels.h",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+        ],
+        exported_deps = [
+            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common",
+        ],
+        platforms = CXX,
+    )
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 8da6169cda1..78413ef312e 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -28,6 +28,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
diff --git a/backends/cadence/hifi/operators/TARGETS b/backends/cadence/hifi/operators/TARGETS
new file mode 100644
index 00000000000..67f2bab681a
--- /dev/null
+++ b/backends/cadence/hifi/operators/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
index 37eaecbe19d..79645f5381d 100644
--- a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
@@ -6,8 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
index 6e74fb4f3ce..e280f6bcffd 100644
--- a/backends/cadence/hifi/operators/quantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
@@ -6,8 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
index 930ce12dea9..3974d6ee5e9 100644
--- a/backends/cadence/hifi/operators/quantized_layer_norm.cpp
+++ b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
@@ -6,9 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-
 #include <algorithm>
 #include <cmath>
 #include <tuple>
@@ -76,9 +75,11 @@ void quantized_layer_norm_(
     for (size_t j = 0; j < last_dim; ++j) {
       // Since X is quantized, we dequantize it, compute fp32 result, and
       // quantize the result to an int8/uint8 value.
-      float val = kernels::dequantize<T>(x[j], input_scale, input_zero_point);
+      float val = impl::HiFi::kernels::dequantize<T>(
+          x[j], input_scale, input_zero_point);
       val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
-      y[j] = kernels::quantize<T>(val, output_inv_scale, output_zero_point);
+      y[j] = impl::HiFi::kernels::quantize<T>(
+          val, output_inv_scale, output_zero_point);
     }
   }
 }
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp
index 0a254cb5f7d..fb186abbb14 100644
--- a/backends/cadence/hifi/operators/quantized_linear_out.cpp
+++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp
@@ -6,8 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
-
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <algorithm>
 #include <cmath>
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
new file mode 100644
index 00000000000..c7b24d790f0
--- /dev/null
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -0,0 +1,30 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # Define build targets for all operators registered in the tables above.
+
+    runtime.cxx_library(
+        name = "cadence_hifi_ops",
+        srcs = glob([
+            "*.cpp",
+        ]),
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib",
+            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+        ],
+    )
diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt
index fba66e9b27a..07394cbe834 100644
--- a/backends/cadence/reference/kernels/CMakeLists.txt
+++ b/backends/cadence/reference/kernels/CMakeLists.txt
@@ -7,4 +7,9 @@
 # lint_cmake: -linelength
 add_library(cadence_kernels kernels.cpp)
 
-target_include_directories(cadence_kernels PUBLIC .)
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+target_include_directories(cadence_kernels PUBLIC .
+                    ${_common_include_directories}
+)
diff --git a/backends/cadence/reference/kernels/kernels.cpp b/backends/cadence/reference/kernels/kernels.cpp
index ae3e1bad2db..49e5e88d8ea 100644
--- a/backends/cadence/reference/kernels/kernels.cpp
+++ b/backends/cadence/reference/kernels/kernels.cpp
@@ -9,6 +9,7 @@
 #include <math.h>
 #include <algorithm>
 #include <cstring>
+#include <limits>
 #include <numeric>
 
 namespace impl {
@@ -17,8 +18,7 @@ namespace kernels {
 
 // Quantize a fp32 value to an int8_t/uint8_t value
 template <typename T>
-__attribute__((always_inline)) T
-quantize(const float x, float scale, int32_t zero_point) {
+T quantize(const float x, float scale, int32_t zero_point) {
   constexpr float min_val = std::numeric_limits<T>::min();
   constexpr float max_val = std::numeric_limits<T>::max();
   float tmp = roundf(x * scale + zero_point);
@@ -40,8 +40,7 @@ void quantize(
 
 // Dequantize an int8_t/uint8_t value to an fp32 value
 template <typename T>
-__attribute__((always_inline)) float
-dequantize(const T x, float scale, int32_t zero_point) {
+float dequantize(const T x, float scale, int32_t zero_point) {
   return scale * (x - zero_point);
 }
 
@@ -60,9 +59,8 @@ void dequantize(
 
 // explicit template instantiation
 
-#define typed_quantize_val(dtype)                         \
-  template __attribute__((always_inline)) dtype quantize( \
-      const float x, float inv_scale, int32_t zero_point);
+#define typed_quantize_val(dtype) \
+  template dtype quantize(const float x, float inv_scale, int32_t zero_point);
 typed_quantize_val(int8_t);
 typed_quantize_val(uint8_t);
 typed_quantize_val(int16_t);
@@ -82,9 +80,8 @@ typed_quantize_vec(int16_t);
 typed_quantize_vec(int32_t);
 #undef typed_quantize_vec
 
-#define typed_dequantize_val(dtype)                         \
-  template __attribute__((always_inline)) float dequantize( \
-      const dtype x, float scale, int32_t zero_point);
+#define typed_dequantize_val(dtype) \
+  template float dequantize(const dtype x, float scale, int32_t zero_point);
 typed_dequantize_val(int8_t);
 typed_dequantize_val(uint8_t);
 typed_dequantize_val(int16_t);
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
index 605c43ef715..c814a58fe8d 100644
--- a/backends/cadence/reference/operators/CMakeLists.txt
+++ b/backends/cadence/reference/operators/CMakeLists.txt
@@ -32,6 +32,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp
index 47234a7cd95..59574aeca6a 100644
--- a/backends/cadence/reference/operators/quantized_conv_out.cpp
+++ b/backends/cadence/reference/operators/quantized_conv_out.cpp
@@ -9,8 +9,6 @@
 #include <executorch/backends/cadence/reference/kernels/kernels.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <algorithm>
-#include <cmath>
 
 namespace impl {
 namespace reference {
diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp
index a2dd644a976..574bcef1b22 100644
--- a/backends/cadence/reference/operators/quantized_layer_norm.cpp
+++ b/backends/cadence/reference/operators/quantized_layer_norm.cpp
@@ -9,11 +9,10 @@
 #include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-#include <algorithm>
 #include <cmath>
-#include <tuple>
 
-using Tensor = exec_aten::Tensor;
+using executorch::aten::Tensor;
+using executorch::runtime::getLeadingDims;
 using executorch::runtime::KernelRuntimeContext;
 
 namespace impl {
diff --git a/backends/cadence/reference/operators/quantized_linear_out.cpp b/backends/cadence/reference/operators/quantized_linear_out.cpp
index 300158d8e5e..c85e3a59603 100644
--- a/backends/cadence/reference/operators/quantized_linear_out.cpp
+++ b/backends/cadence/reference/operators/quantized_linear_out.cpp
@@ -13,7 +13,8 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using executorch::aten::Tensor;
+using executorch::runtime::getLeadingDims;
 using executorch::runtime::KernelRuntimeContext;
 
 void quantized_linear_out(
diff --git a/backends/cadence/reference/operators/quantized_matmul_out.cpp b/backends/cadence/reference/operators/quantized_matmul_out.cpp
index b381a8ee394..b0a9393cd01 100644
--- a/backends/cadence/reference/operators/quantized_matmul_out.cpp
+++ b/backends/cadence/reference/operators/quantized_matmul_out.cpp
@@ -13,7 +13,8 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using executorch::aten::Tensor;
+using executorch::runtime::getLeadingDims;
 using executorch::runtime::KernelRuntimeContext;
 
 // The quantized matmul. The quantized matmul accumulates in a wider register,
diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/runtime/executor.py
index d07b1b6a52e..6b173a97809 100644
--- a/backends/cadence/runtime/executor.py
+++ b/backends/cadence/runtime/executor.py
@@ -106,7 +106,9 @@ def __init__(
         working_dir: str = "",
     ):
         self.working_dir = working_dir
-        self.executor_builder = "./backends/cadence/build_cadence_runner.sh"
+        self.executor_builder = (
+            "./backends/cadence/cadence_runner/build_cadence_runner.sh"
+        )
         self.execute_runner = "./cmake-out/backends/cadence/cadence_runner"
         self.bundled_program_path: str = "CadenceDemoModel.bpte"
 
diff --git a/backends/cadence/runtime/executor_main.sh b/backends/cadence/runtime/executor_main.sh
index c850ab8b4a9..7d6cba09b87 100644
--- a/backends/cadence/runtime/executor_main.sh
+++ b/backends/cadence/runtime/executor_main.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Test the end-to-end flow of building sdk_example_runner and use it to run
+# Test the end-to-end flow of building devtools/example_runner and use it to run
 # an actual model.
 
 
@@ -14,21 +14,21 @@ set -e
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../../.ci/scripts/utils.sh"
 
-cmake_install_executorch_sdk_lib() {
+cmake_install_executorch_devtools_lib() {
   echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a"
   rm -rf cmake-out
 
   retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
   cmake --build cmake-out -j9 --target install --config Release
 }
 
-test_cmake_sdk_example_runner() {
-  local example_dir=examples/sdk
+test_cmake_devtools_example_runner() {
+  local example_dir=examples/devtools
   local build_dir=cmake-out/${example_dir}
   CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
   rm -rf ${build_dir}
@@ -42,8 +42,8 @@ test_cmake_sdk_example_runner() {
   echo "Building ${example_dir}"
   cmake --build ${build_dir} -j9 --config Release
 
-  echo 'Running sdk_example_runner'
-  ${build_dir}/sdk_example_runner --bundled_program_path="./CadenceDemoModel.bpte"
+  echo 'Running devtools/example_runner'
+  ${build_dir}/example_runner --bundled_program_path="./CadenceDemoModel.bpte"
 }
 
 if [[ -z $PYTHON_EXECUTABLE ]];
@@ -56,5 +56,5 @@ then
   BUCK=buck2
 fi
 
-cmake_install_executorch_sdk_lib
-test_cmake_sdk_example_runner
+cmake_install_executorch_devtools_lib
+test_cmake_devtools_example_runner
diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index 9a9c82d90a9..45f4bcda7fe 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -28,7 +28,7 @@ add_library(neuron_backend SHARED)
 target_compile_options(neuron_backend PRIVATE "-frtti" "-fexceptions")
 target_link_libraries(neuron_backend
     PRIVATE
-    executorch_no_prim_ops
+    executorch_core
     android
     log
     ${NEURON_BUFFER_ALLOCATOR_LIB}
diff --git a/examples/mediatek/requirements.txt b/backends/mediatek/requirements.txt
similarity index 100%
rename from examples/mediatek/requirements.txt
rename to backends/mediatek/requirements.txt
diff --git a/backends/mediatek/scripts/README.md b/backends/mediatek/scripts/README.md
index 76d0c5ad5fb..0550fb23d51 100644
--- a/backends/mediatek/scripts/README.md
+++ b/backends/mediatek/scripts/README.md
@@ -10,41 +10,60 @@ Before you begin, ensure you have the following prerequisites installed and conf
 
 - **Download Buck2**: Obtain Buck2 from the official [releases page](https://github.com/facebook/buck2/releases/tag/2024-02-01).
 - **Add to PATH**: Extract the downloaded file and add the directory to your system's `$PATH` environment variable.
-```bash
-export PATH=<path_to_buck>:$PATH
-```
+   ```bash
+   export PATH=<path_to_buck>:$PATH
+   ```
 
 ### 2. Android NDK
 
-- **Download Android NDK**: Acquire the Android NDK from the [Android developer site](https://developer.android.com/ndk/downloads).
+- **Download Android NDK**: Acquire the Android NDK version 26.3.11579264 from the [Android developer site](https://developer.android.com/ndk/downloads).
 - **Set NDK Path**: Ensure that the `$ANDROID_NDK` environment variable is set to the path where the NDK is located.
-```bash
-export ANDROID_NDK=<path_to_android_ndk>
-```
+   ```bash
+   export ANDROID_NDK=<path_to_android_ndk>
+   ```
 
 ### 3. MediaTek ExercuTorch Libraries
 
-Download the following libraries from MediaTek's NeuroPilot portal (link to be added):
+Download [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress) from MediaTek's NeuroPilot portal:
 
 - `libneuronusdk_adapter.mtk.so`: This universal SDK contains the implementation required for executing target-dependent code on the MediaTek chip.
 - `libneuron_buffer_allocator.so`: This utility library is designed for allocating DMA buffers necessary for model inference.
-```bash
-export NEURON_BUFFER_ALLOCATOR_LIB=<path_to_buffer_allocator>
-```
+- `mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`: This library preprocess the model into a MediaTek representation.
+- `mtk_neuron-8.2.2-py3-none-linux_x86_64.whl`: This library converts the model to binaries.
 
 ## Setup
 
-Follow the steps below to set up your build environment:
+Follow the steps below to setup your build environment:
+
+1. **Setup ExercuTorch Environment**: Refer to the [Setting up ExercuTorch](https://pytorch.org/executorch/stable/getting-started-setup) guide for detailed instructions on setting up the ExercuTorch environment.
+
+2. **Setup MediaTek Backend Environment**
+- Install the dependent libs. Ensure that you are inside backends/mediatek/ directory
+   ```bash
+   pip3 install -r requirements.txt
+   ```
+- Install the two .whl downloaded from NeuroPilot Portal
+   ```bash
+   pip3 install mtk_neuron-8.2.2-py3-none-linux_x86_64.whl
+   pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+   ```
+- Set evironment variables for building backend
+   ```bash
+   export NEURON_BUFFER_ALLOCATOR_LIB=<path_to_buffer_allocator>
+   ```
 
-1. **ExercuTorch Official Tutorial**: Refer to the [Setting up ExercuTorch](https://pytorch.org/executorch/stable/getting-started-setup) guide for detailed instructions on setting up the ExercuTorch environment.
+## Build
 
-2. **Build Script**: Once the prerequisites are in place, run the `mtk_build.sh` script to start the build process.
+1. **Build MediaTek Backend**: Once the prerequisites are in place, run the `mtk_build.sh` script to start the build process, MediaTek backend will be built under `cmake-android-out/backends/` as `libneuron_backend.so`
 
    ```bash
    ./mtk_build.sh
    ```
-3. **Push MediaTek universal SDK to the device**: push libneuronusdk_adapter.mtk.so to the phone and export it to the `$LD_LIBRARY_PATH` environment variable before executing ExercuTorch with MediaTek backend.
+
+## Run
+
+1. **Push MediaTek universal SDK and MediaTek backend to the device**: push `libneuronusdk_adapter.mtk.so` and `libneuron_backend.so` to the phone and export it to the `$LD_LIBRARY_PATH` environment variable before executing ExercuTorch with MediaTek backend.
 
    ```bash
-   export LD_LIBRARY_PATH=<path_to_usdk>:$LD_LIBRARY_PATH
+   export LD_LIBRARY_PATH=<path_to_usdk>:<path_to_neuron_backend>:$LD_LIBRARY_PATH
    ```
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 8c62b025bcd..a73b4ba85da 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -181,7 +181,10 @@ target_link_libraries(
 )
 target_link_libraries(
   qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
-                                 executorch_no_prim_ops qcir_utils
+                                 executorch_core qcir_utils extension_tensor
+)
+set_target_properties(
+  qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
 target_link_libraries(utils PRIVATE qnn_executorch_logging)
 target_link_libraries(
@@ -243,6 +246,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
             qnn_executorch_header
             executorch
             qcir_utils
+            extension_tensor
   )
   target_link_libraries(
     PyQnnWrapperAdaptor PRIVATE pybind11::module pybind11::lto wrappers
diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md
index 3c0fdd8f987..be7cd427d6e 100644
--- a/backends/qualcomm/README.md
+++ b/backends/qualcomm/README.md
@@ -73,3 +73,67 @@ examples/qualcomm
 Please see this [README.md](../../examples/qualcomm/README.md).
 
 Further, an example build script is provided as [build.sh](scripts/build.sh).
+
+## Issues
+If you want to address the problem encountered, it would be great to have reproduction information for indicating maintainers. Please also follow the [policy](../../CONTRIBUTING.md#issues) to emit issues.
+
+## Pull Requests
+PRs are always welcome to help improve the codebase in a comprehensive manner. Before submitting changes, please apply:
+
+- **Check the Coding Style**:<br/>
+    Make sure your code follows the [style guides](../../CONTRIBUTING.md#coding-style) and passes the [lint checks](../../CONTRIBUTING.md#lintrunner).
+
+- **Add Unit Tests**:<br/>
+    Following is an example of adding test case after [creating new operator builder](builders/README.md), please navigate to `backends/qualcomm/tests` folder and put minimum example module in `model.py`. e.g.:
+    ```python
+    class IndexPut(torch.nn.Module):
+        ...
+
+    # please insert implementation in alphabetical order
+    class LayerNorm(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layer_norm = torch.nn.LayerNorm([768], eps=1e-6)
+
+        def forward(self, x):
+            return self.layer_norm(x)
+
+
+    class LeakyReLUDefault(torch.nn.Module):
+        ...
+    ```
+    Also extend sections `TestQNNFloatingPointOperator`, `TestQNNQuantizedOperator` in `test_qnn_delegate.py`. e.g.:
+    ```python
+    class TestQNNQuantizedOperator(TestQNN):
+        def test_qnn_backend_interpolate_nearest_2d(self):
+            ...
+
+        # please insert it implementation alphabetical order
+        def test_qnn_backend_layer_norm(self):
+            module = LayerNorm()  # noqa: F405
+            sample_input = (torch.randn(196, 768),)
+            module = self.get_qdq_module(module, sample_input)
+            self.lower_module_and_test_output(module, sample_input)
+
+        def test_qnn_backend_leaky_relu(self):
+            ...
+    ```
+
+- **Verify Unit Test Results**:<br/>
+    ```bash
+    cd $PATH_TO_EXECUTORCH
+    # example usage of performing unit test
+    python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_layer_norm -s $DEVICE_SERIAL -m SM8650 -b build-android/ -a $PATH_TO_TEST_ARTIFACTS
+    ```
+    The test graph is expected to have 1 delegated node with only placeholders / output nodes being left. Check the execution report for more information.
+
+- **Code Reviews**:<br/>
+    Please ping authors in Qualcomm AI Engine Direct related PRs for reviewing, possible candidates are listed below:
+    - [chiwwang](https://github.com/chiwwang)
+    - [shewu-quic](https://github.com/shewu-quic)
+    - [chunit-quic](https://github.com/chunit-quic)
+    - [winskuo-quic](https://github.com/winskuo-quic)
+    - [chuntl](https://github.com/chuntl)
+    - [haowhsu-quic](https://github.com/haowhsu-quic)
+
+Thanks again for your contribution!
diff --git a/backends/qualcomm/TARGETS b/backends/qualcomm/TARGETS
index 0a42614a385..6a720a62479 100644
--- a/backends/qualcomm/TARGETS
+++ b/backends/qualcomm/TARGETS
@@ -1,5 +1,18 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
 
 define_common_targets()
+
+runtime.python_library(
+    name = "preprocess",
+    srcs = ["qnn_preprocess.py"],
+    visibility = [
+        "//executorch/backends/qualcomm/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "//executorch/backends/qualcomm/passes:passes",
+    ],
+)
diff --git a/backends/qualcomm/aot/python/TARGETS b/backends/qualcomm/aot/python/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/backends/qualcomm/aot/python/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/qualcomm/aot/python/targets.bzl b/backends/qualcomm/aot/python/targets.bzl
new file mode 100644
index 00000000000..b16acfc4905
--- /dev/null
+++ b/backends/qualcomm/aot/python/targets.bzl
@@ -0,0 +1,100 @@
+load(
+    "@fbsource//tools/build_defs:default_platform_defs.bzl",
+    "ANDROID",
+)
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+PYTHON_MODULE_NAME = "PyQnnManagerAdaptor"
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    
+    runtime.cxx_python_extension(
+        name = "PyQnnManagerAdaptor",
+        srcs = [
+            "PyQnnManagerAdaptor.cpp",
+        ],
+        headers = [
+            "PyQnnManagerAdaptor.h",
+        ],
+        base_module = "executorch.backends.qualcomm.python",
+        preprocessor_flags = [
+            "-DEXECUTORCH_PYTHON_MODULE_NAME={}".format(PYTHON_MODULE_NAME),
+        ],
+        deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/backends/qualcomm/aot/python:python_lib",
+            "//executorch/backends/qualcomm/aot/wrappers:wrappers",
+            "//executorch/backends/qualcomm/runtime:logging",
+            "//executorch/backends/qualcomm:schema",
+            "//executorch/backends/qualcomm/aot/ir:qcir_utils",
+            "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/qualcomm/qnn:api",
+        ],
+        external_deps = [
+            "pybind11",
+            "libtorch_python",
+        ],
+        use_static_deps = True,
+        visibility = [
+            "//executorch/backends/qualcomm/...",
+        ],
+    )
+
+
+    runtime.cxx_python_extension(
+        name = "PyQnnWrapperAdaptor",
+        srcs = [
+            "PyQnnWrapperAdaptor.cpp",
+        ],
+        headers = [
+            "PyQnnWrapperAdaptor.h",
+        ],
+        base_module = "executorch.backends.qualcomm.python",
+        preprocessor_flags = [
+            "-DEXECUTORCH_PYTHON_MODULE_NAME={}".format(PYTHON_MODULE_NAME),
+        ],
+        deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/backends/qualcomm/aot/python:python_lib",
+            "//executorch/backends/qualcomm/aot/wrappers:wrappers",
+            "//executorch/backends/qualcomm/runtime:logging",
+            "//executorch/backends/qualcomm:schema",
+            "//executorch/backends/qualcomm/aot/ir:qcir_utils",
+            "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/qualcomm/qnn:api",
+        ],
+        external_deps = [
+            "pybind11",
+            "libtorch_python",
+        ],
+        use_static_deps = True,
+        visibility = [
+            "//executorch/backends/qualcomm/...",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "python_lib",
+        srcs = glob([
+            "*.cpp",
+        ]),
+        exported_headers = glob([
+            "*.h",
+        ]),
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        deps = [
+            "//executorch/backends/qualcomm/aot/wrappers:wrappers",
+            "//executorch/backends/qualcomm/runtime:logging",
+            "//executorch/backends/qualcomm:schema",
+            "//executorch/backends/qualcomm/aot/ir:qcir_utils",
+            "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/qualcomm/qnn:api",
+        ],
+        external_deps = [
+            "pybind11",
+        ],
+    )
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
index 9d80fd735aa..b6beafb40cf 100644
--- a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
+++ b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
@@ -91,7 +91,9 @@ TensorWrapper::TensorWrapper(
   if (data != nullptr) {
     QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes;
 
-    if (copy_data) {
+    if (tensor_type != QNN_TENSOR_TYPE_STATIC) {
+      QNN_VER_PTR(tensor_)->clientBuf.data = nullptr;
+    } else if (copy_data) {
       owned_data_ = std::make_unique<char[]>(bytes);
       const char* src_data = static_cast<const char*>(data);
       std::memcpy(owned_data_.get(), src_data, bytes);
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
new file mode 100644
index 00000000000..a81df0d6def
--- /dev/null
+++ b/backends/qualcomm/builders/README.md
@@ -0,0 +1,361 @@
+# Contribution for More Operators
+Thank you for contributing to Qualcomm AI Engine Direct delegate for ExecuTorch. Reading and following these guidelines will help you quickly get the essentials of implementing operator builder to unblock yourself and land pull requests more efficiently.
+
+## Sections
+* [References](#references)
+* [Getting Started](#getting-started)
+    * [Identify Unsupported Operator](#identify-unsupported-operator)
+    * [Check Operator Spec](#check-operator-spec)
+    * [Implementation](#implementation)
+    * [Quantizer Annotation](#quantizer-annotation)
+* [Issues](#issues)
+* [Pull Requests](#pull-requests)
+
+## References
+### Qualcomm AI Engine Direct
+- [Operator Definitions](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/MasterOpDef.html)
+- [Supported Operators in Backends](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/operations.html#backend-supplements)
+
+### PyTorch
+- [torch.nn Operator Definitions](https://pytorch.org/docs/stable/nn.html)
+- [torch.nn.functional Operator Definitions](https://pytorch.org/docs/stable/nn.functional.html)
+- [ATen Operator Definitions](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native)
+
+## Getting Started
+### Identify Unsupported Operator
+Consider we're enabling following model:
+```python
+class MyModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm([768], eps=1e-6)
+        self.linear = torch.nn.Linear(768, 100)
+
+    def forward(self, x):
+        return self.linear(self.layer_norm(x))
+```
+At the time we try to lower it with Qualcomm backend:
+```python
+from excutorch.examples.qualcomm.utils import build_executorch_binary
+
+build_executorch_binary(
+    model=MyModel(),
+    inputs=(torch.randn(200, 768),),
+    soc_model="SM8650"
+    file_name="my_model",
+    dataset=None,
+)
+```
+Assume there is no `torch.nn.LayerNorm` support, you should see the following error logs:
+```bash
+File "/executorch/backends/qualcomm/partition/qnn_partitioner.py", line 77, in is_node_supported
+    op_wrapper = self.node_visitors[node.target.__name__].define_node(
+KeyError: 'aten.native_layer_norm.default'
+```
+This log comes straight to the point, there is no suitable conversion for delegating torch operator to Qualcomm AI Engine Direct. Where the `node_visitors` is a dictionary which maps operator target name with its implementation callback. The goal of this tutorial aims for helping you register the missing one.<br/>
+The very first step is to locate which operator type are we going to support. Sometimes the target name of operator might be obscure, following snippet could help you trace back by its call stack:
+```python
+from executorch.backends.qualcomm.utils.utils import capture_program
+
+prog = capture_program(MyModel(), (torch.randn(200, 768),))
+for node in prog.exported_program.graph.nodes:
+    if node.op == "call_function" and node.target.__name__ == 'aten.native_layer_norm.default':
+        print(node.meta["source_fn_stack"])
+```
+It will provide more hint to the source PyTorch layer where the missing operator maps to:
+```bash
+[('l__self___layer_norm', <class 'torch.nn.modules.normalization.LayerNorm'>)]
+```
+
+### Check Operator Spec
+- **Qualcomm AI Engine Direct**:<br/>
+    You could collect information of `LayerNorm`'s IO via documents mentioned in [Qualcomm AI Engine Direct Manual](#qualcomm-ai-engine-direct):
+    * inputs
+        - in[0] - input activation / required
+        - in[1] - gamma / optional
+        - in[2] - beta / optional
+    * parameters
+        - "epsilon" / optional
+        - "axes" / required
+    * outputs
+        - out[0] - output activation / required
+
+    The required tensors must be provided for no default values were given inside QNN runtime, The order of IOs (`input activation`, `gamma`, `beta`) matters compared to parameters (`epsilon`, `axes`) who are recognized by literal value:
+    ```c
+    typedef struct {
+        /// A human-readable name for the operation instance.
+        const char* name;
+        /// The name of the operation package to which this operation's type belongs.
+        const char* packageName;
+        /// The name of operation type (e.g. Conv2D).
+        const char* typeName;
+        /// The number of static parameters provided in the params array.
+        uint32_t numOfParams;
+        /// Array of operation parameters.
+        Qnn_Param_t* params;
+        /// The number of input tensors.
+        uint32_t numOfInputs;
+        /// Array of input tensors.
+        Qnn_Tensor_t* inputTensors;
+        /// The number of output tensors.
+        uint32_t numOfOutputs;
+        /// Array of output tensors.
+        Qnn_Tensor_t* outputTensors;
+    } Qnn_OpConfigV1_t;
+    ```
+    This is a data structure used to check operator validity in QNN SDK. Inside validation process, tensors are retrieved sequentially and passed through a series of spec examinations while parameters are matched by their names:
+    ```c
+    typedef struct {
+        /// Parameter type: scalar or tensor
+        Qnn_ParamType_t paramType;
+        /// Name of the parameter
+        const char* name;
+
+        union UNNAMED {
+            /// Scalar parameter specification
+            Qnn_Scalar_t scalarParam;
+            /// Tensor parameter specification; tensors referred to must be STATIC.
+            Qnn_Tensor_t tensorParam;
+        };
+    } Qnn_Param_t;
+    ```
+    The name value equals to the parameter name described in [Operator Definitions](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/MasterOpDef.html), there are `epsilon`, `axes` for `LayerNorm` case.<br/>
+
+    If you find it hard to correlate missing operator with documentation, this [table](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/SupportedOps.html) might be helpful for searching. In some cases, an exact match may not exist. Consider seeking for a math equivalent approach or notify maintainer for further analysis.
+
+- **PyTorch**:<br/>
+    We could also read the IO spec from [function declaration](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/layer_norm.cpp) mentioned in [PyTorch Documentation](#pytorch):
+    * inputs
+        - in[0] - input activation / required
+        - in[1] - normalized_shape / required
+        - in[2] - weight_opt / optional
+        - in[3] - bias_opt / optional
+        - in[4] - eps / required
+
+    Through comparing the [equation](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html), we could sort out the relevance of arguments (`gamma` / `beta` / `epsilon`) inside Qualcomm manual to PyTorch (`weight_opt` / `bias_opt` / `eps`). The unmatched parameter `axes` will have more discussions in the [implementation](#implementation) part.
+
+### Implementation
+Let's start with adding new definition in `qnn_constant.py` for `LayerNorm` operator.
+```python
+@dataclass(init=False, frozen=True)
+class OpHardSwish:
+    ...
+
+# please insert it in alphabetically order
+@dataclass(init=False, frozen=True)
+class OpLayerNorm:
+    op_name: str = "LayerNorm"
+    param_epsilon = "epsilon"
+    param_axes = "axes"
+
+
+@dataclass(init=False, frozen=True)
+class OpLogSoftmax:
+    ...
+```
+The conventions are:
+- op_name: string describing the operator
+- params_xxx: string for consumed parameters
+
+The content should have exact match with literal values mentioned in [Qualcomm AI Engine Direct Manual](#qualcomm-ai-engine-direct) or `QnnOpDef.h` under `$QNN_SDK_ROOT/include/QNN/`:
+```c
+#define QNN_OP_LAYER_NORM               "LayerNorm"
+#define QNN_OP_LAYER_NORM_PARAM_EPSILON "epsilon"
+#define QNN_OP_LAYER_NORM_PARAM_AXES    "axes"
+```
+
+Next, create a new file with name in snake case format (e.g. `op_layer_norm.py`) and import required modules (please check comments for getting the ideas of usage):
+```python
+# pybind interface for invoking QNN APIs
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+# tensors or other numerics will be shipped in numpy format
+import numpy as np
+import torch
+# common keywords of Qualcomm backend
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+# op builder will inherit NodeVisitor and have its own implementation
+# register_node_visitor for book-keeping the dictionary of target name v.s. callback
+from .node_visitor import NodeVisitor, register_node_visitor
+# the definitions required to build operator in QNN
+from .qnn_constants import OpLayerNorm, QNN_OP_PACKAGE_NAME_QTI_AISW
+# utility to get parameter value when creating tensor in QNN
+from .utils import get_parameter
+```
+Start with function declaration as:
+```python
+@register_node_visitor
+class LayerNormVisitor(NodeVisitor):
+    target = ["aten.native_layer_norm.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+```
+It's mandatory to have `target` member in list form, since there would have multiple targets map to the same implementation. e.g. `aten.leaky_relu.default`, `aten.prelu.default` have similar equations but only differ in negative slope.<br/>
+The `nodes_to_wrappers` is a dictionary maintaining relationship between graph node and its output tensor. `nodes_to_wrappers` acts as an memo for not creating tensor objects to nodes that have already been traversed.<br/>
+
+Now, we can start to fill in function body step by step:
+1. Define input activation tensors:
+    ```python
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+    ```
+    Through the information in [Check Operator Spec](#check-operator-spec) section, we could easily extract the desired nodes.<br/>
+    The `get_tensor` method is responsible for retrieving torch tensor in correct axis order if `layout_transform` pass happened to apply.<br/>
+    The `define_tensor` method is for generating tensor object for QNN API and will be memorized by aforementioned `node_to_wrappers`.<br/>
+    And yet, there are arguments worth for addressing more:
+    - **node**: current graph node
+    - **tensor**: torch tensor emitted by node
+    - **tensor_type**: type compatible with QNN SDK, oftenly use `QNN_TENSOR_TYPE_NATIVE` for intermediate outputs and `QNN_TENSOR_TYPE_STATIC` for constant parameters
+    - **nodes_to_wrappers**: dictionary of graph node and its output tensor (note: the tensor here is not a torch tensor but a wrapped object for QNN)
+    - **is_input_tensor**: flag to tell if current tensor is input activation or parameter, which is important for fixed point mixed-precision to work properly
+    - **node_name**: (optional) tensor name for user to specify
+    - **wrapper_idx**: (optional) defaults to zero if node is not a tuple, otherwise it acts as an indexer to output tensors. e.g. when slicing input tensor into multiple outputs, `wrapper_idx` is necessary for getting correct wrapped tensor object
+
+2. Define input gamma / beta tensors:
+    ```python
+        weight_node = node.args[2]
+        weight_tensor = get_parameter(weight_node, self.edge_program)
+        weight_tensor_wrapper = self.define_tensor(
+            weight_node,
+            weight_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        bias_node = node.args[3]
+        bias_tensor = get_parameter(bias_node, self.edge_program)
+        bias_tensor_wrapper = self.define_tensor(
+            bias_node,
+            bias_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+    ```
+    The logic should be similar and straightforward. Please carefully set arguments `tensor_type`, `is_input_tensor` according to tensors' property.
+
+3. Define parameters:
+    ```python
+        normalized_shapes = node.args[1]
+        if len(normalized_shapes) != 1:
+            print("QNN only supports normalized output with rank 1")
+            return
+
+        axes = [len(input_tensor.shape) - 1]
+        axes_shape = [len(axes)]
+        epsilon = node.args[4]
+    ```
+    Here you can see the constraint introduced by Qualcomm AI Engine Direct. Unlike PyTorch's LayerNorm operator, QNN can only normalize input into 1-D tensor. Therefore we will have log to remind user and return the program directly, this gesture will be considered as validation failure in partitioner and will fallback this operator to CPU.<br/>
+    When passing tensor type parameters via pybind interface, it's also required to ship extra information like tensor shape in list form. e.g. `axes_shape = [len(axes)]`. More details will be provided in coming steps.
+
+4. Define output tensor:
+    ```python
+        output_tensor = self.get_tensor(node, node, 0)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+    ```
+    Althought the input / output activations might map to the graph IOs (a.k.a. user inputs / outputs) with corresponding type   `QNN_TENSOR_TYPE_APP_READ` / `QNN_TENSOR_TYPE_APP_WRITE`. Users are still expected to have `QNN_TENSOR_TYPE_NATIVE` for all nodes' IOs and leave the  detection logic handled inside `define_tensor` method.
+
+5. Generate operator object in QNN graph:
+    ```python
+        layer_norm_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpLayerNorm.op_name,
+        )
+    ```
+
+6. Pass IO tensors to operator object:
+    ```python
+        layer_norm_op.AddInputTensors(
+            [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper]
+        )
+        layer_norm_op.AddOutputTensors([output_tensor_wrapper])
+    ```
+    The IO tensor objects created before are gathered up and shipped to operator object.
+
+7. Pass parameters to operator object:
+    ```python
+        layer_norm_op.AddScalarParam(
+            OpLayerNorm.param_epsilon,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
+            {QCOM_DATA: np.float32(epsilon)},
+        )
+        layer_norm_op.AddTensorParam(
+            OpLayerNorm.param_axes,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(axis_shape),
+            axis_shape,
+            np.array(axis, dtype=np.uint32),
+            True,
+        )
+    ```
+    By checking the `Shape` property of parameter in [Qualcomm AI Engine Direct Manual](#qualcomm-ai-engine-direct), it should be clear which API to be used. e.g.:
+    - "epsilon" > __Shape__: scalar
+    - "axes" > __Shape__: 1D of shape[M]
+
+    The function signature of AddScalarParam is:
+    - **name**: string maps to the operator name in Qualcomm AI Engine Direct manual
+    - **data_type**: type compatible with QNN SDK, e.g. `QNN_DATATYPE_FLOAT_32`, `QNN_DATATYPE_UINT_32`, etc.
+    - **attr**: dictionary for shipping data, currently only `QCOM_DATA` key is used
+
+    The function signature of AddTensorParam is:
+    - **name**: string maps to the operator name in Qualcomm AI Engine Direct manual
+    - **data_type**: type compatible with QNN SDK, e.g. `QNN_DATATYPE_FLOAT_32`, `QNN_DATATYPE_UINT_32`, etc.
+    - **rank**: dimensions of tensor
+    - **dims**: shape of tensor
+    - **data**: tesnor data
+    - **copy_data**: user should specify to True for constant parameters
+
+8. Last, return operator object for partitioner to conduct validation:
+    ```python
+        return layer_norm_op
+    ```
+    Also update the `__init__.py` for `register_node_visitor` to work properly:
+    ```python
+    from . import (
+        ...
+        op_index_put,
+        # please insert codes in alphabetical order
+        op_layer_norm,
+        op_linear,
+        ...
+    )
+
+    __all__ = [
+        ...
+        op_index_put,
+        # please insert codes in alphabetical order
+        op_layer_norm,
+        op_linear,
+        ...
+    ]
+    ```
+
+### Quantizer Annotation
+The operator now should be functional for Qualcomm backends. For operator to work in fixed-precision, we should also make `QnnQuantizer` to correctly insert observers for recording calibrated encodings. Please read more on the [Quantization Annotation Tutorial](../quantizer//README.md).
+
+## Issues
+Please refer to the [issue section](../README.md#issues) for more information.
+
+## Pull Requests
+Please refer to the [PR section](../README.md#pull-requests) for more information.
diff --git a/backends/qualcomm/builders/TARGETS b/backends/qualcomm/builders/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/backends/qualcomm/builders/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/qualcomm/builders/op_avg_pool2d.py b/backends/qualcomm/builders/op_avg_pool2d.py
index 3e10a1918dc..2f7e773b4fb 100644
--- a/backends/qualcomm/builders/op_avg_pool2d.py
+++ b/backends/qualcomm/builders/op_avg_pool2d.py
@@ -51,8 +51,8 @@ def define_node(
             filter_size = filter_size + filter_size
         filter_size_shape = [len(filter_size)]
 
-        # stride info
-        stride = cast(List[int], node.args[2])
+        # stride info - default to kernel_size if not given
+        stride = cast(List[int], node.args[2]) if len(node.args) > 2 else filter_size
         if len(stride) == 1:
             stride = stride + stride
         stride_shape = [len(stride)]
diff --git a/backends/qualcomm/builders/op_batch_norm.py b/backends/qualcomm/builders/op_batch_norm.py
index 6b2e9ab91d8..9ca299e7432 100644
--- a/backends/qualcomm/builders/op_batch_norm.py
+++ b/backends/qualcomm/builders/op_batch_norm.py
@@ -26,12 +26,13 @@ class BatchNorm(NodeVisitor):
     def __init__(self, *args) -> None:
         super().__init__(*args)
 
-    def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor):
+    def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor, eps):
         if isinstance(tensor, torch._subclasses.FakeTensor):
             return
 
         if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
-            diff = max(abs(tensor.max()), abs(tensor.min()))
+            # scale value equals to zero will cause failure in HTP
+            diff = max(abs(tensor.max()), abs(tensor.min())) + eps
             quant_attrs[QCOM_SCALE] = diff / quant_attrs[QCOM_QUANT_MAX]
 
     def define_node(
@@ -42,7 +43,7 @@ def define_node(
         input_node = node.args[0]
         input_tensor = self.get_tensor(input_node, node)
 
-        mean_node, var_node, eps = node.args[3], node.args[4], 1e-5
+        mean_node, var_node, eps = node.args[3], node.args[4], 1e-9
         mean_tensor = get_parameter(mean_node, self.edge_program)
         var_tensor = get_parameter(var_node, self.edge_program)
 
@@ -61,7 +62,7 @@ def define_node(
 
         amount = (filter_tensor * mean_tensor) / torch.sqrt(var_tensor + eps)
         bias_tensor = bias_tensor - amount
-        self.update_encoding(bias_node, bias_tensor)
+        self.update_encoding(bias_node, bias_tensor, eps)
         bias_tensor_wrapper = self.define_tensor(
             bias_node,
             bias_tensor,
@@ -71,7 +72,7 @@ def define_node(
         )
 
         filter_tensor = filter_tensor / torch.sqrt(var_tensor + eps)
-        self.update_encoding(filter_node, filter_tensor)
+        self.update_encoding(filter_node, filter_tensor, eps)
         filter_tensor_wrapper = self.define_tensor(
             filter_node,
             filter_tensor,
diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py
index 4b58edbac63..b6e70c374e0 100644
--- a/backends/qualcomm/builders/op_conv2d.py
+++ b/backends/qualcomm/builders/op_conv2d.py
@@ -18,6 +18,7 @@
     OpDepthWiseConv2d,
     OpExpandDims,
     OpReshape,
+    OpTransposeConv2d,
     QNN_OP_PACKAGE_NAME_QTI_AISW,
 )
 from .utils import get_parameter
@@ -42,6 +43,9 @@ def _add_conv_op_parameter(
         padding_shape,
         dilation,
         dilation_shape,
+        output_padding=None,
+        output_padding_shape=None,
+        transpose_conv=False,
         groups=None,
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         """
@@ -68,14 +72,26 @@ def _add_conv_op_parameter(
             ),
             True,
         )
-        conv_op.AddTensorParam(
-            OP.param_dilation,
-            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
-            len(dilation_shape),
-            dilation_shape,
-            np.array(dilation, dtype=np.uint32),
-            True,
-        )
+
+        if transpose_conv:
+            conv_op.AddTensorParam(
+                OP.param_output_padding,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                len(output_padding_shape),
+                output_padding_shape,
+                np.array(output_padding, dtype=np.uint32),
+                True,
+            )
+        else:
+            conv_op.AddTensorParam(
+                OP.param_dilation,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                len(dilation_shape),
+                dilation_shape,
+                np.array(dilation, dtype=np.uint32),
+                True,
+            )
+
         if groups is not None:
             conv_op.AddScalarParam(
                 OP.param_group,
@@ -94,6 +110,11 @@ def _define_conv1d(
         Conv1D is a special case for convolutional operation. QNN does not support Conv1D, therefore,
         we need to cast from input -> Conv1d -> output to input -> unsqueeze -> Conv2d -> squeeze -> output.
         """
+        transpose_conv = cast(bool, node.args[6])
+        if transpose_conv:
+            print("ConvTranspose1d is not yet supported")
+            return
+
         op_wrapper_list = []  # op_wrapper to return
         unsqueeze_input_node = node.args[0]
         input_quant_encoding, input_quant_configs = self.get_quant_encoding_conf(
@@ -239,9 +260,9 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-
         if get_parameter(node.args[1], self.edge_program).dim() == 3:
             return self._define_conv1d(node, nodes_to_wrappers)
+
         input_node = node.args[0]
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
@@ -254,8 +275,9 @@ def define_node(
 
         filter_node = node.args[1]
         filter_tensor = get_parameter(filter_node, self.edge_program)
-        # weight of pytorch OIHW, yet QNN is HWIO
-        filter_axis_order = (2, 3, 1, 0)
+        # weight of pytorch OIHW(conv2d) | IOHW(conv_transpose2d), yet QNN is HWIO
+        is_transpose_conv = cast(bool, node.args[6])
+        filter_axis_order = (2, 3, 0, 1) if is_transpose_conv else (2, 3, 1, 0)
         filter_tensor = filter_tensor.permute(dims=filter_axis_order).contiguous()
         filter_tensor_wrapper = self.define_tensor(
             filter_node,
@@ -291,6 +313,7 @@ def define_node(
         stride = cast(List[int], node.args[3])
         padding = cast(List[int], node.args[4])
         dilation = cast(List[int], node.args[5])
+        output_padding = cast(List[int], node.args[7])
 
         groups = cast(int, node.args[8])
         # Qnn filter tensor is (H, W, Cin, Cout)
@@ -308,57 +331,38 @@ def define_node(
         if len(padding) == 1:
             padding = padding + padding
 
-        # args[6] = transposed
-        if cast(bool, node.args[6]):
-            print("Currently, No support for transposed convolution")
-            return
-
-        # args[7] = output padding
-        if not all(out_pad == 0 for out_pad in cast(List[int], node.args[7])):
-            print("QNN does not support output padding")
-            return
-
         stride_shape = [len(stride)]
         padding_shape = [2, 2]
         dilation_shape = [len(dilation)]
+        output_padding_shape = [len(output_padding)]
 
         if is_depthwise_conv:
-            conv_op = PyQnnWrapper.PyQnnOpWrapper(
-                node.name,
-                QNN_OP_PACKAGE_NAME_QTI_AISW,
-                OpDepthWiseConv2d.op_name,
-            )
-            conv_op = self._add_conv_op_parameter(
-                OpDepthWiseConv2d,
-                conv_op,
-                conv_input_tensors,
-                conv_output_tensors,
-                stride,
-                stride_shape,
-                padding,
-                padding_shape,
-                dilation,
-                dilation_shape,
-            )
-
+            op_class = OpDepthWiseConv2d
+        elif is_transpose_conv:
+            op_class = OpTransposeConv2d
         else:
-            conv_op = PyQnnWrapper.PyQnnOpWrapper(
-                node.name,
-                QNN_OP_PACKAGE_NAME_QTI_AISW,
-                OpConv2d.op_name,
-            )
-            conv_op = self._add_conv_op_parameter(
-                OpConv2d,
-                conv_op,
-                conv_input_tensors,
-                conv_output_tensors,
-                stride,
-                stride_shape,
-                padding,
-                padding_shape,
-                dilation,
-                dilation_shape,
-                groups,
-            )
+            op_class = OpConv2d
+
+        conv_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            op_class.op_name,
+        )
+        conv_op = self._add_conv_op_parameter(
+            op_class,
+            conv_op,
+            conv_input_tensors,
+            conv_output_tensors,
+            stride,
+            stride_shape,
+            padding,
+            padding_shape,
+            dilation,
+            dilation_shape,
+            output_padding,
+            output_padding_shape,
+            is_transpose_conv,
+            None if is_depthwise_conv else groups,
+        )
 
         return conv_op
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 8ac702f2ad5..9c589c76784 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -356,3 +356,12 @@ class OpTile:
 class OpTranspose:
     op_name: str = "Transpose"
     param_perm: str = "perm"
+
+
+@dataclass(init=False, frozen=True)
+class OpTransposeConv2d:
+    op_name: str = "TransposeConv2d"
+    param_stride: str = "stride"
+    param_pad_amount: str = "pad_amount"
+    param_group: str = "group"
+    param_output_padding: str = "output_padding"
diff --git a/backends/qualcomm/builders/targets.bzl b/backends/qualcomm/builders/targets.bzl
new file mode 100644
index 00000000000..39159e56cd8
--- /dev/null
+++ b/backends/qualcomm/builders/targets.bzl
@@ -0,0 +1,25 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    
+    runtime.python_library(
+        name = "builders",
+        srcs = glob([
+            "*.py",
+        ]),
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//executorch/exir/backend:backend_details",
+            "//executorch/exir/backend:compile_spec_schema",
+            "//executorch/backends/qualcomm/aot/python:PyQnnWrapperAdaptor",
+            "//executorch/backends/qualcomm/aot/python:PyQnnManagerAdaptor",
+            "//executorch/backends/qualcomm/utils:utils",
+            "//executorch/exir:lib",
+        ],
+    )
diff --git a/backends/qualcomm/partition/TARGETS b/backends/qualcomm/partition/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/backends/qualcomm/partition/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/qualcomm/partition/targets.bzl b/backends/qualcomm/partition/targets.bzl
new file mode 100644
index 00000000000..72248a6cebb
--- /dev/null
+++ b/backends/qualcomm/partition/targets.bzl
@@ -0,0 +1,24 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    
+    runtime.python_library(
+        name = "partition",
+        srcs = glob([
+            "*.py",
+        ]),
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//executorch/exir/backend:backend_details",
+            "//executorch/exir/backend:compile_spec_schema",
+            "//executorch/backends/qualcomm/builders:builders",
+            "//executorch/backends/qualcomm:preprocess",
+            "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
+        ],
+    )
diff --git a/backends/qualcomm/passes/TARGETS b/backends/qualcomm/passes/TARGETS
new file mode 100644
index 00000000000..a824ca9f6e5
--- /dev/null
+++ b/backends/qualcomm/passes/TARGETS
@@ -0,0 +1,18 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.python_library(
+    name = "passes",
+    srcs = glob([
+        "*.py",
+    ]),
+    visibility = [
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "//executorch/backends/transforms:addmm_mm_to_linear",
+        "//executorch/exir/backend:backend_details",
+        "//executorch/exir/backend:compile_spec_schema",
+    ],
+)
diff --git a/backends/qualcomm/passes/convert_to_linear.py b/backends/qualcomm/passes/convert_to_linear.py
index 8de89f8f408..e7c4e8f9a92 100644
--- a/backends/qualcomm/passes/convert_to_linear.py
+++ b/backends/qualcomm/passes/convert_to_linear.py
@@ -109,17 +109,13 @@ def _convert_to_linear(
 
         # Since QNN has no keep dims for linear op, we will need to add squeeze and unsqueeze around linear node
         # TODO: Find a more general conditional statement.
-        if (
-            fn_node.target == self.add
-            and linear_node.meta["val"].dim() == 3
-            and linear_node.meta["val"].shape[0] == 1
-        ):
-            squeeze_dim = linear_node.meta["val"].shape[1:]
-            linear_node.meta["val"] = torch.squeeze(linear_node.meta["val"], 0)
+        linear_output = linear_node.meta["val"]
+        if linear_output.dim() == 3 and linear_output.shape[0] == 1:
             with gm.graph.inserting_after(input_node):
                 input_users = list(input_node.users.keys())
-                squeeze_dim = linear_node.meta["val"].shape
-                squeeze_view_copy_node = gm.graph.create_node(
+                input_tensor = input_node.meta["val"]
+                squeeze_dim = input_tensor.shape[-2:]
+                squeeze_node = gm.graph.create_node(
                     "call_function",
                     self.view_copy,
                     (
@@ -127,14 +123,19 @@ def _convert_to_linear(
                         squeeze_dim,
                     ),
                 )
-                squeeze_view_copy_node.meta = linear_node.meta
+                # meta needs to be copied elementwisely for fake-tensor
+                # to be updated correctly and not affect meta of input_node
+                for k, v in input_node.meta.items():
+                    squeeze_node.meta[k] = v
+                squeeze_node.meta["val"] = input_tensor.reshape(squeeze_dim)
                 for user in input_users:
                     if user == linear_node:
-                        user.replace_input_with(input_node, squeeze_view_copy_node)
-            with gm.graph.inserting_after(output):
+                        user.replace_input_with(input_node, squeeze_node)
+
+            with gm.graph.inserting_after(linear_node):
                 output_users = list(linear_node.users.keys())
-                unsqueeze_dim = output.args[0].meta["val"].shape
-                unsqueeze_view_copy_node = gm.graph.create_node(
+                unsqueeze_dim = linear_output.shape
+                unsqueeze_node = gm.graph.create_node(
                     "call_function",
                     self.view_copy,
                     (
@@ -142,16 +143,16 @@ def _convert_to_linear(
                         unsqueeze_dim,
                     ),
                 )
-                unsqueeze_view_copy_node.meta = output.args[0].meta
+                # meta needs to be copied elementwisely for fake-tensor
+                # to be updated correctly and not affect meta of unsqueeze_node
+                for k, v in linear_node.meta.items():
+                    unsqueeze_node.meta[k] = v
+                # update linear node's shape
+                linear_node.meta["val"] = linear_output.reshape(
+                    linear_output.shape[-2:]
+                )
                 for user in output_users:
-                    user.replace_input_with(linear_node, unsqueeze_view_copy_node)
-            if QCOM_QUANT_ATTRS in linear_node.meta:
-                squeeze_view_copy_node.meta[QCOM_QUANT_ATTRS] = linear_node.meta[
-                    QCOM_QUANT_ATTRS
-                ]
-                unsqueeze_view_copy_node.meta[QCOM_QUANT_ATTRS] = linear_node.meta[
-                    QCOM_QUANT_ATTRS
-                ]
+                    user.replace_input_with(linear_node, unsqueeze_node)
 
     def _extract_mm_ops(self, partitioned_nodes: List[edge_op]) -> List[torch.fx.Node]:
         mm_node = [n for n in partitioned_nodes if n.target == self.mm][0]
diff --git a/backends/qualcomm/passes/expand_broadcast_tensor_shape.py b/backends/qualcomm/passes/expand_broadcast_tensor_shape.py
new file mode 100644
index 00000000000..277fc9c6ce8
--- /dev/null
+++ b/backends/qualcomm/passes/expand_broadcast_tensor_shape.py
@@ -0,0 +1,58 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+
+
+class ExpandBroadcastTensorShape(ExportPass):
+    """
+    Make tensors have same rank for layout-transform to work properly.
+    """
+
+    def __init__(self):
+        super(ExpandBroadcastTensorShape, self).__init__()
+        self.broadcast_op_targets = [
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.div.Tensor,
+        ]
+
+    def traverse_broadcast_node(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.target in self.broadcast_op_targets:
+                for arg in node.args:
+                    input_rank = len(arg.meta["val"].shape)
+                    output_rank = len(node.meta["val"].shape)
+                    if input_rank != output_rank:
+                        with graph_module.graph.inserting_after(arg):
+                            new_rank = [1] * (output_rank - input_rank) + list(
+                                arg.meta["val"].shape
+                            )
+                            users = list(arg.users.keys())
+                            reshape_node = graph_module.graph.create_node(
+                                "call_function",
+                                exir_ops.edge.aten.view_copy.default,
+                                (arg, tuple(new_rank)),
+                            )
+                            # meta needs to be copied elementwisely for fake-tensor
+                            # to be updated correctly and not affect meta of arg
+                            for k, v in arg.meta.items():
+                                reshape_node.meta[k] = v
+                            reshape_node.meta["val"] = reshape_node.meta["val"].reshape(
+                                new_rank
+                            )
+                            for user in users:
+                                user.replace_input_with(arg, reshape_node)
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        self.traverse_broadcast_node(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/quantizer/README.md b/backends/qualcomm/quantizer/README.md
new file mode 100644
index 00000000000..6870ecc76ac
--- /dev/null
+++ b/backends/qualcomm/quantizer/README.md
@@ -0,0 +1,189 @@
+# Contribution for Operator Annotation
+Thank you for contributing to Qualcomm AI Engine Direct delegate for ExecuTorch. Reading and following these guidelines will help you quickly get the essentials of annotating an operator in `QnnQuantizer` to unblock yourself and land pull requests more efficiently.
+
+## Sections
+* [References](#references)
+* [Getting Started](#getting-started)
+* [Issues](#issues)
+* [Pull Requests](#pull-requests)
+
+## References
+### Qualcomm AI Engine Direct
+- [Operator Definitions for HTP](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html)
+
+### PyTorch
+- [ATen Operator Definitions](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native)
+
+## Getting Started
+Before extending operator for quantization annotation, please make sure the operator builder has been well-implemented (learn more on this [tutorial](../builders/README.md)).
+### Behavior of Annotation
+In order to conduct PTQ for floating point precision graph, observers are required to be inserted after each graph nodes. The observed numeric range will go through different algorithms and return statistics of `scale`, `offset` to represent data in fixed point.<br/><br/>
+**Stages could be shown as**:
+- Floating point `nn.Module` after `torch.export.export`
+    ```mermaid
+    flowchart TB
+        input & kernel & bias --> id1(convolution) --> output
+    ```
+
+- Inserting observers for inspecting numeric range
+    ```mermaid
+    flowchart TB
+        input --> id2(input_act_obs) --> id1(convolution) --> id3(output_act_obs) --> output
+        kernel --> id4(weight_obs) --> id1(convolution)
+        bias --> id5(bias_obs) --> id1(convolution)
+    ```
+
+- Cascade QDQ pairs after landing encodings
+    ```mermaid
+    flowchart TB
+        input --> id2(Q_i) --> id3(DQ_i) --> id1(convolution) --> id4(Q_o) --> id5(DQ_o) --> output
+        kernel --> id6(Q_k) --> id7(DQ_k) --> id1(convolution)
+        bias --> id8(Q_b) --> id9(DQ_b) --> id1(convolution)
+    ```
+Qualcomm backend will consume the generated encodings and lower operators with fixed precision. This tutorial will guide you through the details of inserting observer and some useful utilies.
+
+### Register Annotation via Operator Type
+Let's start with hooking callback for designated operator target:
+```python
+def register_annotator(ops: List[OpOverload]):
+    def decorator(annotator: Callable):
+        for op in ops:
+            OP_ANNOTATOR[op] = annotator
+
+    return decorator
+```
+The `register_annotator` decorator provides a convenient way to attach your own annotation logic, which requires list of operator type as its input argument.<br/> For example, the torch activation functions have `copy`, `in-place` implementation with small difference appears in naming (an extra `_` postfix), which will map to the same [Core ATen](https://pytorch.org/docs/stable/torch.compiler_ir.html) operators after `to_edge`:
+```python
+@register_annotator([torch.ops.aten.relu.default, torch.ops.aten.relu_.default])
+```
+Where `torch.ops.aten.relu.default` / `torch.ops.aten.relu_.default` map to `copy` / `in-place` version and both will be converted into `torch.ops.aten.relu.default` ultimately.<br/><br>
+
+The function signature is defined as follow with two arguments:
+```python
+def annotate_xxx(node: Node, quantization_config: QuantizationConfig) -> None:
+```
+- __node__: graph node required to be observed
+- __quantization_config__: data structure describing quantization configurations for IO activation / weight / bias
+
+### Example of Conv2d Annotation
+Conv2d accepts up to three input tensors: `input activation`, `kernel`, `bias`. There are constraints imposed by [Qualcomm AI Engine Direct Manual](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#conv2d).<br/>
+Take 8-bit fixed point as example:
+- __weight__: must be symmetrically quantized if per-channel observer is applied
+- __bias__: must have `QNN_DATATYPE_SFIXED_POINT_32` and be symmetrically quantized with expected encoding `scales = weight.scales * input.scale`, `offset = 0` if per-channel observer is applied.
+
+Let's look at the simplified per-channel quantization configuration used in `QnnQuantizer`:
+```python
+def ptq_per_channel_quant_config(
+    act_dtype=torch.uint8, weight_dtype=torch.int8
+) -> QuantizationConfig:
+    ...
+    act_quantization_spec = QuantizationSpec(
+        dtype=act_dtype,
+        quant_min=torch.iinfo(act_dtype).min,
+        quant_max=torch.iinfo(act_dtype).max,
+        qscheme=torch.per_tensor_affine,
+        observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),
+    )
+
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=torch.iinfo(weight_dtype).min + 1,
+        quant_max=torch.iinfo(weight_dtype).max,
+        qscheme=torch.per_channel_symmetric,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=PerChannelMinMaxObserver.with_args(**extra_args),
+    )
+
+    bias_quantization_spec = _derived_bias_quant_spec
+
+    quantization_config = QuantizationConfig(
+        input_activation=act_quantization_spec,
+        output_activation=act_quantization_spec,
+        weight=weight_quantization_spec,
+        bias=bias_quantization_spec,
+    )
+
+    return quantization_config
+```
+Here we choose `torch.uint8` + `MinMaxObserver` for better converage of IO activation and apply rules to `weight` w/`PerChannelMinMaxObserver`, `bias` w/`_derived_bias_quant_spec` (a callable method to calculate encoding in desired way) to meet aforementioned constraints. The well-defined `quantizaton_config` will then be shipped to callback for annotation.<br/>
+
+Now, we can start to fill in the function body:
+- Register annotator
+    ```python
+    @register_annotator(
+        [
+            torch.ops.aten.conv2d.default,
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv_transpose2d.input,
+        ]
+    )
+    def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
+    ```
+    There are multiple targets expected to meet our annotation criteria, it's encouraged to do so for code reuse.
+
+- Define map of input quantization spec
+    ```python
+        if _is_annotated([node]):
+            return
+
+        input_qspec_map = {}
+
+        # annotate input activation
+        input_act = node.args[0]
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+
+        # annotate kernel
+        kernel = node.args[1]
+        input_qspec_map[kernel] = quantization_config.weight
+
+        # annotate bias
+        if len(node.args) > 2:
+            bias = node.args[2]
+            input_qspec_map[bias] = quantization_config.bias(node)
+    ```
+    We first check if current graph node has been annotated. If not, an `input_qspec_map` dictionary required by PyTorch framework will be declared for providing mapping between graph nodes and their configurations.<br/>
+    The parameters' order could be found [here](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/Convolution.cpp) mentioned in [ATen Operator Definitions](#pytorch). Since bias node is optional, the implementation will invoke `_derived_bias_quant_spec` to calculate the per-channel bias encoding only if it exists.
+
+- Update node's meta with framework compatible data structure
+    ```python
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+    ```
+    After done processing `input_qspec_map`, it's required to have it in node's meta with special tag (`QUANT_ANNOTATION_KEY`) for `convert_pt2e` to properly insert observers.
+
+### Common Annotators
+For operators without extra parameters to be observed, there are pre-defined annotation method for convenience:
+- Single in single out operators, e.g.:
+    ```python
+    @register_annotator([torch.ops.aten.relu.default, torch.ops.aten.relu_.default])
+    def annotate_relu(node: Node, quantization_config: QuantizationConfig) -> None:
+        annotate_single_in_single_out(node, quantization_config)
+    ```
+
+- Binary in single out operators, e.g.:
+    ```python
+    @register_annotator([torch.ops.aten.add, torch.ops.aten.add.Tensor])
+    def annotate_add(node: Node, quantization_config: QuantizationConfig) -> None:
+        annotate_binary(node, quantization_config)
+    ```
+
+- Shared encodings between input / output, e.g.:<br/>
+    ```python
+    # For operators without arithmetical function, IOs are expected to own the same encodings.
+    @register_annotator([torch.ops.aten.transpose.int])
+    def annotate_transpose(node: Node, quantization_config: QuantizationConfig) -> None:
+        annotate_in_out_obs_sharing_op(node, quantization_config)
+        if not _is_annotated([node]):
+            annotate_single_in_single_out(node, quantization_config)
+    ```
+    This annotator only works for single-in-single-out scenario with node's input that has already been annotated. If not, we still need to invoke `annotate_single_in_single_out` again (this path should be less likely).
+
+## Issues
+Please refer to the [issue section](../README.md#issues) for more information.
+
+## Pull Requests
+Please refer to the [PR section](../README.md#pull-requests) for more information.
diff --git a/backends/qualcomm/quantizer/TARGETS b/backends/qualcomm/quantizer/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/backends/qualcomm/quantizer/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
index 9cde50b9c70..881d24bbb5e 100644
--- a/backends/qualcomm/quantizer/custom_annotation.py
+++ b/backends/qualcomm/quantizer/custom_annotation.py
@@ -118,3 +118,29 @@ def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig):
                 if "SDPA" in full_qualified_name:
                     annotate_matmul(node, quantization_config_16a8w)
                     annotate_matmul_input1(node.args[1], quantization_config_8a8w)
+
+
+def custom_annotate_matmul_16a8w(gm: torch.fx.GraphModule):
+    """
+    Annotate matmul op with 16a8w quantization config
+    """
+
+    def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+        input_act1 = node.args[1]
+        input_spec1 = quantization_config.weight
+        input_qspec_map[input_act1] = input_spec1
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    # Annotate 16a8w for matmul op to get better performance
+    quantization_config_16a8w = get_16a8w_qnn_ptq_config()
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
+            annotate_matmul(node, quantization_config_16a8w)
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index e27edf939c8..1d4b4c2f217 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -26,7 +26,7 @@
     get_16a8w_qnn_ptq_config,
     get_default_16bit_qnn_ptq_config,
     get_default_8bit_qnn_ptq_config,
-    get_ptq_per_channel_weight_config,
+    get_ptq_per_channel_quant_config,
     OP_ANNOTATOR,
     QuantizationConfig,
 )
@@ -72,6 +72,7 @@ def __init__(self):
             "8bit_act": torch.int8,
             "16bit_act": torch.int16,
         }
+        self.per_channel_quant_config = None
 
     def _annotate(self, gm: GraphModule) -> None:
         for node in gm.graph.nodes:
@@ -96,13 +97,17 @@ def _get_quant_config(self, op: str | OpOverload) -> Optional[QuantizationConfig
             return
 
         if op in self.use_per_channel_weight_quant_ops:
-            if op in self.bit16_quant_ops:
-                return get_ptq_per_channel_weight_config(
-                    torch.uint16, self.per_channel_weight_dtype["16bit_act"]
+            if self.per_channel_quant_config is None:
+                if op in self.bit16_quant_ops:
+                    return get_ptq_per_channel_quant_config(
+                        act_dtype=torch.uint16,
+                        weight_dtype=self.per_channel_weight_dtype["16bit_act"],
+                    )
+                return get_ptq_per_channel_quant_config(
+                    act_dtype=torch.uint8,
+                    weight_dtype=self.per_channel_weight_dtype["8bit_act"],
                 )
-            return get_ptq_per_channel_weight_config(
-                weight_dtype=self.per_channel_weight_dtype["8bit_act"]
-            )
+            return self.per_channel_quant_config
 
         if op in self.bit8_quant_ops:
             return self.bit8_quant_config
diff --git a/backends/qualcomm/quantizer/targets.bzl b/backends/qualcomm/quantizer/targets.bzl
new file mode 100644
index 00000000000..a6689012b25
--- /dev/null
+++ b/backends/qualcomm/quantizer/targets.bzl
@@ -0,0 +1,20 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.python_library(
+        name = "quantizer",
+        srcs = glob([
+            "*.py",
+        ]),
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//executorch/backends/transforms:decompose_sdpa",
+        ],
+    )
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index d3ae1194acd..39da08a31f1 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -20,6 +20,7 @@
     MinMaxObserver,
     MovingAverageMinMaxObserver,
     PerChannelMinMaxObserver,
+    UniformQuantizationObserverBase,
 )
 
 from torch.ao.quantization.quantizer import (
@@ -35,6 +36,107 @@
 from torch.fx import Node
 
 
+class ParamObserver(UniformQuantizationObserverBase):
+    def __init__(
+        self,
+        ch_axis=0,
+        use_mse=True,
+        steps=100,
+        dtype=torch.int8,
+        qscheme=torch.per_channel_symmetric,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,  # noqa: B008
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            factory_kwargs=factory_kwargs,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs,
+        )
+
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
+        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
+        self.ch_axis = ch_axis
+        self.use_mse = use_mse
+        self.steps = steps
+        self.calibrated = False
+
+    def to_ch_axis(self, x):
+        axis_order = list(range(len(x.size())))
+        axis_order[self.ch_axis], axis_order[0] = 0, self.ch_axis
+        return torch.flatten(x.permute(axis_order), start_dim=1)
+
+    def mse(self, pred, expect):
+        loss = (pred - expect).abs().pow(2)
+        return self.to_ch_axis(loss).mean(1)
+
+    def cosine(self, pred, expect):
+        target = torch.ones(pred.shape[self.ch_axis])
+        pred_n = self.to_ch_axis(pred).reshape(pred.shape[0], -1)
+        expect_n = self.to_ch_axis(expect).reshape(expect.shape[0], -1)
+        return torch.nn.CosineEmbeddingLoss()(pred_n, expect_n, target)
+
+    def loss_fn(self, x, new_min, new_max):
+        scale, offset = self._calculate_qparams(new_min, new_max)
+        x_q = torch.fake_quantize_per_channel_affine(
+            x,
+            scale.data,
+            offset.data.int(),
+            self.ch_axis,
+            self.quant_min,
+            self.quant_max,
+        )
+        return self.mse(x_q, x) if self.use_mse else self.cosine(x_q, x)
+
+    def line_search(self, x):
+        x_min, x_max = torch.aminmax(self.to_ch_axis(x), dim=1)
+        x_range = torch.max(x_min.abs(), x_max)
+        optimal_loss = torch.zeros_like(x_min) + 1e9
+
+        # check which clip range could produce smallest loss
+        for i in range(1, self.steps + 1):
+            thres = x_range / self.steps * i
+            current_loss = self.loss_fn(x, -thres, thres)
+            x_min = torch.where(current_loss < optimal_loss, -thres, x_min)
+            x_max = torch.where(current_loss < optimal_loss, thres, x_max)
+            optimal_loss = torch.min(current_loss, optimal_loss)
+
+        return x_min, x_max
+
+    def forward(self, x_orig):
+        # since params are static, one calibration is enough
+        if not self.calibrated:
+            x = x_orig.detach().to(self.min_val.dtype)
+            self.min_val, self.max_val = self.line_search(x)
+            self.calibrated = True
+
+        # return fake-quant result for saturating outliers
+        scale, zero_point = self._calculate_qparams(self.min_val, self.max_val)
+        return torch.fake_quantize_per_channel_affine(
+            x_orig,
+            scale.data,
+            zero_point.data.int(),
+            self.ch_axis,
+            self.quant_min,
+            self.quant_max,
+        )
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        return self._calculate_qparams(self.min_val, self.max_val)
+
+
 @dataclass(eq=True, frozen=True)
 class QuantizationConfig:
     input_activation: Optional[QuantizationSpec]
@@ -235,7 +337,7 @@ def get_default_16bit_qnn_ptq_config(
     return quantization_config
 
 
-def get_ptq_per_channel_weight_config(
+def get_ptq_per_channel_quant_config(
     act_dtype=torch.uint8, weight_dtype=torch.int8
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-12}
@@ -585,7 +687,7 @@ def annotate_prelu(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.view.default])
+@register_annotator([torch.ops.aten.view.default, torch.ops.aten._unsafe_view.default])
 def annotate_view(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_in_out_obs_sharing_op(node, quantization_config)
     if not _is_annotated([node]):
@@ -941,7 +1043,13 @@ def annotate_bmm(node: Node, quantization_config: QuantizationConfig) -> None:
     node.meta["source_fn_stack"] = [(node, torch.bmm)]
 
 
-@register_annotator([torch.ops.aten.conv2d.default, torch.ops.aten.conv1d.default])
+@register_annotator(
+    [
+        torch.ops.aten.conv2d.default,
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.conv_transpose2d.input,
+    ]
+)
 def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h
index 45525726ca7..dabd4cdde5f 100644
--- a/backends/qualcomm/runtime/QnnExecuTorch.h
+++ b/backends/qualcomm/runtime/QnnExecuTorch.h
@@ -44,7 +44,7 @@ struct CustomMemTensorInfo {
   size_t tensor_bytes;
   uint32_t* shape;
   uint32_t rank;
-  torch::executor::ScalarType dtype;
+  exec_aten::ScalarType dtype;
 };
 
 /// Allocate specific tensors (usually graph inputs and outputs) on shared
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index 36512c4ff21..f5c9473411e 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -213,8 +213,10 @@ Error QnnExecuTorchBackend::execute(
   }
 
   ET_CHECK_OR_RETURN_ERROR(
-      qnn_manager->Execute(input_tensor_structs, output_tensor_structs) ==
-          Error::Ok,
+      qnn_manager->Execute(
+          input_tensor_structs,
+          output_tensor_structs,
+          context.event_tracer()) == Error::Ok,
       Internal,
       "Fail to execute graph");
   ET_CHECK_OR_RETURN_ERROR(
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 38245ca7f96..f4275f0ab3d 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -10,6 +10,7 @@
 #include <executorch/backends/qualcomm/runtime/Utils.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
@@ -57,9 +58,7 @@ QnnManager::QnnManager(
         "backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type));
     QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str());
     QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str());
-    QNN_EXECUTORCH_LOG_INFO(
-        "tensor_dump_output_path: %s",
-        options_->tensor_dump_output_path()->c_str());
+    QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump());
     QNN_EXECUTORCH_LOG_INFO(
         "log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level()));
     QNN_EXECUTORCH_LOG_INFO(
@@ -366,7 +365,8 @@ Error QnnManager::AllocateTensor(
 
 Error QnnManager::Execute(
     const std::vector<Qnn_Tensor_t>& input_tensor_structs,
-    std::vector<Qnn_Tensor_t>& output_tensor_structs) {
+    std::vector<Qnn_Tensor_t>& output_tensor_structs,
+    EventTracer* event_tracer) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
   error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute(
@@ -377,30 +377,27 @@ Error QnnManager::Execute(
         "qnn_graph_execute failed. Error %d", QNN_GET_ERROR_CODE(error));
     return Error::Internal;
   }
-
   if (IsTensorDump()) {
     // TODO: Need to handle the graph which is partitioned.
     // Maybe we could use graph name.
-    std::string dir = options_->tensor_dump_output_path()->str() + "/Result/";
-    CreateDirectory(dir);
-    QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str());
     for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size();
          ++out_idx) {
       const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx];
-
-      std::string output_path =
-          dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw";
-
-      std::ofstream fout(output_path, std::ios::binary);
-      if (fout.fail()) {
-        QNN_EXECUTORCH_LOG_ERROR(
-            "Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name);
-        return Error::Internal;
-      }
-
-      fout.write(
-          static_cast<const char*>(QNN_VER_PTR(output_tensor)->clientBuf.data),
-          QNN_VER_PTR(output_tensor)->clientBuf.dataSize);
+      std::vector<exec_aten::SizesType> sizes(
+          QNN_VER_PTR(output_tensor)->dimensions,
+          QNN_VER_PTR(output_tensor)->dimensions +
+              QNN_VER_PTR(output_tensor)->rank);
+
+      auto dump_tensor = executorch::extension::from_blob(
+          QNN_VER_PTR(output_tensor)->clientBuf.data,
+          sizes,
+          qnn_dtype_to_scalar_type_[QNN_VER_PTR(output_tensor)->dataType]);
+
+      torch::executor::event_tracer_log_output_delegate<exec_aten::Tensor>(
+          event_tracer,
+          QNN_VER_PTR(output_tensor)->name,
+          /*delegate_debug_id=*/static_cast<torch::executor::DebugHandle>(-1),
+          *dump_tensor);
     }
   }
 
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 5190f6768b7..3d1cc3863aa 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -37,7 +37,8 @@ class QnnManager {
 
   Error Execute(
       const std::vector<Qnn_Tensor_t>& input_tensor_structs,
-      std::vector<Qnn_Tensor_t>& output_tensor_structs);
+      std::vector<Qnn_Tensor_t>& output_tensor_structs,
+      EventTracer* event_tracer);
 
   Error ProfileExecuteData(EventTracer* event_tracer);
 
@@ -52,7 +53,7 @@ class QnnManager {
   }
 
   bool IsTensorDump() {
-    return options_->tensor_dump_output_path()->size() > 0;
+    return options_->dump_intermediate_outputs();
   }
 
   bool IsNodeSupportedByBackend(
diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
index 3fa62d09cdb..2b2a729835c 100644
--- a/backends/qualcomm/runtime/SharedBuffer.cpp
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -25,7 +25,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
     hash_val ^= info.shape[i];
   }
   hash_val ^= std::hash<uint32_t>()(info.rank);
-  hash_val ^= std::hash<torch::executor::ScalarType>()(info.dtype);
+  hash_val ^= std::hash<exec_aten::ScalarType>()(info.dtype);
   return hash_val;
 }
 
diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
index fa5829d23b8..ae336a800b6 100644
--- a/backends/qualcomm/runtime/backends/QnnProfiler.cpp
+++ b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
-#include <iostream>
 
 namespace torch {
 namespace executor {
diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl
index 77449e95e2a..f3b868892fa 100644
--- a/backends/qualcomm/runtime/targets.bzl
+++ b/backends/qualcomm/runtime/targets.bzl
@@ -25,10 +25,10 @@ def define_common_targets():
         deps = [
             "fbsource//third-party/qualcomm/qnn:api",
             "//executorch/runtime/backend:interface",
-            "//executorch/runtime/core:core",
         ],
         exported_deps = [
             "//executorch/backends/qualcomm:schema",
+            "//executorch/runtime/core:core",
         ],
     )
 
@@ -55,6 +55,9 @@ def define_common_targets():
         link_whole = True,  # needed for executorch/examples/models/llama2:main to register QnnBackend
         platforms = [ANDROID],
         visibility = ["@EXECUTORCH_CLIENTS"],
+        resources = {
+            "qnn_lib": "fbsource//third-party/qualcomm/qnn/qnn-2.25:qnn_offline_compile_libs",
+        },
         deps = [
             "fbsource//third-party/qualcomm/qnn:api",
             ":logging",
@@ -63,5 +66,10 @@ def define_common_targets():
             "//executorch/backends/qualcomm/aot/wrappers:wrappers",
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
+            "//executorch/extension/tensor:tensor",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+            "//executorch/runtime/core:event_tracer",
         ],
     )
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index 5f77a747404..ed77a873516 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -79,7 +79,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
         -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DEXECUTORCH_BUILD_QNN=ON \
-        -DEXECUTORCH_BUILD_SDK=ON \
+        -DEXECUTORCH_BUILD_DEVTOOLS=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
@@ -123,7 +123,7 @@ if [ "$BUILD_X86_64" = true ]; then
         -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
         -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
         -DEXECUTORCH_BUILD_QNN=ON \
-        -DEXECUTORCH_BUILD_SDK=ON \
+        -DEXECUTORCH_BUILD_DEVTOOLS=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
@@ -135,6 +135,8 @@ if [ "$BUILD_X86_64" = true ]; then
 
     rm -f $PRJ_ROOT/backends/qualcomm/python/*
     cp -fv $BUILD_ROOT/backends/qualcomm/Py* "$PRJ_ROOT/backends/qualcomm/python"
+    cp -fv "$PRJ_ROOT/schema/program.fbs" "$PRJ_ROOT/exir/_serialize/program.fbs"
+    cp -fv "$PRJ_ROOT/schema/scalar_type.fbs" "$PRJ_ROOT/exir/_serialize/scalar_type.fbs"
 
    EXAMPLE_ROOT=examples/qualcomm
    CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;"
diff --git a/examples/sdk/sdk_example_runner/TARGETS b/backends/qualcomm/serialization/TARGETS
similarity index 100%
rename from examples/sdk/sdk_example_runner/TARGETS
rename to backends/qualcomm/serialization/TARGETS
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
index 338f61997ea..09d910aba58 100644
--- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py
+++ b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
@@ -33,6 +33,7 @@ class QcomChipset(IntEnum):
     SM8450 = 36  # v69
     SM8475 = 42  # v69
     SM8550 = 43  # v73
+    SSG2115P = 46  # v73
     SM8650 = 57  # v75
 
 
@@ -47,6 +48,7 @@ class SocInfo:
     QcomChipset.SM8475: SocInfo(QcomChipset.SM8475, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SM8550: SocInfo(QcomChipset.SM8550, HtpInfo(HtpArch.V73, 8)),
     QcomChipset.SM8650: SocInfo(QcomChipset.SM8650, HtpInfo(HtpArch.V75, 8)),
+    QcomChipset.SSG2115P: SocInfo(QcomChipset.SSG2115P, HtpInfo(HtpArch.V73, 2)),
 }
 
 
@@ -129,7 +131,7 @@ class QnnExecuTorchOptions:
     library_path: str = ""
     log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff
     online_prepare: bool = False
-    tensor_dump_output_path: str = ""
+    dump_intermediate_outputs: bool = False
     profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
     shared_buffer: bool = False
     is_from_context_binary: bool = False
diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs
index 4288c83b130..f2275377f7b 100644
--- a/backends/qualcomm/serialization/schema.fbs
+++ b/backends/qualcomm/serialization/schema.fbs
@@ -32,6 +32,7 @@ enum QcomChipset: int {
   SM8450 = 36,
   SM8475 = 42,
   SM8550 = 43,
+  SSG2115P = 46,
   SM8650 = 57,
 }
 
@@ -164,15 +165,13 @@ table QnnExecuTorchOptions {
   /// Check if on-device graph construction. Default is false.
   online_prepare:bool;
 
-  /// Tensor dump output path. If a path is given, Delegate would write
-  /// outputs of each OP there.
-  /// In ALL cases, we don't recommend to set this option.
-  /// This option exist just for debugging some accuracy issues.
-  tensor_dump_output_path:string;
+  /// If tensor dump is enabled, all intermediate tensors output will be dumped.
+  /// This option exists for debugging accuracy issues. Default is off.
+  dump_intermediate_outputs:bool;
 
   /// Profiling level of the delegate and the backend. Default is off.
   profile_level:QnnExecuTorchProfileLevel;
-  
+
   /// Enables usage of shared buffer between application and backend for graph I/O.
   shared_buffer:bool;
 
diff --git a/backends/qualcomm/serialization/targets.bzl b/backends/qualcomm/serialization/targets.bzl
new file mode 100644
index 00000000000..c3c571109e7
--- /dev/null
+++ b/backends/qualcomm/serialization/targets.bzl
@@ -0,0 +1,31 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    
+    export_file(
+        name = "qnn_schema",
+        src = "schema.fbs",
+        visibility = ["//executorch/backends/qualcomm/serialization/..."],
+    )
+
+    runtime.python_library(
+        name = "serialization",
+        srcs = glob([
+            "*.py",
+        ]),
+        resources = {
+            ":qnn_schema": "schema.fbs",
+        },
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//executorch/exir/backend:backend_details",
+            "//executorch/exir/backend:compile_spec_schema",
+        ],
+    )
diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl
index 55fe390f6b0..1435d41f8db 100644
--- a/backends/qualcomm/targets.bzl
+++ b/backends/qualcomm/targets.bzl
@@ -4,7 +4,6 @@ load(
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-
 # Construct the input and output file names. All input and output files rely on scalar_type file.
 SCHEMA_NAME = "schema"
 
@@ -55,6 +54,7 @@ def define_common_targets():
         [OUTPUT_SCHEMA_HEADER],
         OUTPUT_SCHEMA_HEADER,
     )
+
     # Header-only library target with the generate executorch program schema header.
     runtime.cxx_library(
         name = "schema",
@@ -76,7 +76,6 @@ def define_common_targets():
         platforms = [ANDROID],
     )
 
-
     runtime.cxx_library(
         name = "qnn_executorch_backend",
         srcs = [],
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index e448a219284..ee3d6cf93a7 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -361,6 +361,46 @@ def forward(self, x):
         return self.conv(x)
 
 
+class ConvTranspose2dSingle(torch.nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.conv_transpose = torch.nn.ConvTranspose2d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.conv_transpose(x)
+
+
+class Conv2dDownUpSample(torch.nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=bias,
+        )
+        self.conv_transpose = torch.nn.ConvTranspose2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.conv_transpose(self.conv(x))
+
+
 class Conv2dSumReduceDim(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index d17fce2b839..20233991fac 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -68,7 +68,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -130,6 +130,16 @@ def test_qnn_backend_conv2d(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv_transpose2d(self):
+        modules = [
+            ConvTranspose2dSingle(),  # noqa: F405
+            ConvTranspose2dSingle(bias=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn([1, 1, 3, 3]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_element_wise_add(self):
         test_comb = [
             {
@@ -340,14 +350,12 @@ def test_qnn_backend_mean_dim(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("failed to lower in QNN 2.25")
+    @unittest.skip("failed to lower in QNN 2.26")
     def test_qnn_backend_mha(self):
         module = MultiheadAttention()  # noqa: F405
         sample_input = (torch.randn(1, 197, 96),)
         self.lower_module_and_test_output(module, sample_input)
 
-    # fp16 pad op might hit corner case in runtime
-    @unittest.expectedFailure
     def test_qnn_backend_pad(self):
         module = Pad()  # noqa: F405
         sample_input = (torch.randn([1, 8, 128]),)
@@ -492,7 +500,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -523,6 +531,11 @@ def test_qnn_backend_conv2d_cat(self):
         sample_input = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5))
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_down_up_sample(self):
+        module = Conv2dDownUpSample()  # noqa: F405
+        sample_input = (torch.randn(1, 16, 224, 224),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv2d_max_pool2d(self):
         module = Conv2dMaxPool2d()  # noqa: F405
         sample_input = (torch.rand(1, 2, 14, 14),)
@@ -606,7 +619,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -631,6 +644,7 @@ def test_qnn_backend_16a4w_linear(self):
         )
         self.lower_module_and_test_output(module, sample_input)
 
+    @unittest.skip("segfault happens in QNN 2.26")
     def test_qnn_backend_16a4w_per_channel_linear(self):
         module = Linear(use_bias=False)  # noqa: F405
         sample_input = (torch.randn([3, 4]),)
@@ -714,6 +728,17 @@ def test_qnn_backend_conv2d(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv_transpose2d(self):
+        modules = [
+            ConvTranspose2dSingle(),  # noqa: F405
+            ConvTranspose2dSingle(bias=False),  # noqa: F405
+        ]  # noqa: F405
+        sample_input = (torch.randn([1, 1, 3, 3]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_element_wise_add(self):
         test_comb = [
             {
@@ -1122,7 +1147,7 @@ def setUp(self):
             debug=False,
             saver=False,
             online_prepare=TestQNN.online_prepare,
-            tensor_dump_output_path="",
+            dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
             profile=TestQNN.enable_profile,
             shared_buffer=TestQNN.shared_buffer,
         )
@@ -1158,6 +1183,12 @@ def test_qnn_backend_conv2d_cat(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_down_up_sample(self):
+        module = Conv2dDownUpSample()  # noqa: F405
+        sample_input = (torch.randn(1, 16, 224, 224),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv2d_max_pool2d(self):
         module = Conv2dMaxPool2d()  # noqa: F405
         sample_input = (torch.rand(1, 2, 14, 14),)
@@ -1288,6 +1319,22 @@ def setUp(self):
             saver=False,
         )
 
+    def test_qnn_backend_dump_intermediate_outputs(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            dump_intermediate_outputs=True,
+        )
+        module = Relu()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+            expected_intermediate_events=3,
+        )
+
     def test_qnn_backend_skip_node_id(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -1399,7 +1446,6 @@ def test_qnn_backend_online_prepare(self):
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("segfault happens in recent torch.export.export")
     def test_qnn_backend_context_direct(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             module = ContextBinaryExample()  # noqa: F405
@@ -1443,6 +1489,23 @@ def setUp(self):
             saver=False,
         )
 
+    def test_qnn_backend_dump_intermediate_outputs(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            dump_intermediate_outputs=True,
+        )
+        module = Relu()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(
+            module,
+            sample_input,
+            expected_partitions=1,
+            expected_intermediate_events=5,
+        )
+
     def test_qnn_backend_skip_node_id_partitioner(self):
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
@@ -1670,7 +1733,6 @@ def test_qnn_backend_online_prepare(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("segfault happens in recent torch.export.export")
     def test_qnn_backend_context_direct(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             module = ContextBinaryExample()  # noqa: F405
@@ -1712,13 +1774,12 @@ def required_envs(self, conditions=None) -> bool:
             ]
         )
 
-    def test_fbnet(self):
+    def test_dino_v2(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
-
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/fbnet.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/dino_v2.py",
             "--dataset",
             self.image_dataset,
             "--artifact",
@@ -1745,18 +1806,16 @@ def test_fbnet(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                self.assertGreaterEqual(msg["top_1"], 60)
-                self.assertGreaterEqual(msg["top_5"], 90)
+                self.assertGreaterEqual(msg["top_1"], 70)
+                self.assertGreaterEqual(msg["top_5"], 85)
 
-    def test_gMLP(self):
-        if not self.required_envs([self.image_dataset]):
+    def test_esrgan(self):
+        if not self.required_envs():
             self.skipTest("missing required envs")
 
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/gMLP_image_classification.py",
-            "--dataset",
-            self.image_dataset,
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/esrgan.py",
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -1765,6 +1824,9 @@ def test_gMLP(self):
             self.device,
             "--model",
             self.model,
+            "--default_dataset",
+            "--oss_repo",
+            self.oss_repo,
             "--ip",
             self.ip,
             "--port",
@@ -1781,17 +1843,17 @@ def test_gMLP(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                self.assertGreaterEqual(msg["top_1"], 60)
-                self.assertGreaterEqual(msg["top_5"], 90)
+                self.assertGreaterEqual(msg["PSNR"], 24)
+                self.assertGreaterEqual(msg["SSIM"], 0.8)
 
-    def test_regnet(self):
-        if not self.required_envs([self.image_dataset]):
+    def test_fastvit(self):
+        if not self.required_envs(
+            [self.image_dataset, self.pretrained_weight, self.oss_repo]
+        ):
             self.skipTest("missing required envs")
-
-        weights = ["regnet_y_400mf", "regnet_x_400mf"]
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/regnet.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/fastvit.py",
             "--dataset",
             self.image_dataset,
             "--artifact",
@@ -1802,6 +1864,10 @@ def test_regnet(self):
             self.device,
             "--model",
             self.model,
+            "--oss_repo",
+            self.oss_repo,
+            "--pretrained_weight",
+            self.pretrained_weight,
             "--ip",
             self.ip,
             "--port",
@@ -1810,27 +1876,26 @@ def test_regnet(self):
         if self.host:
             cmds.extend(["--host", self.host])
 
-        for weight in weights:
-            p = subprocess.Popen(
-                cmds + ["--weights", weight], stdout=subprocess.DEVNULL
-            )
-            with Listener((self.ip, self.port)) as listener:
-                conn = listener.accept()
-                p.communicate()
-                msg = json.loads(conn.recv())
-                if "Error" in msg:
-                    self.fail(msg["Error"])
-                else:
-                    self.assertGreaterEqual(msg["top_1"], 60)
-                    self.assertGreaterEqual(msg["top_5"], 85)
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 80)
 
-    def test_ssd300_vgg16(self):
-        if not self.required_envs([self.pretrained_weight, self.oss_repo]):
+    def test_fbnet(self):
+        if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
 
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/ssd300_vgg16.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/fbnet.py",
+            "--dataset",
+            self.image_dataset,
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -1839,10 +1904,6 @@ def test_ssd300_vgg16(self):
             self.device,
             "--model",
             self.model,
-            "--oss_repo",
-            self.oss_repo,
-            "--pretrained_weight",
-            self.pretrained_weight,
             "--ip",
             self.ip,
             "--port",
@@ -1859,14 +1920,16 @@ def test_ssd300_vgg16(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                self.assertGreaterEqual(msg["mAP"], 0.70)
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 90)
 
-    def test_dino_v2(self):
+    def test_gMLP(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
+
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/dino_v2.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/gMLP_image_classification.py",
             "--dataset",
             self.image_dataset,
             "--artifact",
@@ -1893,16 +1956,58 @@ def test_dino_v2(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                self.assertGreaterEqual(msg["top_1"], 70)
-                self.assertGreaterEqual(msg["top_5"], 85)
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 90)
 
-    def test_esrgan(self):
-        if not self.required_envs():
+    def test_regnet(self):
+        if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
 
+        weights = ["regnet_y_400mf", "regnet_x_400mf"]
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/esrgan.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/regnet.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        for weight in weights:
+            p = subprocess.Popen(
+                cmds + ["--weights", weight], stdout=subprocess.DEVNULL
+            )
+            with Listener((self.ip, self.port)) as listener:
+                conn = listener.accept()
+                p.communicate()
+                msg = json.loads(conn.recv())
+                if "Error" in msg:
+                    self.fail(msg["Error"])
+                else:
+                    self.assertGreaterEqual(msg["top_1"], 60)
+                    self.assertGreaterEqual(msg["top_5"], 85)
+
+    def test_squeezenet(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/squeezenet.py",
+            "--dataset",
+            self.image_dataset,
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -1911,9 +2016,6 @@ def test_esrgan(self):
             self.device,
             "--model",
             self.model,
-            "--default_dataset",
-            "--oss_repo",
-            self.oss_repo,
             "--ip",
             self.ip,
             "--port",
@@ -1930,18 +2032,16 @@ def test_esrgan(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                self.assertGreaterEqual(msg["PSNR"], 24)
-                self.assertGreaterEqual(msg["SSIM"], 0.8)
+                self.assertGreaterEqual(msg["top_1"], 45)
+                self.assertGreaterEqual(msg["top_5"], 70)
 
-    def test_squeezenet(self):
-        if not self.required_envs([self.image_dataset]):
+    def test_ssd300_vgg16(self):
+        if not self.required_envs([self.pretrained_weight, self.oss_repo]):
             self.skipTest("missing required envs")
 
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/squeezenet.py",
-            "--dataset",
-            self.image_dataset,
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/ssd300_vgg16.py",
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -1950,6 +2050,10 @@ def test_squeezenet(self):
             self.device,
             "--model",
             self.model,
+            "--oss_repo",
+            self.oss_repo,
+            "--pretrained_weight",
+            self.pretrained_weight,
             "--ip",
             self.ip,
             "--port",
@@ -1966,8 +2070,7 @@ def test_squeezenet(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                self.assertGreaterEqual(msg["top_1"], 40)
-                self.assertGreaterEqual(msg["top_5"], 70)
+                self.assertGreaterEqual(msg["mAP"], 0.70)
 
 
 class TestExampleQaihubScript(TestQNN):
@@ -2721,6 +2824,7 @@ def setup_environment():
     TestQNN.oss_repo = args.oss_repo
     TestQNN.shared_buffer = args.shared_buffer
     TestQNN.enable_x86_64 = args.enable_x86_64
+    TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
     return sys.argv[:1] + ns_args
 
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 0d9e1a69679..27f071ed823 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -27,8 +27,7 @@
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import capture_program
-from executorch.devtools import generate_etrecord
-from executorch.devtools.inspector import Inspector
+from executorch.devtools import generate_etrecord, Inspector
 from executorch.examples.qualcomm.utils import (
     generate_inputs,
     make_output_dir,
@@ -119,6 +118,7 @@ class TestQNN(unittest.TestCase):
     model: QcomChipset = None
     compiler_specs: List[CompileSpec] = None
     arch_table = {
+        "SSG2115P": QcomChipset.SSG2115P,
         "SM8650": QcomChipset.SM8650,
         "SM8550": QcomChipset.SM8550,
         "SM8475": QcomChipset.SM8475,
@@ -181,13 +181,14 @@ def _save_model_and_expected_output(
 
         return input_list, ref_outputs, pte_fname
 
-    def verify_output(
+    def verify_output(  # noqa: C901
         self,
         module: torch.nn.Module,
         sample_inputs: Tuple[torch.Tensor],
         executorch_prog: ExecutorchProgram | LoweredBackendModule,
         etrecord_path: str = "etrecord.bin",
         expected_profile_events: int = -1,
+        expected_intermediate_events: int = -1,
     ):
         with tempfile.TemporaryDirectory() as tmp_dir:
             buffer = (
@@ -211,6 +212,7 @@ def verify_output(
             output_dir = f"{tmp_dir}/outputs"
             outputs = []
             etdump_path = f"{tmp_dir}/etdump.etdp"
+            debug_output_path = f"{tmp_dir}/debug_output.bin"
 
             def post_process():
                 for i, f in enumerate(sorted(os.listdir(output_dir))):
@@ -225,6 +227,16 @@ def validate_profile():
                     len(inspector.to_dataframe().index) == expected_profile_events
                 )
 
+            def validate_intermediate_tensor():
+                inspector = Inspector(
+                    etdump_path=etdump_path, debug_buffer_path=debug_output_path
+                )
+                for event_block in inspector.event_blocks:
+                    if event_block.name == "Execute":
+                        self.assertTrue(
+                            len(event_block.events) == expected_intermediate_events
+                        )
+
             if self.enable_x86_64:
                 generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list)
                 make_output_dir(output_dir)
@@ -251,6 +263,8 @@ def validate_profile():
                     "--output_folder_path",
                     f"{output_dir}",
                 ]
+                if expected_intermediate_events != -1:
+                    cmd.append("--dump_intermediate_outputs")
 
                 env = dict(os.environ)
                 env["LD_LIBRARY_PATH"] = f"{qnn_sdk}/lib/{target}/:{build_folder}/lib"
@@ -277,6 +291,9 @@ def validate_profile():
                 # Verify the etdump
                 if expected_profile_events != -1:
                     validate_profile()
+
+                if expected_intermediate_events != -1:
+                    validate_intermediate_tensor()
             else:
                 adb = SimpleADB(
                     qnn_sdk=os.getenv("QNN_SDK_ROOT"),
@@ -287,6 +304,9 @@ def validate_profile():
                     host_id=self.host,
                     soc_model=self.model,
                     error_only=self.error_only,
+                    dump_intermediate_outputs=(
+                        True if expected_intermediate_events != -1 else False
+                    ),
                 )
                 adb.push(inputs=[sample_inputs], input_list=input_list)
                 adb.execute()
@@ -296,12 +316,20 @@ def validate_profile():
                 if expected_profile_events != -1:
                     adb.pull_etdump(etdump_path, callback=validate_profile)
 
+                if expected_intermediate_events != -1:
+                    adb.pull_debug_output(
+                        etdump_path,
+                        debug_output_path,
+                        callback=validate_intermediate_tensor,
+                    )
+
     def lower_module_and_test_output(
         self,
         module: torch.nn.Module,
         sample_inputs: Tuple[torch.Tensor],
         expected_partitions: int = 1,
         expected_profile_events: int = -1,
+        expected_intermediate_events: int = -1,
         assert_output_equal: bool = True,
         skip_node_id_set: set = None,
         skip_node_op_set: set = None,
@@ -325,7 +353,6 @@ def lower_module_and_test_output(
                 # Therefore, won't want to pre-allocate
                 # by memory manager in runtime.
                 memory_planning_pass=MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=not self.shared_buffer,
                     alloc_graph_output=not self.shared_buffer,
                 ),
@@ -346,11 +373,19 @@ def lower_module_and_test_output(
         etrecord_path = "etrecord.bin"
         if self.enable_profile:
             generate_etrecord(etrecord_path, edge_copy, exec_prog)
-
         # Check numerics
-        if assert_output_equal or expected_profile_events != -1:
+        if (
+            assert_output_equal
+            or expected_profile_events != -1
+            or expected_intermediate_events != -1
+        ):
             self.verify_output(
-                module, sample_inputs, exec_prog, etrecord_path, expected_profile_events
+                module,
+                sample_inputs,
+                exec_prog,
+                etrecord_path,
+                expected_profile_events,
+                expected_intermediate_events,
             )
 
     def get_qdq_module(
diff --git a/backends/cadence/cadence_runner/TARGETS b/backends/qualcomm/utils/TARGETS
similarity index 90%
rename from backends/cadence/cadence_runner/TARGETS
rename to backends/qualcomm/utils/TARGETS
index 21f36a9baea..2341af9282f 100644
--- a/backends/cadence/cadence_runner/TARGETS
+++ b/backends/qualcomm/utils/TARGETS
@@ -3,6 +3,6 @@
 
 load(":targets.bzl", "define_common_targets")
 
-oncall("odai_jarvis")
+oncall("executorch")
 
 define_common_targets()
diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py
index 9875c9f5afb..7a0be8b0703 100644
--- a/backends/qualcomm/utils/constants.py
+++ b/backends/qualcomm/utils/constants.py
@@ -25,6 +25,7 @@
 QCOM_SCALE_OFFSET = "scale_offset"
 QCOM_ZERO_POINT = "zero_point"
 QCOM_ZERO_POINTS = "zero_points"
+QCOM_PASS_EXPAND_BROADCAST_SHAPE = "expand_broadcast_shape"
 
 # constants in backends/qualcomm/tests
 QCOM_ANNOTATION = "annotation"
diff --git a/backends/qualcomm/utils/targets.bzl b/backends/qualcomm/utils/targets.bzl
new file mode 100644
index 00000000000..c76ef7f1906
--- /dev/null
+++ b/backends/qualcomm/utils/targets.bzl
@@ -0,0 +1,22 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    
+    runtime.python_library(
+        name = "utils",
+        srcs = glob([
+            "*.py",
+        ]),
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//executorch/exir/backend:backend_details",
+            "//executorch/exir/backend:compile_spec_schema",
+            "//executorch/backends/qualcomm/serialization:serialization",
+        ],
+    )
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 2a954f90d24..357619e3863 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -5,8 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
+import warnings
 from collections import OrderedDict
-from typing import Callable, Dict, List, Tuple
+from typing import Callable, Dict, List, Set, Tuple
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
 
@@ -33,6 +34,9 @@
 )
 from executorch.backends.qualcomm.passes.convert_prelu import ConvertPReLU
 from executorch.backends.qualcomm.passes.convert_to_linear import ConvertToLinear
+from executorch.backends.qualcomm.passes.expand_broadcast_tensor_shape import (
+    ExpandBroadcastTensorShape,
+)
 from executorch.backends.qualcomm.passes.fold_qdq import FoldQDQ
 from executorch.backends.qualcomm.passes.i64_to_i32 import I64toI32
 from executorch.backends.qualcomm.passes.layout_transform import LayoutTransform
@@ -60,7 +64,10 @@
     convert_to_flatbuffer,
     convert_to_option,
 )
-from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_PASS_EXPAND_BROADCAST_SHAPE,
+    QCOM_QNN_COMPILE_SPEC,
+)
 
 from executorch.exir import ExirExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -267,7 +274,10 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
     return source_decompositions
 
 
-def _transform(edge_program: ExportedProgram) -> None:
+def _transform(
+    edge_program: ExportedProgram, custom_pass_config: Set[str] = None
+) -> None:
+    custom_pass_config = custom_pass_config or {}
     # currently ExirExportedProgram.transform does not accept
     # changes of input number which was caused by FoldQDQ
     # apply passes one by one here to avoid IR capture failure
@@ -284,6 +294,10 @@ def _transform(edge_program: ExportedProgram) -> None:
     AnnotateAndQuantScalar(edge_program)(graph_module)
     AnnotateDecomposed(edge_program)(graph_module)
     FoldQDQ()(graph_module)
+    # this pass is not necessary for network without layout-sensitive ops
+    # enable defaultly will introduce overhead from extra view_copy nodes
+    if QCOM_PASS_EXPAND_BROADCAST_SHAPE in custom_pass_config:
+        ExpandBroadcastTensorShape()(graph_module)
     LayoutTransform(edge_program)(graph_module)
     ReplaceIndexPutInput(edge_program)(graph_module)
 
@@ -298,6 +312,7 @@ def _transform(edge_program: ExportedProgram) -> None:
 def capture_program(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
+    custom_pass_config: Set[str] = None,
 ) -> exir.ExirExportedProgram:
     ep = torch.export.export(module, inputs)
     decomposed_ep = ep.run_decompositions(get_decomp_table())
@@ -309,7 +324,7 @@ def capture_program(
     core_ep = ExirExportedProgram(decomposed_ep, False)
     core_ep.transform(ConvertBinaryOpsWithScalar())
     edge_ep = core_ep.to_edge(qnn_edge_config())
-    _transform(edge_ep.exported_program)
+    _transform(edge_ep.exported_program, custom_pass_config)
     return edge_ep
 
 
@@ -734,7 +749,7 @@ def generate_qnn_executorch_compiler_spec(
     debug: bool = False,
     saver: bool = False,
     online_prepare: bool = False,
-    tensor_dump_output_path: str = "",
+    dump_intermediate_outputs: bool = False,
     profile: bool = False,
     shared_buffer: bool = False,
     is_from_context_binary: bool = False,
@@ -756,10 +771,8 @@ def generate_qnn_executorch_compiler_spec(
         saver: Instead of compiling the model, run QNN Saver. Please check
             documents of Qualcomm AI Engine Direct SDK. This feature is usually
             for debugging purpose.
-        tensor_dump_output_path: If a path is given, Delegate would write
-            outputs of each OP there in runtime. In ALL cases,
-            we don't recommend to set this option. This option exist just
-            for debugging some accuracy issues.
+        dump_intermediate_outputs: If tensor dump is enabled, all intermediate tensors output will be dumped.
+            This option exists for debugging accuracy issues
         profile: Enable profile the performance of per operator.
             Note that for now only support kProfileDetailed to
             profile the performance of each operator with cycle unit.
@@ -777,6 +790,13 @@ def generate_qnn_executorch_compiler_spec(
     if soc_model not in _supported_soc_models:
         raise ValueError(f"unknown SoC model for QNN: {soc_model}")
 
+    if profile and dump_intermediate_outputs:
+        warnings.warn(
+            "It is not recommended to turn on both profiling and dump_intermediate_outputs the same time"
+            ", because dump_intermediate_outputs will cause performance drop.",
+            stacklevel=1,
+        )
+
     qnn_executorch_options = QnnExecuTorchOptions(
         _soc_info_table[soc_model], backend_options
     )
@@ -787,12 +807,11 @@ def generate_qnn_executorch_compiler_spec(
         else QnnExecuTorchLogLevel.kLogLevelWarn
     )
 
+    qnn_executorch_options.dump_intermediate_outputs = dump_intermediate_outputs
+
     if saver:
         qnn_executorch_options.library_path = "libQnnSaver.so"
 
-    if len(tensor_dump_output_path.strip()) != 0:
-        qnn_executorch_options.tensor_dump_output_path = tensor_dump_output_path
-
     if profile:
         qnn_executorch_options.profile_level = (
             QnnExecuTorchProfileLevel.kProfileDetailed
diff --git a/backends/transforms/TARGETS b/backends/transforms/TARGETS
index df50e45f099..0a42614a385 100644
--- a/backends/transforms/TARGETS
+++ b/backends/transforms/TARGETS
@@ -1,197 +1,5 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
 
-runtime.python_library(
-    name = "lib",
-    srcs = [
-        "__init__.py",
-    ],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        ":addmm_mm_to_linear",
-    ],
-)
-
-runtime.python_library(
-    name = "addmm_mm_to_linear",
-    srcs = ["addmm_mm_to_linear.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir:sym_util",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "decompose_sdpa",
-    srcs = ["decompose_sdpa.py"],
-    visibility = [
-        "//executorch/backends/...",
-        "@EXECUTORCH_CLIENTS",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-    ],
-)
-
-runtime.python_library(
-    name = "fuse_batch_norm_with_conv",
-    srcs = ["fuse_batch_norm_with_conv.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        ":utils",
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir:sym_util",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "fuse_conv_with_clamp",
-    srcs = ["fuse_conv_with_clamp.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        ":utils",
-        "//caffe2:torch",
-        "//executorch/backends/vulkan/passes:custom_ops_defs",
-        "//executorch/exir:pass_base",
-        "//executorch/exir:sym_util",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "fuse_dequant_linear",
-    srcs = ["fuse_dequant_linear.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        ":utils",
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir:sym_util",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "view_copy_to_squeeze_unsqueeze",
-    srcs = ["view_copy_to_squeeze_unsqueeze.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        ":utils",
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "fuse_view_copy",
-    srcs = ["fuse_view_copy.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "remove_clone_ops",
-    srcs = ["remove_clone_ops.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "mean_to_sum_div",
-    srcs = ["mean_to_sum_div.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir:sym_util",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "utils",
-    srcs = ["utils.py"],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:lib",
-        "//executorch/exir:pass_manager",
-        "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
-        "//executorch/exir/dialects:lib",
-        "//pytorch/ao:torchao",  # @manual
-    ],
-)
-
-runtime.python_library(
-    name = "duplicate_dynamic_quant_chain",
-    srcs = ["duplicate_dynamic_quant_chain.py"],
-    visibility = [
-        "//executorch/backends/...",
-        "//executorch/examples/...",
-        "//executorch/extension/llm/...",
-        "@EXECUTORCH_CLIENTS",
-    ],
-    deps = [
-        "//caffe2:torch",
-    ],
-)
-
-runtime.python_library(
-    name = "convert_dtype_pass",
-    srcs = [
-        "convert_dtype_pass.py",
-    ],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-    ],
-)
-
-runtime.python_test(
-    name = "test_duplicate_dynamic_quant_chain",
-    srcs = [
-        "test/test_duplicate_dynamic_quant_chain.py",
-    ],
-    deps = [
-        "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
-        ":duplicate_dynamic_quant_chain",
-        "//caffe2:torch",
-        "//executorch/exir:lib",
-    ],
-)
+define_common_targets()
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
new file mode 100644
index 00000000000..458a2d71bb5
--- /dev/null
+++ b/backends/transforms/targets.bzl
@@ -0,0 +1,201 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.python_library(
+        name = "lib",
+        srcs = [
+            "__init__.py",
+        ],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            ":addmm_mm_to_linear",
+        ],
+    )
+
+    runtime.python_library(
+        name = "addmm_mm_to_linear",
+        srcs = ["addmm_mm_to_linear.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+            "//executorch/exir:sym_util",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "decompose_sdpa",
+        srcs = ["decompose_sdpa.py"],
+        visibility = [
+            "//executorch/backends/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+        ],
+    )
+
+    runtime.python_library(
+        name = "fuse_batch_norm_with_conv",
+        srcs = ["fuse_batch_norm_with_conv.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            ":utils",
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+            "//executorch/exir:sym_util",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "fuse_conv_with_clamp",
+        srcs = ["fuse_conv_with_clamp.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            ":utils",
+            "//caffe2:torch",
+            "//executorch/backends/vulkan/passes:custom_ops_defs",
+            "//executorch/exir:pass_base",
+            "//executorch/exir:sym_util",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "fuse_dequant_linear",
+        srcs = ["fuse_dequant_linear.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            ":utils",
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+            "//executorch/exir:sym_util",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "view_copy_to_squeeze_unsqueeze",
+        srcs = ["view_copy_to_squeeze_unsqueeze.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            ":utils",
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "fuse_view_copy",
+        srcs = ["fuse_view_copy.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "remove_clone_ops",
+        srcs = ["remove_clone_ops.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "mean_to_sum_div",
+        srcs = ["mean_to_sum_div.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+            "//executorch/exir:sym_util",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "utils",
+        srcs = ["utils.py"],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:lib",
+            "//executorch/exir:pass_manager",
+            "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
+            "//executorch/exir/dialects:lib",
+            "//pytorch/ao:torchao",  # @manual
+        ],
+    )
+
+    runtime.python_library(
+        name = "duplicate_dynamic_quant_chain",
+        srcs = ["duplicate_dynamic_quant_chain.py"],
+        visibility = [
+            "//executorch/backends/...",
+            "//executorch/examples/...",
+            "//executorch/extension/llm/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//caffe2:torch",
+        ],
+    )
+
+    runtime.python_library(
+        name = "convert_dtype_pass",
+        srcs = [
+            "convert_dtype_pass.py",
+        ],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+        ],
+    )
+
+    runtime.python_test(
+        name = "test_duplicate_dynamic_quant_chain",
+        srcs = [
+            "test/test_duplicate_dynamic_quant_chain.py",
+        ],
+        deps = [
+            "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
+            ":duplicate_dynamic_quant_chain",
+            "//caffe2:torch",
+            "//executorch/exir:lib",
+        ],
+    )
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
index bc5a674970f..013833fc837 100644
--- a/backends/vulkan/README.md
+++ b/backends/vulkan/README.md
@@ -137,12 +137,12 @@ compile the Vulkan Compute Library's GLSL compute shaders.
 The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON`
 when building with CMake.
 
-First, make sure that you have the Android NDK installed - Android NDK r25c is
+First, make sure that you have the Android NDK installed - Android NDK 26.3.11579264 is
 recommended. The Android SDK should also be installed so that you have access
 to `adb`.
 
 ```shell
-# Recommended version is Android NDK r25c.
+# Recommended version is Android NDK 26.3.11579264.
 export ANDROID_NDK=<path_to_ndk>
 # Select an appropriate Android ABI
 export ANDROID_ABI=arm64-v8a
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
index 8570859ed34..0ddf885bb64 100644
--- a/backends/vulkan/docs/android_demo.md
+++ b/backends/vulkan/docs/android_demo.md
@@ -28,7 +28,7 @@ Tutorial in order to install the specified versions of the Android NDK and the
 Android SDK.
 
 ```shell
-# Recommended version is Android NDK r25c.
+# Recommended version is Android NDK 26.3.11579264.
 export ANDROID_NDK=<path_to_ndk>
 # Select an appropriate Android ABI
 export ANDROID_ABI=arm64-v8a
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
index ca7ce72caed..054b6947517 100644
--- a/backends/vulkan/partitioner/supported_ops.py
+++ b/backends/vulkan/partitioner/supported_ops.py
@@ -47,7 +47,8 @@ def __contains__(self, op):
     operator.getitem,
 ]
 
-BINARY_OPS = [
+SUPPORTS_DYNAMIC_SHAPE = [
+    # Binary broadcasting
     exir_ops.edge.aten.add.Tensor,
     exir_ops.edge.aten.sub.Tensor,
     exir_ops.edge.aten.minimum.default,
@@ -55,9 +56,7 @@ def __contains__(self, op):
     exir_ops.edge.aten.div.Tensor,
     exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.pow.Tensor_Tensor,
-]
-
-UNARY_OPS = [
+    # Unary elementwise
     exir_ops.edge.aten.abs.default,
     exir_ops.edge.aten.clamp.default,
     exir_ops.edge.aten.cos.default,
@@ -71,60 +70,47 @@ def __contains__(self, op):
     exir_ops.edge.aten.sin.default,
     exir_ops.edge.aten.sqrt.default,
     exir_ops.edge.aten.tanh.default,
-]
-
-MATMUL_OPS = [
+    # Matrix Multiplication
     exir_ops.edge.aten.bmm.default,
     exir_ops.edge.aten.mm.default,
     exir_ops.edge.aten.addmm.default,
     exir_ops.edge.aten.linear.default,
-]
-
-POOLING_OPS = [
+    # Reduction
+    exir_ops.edge.aten._log_softmax.default,
+    exir_ops.edge.aten._softmax.default,
+    # 2D Pooling
     exir_ops.edge.aten.avg_pool2d.default,
     exir_ops.edge.aten.max_pool2d_with_indices.default,
-]
-
-CONVOLUTION_OPS = [
+    # Convolution
     exir_ops.edge.aten.convolution.default,
     exir_ops.edge.et_vk.conv_with_clamp.default,
 ]
 
-REDUCTION_OPS = [
+NO_DYNAMIC_SHAPE = [
+    # Reduction
     exir_ops.edge.aten.mean.dim,
     exir_ops.edge.aten.sum.dim_IntList,
-    exir_ops.edge.aten._log_softmax.default,
-    exir_ops.edge.aten._softmax.default,
-]
-
-NORMALIZATION_OPS = [
+    # Normalization
     exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
     exir_ops.edge.aten.native_layer_norm.default,
-]
-
-SHAPE_MANIPULATION_OPS = [
+    # Shape Manipulation
     exir_ops.edge.aten.squeeze_copy.dims,
     exir_ops.edge.aten.unsqueeze_copy.default,
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.permute_copy.default,
     exir_ops.edge.aten.t_copy.default,
-]
-
-INDEXING_OPS = [
+    # Indexing and lookup
     exir_ops.edge.aten.embedding.default,
+    exir_ops.edge.aten.flip.default,
     exir_ops.edge.aten.index_select.default,
     exir_ops.edge.aten.select_copy.int,
     exir_ops.edge.aten.slice_copy.Tensor,
-]
-
-ORCHESTRATION_OPS = [
+    # Tensor combination
     exir_ops.edge.aten.cat.default,
     exir_ops.edge.aten.split_with_sizes_copy.default,
     exir_ops.edge.aten.split.Tensor,
     exir_ops.edge.aten.repeat.default,
-]
-
-CREATION_OPS = [
+    # Tensor creation
     exir_ops.edge.aten.arange.start_step,
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.constant_pad_nd.default,
@@ -139,39 +125,20 @@ def __contains__(self, op):
 ]
 
 
-def register_prim_ops(ops: OpList):
-    for op in PRIM_OPS:
-        ops[op].supports_texture = True
-        ops[op].supports_buffer = True
-        ops[op].supports_dynamic_shape = True
+def enumerate_supported_ops():
+    ops = OpList()
 
+    # Register in order of least to most capabilities
 
-def register_no_dynamic_shape_ops(ops: OpList):
-    for op in [
-        *REDUCTION_OPS,
-        *NORMALIZATION_OPS,
-        *SHAPE_MANIPULATION_OPS,
-        *INDEXING_OPS,
-        *ORCHESTRATION_OPS,
-        *CREATION_OPS,
-    ]:
+    for op in NO_DYNAMIC_SHAPE:
         ops[op].supports_dynamic_shape = False
 
-
-def register_dynamic_shape_ops(ops: OpList):
-    for op in [
-        *BINARY_OPS,
-        *UNARY_OPS,
-        *MATMUL_OPS,
-        *POOLING_OPS,
-        *CONVOLUTION_OPS,
-    ]:
+    for op in SUPPORTS_DYNAMIC_SHAPE:
         ops[op].supports_dynamic_shape = True
 
+    for op in PRIM_OPS:
+        ops[op].supports_texture = True
+        ops[op].supports_buffer = True
+        ops[op].supports_dynamic_shape = True
 
-def enumerate_supported_ops():
-    ops = OpList()
-    register_prim_ops(ops)
-    register_no_dynamic_shape_ops(ops)
-    register_dynamic_shape_ops(ops)
     return ops
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 7ed9469f77f..8c0c0f511f2 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -30,11 +30,23 @@
 #include <type_traits>
 #include <vector>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace vulkan {
 namespace {
 
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::kTensorDimensionLimit;
+using executorch::runtime::Result;
+
 using namespace vkcompute;
 
 // Flatbuffer types
@@ -357,7 +369,7 @@ class GraphBuilder {
 bool maybe_resize_input(
     ComputeGraph* graph,
     const size_t input_i,
-    exec_aten::Tensor& et_tensor) {
+    executorch::aten::Tensor& et_tensor) {
   ValueRef in_tensor_ref = graph->inputs()[input_i].value;
   vTensorPtr in_tensor = graph->get_tensor(in_tensor_ref);
 
@@ -392,17 +404,18 @@ bool maybe_resize_input(
 void maybe_resize_output(
     ComputeGraph* graph,
     const size_t output_i,
-    exec_aten::Tensor& et_tensor) {
+    executorch::aten::Tensor& et_tensor) {
   ValueRef out_tensor_ref = graph->outputs()[output_i].value;
   vTensorPtr out_tensor = graph->get_tensor(out_tensor_ref);
 
-  exec_aten::SizesType new_output_size[kTensorDimensionLimit];
+  executorch::aten::SizesType new_output_size[kTensorDimensionLimit];
   size_t ndim = out_tensor->sizes().size();
   for (int i = 0; i < ndim; ++i) {
     new_output_size[i] = out_tensor->sizes()[i];
   }
 
-  exec_aten::ArrayRef<exec_aten::SizesType> output_size{new_output_size, ndim};
+  executorch::aten::ArrayRef<executorch::aten::SizesType> output_size{
+      new_output_size, ndim};
   Error err = resize_tensor(et_tensor, output_size);
 
   ET_CHECK_MSG(err == Error::Ok, "Failed to resize output tensor.");
@@ -555,5 +568,5 @@ static auto success_with_compiler = register_backend(backend);
 
 } // namespace
 } // namespace vulkan
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.cpp b/backends/vulkan/runtime/VulkanDelegateHeader.cpp
index a9a9fa849a7..81fd0bbc953 100644
--- a/backends/vulkan/runtime/VulkanDelegateHeader.cpp
+++ b/backends/vulkan/runtime/VulkanDelegateHeader.cpp
@@ -15,10 +15,13 @@
 
 #pragma clang diagnostic ignored "-Wdeprecated"
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace vulkan {
 
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
 namespace {
 
 struct ByteSlice {
@@ -101,5 +104,5 @@ Result<VulkanDelegateHeader> VulkanDelegateHeader::parse(const void* data) {
 }
 
 } // namespace vulkan
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.h b/backends/vulkan/runtime/VulkanDelegateHeader.h
index c5e8859743a..0fc163bbe3c 100644
--- a/backends/vulkan/runtime/VulkanDelegateHeader.h
+++ b/backends/vulkan/runtime/VulkanDelegateHeader.h
@@ -10,8 +10,8 @@
 
 #include <executorch/runtime/core/result.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace vulkan {
 
 // Byte decoding utilities
@@ -22,7 +22,8 @@ uint32_t getUInt16LE(const uint8_t* data);
 struct VulkanDelegateHeader {
   bool is_valid() const;
 
-  static Result<VulkanDelegateHeader> parse(const void* data);
+  static executorch::runtime::Result<VulkanDelegateHeader> parse(
+      const void* data);
 
   uint32_t header_size;
   uint32_t flatbuffer_offset;
@@ -32,5 +33,5 @@ struct VulkanDelegateHeader {
 };
 
 } // namespace vulkan
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.h b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
index df8d7946d6e..fed7c8fa729 100644
--- a/backends/vulkan/runtime/api/containers/ParamsBuffer.h
+++ b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
@@ -56,12 +56,29 @@ class ParamsBuffer final {
     }
     // Fill the uniform buffer with data in block
     {
-      vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::MemoryAccessType::WRITE);
+      vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kWrite);
       Block* data_ptr = mapping.template data<Block>();
 
       *data_ptr = block;
     }
   }
+
+  template <typename T>
+  T read() const {
+    T val;
+    if (sizeof(val) != nbytes_) {
+      VK_THROW(
+          "Attempted to store value from ParamsBuffer to type of different size");
+    }
+    // Read value from uniform buffer and store in val
+    {
+      vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kRead);
+      T* data_ptr = mapping.template data<T>();
+
+      val = *data_ptr;
+    }
+    return val;
+  }
 };
 
 } // namespace api
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
index 66c607e178c..6f67ae8a64a 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -27,6 +27,8 @@ class StagingBuffer final {
   size_t nbytes_;
   vkapi::VulkanBuffer vulkan_buffer_;
 
+  void* mapped_data_;
+
  public:
   StagingBuffer(
       Context* context_p,
@@ -37,7 +39,8 @@ class StagingBuffer final {
         numel_(numel),
         nbytes_(element_size(dtype_) * numel_),
         vulkan_buffer_(
-            context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)) {}
+            context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)),
+        mapped_data_(nullptr) {}
 
   StagingBuffer(const StagingBuffer&) = delete;
   StagingBuffer& operator=(const StagingBuffer&) = delete;
@@ -58,7 +61,10 @@ class StagingBuffer final {
   }
 
   inline void* data() {
-    return vulkan_buffer_.allocation_info().pMappedData;
+    if (!mapped_data_) {
+      mapped_data_ = vulkan_buffer_.allocation_info().pMappedData;
+    }
+    return mapped_data_;
   }
 
   inline size_t numel() {
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index dc507f91626..75e70c77c43 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -13,33 +13,16 @@
 namespace vkcompute {
 namespace api {
 
-/*
- * Given the strides of a buffer-backed tensor, estimate the equivalent memory
- * layout enum value by identifying the fastest moving dimension.
- */
-utils::GPUMemoryLayout estimate_memory_layout(
-    const std::vector<int64_t>& dim_order) {
-  int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
-  if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3) {
-    return utils::GPUMemoryLayout(fastest_dim_whcn);
-  }
-
-  // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
-  // a UNKOWN GPUMemoryLayout. This is not high priority though because we don't
-  // expect this to ever come up in practice.
-  VK_THROW("No compatible GPUMemoryLayout value");
-}
-
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const utils::GPUMemoryLayout memory_layout) {
+    const int32_t packed_dim) {
   // Special case for zero dim tensors
   if (ndim == 0) {
     return {0};
   }
   std::vector<int64_t> dim_order(ndim);
-  int64_t last_dim =
-      ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+  // Explicitly convert ndim to signed to prevent underflow
+  int64_t last_dim = int64_t(ndim) - 1 - packed_dim;
 
   int64_t cur_dim = 0;
   for (int d = 0; d < ndim; ++d) {
@@ -89,11 +72,11 @@ std::vector<int64_t> calculate_strides(
  * tensor. Thus the axis mapping can be considered to be in WHCN dimension
  * order.
  *
- * The last value `axis_mapping.at(3)` indicates the WHCN index of the tensor
+ * The last value `axis_map.at(3)` indicates the WHCN index of the tensor
  * dimension along which batches will be concatenated. This dimension can be
  * referred to as the "inner dimension" To determine which image texture axis is
  * used for the concatenation, a double lookup will need to be performed
- * (axis_mapping.at(axis_mapping.at(3))).
+ * (axis_map.at(axis_map.at(3))).
  *
  * The reason for strucuring axis mapping this way is because for the batch dim,
  * two things need to be easily derived:
@@ -107,7 +90,7 @@ std::vector<int64_t> calculate_strides(
  *
  * The axis mapping allows for permuted views of texture-backed tensors.
  */
-std::vector<int64_t> default_axis_mapping() {
+std::vector<int64_t> default_axis_map() {
   // Currently, all compute shaders have an assumption that the channels dim is
   // used to combine with the batch dim of a tensor. However, once dim mapping
   // is integrated into the tensor indexing logic for each compute shader, we
@@ -149,7 +132,7 @@ std::vector<int64_t> unsqueeze_strides(
 
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout) {
+    const int32_t packed_dim) {
   int64_t ndim = sizes.size();
   if (ndim == 0) {
     ndim = 1;
@@ -163,8 +146,7 @@ std::vector<int64_t> calculate_padded_sizes(
   }
 
   // Pad the packed dim to the next multiple of 4.
-  const int64_t dim_offset =
-      utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+  const int64_t dim_offset = packed_dim + 1;
   const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
   padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
 
@@ -173,44 +155,240 @@ std::vector<int64_t> calculate_padded_sizes(
 
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_mapping,
-    const utils::GPUMemoryLayout memory_layout) {
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim) {
   VK_CHECK_COND(padded_sizes.size() == 4);
-  VK_CHECK_COND(axis_mapping.size() == 4);
+  VK_CHECK_COND(axis_map.size() == 4);
 
   utils::uvec3 extents({1, 1, 1});
-  // First three elements of axis_mapping indicate which (X,Y,Z) image axis the
+  // First three elements of axis_map indicate which (X,Y,Z) image axis the
   // width, height, and channels dim of the tensor maps to.
   for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) {
-    const int64_t axis = axis_mapping.at(whcn_dim);
+    const int64_t axis = axis_map.at(whcn_dim);
     const int64_t dim = padded_sizes.size() - 1 - whcn_dim;
     extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
   }
 
-  // axis_mapping[3] indicates the WHCN index of the dimension used for batch
+  // axis_map[3] indicates the WHCN index of the dimension used for batch
   // concatenation. Thus a double lookup is required to determine the image axis
   // used for batch concatenation.
-  const int64_t concatted_whcn_dim = axis_mapping.at(3);
-  const int64_t batch_axis = axis_mapping.at(concatted_whcn_dim);
+  const int64_t concatted_whcn_dim = axis_map.at(3);
+  const int64_t batch_axis = axis_map.at(concatted_whcn_dim);
   // Multiply the extents of the batch axis by the batch size.
   extents[batch_axis] *= padded_sizes.at(0);
 
-  switch (memory_layout) {
-    case utils::kWidthPacked:
-      VK_CHECK_COND(extents[0] % 4 == 0);
-      extents[0] /= 4;
+  VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
+  extents[axis_map.at(packed_dim)] /= 4;
+  return extents;
+}
+
+//
+// vTensorStorage
+//
+
+vkapi::VulkanImage allocate_image(
+    Context* const context_ptr,
+    utils::uvec3& image_extents,
+    const utils::StorageType storage_type,
+    const VkFormat image_format,
+    const bool allocate_memory) {
+  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
+
+  vkapi::ImageSampler::Properties sampler_props{
+      VK_FILTER_NEAREST,
+      VK_SAMPLER_MIPMAP_MODE_NEAREST,
+      VK_SAMPLER_ADDRESS_MODE_REPEAT,
+      VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
+  };
+
+  VkImageType image_type = VK_IMAGE_TYPE_3D;
+  VkImageViewType image_view_type;
+
+  switch (storage_type) {
+    case utils::kTexture3D:
+      image_type = VK_IMAGE_TYPE_3D;
+      image_view_type = VK_IMAGE_VIEW_TYPE_3D;
       break;
-    case utils::kHeightPacked:
-      VK_CHECK_COND(extents[1] % 4 == 0);
-      extents[1] /= 4;
+    case utils::kTexture2D:
+      image_type = VK_IMAGE_TYPE_2D;
+      image_view_type = VK_IMAGE_VIEW_TYPE_2D;
       break;
-    case utils::kChannelsPacked:
-      VK_CHECK_COND(extents[2] % 4 == 0);
-      extents[2] /= 4;
+    default:
+      // Return an empty VulkanImage by default
+      return vkapi::VulkanImage();
+  }
+
+  VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
+
+  return adapter_ptr->vma().create_image(
+      vkapi::create_extent3d(image_extents),
+      image_format,
+      image_type,
+      image_view_type,
+      sampler_props,
+      sampler,
+      /*allow_transfer = */ true,
+      /*allocate_memory = */ allocate_memory);
+}
+
+vkapi::VulkanBuffer allocate_buffer(
+    Context* const context_ptr,
+    const int64_t numel,
+    const utils::StorageType storage_type,
+    const vkapi::ScalarType dtype,
+    const bool allocate_memory) {
+  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
+
+  switch (storage_type) {
+    case utils::kBuffer:
       break;
+    default:
+      // Return an empty VulkanBuffer if Buffer storage is not used
+      return vkapi::VulkanBuffer();
   }
 
-  return extents;
+  return adapter_ptr->vma().create_storage_buffer(
+      element_size(dtype) * numel, allocate_memory);
+}
+
+vTensorStorage::vTensorStorage(
+    Context* const context,
+    const utils::StorageType storage_type,
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim,
+    const std::vector<int64_t>& padded_sizes,
+    const vkapi::ScalarType dtype,
+    const bool allocate_memory)
+    : context_(context),
+      storage_type_{storage_type},
+      image_extents_(
+          calculate_image_extents(padded_sizes, axis_map, packed_dim)),
+      buffer_length_{utils::multiply_integers(padded_sizes)},
+      buffer_offset_{0},
+      image_(allocate_image(
+          context_,
+          image_extents_,
+          storage_type_,
+          to_vkformat(dtype),
+          allocate_memory)),
+      buffer_(allocate_buffer(
+          context_,
+          buffer_length_,
+          storage_type_,
+          dtype,
+          allocate_memory)),
+      last_access_{},
+      has_copies_{false} {}
+
+vTensorStorage::vTensorStorage(
+    vTensorStorage& other,
+    const int64_t buffer_offset)
+    : context_(other.context_),
+      storage_type_{other.storage_type_},
+      image_extents_(other.image_extents_),
+      buffer_length_{other.buffer_length_},
+      buffer_offset_{buffer_offset},
+      image_(other.image_),
+      buffer_(other.buffer_, buffer_offset),
+      last_access_{other.last_access_},
+      has_copies_{false} {
+  other.has_copies_ = true;
+}
+
+vTensorStorage::~vTensorStorage() {
+  flush();
+}
+
+void vTensorStorage::flush() {
+  if (image_) {
+    context_->register_image_cleanup(image_);
+  } else if (buffer_) {
+    context_->register_buffer_cleanup(buffer_);
+  }
+  last_access_ = {};
+}
+
+void vTensorStorage::transition(
+    vkapi::PipelineBarrier& pipeline_barrier,
+    const vkapi::PipelineStageFlags cur_stage,
+    const vkapi::MemoryAccessFlags cur_access) {
+  // Get last stage access
+  vkapi::PipelineStageFlags prev_stage = last_access_.stage;
+  vkapi::MemoryAccessFlags prev_access = last_access_.access;
+
+  // If the underlying resource is a copy of another tensor's resource the
+  // last_access may not be accurate, since the original storage may have been
+  // written to as part of the original tensor. Likewise, if the underlying
+  // resource has copies, then the resource may have been updated as part of the
+  // view tensors.
+  //
+  // If the resource is a copy, or has copies of it, then cowardly assume that
+  // it has previously been written to as part of a compute shader before the
+  // current access event so that the appropriate memory barriers may be
+  // inserted.
+  if (is_copy() || has_copies_) {
+    prev_stage = vkapi::PipelineStage::COMPUTE;
+    prev_access = vkapi::kWrite;
+  }
+
+  const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
+
+  VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+  VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+  bool layout_changed = false;
+  if (image_) {
+    cur_layout = image_.layout();
+    new_layout = vkapi::vk_layout(cur_stage, cur_access);
+
+    layout_changed = cur_layout != new_layout;
+  }
+
+  if (prev_written || layout_changed) {
+    VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
+    if (0u == src_stage) {
+      src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    }
+    VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage);
+    if (0u == dst_stage) {
+      dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+    }
+
+    pipeline_barrier.stage.src |= src_stage;
+    pipeline_barrier.stage.dst |= dst_stage;
+
+    if (image_) {
+      pipeline_barrier.images.emplace_back(
+          vkapi::vk_access(prev_stage, prev_access),
+          vkapi::vk_access(cur_stage, cur_access),
+          cur_layout,
+          new_layout,
+          image_);
+
+      image_.set_layout(new_layout);
+    } else if (buffer_) {
+      pipeline_barrier.buffers.emplace_back(
+          vkapi::vk_access(prev_stage, prev_access),
+          vkapi::vk_access(cur_stage, cur_access),
+          buffer_);
+    }
+  }
+
+  last_access_.stage = cur_stage;
+  last_access_.access = cur_access;
+}
+
+bool vTensorStorage::is_copy() const {
+  if (storage_type_ == utils::kBuffer) {
+    return buffer_.is_copy();
+  }
+  return image_.is_copy();
+}
+
+bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
+  if (storage_type_ == utils::kBuffer) {
+    return buffer_.is_copy_of(other.buffer_);
+  }
+  return image_.is_copy_of(other.image_);
 }
 
 //
@@ -225,29 +403,29 @@ vTensor::vTensor(
     const utils::GPUMemoryLayout memory_layout,
     const bool allocate_memory)
     : dtype_(dtype),
-      memory_layout_(memory_layout),
       // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
-      dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
-      axis_mapping_(default_axis_mapping()),
+      packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
+      dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
+      axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
-      padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
+      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      texture_limits_{{0, 0, 0}},
+      logical_limits_{{0, 0, 0}},
       // Utility Uniform Buffers that can be passed to shaders as arguments
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_mapping_uniform_(),
-      texture_limits_uniform_(),
+      axis_map_uniform_(),
+      logical_limits_uniform_(),
       // Construct Tensor storage
       storage_(
           context,
           storage_type,
-          memory_layout_,
-          axis_mapping_,
+          axis_map_,
+          packed_dim_,
           padded_sizes_,
           dtype_,
           allocate_memory) {
@@ -255,10 +433,7 @@ vTensor::vTensor(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
 
   if (storage_type != utils::kBuffer) {
-    texture_limits_.limits = utils::ivec3{
-        utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
-        utils::safe_downcast<int32_t>(storage_.image_extents_[1]),
-        utils::safe_downcast<int32_t>(storage_.image_extents_[2])};
+    set_logical_limits(storage_.image_extents_);
   }
 
   if (dtype == vkapi::kHalf) {
@@ -269,13 +444,14 @@ vTensor::vTensor(
   }
 }
 
-vTensor::vTensor(const vTensor& other)
+// NOLINTNEXTLINE
+vTensor::vTensor(vTensor& other)
     : dtype_(other.dtype_),
-      memory_layout_(other.memory_layout_),
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
+      packed_dim_{other.packed_dim_},
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
-      axis_mapping_(other.axis_mapping_.begin(), other.axis_mapping_.end()),
+      axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
       numel_(other.numel_),
       padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
@@ -283,39 +459,39 @@ vTensor::vTensor(const vTensor& other)
           other.unsqueezed_strides_.begin(),
           other.unsqueezed_strides_.end()},
       padded_numel_(other.padded_numel_),
-      texture_limits_{other.texture_limits_},
+      logical_limits_{other.logical_limits_},
       // Empty initialize Utility Uniform Buffers
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_mapping_uniform_(),
-      texture_limits_uniform_(),
+      axis_map_uniform_(),
+      logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_) {}
 
 vTensor::vTensor(
-    const vTensor& other,
+    vTensor& other,
     const std::vector<int64_t>& sizes,
     const std::vector<int64_t>& dim_order,
     const int64_t offset_numel)
     : dtype_(other.dtype_),
-      memory_layout_(estimate_memory_layout(dim_order)),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
+      packed_dim_(other.packed_dim_),
       dim_order_(dim_order.begin(), dim_order.end()),
-      axis_mapping_(default_axis_mapping()),
+      axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
-      padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
+      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      texture_limits_{{0, 0, 0}},
+      logical_limits_(other.logical_limits_),
       // Empty initialize Utility Uniform Buffers
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_mapping_uniform_(),
-      texture_limits_uniform_(),
+      axis_map_uniform_(),
+      logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
   VK_CHECK_COND(
@@ -356,12 +532,23 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
-utils::uvec3 vTensor::mapped_extents() const {
-  utils::uvec3 m_extents;
-  m_extents[0] = storage_.image_extents_[axis_mapping_.at(0)];
-  m_extents[1] = storage_.image_extents_[axis_mapping_.at(1)];
-  m_extents[2] = storage_.image_extents_[axis_mapping_.at(2)];
-  return m_extents;
+void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
+  logical_limits_.limits[0] = image_extents[axis_map_.at(0)];
+  logical_limits_.limits[1] = image_extents[axis_map_.at(1)];
+  logical_limits_.limits[2] = image_extents[axis_map_.at(2)];
+}
+
+utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
+  switch (packed_dim_) {
+    case WHCN::kWidthDim:
+      return utils::kWidthPacked;
+    case WHCN::kHeightDim:
+      return utils::kHeightPacked;
+    case WHCN::kChannelsDim:
+      return utils::kChannelsPacked;
+    default:
+      VK_THROW("Invalid packed dim");
+  }
 }
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
@@ -380,19 +567,19 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
   return vkapi::BufferBindInfo(strides_uniform_.buffer());
 }
 
-const vkapi::BufferBindInfo vTensor::axis_mapping_ubo() {
-  if (!axis_mapping_uniform_.buffer()) {
-    axis_mapping_uniform_ =
-        ParamsBuffer(storage_.context_, utils::make_ivec4(axis_mapping_));
+const vkapi::BufferBindInfo vTensor::axis_map_ubo() {
+  if (!axis_map_uniform_.buffer()) {
+    axis_map_uniform_ =
+        ParamsBuffer(storage_.context_, utils::make_ivec4(axis_map_));
   }
-  return vkapi::BufferBindInfo(axis_mapping_uniform_.buffer());
+  return vkapi::BufferBindInfo(axis_map_uniform_.buffer());
 }
 
-const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
-  if (!texture_limits_uniform_.buffer()) {
-    texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_);
+const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
+  if (!logical_limits_uniform_.buffer()) {
+    logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_);
   }
-  return vkapi::BufferBindInfo(texture_limits_uniform_.buffer());
+  return vkapi::BufferBindInfo(logical_limits_uniform_.buffer());
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
@@ -415,17 +602,6 @@ size_t vTensor::staging_buffer_numel() const {
   return padded_numel_;
 }
 
-VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
-  switch (storage_type()) {
-    case utils::kBuffer:
-      return storage_.buffer_.allocation_create_info();
-    case utils::kTexture2D:
-    case utils::kTexture3D:
-      return storage_.image_.allocation_create_info();
-  }
-  return {};
-}
-
 VkMemoryRequirements vTensor::get_memory_requirements() const {
   switch (storage_type()) {
     case utils::kBuffer:
@@ -451,27 +627,16 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
 
 void vTensor::update_metadata() {
   strides_ = calculate_strides(sizes_, dim_order_);
-  // Only update the memory layout for buffer-backed tensors. Strides are
-  // meaningless for texture-backed tensors and do not impact the memory layout.
-  if (storage_type() == utils::kBuffer) {
-    memory_layout_ = estimate_memory_layout(dim_order_);
-  }
   numel_ = utils::multiply_integers(sizes_);
 
-  padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_);
   unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
   padded_numel_ = utils::multiply_integers(padded_sizes_);
 
-  // Calculate the extents of the image texture that would have been required
-  // for a tensor of the new sizes.
-  utils::uvec3 virtual_extents =
-      calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_);
-
-  // Update the texture limits to reflect the new virtual extents.
-  texture_limits_.limits = utils::ivec3{
-      utils::safe_downcast<int32_t>(virtual_extents[0]),
-      utils::safe_downcast<int32_t>(virtual_extents[1]),
-      utils::safe_downcast<int32_t>(virtual_extents[2])};
+  // Calculate the image extents that would have been used to allocate a texture
+  // withthe current sizes, and use that to set the logical limits.
+  set_logical_limits(
+      calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));
 
   if (sizes_uniform_.buffer()) {
     sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
@@ -482,11 +647,11 @@ void vTensor::update_metadata() {
   if (numel_uniform_.buffer()) {
     numel_uniform_.update(numel_);
   }
-  if (axis_mapping_uniform_.buffer()) {
-    axis_mapping_uniform_.update(utils::make_ivec4(axis_mapping_));
+  if (axis_map_uniform_.buffer()) {
+    axis_map_uniform_.update(utils::make_ivec4(axis_map_));
   }
-  if (texture_limits_uniform_.buffer()) {
-    texture_limits_uniform_.update(texture_limits_);
+  if (logical_limits_uniform_.buffer()) {
+    logical_limits_uniform_.update(logical_limits_);
   }
 }
 
@@ -495,11 +660,13 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents =
-        calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_);
+        calculate_image_extents(padded_sizes_, axis_map_, packed_dim_);
 
-    bool valid_resize = virtual_extents[0] <= image_extents()[0];
-    valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1];
-    valid_resize = valid_resize && virtual_extents[2] <= image_extents()[2];
+    bool valid_resize = virtual_extents[0] <= storage_.image_extents_[0];
+    valid_resize =
+        valid_resize && virtual_extents[1] <= storage_.image_extents_[1];
+    valid_resize =
+        valid_resize && virtual_extents[2] <= storage_.image_extents_[2];
 
     VK_CHECK_COND(
         valid_resize,
@@ -531,6 +698,14 @@ void vTensor::virtual_reconfigure(
   update_metadata();
 }
 
+void vTensor::virtual_clone(const vTensor& other) {
+  VK_CHECK_COND(is_view_of(other));
+  sizes_ = other.sizes_;
+  dim_order_ = other.dim_order_;
+  axis_map_ = other.axis_map_;
+  packed_dim_ = other.packed_dim_;
+}
+
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
   VK_CHECK_COND(
       new_sizes.size() == dim_order_.size(),
@@ -541,231 +716,51 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
   update_metadata();
 }
 
-void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
-  sizes_ = new_sizes;
-  update_metadata();
-  storage_.discard_and_reallocate(
-      calculate_padded_sizes(new_sizes, memory_layout_),
-      axis_mapping_,
-      memory_layout_,
-      dtype_);
-}
-
-//
-// vTensorStorage
-//
-
-vkapi::VulkanImage allocate_image(
-    Context* const context_ptr,
-    utils::uvec3& image_extents,
-    const utils::StorageType storage_type,
-    const VkFormat image_format,
-    const bool allocate_memory) {
-  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
-
-  vkapi::ImageSampler::Properties sampler_props{
-      VK_FILTER_NEAREST,
-      VK_SAMPLER_MIPMAP_MODE_NEAREST,
-      VK_SAMPLER_ADDRESS_MODE_REPEAT,
-      VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
-  };
-
-  VkImageType image_type = VK_IMAGE_TYPE_3D;
-  VkImageViewType image_view_type;
-
-  switch (storage_type) {
-    case utils::kTexture3D:
-      image_type = VK_IMAGE_TYPE_3D;
-      image_view_type = VK_IMAGE_VIEW_TYPE_3D;
-      break;
-    case utils::kTexture2D:
-      image_type = VK_IMAGE_TYPE_2D;
-      image_view_type = VK_IMAGE_VIEW_TYPE_2D;
-      break;
-    default:
-      // Return an empty VulkanImage by default
-      return vkapi::VulkanImage();
-  }
-
-  VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
-
-  return adapter_ptr->vma().create_image(
-      vkapi::create_extent3d(image_extents),
-      image_format,
-      image_type,
-      image_view_type,
-      sampler_props,
-      sampler,
-      /*allow_transfer = */ true,
-      /*allocate_memory = */ allocate_memory);
-}
-
-vkapi::VulkanBuffer allocate_buffer(
-    Context* const context_ptr,
-    const int64_t numel,
-    const utils::StorageType storage_type,
-    const vkapi::ScalarType dtype,
-    const bool allocate_memory) {
-  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
-
-  switch (storage_type) {
-    case utils::kBuffer:
-      break;
-    default:
-      // Return an empty VulkanBuffer if Buffer storage is not used
-      return vkapi::VulkanBuffer();
-  }
-
-  return adapter_ptr->vma().create_storage_buffer(
-      element_size(dtype) * numel, allocate_memory);
-}
-
-vTensorStorage::vTensorStorage(
-    Context* const context,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout gpu_memory_layout,
-    const std::vector<int64_t>& axis_mapping,
-    const std::vector<int64_t>& padded_sizes,
-    const vkapi::ScalarType dtype,
-    const bool allocate_memory)
-    : context_(context),
-      storage_type_{storage_type},
-      image_extents_(calculate_image_extents(
-          padded_sizes,
-          axis_mapping,
-          gpu_memory_layout)),
-      buffer_length_{utils::multiply_integers(padded_sizes)},
-      buffer_offset_{0},
-      image_(allocate_image(
-          context_,
-          image_extents_,
-          storage_type_,
-          to_vkformat(dtype),
-          allocate_memory)),
-      buffer_(allocate_buffer(
-          context_,
-          buffer_length_,
-          storage_type_,
-          dtype,
-          allocate_memory)),
-      last_access_{} {}
-
-vTensorStorage::vTensorStorage(
-    const vTensorStorage& other,
-    const int64_t buffer_offset)
-    : context_(other.context_),
-      storage_type_{other.storage_type_},
-      image_extents_(other.image_extents_),
-      buffer_length_{other.buffer_length_},
-      buffer_offset_{buffer_offset},
-      image_(),
-      buffer_(other.buffer_, buffer_offset),
-      last_access_{other.last_access_} {
-  if (other.storage_type_ != utils::kBuffer) {
-    VK_THROW("Tensors with texture storage cannot be copied!");
-  }
-}
-
-vTensorStorage::~vTensorStorage() {
-  flush();
-}
-
-void vTensorStorage::flush() {
-  if (image_) {
-    context_->register_image_cleanup(image_);
-  } else if (buffer_) {
-    context_->register_buffer_cleanup(buffer_);
+/*
+ * Transposing the dim order is a bit unintuitive. dim0 and dim1 have swapped
+ * their "identities", so we need to swap the values of dim0 and dim1 wherever
+ * they appear in the dim order vector. Compare this to just swapping the
+ * elements at dim0 and dim1 in the `sizes` vectors.
+ */
+void transpose_dim_order_inplace(
+    std::vector<int64_t>& dim_order,
+    const int64_t dim0,
+    const int64_t dim1) {
+  for (int i = 0; i < dim_order.size(); ++i) {
+    if (dim_order[i] == dim0) {
+      dim_order[i] = dim1;
+    } else if (dim_order[i] == dim1) {
+      dim_order[i] = dim0;
+    }
   }
-  last_access_ = {};
 }
 
-void vTensorStorage::transition(
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::PipelineStageFlags cur_stage,
-    const vkapi::MemoryAccessFlags cur_access) {
-  // Get last stage access
-  vkapi::PipelineStageFlags prev_stage = last_access_.stage;
-  vkapi::MemoryAccessFlags prev_access = last_access_.access;
-
-  const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
+void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
+  std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
 
-  VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-  VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-  bool layout_changed = false;
-  if (image_) {
-    cur_layout = image_.layout();
-    new_layout = vkapi::vk_layout(cur_stage, cur_access);
-
-    layout_changed = cur_layout != new_layout;
+  const int dim0_whcn = sizes_.size() - 1 - dim0;
+  const int dim1_whcn = sizes_.size() - 1 - dim1;
+  if (packed_dim_ == dim0_whcn) {
+    packed_dim_ = dim1_whcn;
+  } else if (packed_dim_ == dim1_whcn) {
+    packed_dim_ = dim0_whcn;
   }
 
-  if (prev_written || layout_changed) {
-    VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
-    if (0u == src_stage) {
-      src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-    }
-    VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage);
-    if (0u == dst_stage) {
-      dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
-    }
-
-    pipeline_barrier.stage.src |= src_stage;
-    pipeline_barrier.stage.dst |= dst_stage;
-
-    if (image_) {
-      pipeline_barrier.images.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          cur_layout,
-          new_layout,
-          image_);
-
-      image_.set_layout(new_layout);
-    } else if (buffer_) {
-      pipeline_barrier.buffers.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          buffer_);
+  if (storage_type() == utils::kBuffer) {
+    transpose_dim_order_inplace(dim_order_, dim0, dim1);
+  } else {
+    // Cannot transpose batch dimension for texture storage
+    VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3);
+    std::iter_swap(
+        axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn);
+    // Update the "identity" of the concatted dimension
+    if (axis_map_.at(3) == dim0_whcn) {
+      axis_map_.at(3) = dim1_whcn;
+    } else if (axis_map_.at(3) == dim1_whcn) {
+      axis_map_.at(3) = dim0_whcn;
     }
   }
-
-  last_access_.stage = cur_stage;
-  last_access_.access = cur_access;
-}
-
-bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
-  if (storage_type_ != other.storage_type_) {
-    return false;
-  }
-  // Copies are only enabled for buffer storage at the moment
-  if (storage_type_ != utils::kBuffer) {
-    return false;
-  }
-  return buffer_.is_copy_of(other.buffer_);
-}
-
-void vTensorStorage::discard_and_reallocate(
-    const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_mapping,
-    const utils::GPUMemoryLayout gpu_memory_layout,
-    const vkapi::ScalarType dtype) {
-  const bool image_owns_memory = image_.owns_memory();
-  const bool buffer_owns_memory = buffer_.owns_memory();
-
-  flush();
-
-  image_extents_ =
-      calculate_image_extents(padded_sizes, axis_mapping, gpu_memory_layout);
-  image_ = allocate_image(
-      context_,
-      image_extents_,
-      storage_type_,
-      to_vkformat(dtype),
-      image_owns_memory);
-
-  buffer_length_ = utils::multiply_integers(padded_sizes);
-  buffer_ = allocate_buffer(
-      context_, buffer_length_, storage_type_, dtype, buffer_owns_memory);
+  update_metadata();
 }
 
 } // namespace api
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 31052b351de..a5a5083d029 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -26,7 +26,7 @@ namespace api {
  */
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const utils::GPUMemoryLayout memory_layout);
+    const int32_t packed_dim);
 
 /*
  * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
@@ -57,15 +57,15 @@ std::vector<int64_t> unsqueeze_strides(
  */
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout);
+    const int32_t packed_dim);
 
 /*
  * Calculate the image extents required of a texture backed tensor.
  */
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_mapping,
-    const utils::GPUMemoryLayout memory_layout);
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim);
 
 struct LastAccess {
   vkapi::PipelineStageFlags stage;
@@ -89,8 +89,8 @@ class vTensorStorage final {
   vTensorStorage(
       Context* context,
       const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout gpu_memory_layout,
-      const std::vector<int64_t>& axis_mapping,
+      const std::vector<int64_t>& axis_map,
+      const int32_t packed_dim,
       const std::vector<int64_t>& padded_sizes,
       const vkapi::ScalarType dtype,
       const bool allocate_memory = true);
@@ -104,7 +104,7 @@ class vTensorStorage final {
    * because this behaviour is unsafe, since the original tensor may be
    * destroyed before the copy is destroyed.
    */
-  vTensorStorage(const vTensorStorage& other, const int64_t buffer_offset = 0);
+  vTensorStorage(vTensorStorage& other, const int64_t buffer_offset = 0);
 
  public:
   // To discourage creating copies, the assignment operator is still deleted.
@@ -134,6 +134,8 @@ class vTensorStorage final {
 
   // Last Access - used to insert memory barriers
   LastAccess last_access_;
+  // Indicates whether copies of this vTensorStorage have been made
+  bool has_copies_;
 
  private:
   // Registers underlying memory for cleanup
@@ -153,16 +155,15 @@ class vTensorStorage final {
     return image_.format();
   }
 
+  /*
+   * Check if the underlying resource is a copy of another resource
+   */
+  bool is_copy() const;
+
   /*
    * Used for checking if this vTensorStorage is a copy of another instance
    */
   bool is_copy_of(const vTensorStorage& other) const;
-
-  void discard_and_reallocate(
-      const std::vector<int64_t>& padded_sizes,
-      const std::vector<int64_t>& axis_mapping,
-      const utils::GPUMemoryLayout gpu_memory_layout,
-      const vkapi::ScalarType dtype);
 };
 
 class vTensor final {
@@ -182,6 +183,8 @@ class vTensor final {
       const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked,
       const bool allocate_memory = true);
 
+  vTensor(const vTensor& other) = delete;
+
   /*
    * This constructor allows for the creation of a vTensor that references the
    * same buffer resource of another vTensor, with the same sizes and strides
@@ -191,7 +194,7 @@ class vTensor final {
    * Once created, the sizes and strides of the aliased vTensor can be changed
    * using the `virtual_reconfigure` member function.
    */
-  vTensor(const vTensor& other);
+  vTensor(vTensor& other);
 
   /*
    * This constructor allows for the creation of a vTensor that references the
@@ -208,7 +211,7 @@ class vTensor final {
    * buffer.
    */
   vTensor(
-      const vTensor& other,
+      vTensor& other,
       const std::vector<int64_t>& sizes,
       const std::vector<int64_t>& dim_order,
       const int64_t offset_numel = 0);
@@ -227,13 +230,14 @@ class vTensor final {
 
   // Whether the tensor has elements of type float, int, etc.
   vkapi::ScalarType dtype_;
-  // Describes which dimension is "tightly packed". For texture backed tensors,
-  // this describes which dimension is packed along a texel. For buffer backed
-  // tensors, this describes which dimension has a stride of 1 (i.e. is last in
-  // the dim order).
-  utils::GPUMemoryLayout memory_layout_;
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
+  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
+  // width, 1 for height, etc.). For texture backed tensors, this describes
+  // which dimension is packed along a texel. For buffer backed tensors, this
+  // describes which dimension has a stride of 1 (i.e. is last in the dim
+  // order).
+  int32_t packed_dim_;
 
   /*
    * "Layout" metadata. These describe with further detail how tensor data is
@@ -252,9 +256,9 @@ class vTensor final {
   // Describes which axis of an image texture each dimension of the tensor maps
   // to. The axis mapping allows texture based tensors to be permuted and
   // transposed without modifying the underlying texture storage. For a more in
-  // depth explanation of axis mapping, see the `default_axis_mapping()`
+  // depth explanation of axis mapping, see the `default_axis_map()`
   // function.
-  std::vector<int64_t> axis_mapping_;
+  std::vector<int64_t> axis_map_;
 
   /*
    * The below can be consider "layout" metadata as well, but are derived from
@@ -282,10 +286,8 @@ class vTensor final {
   // Contains the number of elements in the tensor according to the padded
   // sizes.
   size_t padded_numel_;
-  // Contains the "virtual" texture extents of the tensor. See the
-  // texture_limits() function for more context. Note that the texture limits
-  // are only relevant for texture storage, and not for buffer storage.
-  TextureLimits texture_limits_;
+  // See the comments documenting logical_limits() for more context.
+  TextureLimits logical_limits_;
 
   /*
    * Utility GPU buffers that can be passed to shaders in order to convey tensor
@@ -299,8 +301,8 @@ class vTensor final {
   ParamsBuffer sizes_uniform_;
   ParamsBuffer strides_uniform_;
   ParamsBuffer numel_uniform_;
-  ParamsBuffer axis_mapping_uniform_;
-  ParamsBuffer texture_limits_uniform_;
+  ParamsBuffer axis_map_uniform_;
+  ParamsBuffer logical_limits_uniform_;
 
   vTensorStorage storage_;
 
@@ -347,24 +349,30 @@ class vTensor final {
     return storage_.storage_type_ == utils::kBuffer;
   }
 
-  /*
-   * Returns the raw image extents of the underlying image texture used to store
-   * the tensor's data. Note that due to axis mapping, the X, Y, and Z extents
-   * may not correspond to the width, height, or channels dimension of the
-   * tensor.
-   */
-  inline const utils::uvec3& image_extents() const {
-    return storage_.image_extents_;
-  }
+ private:
+  void set_logical_limits(const utils::uvec3& image_extents);
 
+ public:
   /*
-   * Returns the image extents of the underlying image texture, but re-ordered
-   * such that the first element is the extent of the axis used to represent the
-   * tensor's width dimension, the second element is the extent of the axis used
-   * to represent the tensor's height dimension, and the third element is the
-   * extent of the axis used to represent the tensor's channels dimension.
+   * The logical limits of the tensor are derived from the image extents of the
+   * image texture used to store the tensor, but with two key differences.
+   *
+   * First, the image extents are permuted according to the axis map. This
+   * makes it so that the first element of the logical limit is the limit of the
+   * texture axis corresponding to the width dimension of the tensor, the next
+   * element is the limit of the texture axis corresponding to the height
+   * dimension and the last element is the limit of the texture axis that
+   * corresponds to the channels dimension of the tensor.
+   *
+   * Second, the logical limits may use smaller extents than the actual image
+   * extents of the image texture. This is due to dynamic shape; if the tensor's
+   * `virtual_resize()` function is called, then the logical limits will reflect
+   * the extents that would be needed to support a tensor with the updated sizes
+   * instead of the original sizes.
    */
-  utils::uvec3 mapped_extents() const;
+  inline const utils::ivec3& logical_limits() const {
+    return logical_limits_.limits;
+  }
 
   /*
    * Extract an `vkapi::ScalarType` from the TensorOptions member
@@ -373,12 +381,26 @@ class vTensor final {
     return dtype_;
   }
 
-  inline utils::GPUMemoryLayout gpu_memory_layout() const {
-    return memory_layout_;
+  /*
+   * Provide a "best guess" of a memory layout that can be used to construct a
+   * tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this
+   * tensor. In some scenarios, the exact layout of the tensor may not be able
+   * to be replicated due to calling `virtual_*()` functions after construction;
+   * however, this function will provide a memory layout that will produce the
+   * same `packed_dim_` as this tensor.
+   */
+  utils::GPUMemoryLayout estimate_memory_layout() const;
+
+  inline int32_t packed_dim() const {
+    return packed_dim_;
   }
 
-  inline int32_t packed_dim_whcn_idx() const {
-    return static_cast<int32_t>(memory_layout_);
+  /*
+   * Returns the WHCN index of the dimension that is used to concatenate batches
+   * as an int32_t.
+   */
+  inline int32_t concat_dim() const {
+    return utils::safe_downcast<int32_t>(axis_map_.at(3));
   }
 
   inline const std::vector<int64_t>& sizes() const {
@@ -397,6 +419,10 @@ class vTensor final {
     return dim_order_;
   }
 
+  inline const std::vector<int64_t>& axis_map() const {
+    return axis_map_;
+  }
+
   inline const std::vector<int64_t>& strides() const {
     return strides_;
   }
@@ -424,26 +450,19 @@ class vTensor final {
    * Returns a GPU buffer containing the texture axis mapping for each dimension
    * of the tensor, in WHCN dimension order.
    */
-  const vkapi::BufferBindInfo axis_mapping_ubo();
+  const vkapi::BufferBindInfo axis_map_ubo();
 
   /*
-   * Returns a GPU buffer containing the virtual image extents of the tensor.
-   * Since a tensor can be resized with the virtual_resize() function, this
-   * GPU buffer contains the image extents of the tensor calculated using the
-   * virtual_resize() function. This allows shaders to exit early if they are
-   * working outside the limits of the texture.
+   * Returns a GPU buffer containing the logical limits of the tensor. See the
+   * comments for logical_limits() for more context.
    */
-  const vkapi::BufferBindInfo texture_limits_ubo();
+  const vkapi::BufferBindInfo logical_limits_ubo();
 
   /*
    * Returns the number of elements in the buffer used to store the tensor.
    */
   const vkapi::BufferBindInfo numel_ubo();
 
-  inline const utils::ivec3 texture_limits() const {
-    return texture_limits_.limits;
-  }
-
   inline size_t numel() const {
     return numel_;
   }
@@ -501,11 +520,19 @@ class vTensor final {
    *
    * This function can only be used for buffer-backed tensors, since texture
    * backed buffers cannot change dimensionality or memory layout.
+   *
+   * TODO(ssjia): delete this API. prefer functions such as virtual_transpose
+   * instead.
    */
   void virtual_reconfigure(
       const std::vector<int64_t>& new_sizes,
       const std::vector<int64_t>& new_dim_order);
 
+  /*
+   * Set all metadata of this tensor to match the metadata of another tensor.
+   */
+  void virtual_clone(const vTensor& other);
+
   /*
    * Perform a virtual resize of the vTensor by modifying the size metadata that
    * gets used in compute shaders. This allows the shader to treat the
@@ -515,10 +542,9 @@ class vTensor final {
   void virtual_resize(const std::vector<int64_t>& new_sizes);
 
   /*
-   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
-   * tensor sizes
+   * Transpose the tensor in-place by updating its metadata.
    */
-  void reallocate(const std::vector<int64_t>& new_sizes);
+  void virtual_transpose(const int64_t dim0, const int64_t dim1);
 
   /*
    * Check if this vTensor instance is a view of another vTensor instance
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 8a9ec370f6d..011d62c0ea4 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -198,6 +198,32 @@ std::vector<int64_t> ComputeGraph::sizes_of(const ValueRef idx) const {
   VK_THROW("Could not get sizes of value with type ", val.type());
 }
 
+int64_t ComputeGraph::dim_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().dim();
+  } else if (val.isTensorRef()) {
+    return val.toConstTensorRef().sizes.size();
+  }
+  VK_THROW("Could not get dim of value with type ", val.type());
+}
+
+std::vector<int64_t> ComputeGraph::dim_order_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().dim_order();
+  }
+  VK_THROW("Could not get dim order of value with type ", val.type());
+}
+
+std::vector<int64_t> ComputeGraph::strides_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
+  if (val.isTensor()) {
+    return val.toConstTensor().strides();
+  }
+  VK_THROW("Could not get strides of value with type ", val.type());
+}
+
 vkapi::ScalarType ComputeGraph::dtype_of(const ValueRef idx) const {
   const Value& val = values_.at(idx);
   if (val.isTensor()) {
@@ -274,6 +300,11 @@ ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) {
   const vTensorPtr t = get_tensor(vref);
   ValueRef idx(static_cast<int>(values_.size()));
   values_.emplace_back(api::vTensor(*t));
+  for (SharedObject& sobj : shared_objects_) {
+    if (sobj.has_user(vref)) {
+      sobj.add_user(this, idx);
+    }
+  }
   return idx;
 }
 
@@ -285,6 +316,11 @@ ValueRef ComputeGraph::add_tensor_view(
   const vTensorPtr t = get_tensor(vref);
   ValueRef idx(static_cast<int>(values_.size()));
   values_.emplace_back(api::vTensor(*t, sizes, strides, offset_numel));
+  for (SharedObject& sobj : shared_objects_) {
+    if (sobj.has_user(vref)) {
+      sobj.add_user(this, idx);
+    }
+  }
   return idx;
 }
 
@@ -390,6 +426,10 @@ void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) {
   get_symint(idx)->set(val);
 }
 
+int32_t ComputeGraph::read_symint(const ValueRef idx) {
+  return get_symint(idx)->get();
+}
+
 SharedObject& ComputeGraph::get_shared_object(const int64_t idx) {
   if (idx >= shared_objects_.size()) {
     shared_objects_.resize(static_cast<size_t>(idx + 1));
@@ -428,7 +468,7 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   if (is_buffer_storage(idx)) {
     return {uint32_t(numel_of(idx)), 1u, 1u};
   }
-  return image_extents_of(idx);
+  return logical_limits_of(idx);
 }
 
 utils::uvec3 ComputeGraph::create_local_wg_size(
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 46787955336..57cc5316612 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -254,7 +254,7 @@ class ComputeGraph final {
 #undef GET_AND_CHECK_VAL_AS_TYPE_FNS
 
   inline bool val_is_none(const ValueRef idx) {
-    return values_.at(idx).isNone();
+    return idx == kDummyValueRef ? true : values_.at(idx).isNone();
   }
 
   inline TypeTag get_val_type(const ValueRef idx) {
@@ -282,14 +282,16 @@ class ComputeGraph final {
     VK_THROW("Could not get sizes of value with type ", val.type());
   }
 
-  vkapi::ScalarType dtype_of(const ValueRef idx) const;
+  int64_t dim_of(const ValueRef idx) const;
 
-  inline utils::uvec3 image_extents_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().image_extents();
-  }
+  std::vector<int64_t> dim_order_of(const ValueRef idx) const;
+
+  std::vector<int64_t> strides_of(const ValueRef idx) const;
+
+  vkapi::ScalarType dtype_of(const ValueRef idx) const;
 
-  inline utils::uvec3 mapped_extents_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().mapped_extents();
+  inline const utils::ivec3& logical_limits_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().logical_limits();
   }
 
   inline int32_t numel_of(const ValueRef idx) const {
@@ -311,12 +313,17 @@ class ComputeGraph final {
         .is_view_of(values_.at(base).toConstTensor());
   }
 
-  inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().gpu_memory_layout();
+  inline utils::GPUMemoryLayout estimate_memory_layout_of(
+      const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().estimate_memory_layout();
   }
 
-  inline int32_t packed_dim_whcn_idx_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().packed_dim_whcn_idx();
+  inline int32_t packed_dim_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().packed_dim();
+  }
+
+  inline int32_t concat_dim_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().concat_dim();
   }
 
   inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) {
@@ -331,12 +338,12 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().numel_ubo();
   }
 
-  inline vkapi::BufferBindInfo axis_mapping_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().axis_mapping_ubo();
+  inline vkapi::BufferBindInfo axis_map_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().axis_map_ubo();
   }
 
-  inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().texture_limits_ubo();
+  inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().logical_limits_ubo();
   }
 
   //
@@ -371,6 +378,19 @@ class ComputeGraph final {
     return values_.at(idx).toString();
   }
 
+  template <
+      typename T,
+      typename std::enable_if<
+          std::is_integral<T>::value && std::is_signed<T>::value,
+          int>::type = 0>
+  T extract_whcn_dim(const ValueRef idx, const int64_t ndim) {
+    T dim = extract_scalar<T>(idx);
+    // Normalize dim to account for negative indexing
+    dim = (dim % ndim + ndim) % ndim;
+    // Assume original value is NCHW ordering, obtain the WHCN ordering
+    return ndim - 1 - dim;
+  }
+
   //
   // Utility functions
   //
@@ -533,6 +553,8 @@ class ComputeGraph final {
 
   void set_symint(const ValueRef idx, const int32_t val);
 
+  int32_t read_symint(const ValueRef idx);
+
   /*
    * Convenience function to add an input tensor along with its staging buffer
    */
diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.cpp b/backends/vulkan/runtime/graph/containers/SharedObject.cpp
index 0d8b77a5b74..10ddd6f2ca3 100644
--- a/backends/vulkan/runtime/graph/containers/SharedObject.cpp
+++ b/backends/vulkan/runtime/graph/containers/SharedObject.cpp
@@ -12,13 +12,14 @@
 
 namespace vkcompute {
 
+bool SharedObject::has_user(const ValueRef idx) const {
+  return std::find(users.begin(), users.end(), idx) != users.end();
+}
+
 void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
   vTensorPtr t = graph->get_tensor(idx);
 
-  //
   // Aggregate Memory Requirements
-  //
-
   const VkMemoryRequirements mem_reqs = t->get_memory_requirements();
   aggregate_memory_requirements.size =
       std::max(mem_reqs.size, aggregate_memory_requirements.size);
@@ -26,27 +27,6 @@ void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
       std::max(mem_reqs.alignment, aggregate_memory_requirements.alignment);
   aggregate_memory_requirements.memoryTypeBits |= mem_reqs.memoryTypeBits;
 
-  //
-  // Aggregate Allocation Create Info
-  //
-
-  const VmaAllocationCreateInfo create_info = t->get_allocation_create_info();
-  // Clear out CREATE_STRATEGY bit flags in case of conflict
-  VmaAllocationCreateFlags clear_mask = ~VMA_ALLOCATION_CREATE_STRATEGY_MASK;
-  VmaAllocationCreateFlags create_flags = create_info.flags & clear_mask;
-  // Use the default allocation strategy
-  aggregate_create_info.flags =
-      create_flags | vkapi::DEFAULT_ALLOCATION_STRATEGY;
-
-  // Set the usage flag if it is currently not set
-  if (aggregate_create_info.usage == VMA_MEMORY_USAGE_UNKNOWN) {
-    aggregate_create_info.usage = create_info.usage;
-  }
-  // Otherwise check that there is no conflict regarding usage
-  VK_CHECK_COND(aggregate_create_info.usage == create_info.usage);
-  aggregate_create_info.requiredFlags |= create_info.requiredFlags;
-  aggregate_create_info.preferredFlags |= create_info.preferredFlags;
-
   users.emplace_back(idx);
 }
 
@@ -54,8 +34,12 @@ void SharedObject::allocate(ComputeGraph* const graph) {
   if (aggregate_memory_requirements.size == 0) {
     return;
   }
+
+  VmaAllocationCreateInfo alloc_create_info =
+      graph->context()->adapter_ptr()->vma().gpuonly_resource_create_info();
+
   allocation = graph->context()->adapter_ptr()->vma().create_allocation(
-      aggregate_memory_requirements, aggregate_create_info);
+      aggregate_memory_requirements, alloc_create_info);
 }
 
 void SharedObject::bind_users(ComputeGraph* const graph) {
diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.h b/backends/vulkan/runtime/graph/containers/SharedObject.h
index 37e80257f46..f9b16e6c202 100644
--- a/backends/vulkan/runtime/graph/containers/SharedObject.h
+++ b/backends/vulkan/runtime/graph/containers/SharedObject.h
@@ -28,10 +28,10 @@ struct SharedObject {
   explicit SharedObject() = default;
 
   VkMemoryRequirements aggregate_memory_requirements;
-  VmaAllocationCreateInfo aggregate_create_info;
   std::vector<ValueRef> users;
   vkapi::Allocation allocation;
 
+  bool has_user(const ValueRef idx) const;
   void add_user(ComputeGraph* const graph, const ValueRef idx);
   void allocate(ComputeGraph* const graph);
   void bind_users(ComputeGraph* const graph);
diff --git a/backends/vulkan/runtime/graph/containers/SymInt.cpp b/backends/vulkan/runtime/graph/containers/SymInt.cpp
index c91db84b787..a59a2d40141 100644
--- a/backends/vulkan/runtime/graph/containers/SymInt.cpp
+++ b/backends/vulkan/runtime/graph/containers/SymInt.cpp
@@ -17,6 +17,10 @@ void SymInt::set(const int32_t val) {
   gpu_buffer.update(val);
 }
 
+int32_t SymInt::get() {
+  return gpu_buffer.read<int32_t>();
+}
+
 void SymInt::operator=(const int32_t val) {
   gpu_buffer.update(val);
 }
diff --git a/backends/vulkan/runtime/graph/containers/SymInt.h b/backends/vulkan/runtime/graph/containers/SymInt.h
index 0c9fbee5fe2..bd361aabe5a 100644
--- a/backends/vulkan/runtime/graph/containers/SymInt.h
+++ b/backends/vulkan/runtime/graph/containers/SymInt.h
@@ -35,6 +35,8 @@ struct SymInt final {
 
   void set(const int32_t val);
 
+  int32_t get();
+
   void operator=(const int32_t val);
 };
 
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index dece9ddb50d..92efda30229 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -21,16 +21,16 @@ class ComputeGraph;
  * access permission.
  */
 struct ArgGroup {
-  ArgGroup(const ValueRef ref, const vkapi::MemoryAccessType access)
+  ArgGroup(const ValueRef ref, const vkapi::MemoryAccessFlags access)
       : refs{ref}, access(access) {}
 
   ArgGroup(
       const std::vector<ValueRef>& refs,
-      const vkapi::MemoryAccessType access)
+      const vkapi::MemoryAccessFlags access)
       : refs(refs), access(access) {}
 
   const std::vector<ValueRef> refs;
-  const vkapi::MemoryAccessType access;
+  const vkapi::MemoryAccessFlags access;
 };
 
 /*
diff --git a/backends/vulkan/runtime/graph/ops/glsl/activations.h b/backends/vulkan/runtime/graph/ops/glsl/activations.h
index c5ee3b20855..94c9e1274de 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/activations.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/activations.h
@@ -18,7 +18,7 @@ float hardswish(float x) {
 
 vec4 hardswish(vec4 tex) {
   return vec4(
-      hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.z));
+      hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.w));
 }
 
 float hardshrink(float x, float lambda, float neg_lambda) {
@@ -30,3 +30,15 @@ vec4 hardshrink(vec4 tex, float lambda, float neg_lambda) {
       (vec4(greaterThan(tex, vec4(lambda))) +
        vec4(lessThan(tex, vec4(neg_lambda))));
 }
+
+float hardsigmoid(float x) {
+  return mix(float(x >= 0.0), x / 6 + 0.5, float(abs(x) <= 3.0));
+}
+
+vec4 hardsigmoid(vec4 tex) {
+  return vec4(
+      hardsigmoid(tex.x),
+      hardsigmoid(tex.y),
+      hardsigmoid(tex.z),
+      hardsigmoid(tex.w));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
deleted file mode 100644
index dbc87eb7944..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-#include "indexing_utils.h"
-#include "matmul.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
-
-layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes {
-  ivec3 self_sizes;
-};
-
-layout(set = 0, binding = 7) uniform PRECISION restrict AddmmParams {
-  float alpha;
-  float beta;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  vec4 texel = vec4(0);
-
-  $if MAT1_PACKING == "W_packed":
-    $if MAT2_PACKING == "H_packed":
-      ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z);
-      texel = matmul_naive_W_packed_H_packed(
-          im_mat1,
-          im_mat2,
-          pos,
-          in_sizes[0]);
-    $elif MAT2_PACKING == "W_packed":
-      texel = matmul_naive_W_packed_W_packed(
-          im_mat1,
-          im_mat2,
-          pos,
-          in_sizes[0]);
-    $else:
-      $raise Exception("Unsupported value for MAT2_PACKING")
-  $else:
-    $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING")
-
-  vec4 self_texel = get_texel_W_packed(
-      im_self,
-      pos,
-      self_sizes.x == 1,
-      self_sizes.y == 1);
-
-  texel = beta * self_texel + alpha * texel;
-  imageStore(im_out, pos, texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
new file mode 100644
index 00000000000..3d9bf885df6
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+$if MAT2_IS_TRANSPOSED:
+  #define MAT2_IS_TRANSPOSED
+
+$if HAS_BIAS:
+  #define HAS_BIAS
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
+$if HAS_BIAS:
+  ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat1_axis_map")}
+${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat2_axis_map")}
+$if HAS_BIAS:
+  ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
+  ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int out_packed_dim = C_DIM;
+layout(constant_id = 4) const int mat1_packed_dim = W_DIM;
+layout(constant_id = 5) const int mat2_packed_dim = H_DIM;
+layout(constant_id = 6) const int bias_packed_dim = W_DIM;
+
+#ifdef HAS_BIAS
+vec4 get_bias_texel_W_packed(ivec3 logical_pos) {
+  ivec3 bias_pos = ivec3(0);
+  if (bias_sizes.y == 1) {
+    bias_pos[bias_axis_map.y] = 0;
+  } else {
+    bias_pos[bias_axis_map.y] = logical_pos.y;
+  }
+  if (bias_sizes.x == 1) {
+    bias_pos[bias_axis_map.x] = 0;
+    vec4 bias_texel = texelFetch(bias_tensor, bias_pos, 0);
+    // Only the first value is valid, the rest is 0 padding
+    return vec4(bias_texel.x);
+  } else {
+    bias_pos[bias_axis_map.x] = logical_pos.x;
+  }
+
+  return texelFetch(bias_tensor, bias_pos, 0);
+}
+#endif // HAS_BIAS
+
+vec4 matmul_naive_k_dim_packed(const ivec3 out_lpos) {
+  ivec3 mat1_pos;
+  mat1_pos[mat1_axis_map.x] = 0;
+  mat1_pos[mat1_axis_map.y] = out_lpos.y;
+  mat1_pos[mat1_axis_map.z] = out_lpos.z;
+#ifdef MAT2_IS_TRANSPOSED
+  const int mat2_k_axis = mat2_axis_map.x;
+  const int mat2_row_axis = mat2_axis_map.y;
+#else
+  const int mat2_k_axis = mat2_axis_map.y;
+  const int mat2_row_axis = mat2_axis_map.x;
+#endif // MAT2_IS_TRANSPOSED
+
+  vec4 texel = vec4(0);
+  const int K = divup4(mat1_sizes.x);
+
+  for (int i = 0; i < K; ++i) {
+    const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0);
+
+    vec4 sums;
+    for (int r = 0; r < 4; ++r) {
+      // On-demand construction of mat2_pos appears to provide the lowest
+      // latency. Surprisingly, this doesn't translate to mat1_pos.
+      ivec3 mat2_pos = ivec3(0);
+      mat2_pos[mat2_k_axis] = i;
+      mat2_pos[mat2_row_axis] = out_lpos.x * 4 + r;
+#ifndef MAT2_IS_TRANSPOSED
+      mat2_pos[mat2_axis_map.z] = out_lpos.z;
+#endif // MAT2_IS_TRANSPOSED
+      sums[r] = dot(mat1_tex, texelFetch(mat2_tensor, mat2_pos, 0));
+    }
+
+    texel += sums;
+
+    mat1_pos[mat1_axis_map.x]++;
+  }
+
+  return texel;
+}
+
+vec4 matmul_naive_k_dim_packed_row_dim_packed(const ivec3 out_lpos) {
+  ivec3 mat1_pos;
+  mat1_pos[mat1_axis_map.x] = 0;
+  mat1_pos[mat1_axis_map.y] = out_lpos.y;
+  mat1_pos[mat1_axis_map.z] = out_lpos.z;
+
+  ivec3 mat2_pos;
+  mat2_pos[mat2_axis_map.x] = out_lpos.x;
+  mat2_pos[mat2_axis_map.y] = 0;
+  mat2_pos[mat2_axis_map.z] = out_lpos.z;
+
+  ivec3 mat2_pos_offset = ivec3(0);
+  mat2_pos_offset[mat2_axis_map.y] = 1;
+
+  const int mat2_y_axis = mat2_axis_map.y;
+
+  vec4 texel = vec4(0);
+  const int K = divup4(mat1_sizes.x);
+
+  for (int i = 0;
+       i < K;
+       ++i, mat1_pos[mat1_axis_map.x]++, mat2_pos[mat2_axis_map.y]+=4) {
+    const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0);
+
+    for (int r = 0; r < 4; ++r) {
+      // On-demand construction of mat2_pos appears to provide the lowest
+      // latency. Surprisingly, this doesn't translate to mat1_pos.
+      ivec3 mat2_pos = ivec3(0);
+      mat2_pos[mat2_axis_map.x] = out_lpos.x;
+      mat2_pos[mat2_axis_map.y] = 4 * i + r;
+      mat2_pos[mat2_axis_map.z] = out_lpos.z;
+
+      vec4 mat1_comp_vec = vec4(mat1_tex[r]);
+      texel = fma(mat1_comp_vec, texelFetch(mat2_tensor, mat2_pos, 0), texel);
+    }
+  }
+
+  return texel;
+}
+
+void main() {
+  const ivec3 out_lpos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(out_lpos, out_limits))) {
+    return;
+  }
+
+  vec4 texel = vec4(0);
+
+#ifdef MAT2_IS_TRANSPOSED
+  if (mat2_packed_dim == W_DIM) {
+    texel = matmul_naive_k_dim_packed(out_lpos);
+  } else {
+    texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos);
+  }
+#else
+  if (mat2_packed_dim == W_DIM) {
+    texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos);
+  } else {
+    texel = matmul_naive_k_dim_packed(out_lpos);
+  }
+#endif // MAT2_IS_TRANSPOSED
+
+#ifdef HAS_BIAS
+  vec4 bias_texel = get_bias_texel_W_packed(out_lpos);
+  texel = beta * bias_texel + alpha * texel;
+#endif // HAS_BIAS
+
+  write_texel_lpos(out_tensor, out_lpos, texel, out_axis_map);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml
similarity index 61%
rename from backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml
index 48db85cb56e..33b617eed13 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml
@@ -4,21 +4,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-addmm_naive:
+addmm_naive_texture3d:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    MAT1_PACKING: W_packed
-    MAT2_PACKING: H_packed
     MAT2_IS_TRANSPOSED: false
+    HAS_BIAS: true
   generate_variant_forall:
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: addmm_naive_W_packed_H_packed
-    - NAME: addmm_naive_W_packed_W_packed
-      MAT2_PACKING: W_packed
-    - NAME: linear_naive_W_packed_W_packed
-      MAT2_PACKING: W_packed
+    - NAME: addmm_naive_texture3d
+    - NAME: matmul_naive_texture3d
+      HAS_BIAS: false
+    - NAME: linear_naive_texture3d
       MAT2_IS_TRANSPOSED: true
+    - NAME: matmul_transposed_naive_texture3d
+      MAT2_IS_TRANSPOSED: true
+      HAS_BIAS: false
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
index 6e964c745e3..ad794d6db49 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
@@ -27,14 +27,14 @@ ${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
 $if HAS_BIAS:
   ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
 ${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "out_axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
 ${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat1_axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "mat1_axis_map")}
 ${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat2_axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "mat2_axis_map")}
 $if HAS_BIAS:
   ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "bias_axis_mapping")}
+  ${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
   ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -68,10 +68,10 @@ struct FloatMatrix_3d {
 vec4 get_texel_C_packed(const ivec2 idx) {
   ivec3 bias_pos = ivec3(0);
   if (bias_sizes.x > 1) {
-    bias_pos[bias_axis_mapping.x] = idx.x;
+    bias_pos[bias_axis_map.x] = idx.x;
   }
   if (bias_sizes.y > 1) {
-    bias_pos[bias_axis_mapping.y] = idx.y;
+    bias_pos[bias_axis_map.y] = idx.y;
   }
 
   return texelFetch(bias_tensor, bias_pos, 0);
@@ -95,11 +95,11 @@ FloatMatrix matmul_partial(const ivec4 out_idx_tl) {
   vec4 mat2_tensor_partial_load[FOUR];
 
 #ifdef MAT2_IS_TRANSPOSED
-  const int mat2_k_axis = mat2_axis_mapping.x;
-  const int mat2_row_axis = mat2_axis_mapping.y;
+  const int mat2_k_axis = mat2_axis_map.x;
+  const int mat2_row_axis = mat2_axis_map.y;
 #else
-  const int mat2_k_axis = mat2_axis_mapping.y;
-  const int mat2_row_axis = mat2_axis_mapping.x;
+  const int mat2_k_axis = mat2_axis_map.y;
+  const int mat2_row_axis = mat2_axis_map.x;
 #endif // MAT2_IS_TRANSPOSED
 
 #ifdef BATCH_MODE
@@ -113,10 +113,10 @@ FloatMatrix matmul_partial(const ivec4 out_idx_tl) {
     // read and cache (4 x TILE_ROWS) tile of mat1
     for (int r = 0; r < TILE_ROWS; r++) {
       ivec3 mat1_pos = ivec3(0);
-      mat1_pos[mat1_axis_mapping.x] = k_div4;
-      mat1_pos[mat1_axis_mapping.y] = out_idx_tl.y + r;
+      mat1_pos[mat1_axis_map.x] = k_div4;
+      mat1_pos[mat1_axis_map.y] = out_idx_tl.y + r;
 #ifdef BATCH_MODE
-      mat1_pos[mat1_axis_mapping.z] = out_idx_tl.z + batch_idx;
+      mat1_pos[mat1_axis_map.z] = out_idx_tl.z + batch_idx;
 #endif // BATCH_MODE
 
       mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0);
@@ -128,7 +128,7 @@ FloatMatrix matmul_partial(const ivec4 out_idx_tl) {
       mat2_pos[mat2_k_axis] = k_div4;
       mat2_pos[mat2_row_axis] = out_idx_tl.x + r;
 #if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED)
-      mat2_pos[mat2_axis_mapping.z] = out_idx_tl.z + batch_idx;
+      mat2_pos[mat2_axis_map.z] = out_idx_tl.z + batch_idx;
 #endif // BATCH_MODE
 
       mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0);
@@ -158,22 +158,22 @@ FloatMatrix matmul_partial(const ivec4 out_idx_tl) {
 //
 
 void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) {
-  ivec3 out_pos = to_texture_pos(
-      out_idx_tl, out_sizes, out_axis_mapping, out_packed_dim);
+  ivec3 out_pos = tidx_to_pos(
+      out_idx_tl, out_sizes, out_axis_map, out_packed_dim);
 
   for (int tile_c = 0;
        tile_c < TILE_ROWS;
-       tile_c++, out_pos[out_axis_mapping.y]++) {
-    out_pos[out_axis_mapping.x] = out_idx_tl.x;
+       tile_c++, out_pos[out_axis_map.y]++) {
+    out_pos[out_axis_map.x] = out_idx_tl.x;
 
     for (int tile_r = 0;
          tile_r < FOUR;
-         tile_r++, out_pos[out_axis_mapping.x]++) {
+         tile_r++, out_pos[out_axis_map.x]++) {
 
 #ifdef HAS_BIAS
       ivec2 bias_idx;
-      bias_idx[bias_axis_mapping.x] = out_pos[out_axis_mapping.x];
-      bias_idx[bias_axis_mapping.y] = out_pos[out_axis_mapping.y];
+      bias_idx[bias_axis_map.x] = out_pos[out_axis_map.x];
+      bias_idx[bias_axis_map.y] = out_pos[out_axis_map.y];
       float bias_val = get_texel_C_packed(bias_idx).x;
 #ifdef BATCH_MODE
       vec4 bias_texel = vec4(bias_val);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index ec7e1da296c..3103c92aea1 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -19,38 +19,43 @@
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(2, "r", "t_other", DTYPE, STORAGE)}
-${layout_declare_ubo(3, "ivec4", "out_sizes")}
-${layout_declare_ubo(4, "ivec4", "in_sizes")}
-${layout_declare_ubo(5, "ivec4", "other_sizes")}
-${layout_declare_ubo(6, "ivec2", "broadcast_params")}
-${layout_declare_ubo(7, "float", "alpha")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
+${layout_declare_ubo(B, "ivec4", "other_sizes")}
+${layout_declare_ubo(B, "ivec4", "other_axis_map")}
+${layout_declare_ubo(B, "ivec2", "broadcast_params")}
+${layout_declare_ubo(B, "float", "alpha")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim);
 
-  if (any(greaterThanEqual(idx, out_sizes))) {
+  if (any(greaterThanEqual(tidx, out_sizes))) {
     return;
   }
 
-  ivec4 in_idx = broadcast_indices(idx, in_sizes);
-  VEC4_T in_texel = VEC4_T(texelFetch(
+  // broadcast on logical sizes
+  ivec4 in_idx = broadcast_indices(tidx, in_sizes);
+  VEC4_T in_texel = VEC4_T(load_texel(
     t_in,
-    to_texture_pos(in_idx, in_sizes, packed_dim),
-    0));
+    // read axis mapped texel
+    tidx_to_pos(in_idx, in_sizes, in_axis_map, packed_dim)));
 
-  ivec4 other_idx = broadcast_indices(idx, other_sizes);
-  VEC4_T other_texel = VEC4_T(texelFetch(
+  // broadcast on logical sizes
+  ivec4 other_idx = broadcast_indices(tidx, other_sizes);
+  VEC4_T other_texel = VEC4_T(load_texel(
     t_other,
-    to_texture_pos(other_idx, other_sizes, packed_dim),
-    0));
+    // read axis mapped texel
+    tidx_to_pos(other_idx, other_sizes, other_axis_map, packed_dim)));
 
   // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
   if (broadcast_params.x > 0) {
@@ -60,5 +65,9 @@ void main() {
     other_texel = other_texel.xxxx;
   }
 
-  imageStore(t_out, pos, VEC4_T(op(in_texel, other_texel, alpha)));
+  write_texel_lpos(
+    t_out,
+    lpos,
+    VEC4_T(op(in_texel, other_texel, alpha)),
+    out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
index 58796879e85..201b4d17262 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
@@ -23,13 +23,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
 
 void main() {
-  int out_id = int(gl_GlobalInvocationID.x);
-  if (out_id >= numel) {
+  int nchwi = int(gl_GlobalInvocationID.x);
+  if (nchwi >= numel) {
     return;
   }
 
-  ivec4 t_in_idx = from_nchw_buffer_i(out_id, in_sizes);
-  const int in_id = to_buffer_id(t_in_idx, in_strides);
+  ivec4 in_tidx = nchwi_to_tidx(nchwi, in_sizes);
+  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
 
-  nchw_buf[out_id] = t_in[in_id];
+  nchw_buf[nchwi] = t_in[in_bufi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
index 35f8e25fc25..fe6d7ba7a96 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
@@ -18,32 +18,22 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D kernel_in;
-layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in;
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
-
-layout(set = 0, binding = 6) uniform PRECISION restrict Params {
-  int kernel_size;
-  int stride;
-  int padding;
-  int dilation;
-  int in_group_size;
-  int out_group_size;
-};
-
-layout(set = 0, binding = 7) uniform PRECISION restrict OutputParams {
-  float out_min;
-  float out_max;
-};
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "kernel_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
+
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
+${layout_declare_ubo(B, "ivec4", "kernel_axis_map")}
+${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
+
+${layout_declare_ubo(B,"int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
+
+${layout_declare_ubo(B, "float", "out_min", "float", "out_max")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -67,9 +57,9 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 // shader invocations, where each invocation computes 1 result. But that
 // performs worse.
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (any(greaterThanEqual(lpos, out_limits))) {
     return;
   }
 
@@ -78,8 +68,8 @@ void main() {
 
   // "out_c" is the output's channel index where we write our result.
   // Across shader invocations, this is the only value that varies.
-  int out_c = pos.y;
-  vec4 bias = texelFetch(bias_in, ivec3(out_c, 0, 0), 0);
+  int out_c = lpos.y;
+  VEC4_T bias = load_texel_lpos(bias_in, ivec3(out_c, 0, 0), bias_axis_map);
 
   // "in_c" tracks the input's channel start index.
   // We iterate over the input group that corresponds to the output group.
@@ -98,7 +88,7 @@ void main() {
     int out_l = 0;
 
     for (int in_l = l_start; in_l < l_end; in_l += stride, ++out_l) {
-      vec4 sum = vec4(0);
+      VEC4_T sum = VEC4_T(0);
 
       for (int in_c = c_start; in_c < c_end; ++in_c) {
         // "k" tracks the kernel's index for our input-kernel computation.
@@ -107,25 +97,25 @@ void main() {
         for (int k = 0; k < kernel_size; k += 4) {
           // Since the weight tensor is width-packed, which is along the length
           // dimension, we can batch-read four elements at a time.
-          const ivec3 w_pos = ivec3(k / 4, in_c % in_group_size, out_c);
-          const vec4 weight = texelFetch(kernel_in, w_pos, 0);
+          const ivec3 w_lpos = ivec3(k / 4, in_c % in_group_size, out_c);
+          const VEC4_T weight = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
 
-          const ivec3 in_pos_0 = ivec3(in_l + k * dilation, in_c, n / 4);
-          sum = fma(weight.xxxx, texelFetch(image_in, in_pos_0, 0), sum);
+          ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, n / 4), in_axis_map);
+          sum = fma(weight.xxxx, load_texel(t_in, in_pos), sum);
 
-          const ivec3 in_pos_1 = ivec3(in_l + (k+1) * dilation, in_c, n / 4);
-          sum = fma(weight.yyyy, texelFetch(image_in, in_pos_1, 0), sum);
+          in_pos[in_axis_map.x] += dilation;
+          sum = fma(weight.yyyy, load_texel(t_in, in_pos), sum);
 
-          const ivec3 in_pos_2 = ivec3(in_l + (k+2) * dilation, in_c, n / 4);
-          sum = fma(weight.zzzz, texelFetch(image_in, in_pos_2, 0), sum);
+          in_pos[in_axis_map.x] += dilation;
+          sum = fma(weight.zzzz, load_texel(t_in, in_pos), sum);
 
-          const ivec3 in_pos_3 = ivec3(in_l + (k+3) * dilation, in_c, n / 4);
-          sum = fma(weight.wwww, texelFetch(image_in, in_pos_3, 0), sum);
+          in_pos[in_axis_map.x] += dilation;
+          sum = fma(weight.wwww, load_texel(t_in, in_pos), sum);
         }
       }
 
-      ivec3 out_pos = ivec3(out_l, out_c, n / 4);
-      imageStore(image_out, out_pos, op(sum + bias.x, out_min, out_max));
+      const ivec3 out_lpos = ivec3(out_l, out_c, n / 4);
+      write_texel_lpos(t_out, out_lpos, op(sum + bias.x, out_min, out_max), out_axis_map);
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
index 16c4172510c..2266649d2b9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
@@ -7,9 +7,8 @@
 conv1d:
   parameter_names_with_default_values:
     OPERATOR: X
-    NDIM: 3
     DTYPE: float
-    PACKING: C_packed
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
index 18202e4a51f..49ce76423d5 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
@@ -53,7 +53,7 @@ void main() {
   }
 
   // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
 
   // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
index 493a614ee81..4e8bff94947 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
@@ -53,7 +53,7 @@ void main() {
   }
 
   // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
 
   // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
index d2978ffe7e6..df8589e737f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
@@ -53,7 +53,7 @@ void main() {
   }
 
   // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
+  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
 
   // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
index 361a182d6b0..f02049dc2a8 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
@@ -20,20 +20,19 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(1, "r", "existing_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(2, "r", "t_in", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
+${layout_declare_ubo(3, "ivec4", "out_sizes")}
+${layout_declare_ubo(4, "ivec4", "out_axis_map")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec4", "in_axis_map")}
+layout(set = 0, binding = 7) uniform PRECISION restrict CopyArgs {
+  // Operates on (x, y, z) logical extents.
+  ivec3 range;
   // Analogus to range variable in copy. It defines the # of channel being
   // copied.
   int channel_range;
-  int src_channel_offset;
-  int dst_channel_offset;
-  int unused;
-  // Operates on (x, y, z) extents.
-  ivec3 range;
-  int unused1;
   ivec3 dst_offset;
-  int unused2;
+  int dst_channel_offset;
+  int src_channel_offset;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -43,36 +42,36 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   // Note: Unlike other shaders, the range is often not equal to the destination
   // texture extent.
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(pos, range))) {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(lpos, range))) {
     return;
   }
 
-  const ivec3 out_pos = pos + dst_offset;
+  const ivec3 out_lpos = lpos + dst_offset;
 
-  const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
+  const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim);
 
   // First read the existing values to make sure the boundary values stay.
-  VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));
+  VEC4_T v = load_texel_lpos(existing_out, out_lpos, out_axis_map);
 
+  ivec4 in_tidx = out_tidx;
   for (int i=0; i<4; i++) {
-    ivec4 in_whcn = out_whcn;
 
-    in_whcn.z = out_whcn.z - dst_channel_offset + i;
+    in_tidx[packed_dim] = out_tidx[packed_dim] - dst_channel_offset + i;
 
     // Handle the partial update for begining of channel in an existing tensor.
     // If the source channel index is below zero or exceeds the range, we skip
     // updating the element to avoid overwriting existing data.
-    if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) {
+    if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= channel_range)) {
       continue;
     }
 
     // Readjust for the source offset.
-    in_whcn.z = in_whcn.z + src_channel_offset;
+    in_tidx[packed_dim] += src_channel_offset;
 
-    ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
-    v[i] = VEC4_T(texelFetch(t_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
+    ivec4 in_posi = tidx_to_posi(in_tidx, in_sizes, in_axis_map, packed_dim);
+    v[i] = load_texel(t_in, in_posi.xyz)[in_posi.w];
   }
 
-  imageStore(t_out, out_pos, v);
+  write_texel_lpos(t_out, out_lpos, v, out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
index 50ddb92c349..d709578b1c9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -10,19 +10,16 @@
 
 #define PRECISION ${PRECISION}
 
+#include "indexing_utils.h"
+
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
-  ivec3 range;
-  int unused0;
-  ivec3 src_offset;
-  int unused1;
-  ivec3 dst_offset;
-  int unused2;
-};
+${layout_declare_ubo(B, "ivec3", "range", "ivec3", "src_offset", "ivec3", "dst_offset")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -36,5 +33,9 @@ void main() {
     return;
   }
 
-  imageStore(t_out, out_pos, texelFetch(t_in, in_pos, 0));
+  write_texel_lpos(
+    t_out,
+    out_pos,
+    load_texel_lpos(t_in, in_pos, in_axis_map),
+    out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
index 3adffe99bdb..0a3eeee257f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
@@ -16,34 +16,36 @@ layout(std430) buffer;
 
 #include "indexing_utils.h"
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", "int", STORAGE)}
-${layout_declare_tensor(2, "r", "t_weight", DTYPE, STORAGE)}
-${layout_declare_ubo(3, "ivec4", "sizes")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", "int", STORAGE)}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
+${layout_declare_ubo(B, "ivec4", "weight_axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void main() {
-  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-
-  if (pos_out_of_bounds(out_pos, sizes, packed_dim)) {
+  const ivec3 out_lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 out_tidx = lpos_to_tidx(out_lpos, sizes, out_axis_map.w, packed_dim);
+  if (any(greaterThanEqual(out_tidx, sizes))) {
     return;
   }
-
-  const ivec4 out_idx = to_tensor_idx(out_pos, sizes, packed_dim);
   VEC4_T out_texel;
 
   // Consider optimizing via W-packing format for t_in and t_weight.
   for (int i = 0; i < 4; ++i) {
     // Read input tensor for embedding index.
-    const ivec3 in_pos = ivec3(out_pos.y, out_idx.z * 4 + i, out_idx.w / 4);
-    const int in_texel_elem = texelFetch(t_in, in_pos, 0)[out_idx.w % 4];
+    const ivec3 in_lpos = ivec3(out_tidx.y, out_tidx.z * 4 + i, out_tidx.w / 4);
+    const int in_texel_elem = load_texel_lpos(t_in, in_lpos, in_axis_map)[out_tidx.w % 4];
 
     // Read weight tensor for embedding.
-    out_texel[i] = texelFetch(t_weight, ivec3(out_pos.x, in_texel_elem, 0), 0).x;
+    const ivec3 weight_lpos = ivec3(out_tidx.x, in_texel_elem, 0);
+    out_texel[i] = load_texel_lpos(t_weight, weight_lpos, weight_axis_map).x;
   }
 
-  imageStore(t_out, out_pos, out_texel);
+  write_texel_lpos(t_out, out_lpos, out_texel, out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/flip.glsl b/backends/vulkan/runtime/graph/ops/glsl/flip.glsl
new file mode 100644
index 00000000000..2291d1b6e4f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/flip.glsl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "dims")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  VEC4_T out_texel = VEC4_T(0);
+  uint src_x = pos.x;
+  uint src_y = pos.y;
+  uint src_z = pos.z;
+
+  int flattened_channels = int(ceil(out_sizes.z / 4.0));
+
+  // Width
+  if (dims.x == 1) {
+    src_x = out_sizes.x - 1 - pos.x;
+  }
+  // Height
+  if (dims.y == 1) {
+    src_y = out_sizes.y - 1 - pos.y;
+  }
+  // Batch
+  if (dims.w == 1) {
+    uint n = pos.z / flattened_channels;
+    uint src_n = out_sizes.w - 1 - n;
+    uint c4 = pos.z - n * flattened_channels;
+    src_z = src_n * flattened_channels + c4;
+  }
+
+  uint prev_src_z = src_z;
+  for (int p = 0; p < 4; ++p) {
+    uint src_p = p;
+
+    // Channel
+    if (dims.z == 1) {
+      uint nc = (pos.z / flattened_channels) * flattened_channels;
+      uint c4 = pos.z - nc;
+      uint c = c4 * 4 + p;
+      uint src_c = out_sizes.z - 1 - c;
+
+      src_z = (dims.w == 1)
+          ? prev_src_z - c4 + src_c / 4 // Batch and Channel
+          : nc + src_c / 4; // Channel only
+      src_p = src_c % 4;
+    }
+
+    VEC4_T in_texel = VEC4_T(texelFetch(t_in, ivec3(src_x, src_y, src_z), 0));
+    out_texel[p] = in_texel[src_p];
+  }
+  imageStore(t_out, pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/flip.yaml b/backends/vulkan/runtime/graph/ops/glsl/flip.yaml
new file mode 100644
index 00000000000..646fd05e420
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/flip.yaml
@@ -0,0 +1,13 @@
+flip:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+      - VALUE: int8
+      - VALUE: uint8
+  shader_variants:
+    - NAME: flip
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
index 8f113bd2cc2..be3901799f8 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -24,14 +24,14 @@ layout(std430) buffer;
 ${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
+  const ivec4 buf_indices = tidx_to_nchwi(
       tensor_idx,
       sizes,
       packed_dim);
@@ -51,13 +51,13 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
 }
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
 
-  if (any(greaterThanEqual(tensor_idx, sizes))) {
+  if (any(greaterThanEqual(tidx, sizes))) {
     return;
   }
 
-  const VEC4_T intex = load_texel(t_in, pos);
-  write_out_texel(intex, tensor_idx);
+  const VEC4_T intex = load_texel(t_in, lpos_to_pos(lpos, axis_map));
+  write_out_texel(intex, tidx);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl
index ba60000f3d4..76ec540838c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl
@@ -34,18 +34,18 @@ void main() {
   }
 
   const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim);
-  const ivec4 buffer_ixs = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim);
+  const ivec4 buffer_ixs = tidx_to_nchwi(idx, out_sizes, packed_dim);
 
   VEC4_T out_texel;
   for (int i = 0; i < 4; ++i) {
-      const ivec4 out_idx = from_nchw_buffer_i(buffer_ixs[i], out_sizes);
-      int out_channel = out_idx.z;
+      const ivec4 out_tidx = nchwi_to_tidx(buffer_ixs[i], out_sizes);
+      int out_channel = out_tidx.z;
       int in_channel = texelFetch(t_idx, ivec3(out_channel, 0, 0), 0).x;
 
-      ivec4 in_idx = out_idx;
-      in_idx.z = in_channel;
+      ivec4 in_tidx = out_tidx;
+      in_tidx.z = in_channel;
 
-      ivec4 in_elem_pos = to_texture_elem_pos(in_idx, in_sizes, packed_dim);
+      ivec4 in_elem_pos = to_texture_elem_pos(in_tidx, in_sizes, packed_dim);
 
       VEC4_T in_texel = texelFetch(t_in, in_elem_pos.xyz, 0);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 9dc06bd8552..da4374a2b9f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -9,32 +9,44 @@
 #ifndef INDEXING_UTILS_H
 #define INDEXING_UTILS_H
 
-// Width Dim Index, assuming (W, H, C, N) order
+/*
+ * The functions defined in this header file use the following shorthand to
+ * represent tensor related data structures.
+ *
+ * tidx  - ivec4 tensor indices, listed in WHCN order.
+ *
+ * pos   - ivec3 texel position, used to fetch from an image texture via the
+ *         texelFetch(image, pos, lod) GLSL function.
+ * posi  - ivec4 texel element position. It is the same as pos, except with an
+ *         additional component of the index of an element within the texel.
+ * lpos  - ivec3 logical position, listed in WHC order. This is a permutation of
+ *         texture position based on a tensor's axis_map. lpos.x is the position
+ *         component that corresponds to the tensor's width dimension, lpos.y is
+ *         the position component that corresponds to the tensor's height dim,
+ *         and so on.
+ *
+ * bufi  - int index into a GPU buffer that backs a tensor.
+ * nchwi - int index into a staging buffer for a tensor. The data in the
+ *         staging buffer is stored in contiguous data layout, irrespective of
+ *         the tensor's strides.
+ */
+
+// Width Dim Index, assuming WHCN order
 #define W_DIM 0
-// Height, assuming (W, H, C, N) order
+// Height, assuming WHCN order
 #define H_DIM 1
-// Channels, assuming (W, H, C, N) order
+// Channels, assuming WHCN order
 #define C_DIM 2
 
 /*
- * Describes which texture axis the "batches" dimension runs along in a 4D
- * texture.
- *
- * Currently it is set to 2 since we represent batches by concatenating along
- * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
- * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
- * order.
+ * Fast division by 4 using bit shifting
  */
-#define BATCH_AXIS 2
-
-//
-// Basic Indexing Utility Macros and Functions
-//
+#define div4(x) (x >> 2)
 
 /*
  * Divides input and rounds up to 4
  */
-#define divup4(x) ((x + 3) / 4)
+#define divup4(x) ((x + 3) >> 2)
 
 /*
  * Aligns input to the next multiple of 4
@@ -42,8 +54,13 @@
 #define alignup4(x) ((x + 3) & -4)
 
 /*
- * Input: (W, H, C, N) strides of a tensor
- * Returns: the WHCN index of the fastest moving dimension
+ * Fast modulo by 4 using bit masking
+ */
+#define mod4(x) (x & 3)
+
+/*
+ * Find the packed dimension of a tensor given its strides. The packed dimension
+ * is the "fastest moving" dimension which will have a stride of 1.
  */
 int find_packed_dim(const ivec4 strides) {
   int packed_dim = 0;
@@ -56,83 +73,179 @@ int find_packed_dim(const ivec4 strides) {
   return packed_dim;
 }
 
-//
-// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion
-//
-
 /*
- * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
- *        is packed along a texel
- * Output: A ivec4 containing the buffer indices corresponding to each texel
- *         element.
+ * Get the staging buffer indices that contain the data of the texel that
+ * corresponds to the provided tensor index. Since the texel have 4 elements,
+ * 4 buffer indices will be retrieved.
  */
-ivec4 get_texel_nchw_buffer_ixs(ivec4 idx, ivec4 sizes, int packed_dim) {
+ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) {
   ivec4 strides =
       ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z);
 
-  int base_i = idx.x * strides.x + idx.y * strides.y + idx.z * strides.z +
-      idx.w * strides.w;
+  int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
+      tidx.w * strides.w;
 
   return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
 }
 
-/*
- * Input: Index into a tensor's data buffer, (W, H, C, N) sizes of a tensor
- * Returns: The WCHN index of the tensor that corresponds to the specified
- *          buffer index, assuming the buffer has contiguous memory layout
- */
-ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
+ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
   return ivec4(
-      buf_i % sizes.x,
-      (buf_i / (sizes.x)) % sizes.y,
-      (buf_i / (sizes.x * sizes.y)) % sizes.z,
-      (buf_i / (sizes.x * sizes.y * sizes.z)));
+      nchwi % sizes.x,
+      (nchwi / (sizes.x)) % sizes.y,
+      (nchwi / (sizes.x * sizes.y)) % sizes.z,
+      (nchwi / (sizes.x * sizes.y * sizes.z)));
 }
 
-int to_nchw_buffer_i(const ivec4 tensor_idx, const ivec4 sizes) {
-  return tensor_idx.w * sizes.x * sizes.y * sizes.z +
-      tensor_idx.z * sizes.x * sizes.y + tensor_idx.y * sizes.x + tensor_idx.x;
+int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
+  return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y +
+      tidx.y * sizes.x + tidx.x;
 }
 
-/*
- * Input: Texel buffer index, (W, H, C, N) strides of a tensor, which dim is
- *        packed along a texel
- * Returns: The (w, h, c, n) tensor index corresponding to the buffer element
- */
-ivec4 to_tensor_idx(int buffer_id, const ivec4 strides, const int packed_dim) {
+// TODO(ssjia): make this function use dim order so that it can work with any
+// dim order. Currently it assumes that the dim order is contiguous, except for
+// the packed dim.
+ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
     if (i != packed_dim) {
-      idx[i] = buffer_id / strides[i];
-      buffer_id %= strides[i];
+      idx[i] = bufi / strides[i];
+      bufi %= strides[i];
     }
   }
-  idx[packed_dim] = buffer_id;
+  idx[packed_dim] = bufi;
   return idx;
 }
 
-/*
- * Input: Texel buffer index, (W, H, C, N) strides of a tensor
- * Returns: The (w, h, c, n) tensor index corresponding to the buffer element
- *
- * This is a convenience overload of the above function. If the packed dim is
- * not known, it can be found by finding the first dimension with a stride of 1.
- * However, this process adds some overhead, so if performance is a concern then
- * the above function should be used instead so that the packed dim is provided.
- */
-ivec4 to_tensor_idx(int buffer_id, const ivec4 strides) {
+// Convenience overload of the above function, which will determine the packed
+// dim from the strides automatically so it doesn't have to be passed in as a
+// function argument.
+ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) {
   int packed_dim = find_packed_dim(strides);
-  return to_tensor_idx(buffer_id, strides, packed_dim);
+  return bufi_to_tidx(bufi, strides, packed_dim);
+}
+
+int tidx_to_bufi(const ivec4 tidx, ivec4 strides) {
+  return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
+      tidx.w * strides.w;
+}
+
+ivec4 lpos_to_tidx(
+    ivec3 lpos,
+    ivec4 sizes,
+    const int batch_inner_dim,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+  // Moving 1 texel along the packed dim traverses 4 tensor elements
+  lpos[packed_dim] *= 4;
+
+  ivec4 tidx = ivec4(lpos, 0);
+
+  if (sizes.w > 1) {
+    tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim];
+    tidx[batch_inner_dim] %= sizes[batch_inner_dim];
+  }
+  return tidx;
 }
 
+ivec3 tidx_to_lpos(
+    ivec4 tidx,
+    ivec4 sizes,
+    const int batch_inner_dim,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 lpos = tidx.xyz;
+
+  // Adjust batch inner dim by batch index if needed
+  if (sizes.w > 1) {
+    lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim];
+  }
+  // Fast division by 4, since moving 1 texel along the packed dim traverses 4
+  // tensor elements.
+  lpos[packed_dim] >>= 2;
+  return lpos;
+}
+
+ivec3 tidx_to_pos(
+    ivec4 tidx,
+    ivec4 sizes,
+    const ivec4 axis_map,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 pos;
+  for (int dim = 0; dim < 3; ++dim) {
+    pos[axis_map[dim]] = tidx[dim];
+  }
+
+  // Adjust batch inner dim by batch index if needed
+  if (sizes.w > 1) {
+    pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w];
+  }
+  // Fast division by 4, since moving 1 texel along the packed dim traverses 4
+  // tensor elements.
+  pos[axis_map[packed_dim]] >>= 2;
+  return pos;
+}
+
+ivec4 tidx_to_posi(
+    ivec4 tidx,
+    ivec4 sizes,
+    const ivec4 axis_map,
+    const int packed_dim) {
+  return ivec4(
+      tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4);
+}
+
+ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
+  ivec3 pos;
+  pos[axis_map.x] = lpos.x;
+  pos[axis_map.y] = lpos.y;
+  pos[axis_map.z] = lpos.z;
+  return pos;
+}
+
+#ifdef USING_BUFFER
+#define load_texel(buf, idx) buf[idx]
+#elif defined(USING_TEXTURE2D)
+#define load_texel(im, pos) texelFetch(im, pos.xy, 0)
+#else // defined(USING_TEXTURE3D)
+#define load_texel(im, pos) texelFetch(im, pos, 0)
+#define load_texel_lpos(im, lpos, axis_map) \
+  texelFetch(im, lpos_to_pos(lpos, axis_map), 0)
+#endif
+
+#ifdef USING_BUFFER
+#define write_texel(buf, idx, texel) buf[idx] = texel
+#elif defined(USING_TEXTURE2D)
+#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel)
+#else // defined(USING_TEXTURE3D)
+#define write_texel(im, pos, texel) imageStore(im, pos, texel)
+#define write_texel_lpos(im, lpos, texel, axis_map) \
+  imageStore(im, lpos_to_pos(lpos, axis_map), texel)
+#endif
+
+/************************
+ * Deprecated Functions *
+ ************************/
+
+// The below functions and macros are in the process of being deprecated in
+// favor of newer indexing functions that account for axis mapping and have more
+// explicit function names and more updated terminology.
+
 /*
- * Input: (w, h, c, n) tensor index, (W, H, C, N) strides of the tensor buffer
- * Returns: the buffer index corresponding to the specified tensor index
+ * Describes which texture axis the "batches" dimension runs along in a 4D
+ * texture.
+ *
+ * Currently it is set to 2 since we represent batches by concatenating along
+ * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
+ * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
+ * order.
  */
-int to_buffer_id(const ivec4 tensor_idx, ivec4 strides) {
-  return tensor_idx.x * strides.x + tensor_idx.y * strides.y +
-      tensor_idx.z * strides.z + tensor_idx.w * strides.w;
-}
+#define BATCH_AXIS 2
 
 //
 // (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion
@@ -183,42 +296,6 @@ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
   return tensor_idx;
 }
 
-/*
- * Derive (w,h,c,n) tensor indices from (x,y,z) texture position using axis
- * mapping.
- */
-ivec4 to_tensor_idx(
-    ivec3 pos,
-    ivec4 sizes,
-    const ivec4 axis_mapping,
-    const int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  // Packed dim contains 4 elements per texel, so moving 1 unit traverses 4
-  // elements in the tensor.
-  pos[axis_mapping[packed_dim]] *= 4;
-
-  ivec4 tensor_idx;
-  for (int dim = 0; dim < 3; ++dim) {
-    tensor_idx[dim] = pos[axis_mapping[dim]];
-  }
-
-  // Early return if batch is 1. Batch index will be 0.
-  if (sizes.w == 1) {
-    tensor_idx.w = 0;
-    return tensor_idx;
-  }
-
-  // Else, adjust the dim that's concatenated with batch. Note that the axis
-  // mapping for the batch dim indicates WHCN dim index of the dim that it is
-  // concatenated with, not a texture axis.
-  tensor_idx.w = tensor_idx[axis_mapping[3]] / sizes[axis_mapping[3]];
-  tensor_idx[axis_mapping[3]] %= sizes[axis_mapping[3]];
-
-  return tensor_idx;
-}
-
 /*
  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
  *        is packed along a texel
@@ -235,34 +312,6 @@ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
-/*
- * Derive (x,y,z) texture position from (w,h,c,n) tensor indices using axis
- * mapping.
- */
-ivec3 to_texture_pos(
-    const ivec4 idx,
-    ivec4 sizes,
-    const ivec4 axis_mapping,
-    const int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  ivec3 pos;
-  for (int dim = 0; dim < 3; ++dim) {
-    pos[axis_mapping[dim]] = idx[dim];
-  }
-
-  // Adjust batch dim if needed
-  if (sizes.w > 1) {
-    pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
-  }
-
-  // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
-  // tensor elements in that dim.
-  pos[axis_mapping[packed_dim]] /= 4;
-  return pos;
-}
-
 /*
  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
  *        is packed along a texel
@@ -282,55 +331,6 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
-/*
- * Derive (x,y,z,i) texel element position from the (w,h,c,n) tensor index using
- * the axis mapping.
- */
-ivec4 to_texture_elem_pos(
-    const ivec4 idx,
-    ivec4 sizes,
-    const ivec4 axis_mapping,
-    const int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  ivec4 pos;
-  for (int dim = 0; dim < 3; ++dim) {
-    pos[axis_mapping[dim]] = idx[dim];
-  }
-
-  // Adjust batch dim if needed
-  if (sizes.w > 1) {
-    pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
-  }
-
-  // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
-  // tensor elements in that dim.
-  pos[axis_mapping[packed_dim]] /= 4;
-  pos.w = idx[packed_dim] % 4;
-  return pos;
-}
-
-//
-// Texel Access and Storage
-//
-
-#ifdef USING_BUFFER
-#define load_texel(buf, idx) buf[idx]
-#elif defined(USING_TEXTURE2D)
-#define load_texel(im, pos) texelFetch(im, pos.xy, 0)
-#else // defined(USING_TEXTURE3D)
-#define load_texel(im, pos) texelFetch(im, pos, 0)
-#endif
-
-#ifdef USING_BUFFER
-#define write_texel(buf, idx, texel) buf[idx] = texel
-#elif defined(USING_TEXTURE2D)
-#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel)
-#else // defined(USING_TEXTURE3D)
-#define write_texel(im, pos, texel) imageStore(im, pos, texel)
-#endif
-
 //
 // Miscellaneous Utility Functions and Macros
 //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
index 3ef984bfc95..f7133dd0452 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
@@ -19,7 +19,7 @@ layout(std430) buffer;
 ${layout_declare_buffer(B, "w", "nchw_out", "int")}
 ${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")}
 ${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "axis_map")}
 ${layout_declare_ubo(B, "int", "out_numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -41,9 +41,9 @@ void main() {
   int in_buf_idx = 4 * out_buf_idx;
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
-    const ivec4 tensor_idx = from_nchw_buffer_i(in_buf_idx, tensor_sizes);
+    const ivec4 tidx = nchwi_to_tidx(in_buf_idx, tensor_sizes);
     const ivec4 texture_pos = to_texture_elem_pos(
-        tensor_idx, tensor_sizes, packed_dim);
+        tidx, tensor_sizes, packed_dim);
     values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w];
     in_buf_idx++;
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl
new file mode 100644
index 00000000000..8028362c3e5
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl
@@ -0,0 +1,80 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type(STORAGE)}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "cache", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "projected", DTYPE, STORAGE)}
+$if STORAGE == "buffer":
+  ${layout_declare_ubo(B, "int", "projected_numel")}
+  ${layout_declare_ubo(B, "ivec4", "cache_strides")}
+  ${layout_declare_ubo(B, "int", "input_pos")}
+$else:
+  ${layout_declare_ubo(B, "ivec3", "projected_limits")}
+  ${layout_declare_ubo(B, "int", "input_pos")}
+
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * t_cache will have sizes of (max_batch_size, max_seq_len, n_heads, head_dim).
+ * t_projected will have sizes of (batch_size, seq_len, n_heads, head_dim).
+ *
+ * The cache update inserts the values of t_projected into t_cache at the index
+ * specified by input_pos at the seq_len dimension. It is equivalent to calling
+
+ * t_cache = t_cache.slice_scatter(
+ *     t_projected, dim=1, start=input_pos, end=input_pos+seq_len)
+ *
+ * Note that this shader is implemented assuming that max_batch_size is 1.
+ */
+
+#ifdef USING_BUFFER
+
+/***************************
+ ** Buffer Implementation **
+ ***************************/
+
+void main() {
+  int projected_bufi = int(gl_GlobalInvocationID.x);
+  // Bump cache index forward by input_pos elements along the seq_len dimension.
+  // cache_strides contains the strides of the cache tensor.
+  int cache_bufi = input_pos * cache_strides.z + projected_bufi;
+  if (projected_bufi >= projected_numel) {
+    return;
+  }
+  cache[cache_bufi] = projected[projected_bufi];
+}
+
+#else
+
+/****************************
+ ** Texture Implementation **
+ ****************************/
+
+// Note that this shader assumes the that tensors are width packed, i.e.
+// packed_dim = 0
+void main() {
+  const ivec3 projected_pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(projected_pos, projected_limits))) {
+    return;
+  }
+
+  const ivec3 cache_pos = ivec3(
+      projected_pos.x,
+      projected_pos.y,
+      projected_pos.z + input_pos);
+
+  write_texel(cache, cache_pos, load_texel(projected, projected_pos));
+}
+
+#endif // USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax_batch_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml
similarity index 63%
rename from backends/vulkan/runtime/graph/ops/glsl/softmax_batch_height_width.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml
index eba512ca484..e2a96234465 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax_batch_height_width.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml
@@ -4,18 +4,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-softmax_batch_height_width:
+kv_cache_update:
   parameter_names_with_default_values:
-    OPERATOR1: exp(X)
-    OPERATOR2: X / Y
-    NDIM: 3
     DTYPE: float
+    STORAGE: buffer
   generate_variant_forall:
+    STORAGE:
+      - VALUE: buffer
+      - VALUE: texture3d
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: softmax_batch_height_width
-    - NAME: log_softmax_batch_height_width
-      OPERATOR1: X
-      OPERATOR2: X - log(Y)
+    - NAME: kv_cache_update
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.h b/backends/vulkan/runtime/graph/ops/glsl/matmul.h
deleted file mode 100644
index 620f1fd0e6e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// To convince the SPIR-V compiler to unroll the loops optimally, need this
-// macro
-#define FOUR 4
-
-#ifdef TILE_ROW_2
-#define TILE_ROWS 2
-#else
-#define TILE_ROWS 4
-#endif
-
-// we avoid mat4 and vec4 usage here as they compile to much less efficient
-// SPIR-V
-struct FloatMatrix_2d {
-  float data[TILE_ROWS][FOUR];
-};
-
-struct FloatMatrix_3d {
-  float data[TILE_ROWS][FOUR][FOUR];
-};
-
-#ifdef MAT2_IS_TRANSPOSED
-vec4 matmul_naive_W_packed_W_packed(
-#else
-vec4 matmul_naive_W_packed_H_packed(
-#endif
-    const sampler3D im_mat1,
-    const sampler3D im_mat2,
-    const ivec3 out_pos,
-    const int width) {
-  ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z);
-#ifdef MAT2_IS_TRANSPOSED
-  ivec3 mat2_pos = ivec3(0, out_pos.x * 4, 0);
-#else
-  ivec3 mat2_pos = ivec3(out_pos.x * 4, 0, out_pos.z);
-#endif
-
-  vec4 texel = vec4(0);
-  const int K = (width + 3) / 4;
-
-  for (int i = 0; i < K; ++i) {
-    const vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
-#ifdef MAT2_IS_TRANSPOSED
-    const vec4 sums = vec4(
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 1, 0), 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 2, 0), 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 3, 0), 0)));
-#else
-    const vec4 sums = vec4(
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(1, 0, 0), 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(2, 0, 0), 0)),
-        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(3, 0, 0), 0)));
-#endif
-
-    texel += sums;
-
-    mat1_pos.x++;
-#ifdef MAT2_IS_TRANSPOSED
-    mat2_pos.x++;
-#else
-    mat2_pos.y++;
-#endif
-  }
-
-  return texel;
-}
-
-#ifdef MAT2_IS_TRANSPOSED
-vec4 matmul_naive_W_packed_H_packed(
-#else
-vec4 matmul_naive_W_packed_W_packed(
-#endif
-    const sampler3D im_mat1,
-    const sampler3D im_mat2,
-    const ivec3 out_pos,
-    const int width) {
-  ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z);
-  ivec3 mat2_pos = ivec3(out_pos.x, 0, out_pos.z);
-
-  vec4 texel = vec4(0);
-  int K = divup4(width);
-
-  for (int i = 0; i < K; ++i) {
-    vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
-    texel = fma(mat1_tex.xxxx, texelFetch(im_mat2, mat2_pos, 0), texel);
-    mat2_pos.y++;
-    texel = fma(mat1_tex.yyyy, texelFetch(im_mat2, mat2_pos, 0), texel);
-    mat2_pos.y++;
-    texel = fma(mat1_tex.zzzz, texelFetch(im_mat2, mat2_pos, 0), texel);
-    mat2_pos.y++;
-    texel = fma(mat1_tex.wwww, texelFetch(im_mat2, mat2_pos, 0), texel);
-    mat2_pos.y++;
-
-    mat1_pos.x++;
-  }
-
-  return texel;
-}
-
-// get texel from self tensor (width_packed) in addmm
-vec4 get_texel_W_packed(
-    sampler3D im_self,
-    const ivec3 pos,
-    const bool broadcast_at_width,
-    const bool broadcast_at_height) {
-  vec4 self_texel;
-  // self is of shape {1}
-  if (broadcast_at_width && broadcast_at_height) {
-    self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0).xxxx;
-  }
-  // self is of shape {*, 1}
-  else if (broadcast_at_width) {
-    self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0).xxxx;
-  }
-  // self is of shape {1, *}
-  else if (broadcast_at_height) {
-    self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0);
-  } else {
-    self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0);
-  }
-
-  return self_texel;
-}
-
-// get texel from self tensor (channel_packed) in addmm
-vec4 get_texel_C_packed(
-    sampler3D im_self,
-    const ivec3 pos,
-    const bool broadcast_at_width,
-    const bool broadcast_at_height) {
-  vec4 self_texel;
-  // self is of shape {1}
-  if (broadcast_at_width && broadcast_at_height) {
-    self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0);
-  }
-  // self is of shape {*, 1}
-  else if (broadcast_at_width) {
-    self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0);
-  }
-  // self is of shape {1, *}
-  else if (broadcast_at_height) {
-    self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0);
-  } else {
-    self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0);
-  }
-
-  return self_texel;
-}
-
-FloatMatrix_2d matmul_partial_2d(
-    sampler3D im_mat1,
-    sampler3D im_mat2,
-    const ivec3 pos,
-    const int batch_size,
-    const int K_texel_len) {
-  FloatMatrix_2d results;
-  for (int i = 0; i < TILE_ROWS; i++) {
-    for (int j = 0; j < FOUR; j++) {
-      results.data[i][j] = 0.0f;
-    }
-  }
-  vec4 im_mat1_partial_load[TILE_ROWS];
-  vec4 im_mat2_partial_load[FOUR];
-
-  for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) {
-    for (int offset = 0; offset < TILE_ROWS; offset++) {
-      // read and cache 2x4 (or 4x4) tile of im_mat1
-      const int mat1_y = (TILE_ROWS * pos.y) + offset;
-      const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, 0);
-      im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0);
-      // read and cache 4x4 tile of im_mat2
-#ifdef MAT2_IS_TRANSPOSED
-      const int mat2_y = (FOUR * pos.x) + offset;
-      const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0);
-      im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
-#else
-      const int mat2_x = (FOUR * pos.x) + offset;
-      const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, 0);
-      im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
-#endif
-    }
-
-#ifdef TILE_ROW_2
-// column 3 and 4 of im_mat2
-#ifdef MAT2_IS_TRANSPOSED
-    im_mat2_partial_load[2] =
-        texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 2, 0), 0);
-    im_mat2_partial_load[3] =
-        texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 3, 0), 0);
-#else
-    im_mat2_partial_load[2] =
-        texelFetch(im_mat2, ivec3((FOUR * pos.x) + 2, mat1_x, 0), 0);
-    im_mat2_partial_load[3] =
-        texelFetch(im_mat2, ivec3((FOUR * pos.x) + 3, mat1_x, 0), 0);
-#endif
-#endif
-
-    // perform partial dot products and add partial result to results
-    for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
-      for (int out_col = 0; out_col < FOUR; out_col++) {
-        results.data[out_row][out_col] +=
-            dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]);
-      }
-    }
-  }
-  return results;
-}
-
-FloatMatrix_3d matmul_partial_3d(
-    sampler3D im_mat1,
-    sampler3D im_mat2,
-    const ivec3 pos,
-    const int batch_size,
-    const int K_texel_len) {
-  FloatMatrix_3d results;
-  for (int i = 0; i < TILE_ROWS; i++) {
-    for (int j = 0; j < FOUR; j++) {
-      for (int k = 0; k < FOUR; k++) {
-        results.data[i][j][k] = 0.0f;
-      }
-    }
-  }
-  vec4 im_mat1_partial_load[TILE_ROWS];
-  vec4 im_mat2_partial_load[FOUR];
-
-  for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) {
-    if (FOUR * pos.z + batch_idx >= batch_size) {
-      break;
-    }
-    int mat_z = FOUR * pos.z + batch_idx;
-    for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) {
-      for (int offset = 0; offset < TILE_ROWS; offset++) {
-        // read and cache 2x4 (or 4x4) tile of im_mat1
-        const int mat1_y = (TILE_ROWS * pos.y) + offset;
-        const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, mat_z);
-        im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0);
-        // read and cache 4x4 tile of im_mat2
-#ifdef MAT2_IS_TRANSPOSED
-        const int mat2_y = (FOUR * pos.x) + offset;
-        const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0);
-        im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
-#else
-        const int mat2_x = (FOUR * pos.x) + offset;
-        const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, mat_z);
-        im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
-#endif
-      }
-
-#ifdef TILE_ROW_2
-// column 3, and 4 of im_mat2
-#ifdef MAT2_IS_TRANSPOSED
-      im_mat2_partial_load[2] =
-          texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 2, 0), 0);
-      im_mat2_partial_load[3] =
-          texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 3, 0), 0);
-#else
-      im_mat2_partial_load[2] =
-          texelFetch(im_mat2, ivec3((FOUR * pos.x) + 2, mat1_x, mat_z), 0);
-      im_mat2_partial_load[3] =
-          texelFetch(im_mat2, ivec3((FOUR * pos.x) + 3, mat1_x, mat_z), 0);
-#endif
-#endif
-
-      // perform partial dot products and add partial result to results
-      for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
-        for (int out_col = 0; out_col < FOUR; out_col++) {
-          results.data[out_row][out_col][batch_idx] +=
-              dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]);
-        }
-      }
-    }
-  }
-  return results;
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
index 25a6a742779..e4064eed2fa 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
@@ -32,29 +32,29 @@ ${layout_declare_ubo(9, "int", "out_numel")}
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
-  const ivec4 out_idx = ivec4(
+  const ivec4 out_bufix = ivec4(
       gl_GlobalInvocationID.x,
       gl_GlobalInvocationID.y,
       gl_GlobalInvocationID.z % out_sizes.z,
       gl_GlobalInvocationID.z / out_sizes.z);
 
-  if (any(greaterThanEqual(out_idx, out_sizes))) {
+  if (any(greaterThanEqual(out_bufix, out_sizes))) {
     return;
   }
 
-  int mat1_id = to_buffer_id(
-      ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides);
-  int mat2_id = to_buffer_id(
-      ivec4(out_idx.x, 0, out_idx.z, out_idx.w), mat2_strides);
+  int mat1_bufi = tidx_to_bufi(
+      ivec4(0, out_bufix.y, out_bufix.z, out_bufix.w), mat1_strides);
+  int mat2_bufi = tidx_to_bufi(
+      ivec4(out_bufix.x, 0, out_bufix.z, out_bufix.w), mat2_strides);
 
   T sum = T(0.0);
   for (int i = 0; i < mat1_sizes.x; ++i) {
-    sum += t_mat1[mat1_id] * t_mat2[mat2_id];
+    sum += t_mat1[mat1_bufi] * t_mat2[mat2_bufi];
 
-    mat1_id += mat1_strides.x;
-    mat2_id += mat2_strides.y;
+    mat1_bufi += mat1_strides.x;
+    mat2_bufi += mat2_strides.y;
   }
 
-  const int out_id = to_buffer_id(out_idx, out_strides);
-  t_out[out_id] = T(sum);
+  const int out_bufi = tidx_to_bufi(out_bufix, out_strides);
+  t_out[out_bufi] = T(sum);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl
deleted file mode 100644
index 7225f2c64a0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-#include "indexing_utils.h"
-#include "matmul.h"
-
-${layout_declare_tensor(0, "w", "im_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "im_mat1", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "im_mat2", DTYPE, "texture3d")}
-${layout_declare_ubo(3, "ivec3", "out_limits")}
-${layout_declare_ubo(4, "ivec4", "in_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  vec4 texel = vec4(0);
-
-  $if MAT1_PACKING == "W_packed":
-    $if MAT2_PACKING == "H_packed":
-      texel = matmul_naive_W_packed_H_packed(
-          im_mat1,
-          im_mat2,
-          pos,
-          in_sizes[0]);
-    $elif MAT2_PACKING == "W_packed":
-      texel = matmul_naive_W_packed_W_packed(
-          im_mat1,
-          im_mat2,
-          pos,
-          in_sizes[0]);
-    $else:
-      $raise Exception("Unsupported value for MAT2_PACKING")
-  $else:
-    $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING")
-
-  imageStore(im_out, pos, texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml
deleted file mode 100644
index bb1eed494a5..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-matmul_naive_texture3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-    MAT1_PACKING: W_packed
-    MAT2_PACKING: H_packed
-    MAT2_IS_TRANSPOSED: false
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: matmul_naive_texture3d_W_packed_H_packed
-    - NAME: matmul_naive_texture3d_W_packed_W_packed
-      MAT2_PACKING: W_packed
-    - NAME: matmul_transposed_naive_texture3d_W_packed_W_packed
-      MAT2_PACKING: W_packed
-      MAT2_IS_TRANSPOSED: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
index d861972f935..ea4e0d300cc 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -23,13 +23,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
 
 void main() {
-  int out_id = int(gl_GlobalInvocationID.x);
-  if (out_id >= numel) {
+  int out_bufi = int(gl_GlobalInvocationID.x);
+  if (out_bufi >= numel) {
     return;
   }
 
-  ivec4 out_idx = to_tensor_idx(out_id, out_strides);
-  const int in_id = to_nchw_buffer_i(out_idx, out_sizes);
+  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides);
+  const int in_nchwi = tidx_to_nchwi(out_tidx, out_sizes);
 
-  t_out[out_id] = nchw_in[in_id];
+  t_out[out_bufi] = nchw_in[in_nchwi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index 04b6a26cc44..b86a59fc234 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -24,40 +24,40 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "nchw_in", DTYPE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
-VEC4_T read_texel(ivec4 tensor_idx) {
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
-      tensor_idx,
+VEC4_T read_texel(ivec4 tidx) {
+  const ivec4 buf_indices = tidx_to_nchwi(
+      tidx,
       sizes,
       packed_dim);
 
   VEC4_T texel = VEC4_T(0);
-  if (tensor_idx[packed_dim] < sizes[packed_dim]) {
+  if (tidx[packed_dim] < sizes[packed_dim]) {
     texel.x = SCALAR_T(nchw_in[buf_indices.x]);
   }
-  if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) {
+  if (tidx[packed_dim] + 1 < sizes[packed_dim]) {
     texel.y = SCALAR_T(nchw_in[buf_indices.y]);
   }
-  if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) {
+  if (tidx[packed_dim] + 2 < sizes[packed_dim]) {
     texel.z = SCALAR_T(nchw_in[buf_indices.z]);
   }
-  if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) {
+  if (tidx[packed_dim] + 3 < sizes[packed_dim]) {
     texel.w = SCALAR_T(nchw_in[buf_indices.w]);
   }
   return texel;
 }
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
-  if (any(greaterThanEqual(tensor_idx, sizes))) {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
+  if (any(greaterThanEqual(tidx, sizes))) {
     return;
   }
 
-  write_texel(t_out, pos, read_texel(tensor_idx));
+  write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
index 813a174d2a5..f3a3370f3ba 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
@@ -19,7 +19,7 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")}
 ${layout_declare_buffer(B, "r", "nchw_in", "int")}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_mapping")}
+${layout_declare_ubo(B, "ivec4", "axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -35,9 +35,9 @@ int extend_sign(int x) {
   return x;
 }
 
-ivec4 read_texel(ivec4 tensor_idx) {
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
-      tensor_idx, sizes, packed_dim);
+ivec4 read_texel(ivec4 tidx) {
+  const ivec4 buf_indices = tidx_to_nchwi(
+      tidx, sizes, packed_dim);
 
   int shift = (1 << 8) - 1;
   ivec4 masks;
@@ -52,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) {
   ivec4 out_tex = ivec4(0);
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
-    if (tensor_idx[packed_dim] + i < sizes[packed_dim]) {
+    if (tidx[packed_dim] + i < sizes[packed_dim]) {
       int in_texel = nchw_in[buf_indices[i] / 4];
       int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
       extracted_val = extend_sign(extracted_val);
@@ -64,12 +64,12 @@ ivec4 read_texel(ivec4 tensor_idx) {
 }
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
 
-  if (any(greaterThanEqual(tensor_idx, sizes))) {
+  if (any(greaterThanEqual(tidx, sizes))) {
     return;
   }
 
-  write_texel(t_out, pos, read_texel(tensor_idx));
+  write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
index 751d513d59d..de42f9ed996 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
@@ -26,13 +26,14 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)}
-${layout_declare_tensor(2, "r", "t_mat2", "int8", STORAGE)}
+${layout_declare_tensor(2, "r", "t_mat2", "int8", "buffer")}
 ${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)}
 
 $if STORAGE == "texture3d":
   ${layout_declare_ubo(4, "ivec4", "out_sizes")}
   ${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
-  ${layout_declare_ubo(6, "ivec4", "scales_strides")}
+  ${layout_declare_ubo(6, "ivec4", "mat2_strides")}
+  ${layout_declare_ubo(7, "ivec4", "scales_strides")}
 $else:
   ${layout_declare_ubo(4, "ivec4", "out_sizes")}
   ${layout_declare_ubo(5, "ivec4", "out_strides")}
@@ -64,9 +65,9 @@ void main() {
 
     float rc = 0.0;
     int k = 0;
+    const uint k_block = (K + group_size - 1) / group_size;
 
     #ifdef USING_BUFFER
-      const uint k_block = (K + group_size - 1) / group_size;
       ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w);
       ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w);
       ivec4 scale_pos = ivec4(0, n, 0, out_pos.w);
@@ -74,69 +75,57 @@ void main() {
 
       for (int kb = 0; kb < k_block; kb++) {
         scale_pos.x = kb;
-        const int scale_id = to_buffer_id(scale_pos, scales_strides);
-        const float scale = float(t_scales_and_zeros[scale_id]);
+        const int scale_bufi = tidx_to_bufi(scale_pos, scales_strides);
+        const float scale = float(t_scales_and_zeros[scale_bufi]);
 
         zero_pos.x = kb;
-        const int zero_id = to_buffer_id(zero_pos, scales_strides);
-        const float zero = float(t_scales_and_zeros[zero_id]) - scale * 8.0;
+        const int zero_bufi = tidx_to_bufi(zero_pos, scales_strides);
+        const float zero = float(t_scales_and_zeros[zero_bufi]) - scale * 8.0;
 
         for(uint idx = 0; idx < group_size && k < K; idx++, k++) {
           mat1_pos.x = k;
-          const int mat1_id = to_buffer_id(mat1_pos, mat1_strides);
-          const float mat1_val = float(t_mat1[mat1_id]);
+          const int mat1_bufi = tidx_to_bufi(mat1_pos, mat1_strides);
+          const float mat1_val = float(t_mat1[mat1_bufi]);
 
           mat2_pos.x = k / 2;
-          const int mat2_id = to_buffer_id(mat2_pos, mat2_strides);
+          const int mat2_bufi = tidx_to_bufi(mat2_pos, mat2_strides);
           // Bitwise op treats sign bit from int8 as a value bit instead,
           // since there is no uint8_t datatype
-          uint mat2_val = (t_mat2[mat2_id] & 0xFF);
+          uint mat2_val = (t_mat2[mat2_bufi] & 0xFF);
           mat2_val = (k & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
 
           rc += mat1_val * (scale * float(mat2_val) + zero);
         }
       }
 
-      const int out_id = to_buffer_id(out_pos, out_strides);
-      t_out[out_id] = FLOAT_T(rc);
+      const int out_bufi = tidx_to_bufi(out_pos, out_strides);
+      t_out[out_bufi] = FLOAT_T(rc);
 
     #else // Using texture
-      const uint texel_group_size = group_size / FOUR;
-      const uint k_block = (K + texel_group_size - 1) / texel_group_size;
       ivec3 mat1_pos = ivec3(0, m, out_pos.z);
-      ivec3 mat2_pos = ivec3(0, n, out_pos.z);
-      ivec3 scale_pos = ivec3(0, n, 0);
-      ivec3 zero_pos = ivec3(0, n, 1);
+      ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w);
+      ivec3 scale_zero_pos = ivec3(0, n, 0);
+      uint K_texel = K / FOUR;
 
       for (int kb = 0; kb < k_block; kb++) {
-        const int texel_kb = kb / FOUR;
-        const int kb_offset = kb % FOUR;
-
-        scale_pos.x = texel_kb;
-        const VEC4_T scale_texel = load_texel(t_scales_and_zeros, scale_pos);
-        const float scale = float(scale_texel[kb_offset]);
+        scale_zero_pos.x = kb;
+        const vec4 scale_zero = load_texel(t_scales_and_zeros, scale_zero_pos);
+        const float scale = scale_zero.x;
+        const float zero = scale_zero.y - scale * 8.0;
 
-        zero_pos.x = texel_kb;
-        const VEC4_T zero_texel = load_texel(t_scales_and_zeros, zero_pos);
-        const float zero = float(zero_texel[kb_offset]) - scale * 8.0;
-
-        for(uint idx = 0; idx < texel_group_size && k < K; idx++, k++) {
+        for(uint idx = 0; idx < group_size && k < K_texel; idx += FOUR, k++) {
           mat1_pos.x = k;
           const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos);
 
-          mat2_pos.x = k / 2;
-          const i8vec4 mat2_tex = i8vec4(load_texel(t_mat2, mat2_pos));
+          mat2_pos.x = k * 2; // k * FOUR / 2
+          const int mat2_id = tidx_to_bufi(mat2_pos, mat2_strides);
 
-          // Every two texels of mat1 correspond to one texel of mat2
-          // Even mat1 indeces correspond to first half of mat2 texel and
-          // odd indeces correspond to second half
-          const int mat2_offset = (k & 1) == 0 ? 0 : 2;
-          for (int texel_idx = 0; texel_idx < FOUR; texel_idx++){
+          for (int texel_pos = 0; texel_pos < FOUR; texel_pos++) {
             // Bitwise op treats sign bit from int8 as a value bit instead,
             // since there is no uint8_t datatype
-            uint mat2_val = (mat2_tex[mat2_offset + texel_idx / 2] & 0xFF);
-            mat2_val = (texel_idx & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
-            rc += mat1_tex[texel_idx] * (scale * float(mat2_val) + zero);
+            uint mat2_val = (t_mat2[mat2_id + texel_pos / 2] & 0xFF);
+            mat2_val = (texel_pos & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
+            rc += mat1_tex[texel_pos] * (scale * float(mat2_val) + zero);
           }
         }
       }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index 7557a7b0c3d..a72df89b634 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -49,14 +49,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 #ifdef USING_BUFFER
 
 void main() {
-  const int t_id = int(gl_GlobalInvocationID.x);
-  if (t_id >= out_numel) {
+  const int out_bufi = int(gl_GlobalInvocationID.x);
+  if (out_bufi >= out_numel) {
     return;
   }
 
-  const ivec4 out_idx = to_tensor_idx(t_id, out_strides, 0);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0);
 
-  t_out[t_id] = q_8w_linear(out_idx, mat1_sizes.x);
+  t_out[out_bufi] = q_8w_linear(out_tidx, mat1_sizes.x);
 }
 
 #else // USING_TEXTURE
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
new file mode 100644
index 00000000000..3ade1f10cba
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec3", "tin_limits")}
+${layout_declare_ubo(B, "ivec4", "tin_axis_map")}
+${layout_declare_ubo(B, "ivec4", "tout_axis_map")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int nrepeats = 1;
+layout(constant_id = 4) const int repeat_dim = 1;
+
+#include "indexing_utils.h"
+
+void main() {
+  const ivec3 tin_lpos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(tin_lpos, tin_limits))) {
+    return;
+  }
+
+  const VEC4_T intex = load_texel_lpos(tin, tin_lpos, tin_axis_map);
+
+  ivec3 tout_lpos = tin_lpos;
+  tout_lpos[repeat_dim] *= nrepeats;
+
+  for (int i = 0; i < nrepeats; ++i, tout_lpos[repeat_dim]++) {
+    write_texel_lpos(tout, tout_lpos, intex, tout_axis_map);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml
new file mode 100644
index 00000000000..5c284a580c9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml
@@ -0,0 +1,10 @@
+repeat_interleave:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: repeat_interleave
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl
new file mode 100644
index 00000000000..1e854bf7f85
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type(STORAGE)}
+${define_required_extensions(DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "rw", "attn_weight", DTYPE, STORAGE)}
+
+$if STORAGE == "buffer":
+  ${layout_declare_ubo(B, "ivec4", "attn_weight_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "attn_weight_strides")}
+$else:
+  ${layout_declare_ubo(B, "ivec3", "attn_weight_limits")}
+
+${layout_declare_ubo(B, "int", "input_pos")}
+${layout_declare_ubo(B, "float", "scale")}
+
+
+#include "indexing_utils.h"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Negative infinity is represented by having sign bit be 1, all exponent bits
+// be 1, all mantissa bits be 0.
+#define NEGATIVE_INF_BITS 0xFF800000
+const float negative_infinity = NEGATIVE_INF_BITS;
+
+#ifdef USING_BUFFER
+
+/*
+ * This implementations applies a scale and mask to the attention weight tensor
+ * of an SDPA block. The sizes of the attention weight is
+ * (batch_size, n_heads, seq_len, input_pos + seq_len)
+ * Conceptually the weights represent the relationship between each token in the
+ * sequence with each token preceding it.
+ *
+ * The scale applied is 1.0 / sqrt(head_dim_length)
+ *
+ * The mask applied is a bit more complicated. Imagine you create a square
+ * matrix of size (input_pos + seq_len, input_pos + seq_len), and then set the
+ * lower triangular section of the matrix to -inf. Then, slice the matrix along
+ * the row dimension starting from input_pos to input_pos + seq_len. You end up
+ * with a partial mask with size (seq_len, input_pos + seq_len). This is the
+ * mask that is applied to the attention weight.
+ *
+ * In the shader, instead of generating the mask, the index of the elment is
+ * inspected to determine if it would have been masked. Given an element at
+ * tensor index (n, c, h, w), it would be masked if w < h + input_pos.
+ */
+
+/***************************
+ ** Buffer Implementation **
+ ***************************/
+
+void main() {
+  const ivec4 attn_weight_idx = ivec4(
+      gl_GlobalInvocationID.x,
+      gl_GlobalInvocationID.y,
+      gl_GlobalInvocationID.z,
+      0);
+
+  if (any(greaterThanEqual(attn_weight_idx, attn_weight_sizes))) {
+    return;
+  }
+
+  const T scale_conv = T(scale);
+
+  const int attn_weight_id = tidx_to_bufi(attn_weight_idx, attn_weight_strides);
+  if (attn_weight_idx.x <= attn_weight_idx.y + input_pos) {
+    attn_weight[attn_weight_id] = attn_weight[attn_weight_id] * scale_conv;
+  } else {
+    attn_weight[attn_weight_id] = T(negative_infinity);
+  }
+}
+
+#else
+
+/****************************
+ ** Texture Implementation **
+ ****************************/
+
+/*
+ * This implementation assumes that the attention weight is width packed, i.e.
+ * the packed dim of the attn_weight is 0.
+ */
+void main() {
+  const ivec3 attn_weight_pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(attn_weight_pos, attn_weight_limits))) {
+    return;
+  }
+
+  vec4 outtex = imageLoad(attn_weight, attn_weight_pos) * scale;
+
+  // Mask out the upper triangular of attn_weight to -inf
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    if (attn_weight_pos.x * 4 + i > attn_weight_pos.y + input_pos) {
+      outtex[i] = negative_infinity;
+    }
+  }
+
+  write_texel(attn_weight, attn_weight_pos, outtex);
+}
+
+#endif // USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml
new file mode 100644
index 00000000000..ca8806fe000
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml
@@ -0,0 +1,13 @@
+sdpa_attn_weight_scale_and_mask:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: buffer
+      - VALUE: texture3d
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: sdpa_attn_weight_scale_and_mask
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
index d1562d65762..45e6c3358e8 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
@@ -43,11 +43,11 @@ void main() {
   // we calculate the source whcn-coordinate amended with offset-ed channel
   // value.  Then we calculate the actual texture position from the
   // whcn-coordinate.
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim);
+  const ivec4 buf_indices = tidx_to_nchwi(idx, out_sizes, packed_dim);
 
   vec4 outex;
   for (int i=0;i<4;i++) {
-      ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], out_sizes);
+      ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes);
 
       int in_channel = user_coor.z;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
new file mode 100644
index 00000000000..d35492bc367
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define op1(X) ${OPERATOR1}
+
+#define op2(X, Y) ${OPERATOR2}
+
+${define_active_storage_type(STORAGE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
+
+${layout_declare_ubo(B, "ivec3", "tout_limits")}
+${layout_declare_ubo(B, "ivec4", "tin_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = 0;
+layout(constant_id = 4) const int reduce_dim = 0;
+layout(constant_id = 5) const int group_dim = 1;
+
+// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
+// threads that will co-operate to compute one reduction output. There may be
+// multiple groups computing distinct reduction outputs within one work group.
+#define NWORKERS 4
+
+// Sets an upper limit on the total size of a work group based on how many
+// elements are allocated in the shared memory array below. Each thread in the
+// work group will write into its assigned element in the shared array.
+#define MAX_NTHREADS 16
+
+shared vec4 shared_vecs[MAX_NTHREADS];
+
+#include "indexing_utils.h"
+
+int tid_to_smi(const ivec2 tid) {
+  return tid.x + tid.y * NWORKERS;
+}
+
+/*
+ * The shaders below compute softmax for a tensor. Softmax is an interesting mix
+ * between a reduction operator and a unary elementwise operator, defined as
+ * exp(x) / (sum of exp(x)). The general flow of the computation is:
+ *
+ * First, find the maximum element along the reduction dim. The maximum element
+ * is used to preserve numerical stability, since division of exponents is
+ * translation invariant.
+ *
+ * Next, compute the sum of exp(x - max_element) along the reduction dim.
+ *
+ * Finally, for each element along the reduction dim, we compute the output as
+ * exp(x - max_element) / sum_of_exponents.
+ *
+ * The shaders below also utilize shared memory to have multiple threads help
+ * compute the max and sum reduction operations. A total of NGROUPS x NWORKERS
+ * threads are launched. Each group works on a unique reduction "row", and
+ * within a group NWORKERS threads co-operate to compute the max and sum of one
+ * "row". Each worker in the group is responsible for computing a partial output
+ * of the "row" and uploading it to shared memory; the overall reduction output
+ * can then be determined by aggregating the partial outputs stored in shared
+ * memory.
+ *
+ * As a caveat, this shader does not currently support cases where `batch` > 1
+ * and the reduce dim happens to also be the batch concatenation dim.  To support
+ * this, there will need to be additional logic to set the starting value of
+ * `scan_pos[reduce_dim]`. Since this is not expected to be a common use-case,
+ * supporting this case is left as an exercise for when it is required.
+ *
+ * As a final note, log softmax is supported with this shader as well since via
+ * the op1 and op2 macro definitions. See the corresponding YAML file for more
+ * details.
+ */
+
+/*
+ * Computes softmax where the reduction dim is orthogonal to the packed dim.
+ * This case is simpler because each element of a texel belongs to a separate
+ * reduction dim, meaning we don't have to perform reduction along a texel.
+ */
+void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
+  // shared memory index of this thread
+  const int smi = tid_to_smi(tid);
+  // used to iterate over all shared memory in the group
+  int group_i;
+
+  scan_pos[reduce_dim] = tid.x;
+  vec4 max_elements = load_texel(tin, scan_pos);
+  // This thread computes a partial maximum
+  for (int i = tid.x; i < tin_sizes[reduce_dim];
+       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
+    max_elements = max(max_elements, load_texel(tin, scan_pos));
+  }
+  shared_vecs[smi] = max_elements;
+  barrier();
+  // Iterate over the partial maximums to obtain the overall maximum
+  group_i = tid.y * NWORKERS;
+  max_elements = shared_vecs[group_i++];
+  for (int i = 1; i < NWORKERS; ++i, group_i++) {
+    max_elements = max(max_elements, shared_vecs[group_i]);
+  }
+
+  scan_pos[reduce_dim] = tid.x;
+  vec4 denominators = vec4(0);
+  // Compute partial sum
+  for (int i = tid.x; i < tin_sizes[reduce_dim];
+       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
+    denominators += exp(load_texel(tin, scan_pos) - max_elements);
+  }
+  shared_vecs[smi] = denominators;
+  barrier();
+  // Iterate over the partial sums to obtain the overall sum
+  group_i = tid.y * NWORKERS;
+  denominators = shared_vecs[group_i++];
+  for (int i = 1; i < NWORKERS; ++i, group_i++) {
+    denominators += shared_vecs[group_i];
+  }
+
+  // Determine if there are any padding elements in the final texel of the
+  // packed dimension
+  const int nspill = mod4(tin_sizes[packed_dim]);
+  // Detect if this thread is working on the final texels of the packed
+  // dimension, which may have padding elements
+  const bool is_last_texel =
+      scan_pos[packed_dim] == (tout_limits[packed_dim] - 1);
+
+  scan_pos[reduce_dim] = tid.x;
+  for (int i = tid.x; i < tin_sizes[reduce_dim];
+       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
+    const vec4 numerators = op1(load_texel(tin, scan_pos) - max_elements);
+    vec4 outtex = op2(numerators, denominators);
+    // For the last texel in the packed dim, make sure that the padding elements
+    // are explicitly set to 0. Otherwise, they may influence computations later
+    // down the line.
+    if (is_last_texel && nspill > 0) {
+      [[unroll]] for (int i = nspill; i < 4; ++i) {
+        outtex[i] = 0;
+      }
+    }
+    write_texel(tout, scan_pos, outtex);
+  }
+}
+
+/*
+ * Compute softmax where the reduction dim is also the packed dim. This case is
+ * complex because the reduction needs to occur over the individual texels.
+ * Therefore, in this algorithm each element of the accumulator texels are
+ * themselves partial outputs. Special care has to be taken to ignore padding
+ * elements in texels (which occur when the size of the packed dim is not a
+ * multiple of 4) so that they do not influence the output of reduction.
+ */
+void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
+  // shared memory index of this thread
+  const int smi = tid_to_smi(tid);
+  // used to iterate over all shared memory in the group
+  int group_i;
+
+  const int nspill = mod4(tin_sizes[packed_dim]);
+  const int reduce_len = tin_sizes[packed_dim] - nspill;
+
+  scan_pos[reduce_dim] = tid.x;
+  vec4 max_elements = vec4(load_texel(tin, scan_pos).x);
+  for (int i = tid.x * 4; i < reduce_len;
+       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
+    max_elements = max(max_elements, load_texel(tin, scan_pos));
+  }
+  // For the last texel in the dim, if there are padding elements then each
+  // element of the texel needs to be processed individually such that the
+  // padding elements are ignored
+  if (scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1 && nspill > 0) {
+    const vec4 intex = load_texel(tin, scan_pos);
+    for (int i = 0; i < nspill; ++i) {
+      max_elements.x = max(intex[i], max_elements.x);
+    }
+  }
+  shared_vecs[smi] = max_elements;
+  barrier();
+  // Iterate over the partial maximums to obtain the overall maximum
+  group_i = tid.y * NWORKERS;
+  max_elements = shared_vecs[group_i++];
+  for (int i = 1; i < NWORKERS; ++i, group_i++) {
+    max_elements = max(max_elements, shared_vecs[group_i]);
+  }
+  // Each element of the texel is itself a partial maximum; iterate over the
+  // texel to find the actual maximum
+  float max_element = max_elements.x;
+  [[unroll]] for (int i = 1; i < 4; ++i) {
+    max_element = max(max_elements[i], max_element);
+  }
+
+  scan_pos[reduce_dim] = tid.x;
+  vec4 denominators = vec4(0);
+  for (int i = tid.x * 4; i < reduce_len;
+       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
+    denominators += exp(load_texel(tin, scan_pos) - max_element);
+  }
+  // For the last texel in the dim, if there are padding elements then each
+  // element of the texel needs to be processed individually such that the
+  // padding elements are ignored
+  if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) {
+    const vec4 intex = load_texel(tin, scan_pos);
+    for (int i = 0; i < nspill; ++i) {
+      denominators.x += exp(intex[i] - max_element);
+    }
+  }
+  shared_vecs[smi] = denominators;
+  barrier();
+  // Iterate over the partial sums to obtain the overall sum
+  group_i = tid.y * NWORKERS;
+  denominators = shared_vecs[group_i++];
+  for (int i = 1; i < NWORKERS; ++i, group_i++) {
+    denominators += shared_vecs[group_i];
+  }
+  // Reduce over the accumulated texel to find the overall sum
+  float denominator = 0;
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    denominator += denominators[i];
+  }
+
+  scan_pos[reduce_dim] = tid.x;
+  for (int i = tid.x * 4; i < reduce_len;
+       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
+    const vec4 numerators = op1(load_texel(tin, scan_pos) - max_element);
+    write_texel(tout, scan_pos, op2(numerators, denominator));
+  }
+  // For the last texel in the dim, if there are padding elements then the
+  // padding elements need to be set to 0 explicitly, otherwise they may
+  // influence subsequent operations.
+  if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) {
+    const vec4 numerator = op1(load_texel(tin, scan_pos) - max_element);
+    vec4 outtex = op2(numerator, denominator);
+    [[unroll]] for (int i = nspill; i < 4; ++i) {
+      outtex[i] = 0;
+    }
+    write_texel(tout, scan_pos, outtex);
+  }
+}
+
+void main() {
+  ivec3 scan_pos = ivec3(gl_GlobalInvocationID);
+  scan_pos[reduce_dim] = 0;
+
+  const ivec2 tid = ivec2(
+      gl_LocalInvocationID[reduce_dim],
+      gl_LocalInvocationID[group_dim]);
+
+  if (any(greaterThanEqual(scan_pos, tout_limits))) {
+    return;
+  }
+
+  if (reduce_dim != packed_dim) {
+    softmax_nonpacked_dim(tid, scan_pos);
+  } else {
+    softmax_packed_dim(tid, scan_pos);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.h b/backends/vulkan/runtime/graph/ops/glsl/softmax.h
deleted file mode 100644
index b10e27bba7e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// The following two are helper functions to implement `softmax`
-// `early_exit` is the global workgroup position-based condition for unnecessary
-// invocations to exit.
-ivec4 get_early_exit(ivec4 sizes, int in_dim, int compute_dim) {
-  ivec4 early_exit = {
-      sizes.x, // w
-      sizes.y, // h
-      divup4(sizes.z) * sizes.w, // divup4(c) * n
-      0 // zero pad
-  };
-  if (in_dim == 4 && compute_dim == 1) {
-    return early_exit;
-  } else if (in_dim == 4 && compute_dim == 0) {
-    early_exit[2] = divup4(sizes.z);
-    return early_exit;
-  } else {
-    early_exit[in_dim - compute_dim - 1] = 1;
-    return early_exit;
-  }
-}
-
-// `input_dim_stride` is the stride to include elements along the softmax
-// dimension calculation.
-ivec4 get_input_dim_stride(int in_dim, int compute_dim, int in_channel) {
-  ivec4 input_dim_stride = ivec4(0);
-  if (in_dim == 4 && compute_dim == 1) {
-    return input_dim_stride;
-  } else if (in_dim == 4 && compute_dim == 0) {
-    input_dim_stride[2] = divup4(in_channel);
-    return input_dim_stride;
-  } else {
-    input_dim_stride[in_dim - compute_dim - 1] = 1;
-    return input_dim_stride;
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
similarity index 83%
rename from backends/vulkan/runtime/graph/ops/glsl/softmax_channel.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
index b9afc6467ca..d50bbb85f33 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax_channel.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
@@ -4,18 +4,18 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-softmax_channel:
+softmax:
   parameter_names_with_default_values:
     OPERATOR1: exp(X)
     OPERATOR2: X / Y
-    NDIM: 3
     DTYPE: float
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: softmax_channel
-    - NAME: log_softmax_channel
+    - NAME: softmax
+    - NAME: log_softmax
       OPERATOR1: X
       OPERATOR2: X - log(Y)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax_batch_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax_batch_height_width.glsl
deleted file mode 100644
index 49412b24c6e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax_batch_height_width.glsl
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define op1(X) ${OPERATOR1}
-
-#define op2(X, Y) ${OPERATOR2}
-
-#include "indexing_utils.h"
-#include "softmax.h"
-
-layout(std430) buffer;
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict Extents {
-  ivec3 extents;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict Params {
-  // x in_dim
-  // y softmax_dim
-  ivec2 dims;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-/*
- * This shader can compute softmax along batch, height, and width.
- */
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  // `early_exit` is the global workgroup position-based condition for unnecessary invocations to exit.
-  ivec4 early_exit = get_early_exit(sizes, dims.x, dims.y);
-
-  if (!all(lessThan(pos, early_exit.xyz))) {
-    return;
-  }
-
-  // `input_dim_stride` is the stride to include elements along the softmax dimension calculation.
-  ivec4 input_dim_stride = get_input_dim_stride(dims.x, dims.y, sizes.z);
-
-  // Calculate the denominator for the whole dimension.
-  // For numerical stability to avoid floating point overflow,
-  // we leverage the translation invariance of the softmax function,
-  // subtracting every element along input_dim_stride by
-  // the maximum element along input_dim_stride.
-  // find the maximum element
-  vec4 max_element = texelFetch(image_in, pos, 0);
-  ivec3 cand_pos = pos + input_dim_stride.xyz;
-  while (all(lessThan(cand_pos, extents.xyz))) {
-    max_element = max(texelFetch(image_in, cand_pos, 0), max_element);
-    cand_pos += input_dim_stride.xyz;
-  }
-  // Calculate the denominator along the direction of input_dim_stride.
-  cand_pos = pos;
-  vec4 denominator = vec4(0, 0, 0, 0);
-  while (all(lessThan(cand_pos, extents.xyz))) {
-    denominator += exp(texelFetch(image_in, cand_pos, 0) - max_element);
-    cand_pos += input_dim_stride.xyz;
-  }
-  // Calculate every final element along the direction of input_dim_stride.
-  cand_pos = pos;
-  while (all(lessThan(cand_pos, extents.xyz))) {
-    const vec4 numerator = op1(texelFetch(image_in, cand_pos, 0) - max_element);
-    imageStore(image_out, cand_pos, op2(numerator, denominator));
-    cand_pos += input_dim_stride.xyz;
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax_channel.glsl
deleted file mode 100644
index 7922cb041d1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax_channel.glsl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define op1(X) ${OPERATOR1}
-
-#define op2(X, Y) ${OPERATOR2}
-
-#include "indexing_utils.h"
-#include "softmax.h"
-
-layout(std430) buffer;
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-
-layout(set = 0, binding = 2) uniform PRECISION restrict Extents {
-  ivec3 extents;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict Params {
-  // x in_dim
-  // y softmax_dim
-  ivec2 dims;
-};
-
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  // `early_exit` is the global workgroup position-based condition for unnecessary invocations to exit.
-  ivec4 early_exit = get_early_exit(sizes, dims.x, dims.y);
-
-  // how "wide" a batch is in terms of z. Only have one invocation per batch,
-  // as one batch width has elements from every channel in-memory.
-  if (!all(lessThan(pos, early_exit.xyz))) {
-    return;
-  }
-
-  const int b_stride = int(ceil(sizes.z / 4.0));
-  const ivec3 src_pos = ivec3(pos.x, pos.y, pos.z * b_stride);
-  // tail case, padded zeros in memory if tensor's channel dim % 4 != 0
-  uint tail_case_size = sizes.z % 4;
-  if (tail_case_size == 0) {
-    tail_case_size = 4;
-  }
-  // Calculate the denominator for the whole dimension.
-  // For numerical stability to avoid floating point overflow,
-  // we leverage the translation invariance of the softmax function,
-  // subtracting every element along channel by the maximum element along
-  // channel. find the maximum element
-  float max_element = texelFetch(image_in, src_pos, 0)[0];
-  for (int c = 0; c < b_stride - 1; c++) {
-    const vec4 c_texel =
-        texelFetch(image_in, ivec3(src_pos.x, src_pos.y, src_pos.z + c), 0);
-    for (int t = 0; t < 4; t++) {
-      if (c_texel[t] > max_element) {
-        max_element = c_texel[t];
-      }
-    }
-  }
-  vec4 c_texel = texelFetch(
-      image_in, ivec3(src_pos.x, src_pos.y, src_pos.z + b_stride - 1), 0);
-  for (int t = 0; t < tail_case_size; t++) {
-    if (c_texel[t] > max_element) {
-      max_element = c_texel[t];
-    }
-  }
-  // Calculate the denominator.
-  float denominator = 0;
-  for (int c = 0; c < b_stride - 1; c++) {
-    const vec4 c_texel =
-        texelFetch(image_in, ivec3(src_pos.x, src_pos.y, src_pos.z + c), 0);
-    for (int t = 0; t < 4; t++) {
-      denominator += exp(c_texel[t] - max_element);
-    }
-  }
-  c_texel = texelFetch(
-      image_in, ivec3(src_pos.x, src_pos.y, src_pos.z + b_stride - 1), 0);
-  for (int t = 0; t < tail_case_size; t++) {
-    denominator += exp(c_texel[t] - max_element);
-  }
-  // Calculate every final channel element.
-  for (int c = 0; c < b_stride; c++) {
-    const ivec3 dst_pos = ivec3(src_pos.x, src_pos.y, src_pos.z + c);
-    const vec4 numerator = op1(texelFetch(image_in, dst_pos, 0) - max_element);
-    imageStore(image_out, dst_pos, op2(numerator, denominator));
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
index eb05b10b108..2b9f0032f41 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
@@ -38,3 +38,5 @@ unary_op:
       OPERATOR: hardshrink(X, A, B)
     - NAME: hardswish
       OPERATOR: hardswish(X)
+    - NAME: hardsigmoid
+      OPERATOR: hardsigmoid(X)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.glsl b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
index 0b0f587d1d5..8d45e65b396 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/view.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
@@ -39,13 +39,13 @@ void main() {
   // Assume there is a virtual continous buffer in nchw format. From the output
   // pos, we first calculate the index in the virual buffer, and then calculate
   // the input position from the indx.
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(out_tensor_idx, out_sizes, out_packed_dim);
+  const ivec4 buf_indices = tidx_to_nchwi(out_tensor_idx, out_sizes, out_packed_dim);
 
   VEC4_T value = VEC4_T(0);
   // Need to look up the 4 values in the output texel separately.
   for (int i = 0 ; i < 4; i++) {
     if (out_tensor_idx[out_packed_dim]++ < out_sizes[out_packed_dim]) {
-      ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], in_sizes);
+      ivec4 user_coor = nchwi_to_tidx(buf_indices[i], in_sizes);
       ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, in_packed_dim);
       VEC4_T intex = texelFetch(t_in, in_pos_elem.xyz, 0);
       value[i] = intex[in_pos_elem.w];
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
index 8e346bd2088..eb0f1f99a2f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -88,7 +88,7 @@ void add_native_batch_norm_node(
       {{out_ref, vkapi::MemoryAccessType::WRITE},
        {{in_ref, arg_weight, arg_bias, arg_mean, arg_var},
         vkapi::MemoryAccessType::READ}},
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        graph.create_params_buffer(epsilon),
        graph.create_params_buffer(num_texel_per_batch)}));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 6bab8d19111..3ae67489af9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -21,7 +21,7 @@ void check_binary_op_args(
     const api::vTensor& self,
     const api::vTensor& other,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_same_memory_layout(self, other, out));
+  VK_CHECK_COND(check_same_packed_dim(self, other, out));
   std::vector<int64_t> broadcasted_sizes =
       calculate_broadcasted_output_size(self, other);
   VK_CHECK_COND(out.sizes() == broadcasted_sizes);
@@ -53,7 +53,7 @@ void add_binary_op_node(
     const std::string& op_name) {
   ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
   ValueRef arg2 =
-      prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1));
+      prepack_if_tensor_ref(graph, in2, graph.estimate_memory_layout_of(arg1));
 
   vTensorPtr t_in1 = graph.get_tensor(arg1);
   vTensorPtr t_in2 = graph.get_tensor(arg2);
@@ -85,12 +85,15 @@ void add_binary_op_node(
        {{arg1, arg2}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {t_out->sizes_ubo(),
+       t_out->axis_map_ubo(),
        t_in1->sizes_ubo(),
+       t_in1->axis_map_ubo(),
        t_in2->sizes_ubo(),
+       t_in2->axis_map_ubo(),
        graph.create_params_buffer(broadcast_params),
        graph.create_params_buffer(alpha_val)},
       // Specialization Constants
-      {SV(t_out->packed_dim_whcn_idx())},
+      {SV(t_out->packed_dim())},
       // Resizing Logic
       resize_binary_op_node,
       {}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
index cd947091bc1..d5cfd5f4505 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
@@ -25,7 +25,7 @@ void add_cat_default_node(
 
   for (ValueRef input_ref : *input_list) {
     vTensorPtr t_in = graph.get_tensor(input_ref);
-    VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
+    VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
   }
 
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
@@ -40,7 +40,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[0] += range[0];
@@ -52,7 +52,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[1] += range[1];
@@ -63,7 +63,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[2] += range[2];
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
index cef751bc7c8..946a0c9f407 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -32,7 +32,7 @@ void add_clone_node(
       graph.create_local_wg_size(out),
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
-      {t_out->texture_limits_ubo()}));
+      {t_out->logical_limits_ubo()}));
 }
 
 void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index dcdd2dccfa0..360193fb17f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -106,9 +106,9 @@ ValueRef prepack_biases(
       graph.create_local_wg_size(v),
       vref,
       v,
-      {t->sizes_ubo(), t->axis_mapping_ubo()},
+      {t->sizes_ubo(), t->axis_map_ubo()},
       // Specialization constants
-      {SV(t->packed_dim_whcn_idx())}));
+      {SV(t->packed_dim())}));
 
   return v;
 }
@@ -216,14 +216,14 @@ ValueRef prepack_weights(
        graph.create_params_buffer(
            utils::make_ivec4(original_sizes, /*reverse = */ true))},
       // Specialization constants
-      {SV(t->packed_dim_whcn_idx())}));
+      {SV(t->packed_dim())}));
 
   return v;
 }
 
 void check_conv_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 struct Conv2dParams final {
@@ -291,7 +291,7 @@ utils::uvec3 create_conv2d_global_wg_size(
     const Conv2dMethod method,
     const ValueRef out) {
   if (method == Conv2dMethod::Pointwise) {
-    const utils::uvec3 image_extents = graph.image_extents_of(out);
+    const utils::uvec3 image_extents = graph.logical_limits_of(out);
     return {
         utils::div_up(image_extents[0u], 2u),
         utils::div_up(image_extents[1u], 2u),
@@ -376,7 +376,7 @@ void add_conv2d_node(
        {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out->texture_limits_ubo(),
+          t_out->logical_limits_ubo(),
           t_in->sizes_ubo(),
           graph.create_params_buffer(kernel_params),
           graph.create_params_buffer(extra_params),
@@ -444,7 +444,7 @@ void add_conv1d_node(
   int32_t out_group_size = static_cast<int64_t>(out_channels / groups_val);
 
   utils::uvec3 global_size = {1, static_cast<uint32_t>(out_channels), 1};
-  utils::uvec3 local_size = {1, 1, 1};
+  utils::uvec3 local_size = {1, 64, 1};
 
   Kernel1dParams kernel_params = {
       kernel_size,
@@ -474,8 +474,12 @@ void add_conv1d_node(
        {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out->texture_limits_ubo(),
+          t_out->logical_limits_ubo(),
           t_in->sizes_ubo(),
+          t_out->axis_map_ubo(),
+          t_in->axis_map_ubo(),
+          t_weight->axis_map_ubo(),
+          t_bias->axis_map_ubo(),
           graph.create_params_buffer(kernel_params),
           graph.create_params_buffer(out_params),
       },
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
index b15844e1409..c836a53d043 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -33,19 +33,13 @@ void add_copy_offset_node(
   add_dtype_suffix(kernel_name, *t_out);
 
   const struct Block final {
-    ivec3 range;
-    int32_t unused0;
-    ivec3 src_offset;
-    int32_t unused1;
-    ivec3 dst_offset;
-    int32_t unused2;
+    alignas(16) ivec3 range;
+    alignas(16) ivec3 src_offset;
+    alignas(16) ivec3 dst_offset;
   } offset_params{
       range,
-      0,
       src_offset,
-      0,
       dst_offset,
-      0,
   };
 
   auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -61,7 +55,11 @@ void add_copy_offset_node(
           {in, vkapi::MemoryAccessType::READ},
       },
       // Parameter buffers
-      {graph.create_params_buffer(offset_params)},
+      {
+          graph.create_params_buffer(offset_params),
+          t_out->axis_map_ubo(),
+          t_in->axis_map_ubo(),
+      },
       // Specialization Constants
       {}));
 }
@@ -80,8 +78,8 @@ void add_copy_channel_offset_node(
   std::vector<int64_t> in_sizes = t_in->sizes();
   std::vector<int64_t> out_sizes = t_out->sizes();
 
-  VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
 
   // NOTE: This function should be able to support 1d and 2d tensors when
   // range=1, src_offset=dst_offset=1.
@@ -141,28 +139,17 @@ void add_copy_channel_offset_node(
     uvec3 local_size = adaptive_work_group_size(global_size);
 
     const struct Block final {
-      utils::ivec4 out_sizes;
-      utils::ivec4 in_sizes;
-      int32_t channel_range;
-      int32_t src_channel_offset;
-      int32_t dst_channel_offset;
-      int32_t unused;
       ivec3 range;
-      int32_t unused1;
+      int32_t channel_range;
       ivec3 dst_offset;
-      int32_t unused2;
-
+      int32_t dst_channel_offset;
+      int32_t src_channel_offset;
     } channel_offset_params{
-        utils::make_whcn_ivec4(out_sizes),
-        utils::make_whcn_ivec4(in_sizes),
-        channel_range,
-        src_channel_offset,
-        dst_channel_offset,
-        0,
         utils::make_ivec3(global_size),
-        0,
+        channel_range,
         dst_offset,
-        0,
+        dst_channel_offset,
+        src_channel_offset,
     };
 
     auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -179,7 +166,13 @@ void add_copy_channel_offset_node(
             {in, vkapi::MemoryAccessType::READ},
         },
         // Parameter buffers
-        {graph.create_params_buffer(channel_offset_params)},
+        {
+            t_out->sizes_ubo(),
+            t_out->axis_map_ubo(),
+            t_in->sizes_ubo(),
+            t_in->axis_map_ubo(),
+            graph.create_params_buffer(channel_offset_params),
+        },
         // Specialization Constants
         {}));
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
index be0b457b79c..2d733b4964c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
@@ -21,9 +21,9 @@ void check_embedding_args(
     const api::vTensor& weight,
     const api::vTensor& in,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(weight, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(weight, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void add_embedding_node(
@@ -48,7 +48,12 @@ void add_embedding_node(
       graph.create_local_wg_size(out),
       {{out, vkapi::MemoryAccessType::WRITE},
        {{in, weight}, vkapi::MemoryAccessType::READ}},
-      {t_out->sizes_ubo()}));
+      {
+          t_out->sizes_ubo(),
+          t_out->axis_map_ubo(),
+          t_in->axis_map_ubo(),
+          t_weight->axis_map_ubo(),
+      }));
 }
 
 void embedding(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Flip.cpp b/backends/vulkan/runtime/graph/ops/impl/Flip.cpp
new file mode 100644
index 00000000000..12f01708b05
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Flip.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void check_flip_args(const api::vTensor& in, const api::vTensor& out) {
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
+}
+
+void resize_flip_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+
+  out->virtual_resize(in->sizes());
+}
+
+utils::ivec4 create_whcn_bitmap(
+    const std::vector<int64_t>& list,
+    const int64_t ndim) {
+  std::vector<int64_t> bm(4, 0);
+  for (const auto e : list) {
+    auto x = (e % ndim + ndim) % ndim; // normalize
+    x = ndim - 1 - x; // reverse
+    bm.at(x) = 1;
+  }
+  return utils::make_ivec4(bm);
+}
+
+void add_flip_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const std::vector<int64_t>& dim_list,
+    const ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+  check_flip_args(*t_in, *t_out);
+
+  const auto dim_bitmap = create_whcn_bitmap(dim_list, t_in->dim());
+
+  std::string kernel_name("flip");
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
+      // Inputs and Outputs
+      {
+          {out, vkapi::kWrite},
+          {in, vkapi::kRead},
+      },
+      // Parameter buffers
+      {
+          graph.logical_limits_ubo(out),
+          graph.sizes_ubo(out),
+          graph.create_params_buffer(dim_bitmap),
+      },
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_flip_node));
+}
+
+void flip(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef in = args[0];
+  auto dims = graph.get_int_list(args[1]);
+  ValueRef out = args[2];
+
+  add_flip_node(graph, in, *dims, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.flip.default, flip);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Full.cpp b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
index 157515e6e0a..34acb43c668 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Full.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
@@ -54,7 +54,7 @@ void add_full_node(
       // Shader params buffers
       {t_out->sizes_ubo(), graph.create_params_buffer(fill_value_val)},
       // Specialization Constants
-      {SV(t_out->packed_dim_whcn_idx())},
+      {SV(t_out->packed_dim())},
       // Resizing Logic
       resize_full_node,
       {size_or_in}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
index 7b4e45262c0..d9a0cdedd79 100644
--- a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
@@ -21,9 +21,9 @@ void check_index_select_args(
     const api::vTensor& in,
     const api::vTensor& idx,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(idx, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(idx, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void add_index_select_channel_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
index 14c814b084a..b96b8840026 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -36,7 +36,7 @@ void check_addmm_args(
   VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
   VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
 
-  VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out));
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
 
   VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes));
 
@@ -100,27 +100,36 @@ void add_addmm_naive_node(
   std::string kernel_name =
       graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive";
   kernel_name.reserve(kShaderNameReserve);
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
+  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
        {{mat1, mat2, self}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
+          graph.sizes_ubo(out),
+          graph.logical_limits_ubo(out),
+          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1),
+          graph.axis_map_ubo(mat1),
+          graph.sizes_ubo(mat2),
+          graph.axis_map_ubo(mat2),
           graph.sizes_ubo(self),
+          graph.axis_map_ubo(self),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_of(out),
+       graph.packed_dim_of(mat1),
+       graph.packed_dim_of(mat2),
+       graph.packed_dim_of(self)},
       // Resizing Logic
       resize_addmm_node,
       {mat2_is_transposed}));
@@ -151,7 +160,7 @@ void add_addmm_optimized_node(
   ValueRef mat2_packed = mat2;
   const utils::GPUMemoryLayout mat2_layout =
       mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked;
-  if (graph.memory_layout_of(mat2) != mat2_layout) {
+  if (graph.estimate_memory_layout_of(mat2) != mat2_layout) {
     mat2_packed = graph.add_tensor_like(mat2, mat2_layout);
     viewFn(graph, {mat2, graph.add_none(), mat2_packed});
   }
@@ -173,7 +182,7 @@ void add_addmm_optimized_node(
 
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  utils::uvec3 global_size;
+  utils::uvec3 global_size = graph.logical_limits_of(out);
 
   // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
   // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
@@ -182,11 +191,11 @@ void add_addmm_optimized_node(
   // this identity can be used to compute the tensor index of the top left
   // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
   if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    // Use `mapped_extents` instead of `image_extents` because the workgroup
+    // Use `logical_extents` instead of `image_extents` because the workgroup
     // axes need to correspond to tensor dimensions.
-    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1});
+    global_size = utils::divup_vec(global_size, {4, 2, 1});
   } else {
-    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1});
+    global_size = utils::divup_vec(global_size, {4, 4, 1});
   }
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
@@ -201,17 +210,17 @@ void add_addmm_optimized_node(
       // Shader params buffers
       {
           graph.sizes_ubo(out),
-          graph.axis_mapping_ubo(out),
+          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1_W_packed),
-          graph.axis_mapping_ubo(mat1_W_packed),
+          graph.axis_map_ubo(mat1_W_packed),
           graph.sizes_ubo(mat2_packed),
-          graph.axis_mapping_ubo(mat2_packed),
+          graph.axis_map_ubo(mat2_packed),
           graph.sizes_ubo(self),
-          graph.axis_mapping_ubo(self),
+          graph.axis_map_ubo(self),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
-      {graph.packed_dim_whcn_idx_of(out)},
+      {graph.packed_dim_of(out)},
       // Resizing Logic
       resize_addmm_node,
       {mat2_is_transposed}));
@@ -237,10 +246,10 @@ void add_addmm_node(
   }
 
   Params params = {alpha_val, beta_val};
-  if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) {
+  if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) {
     add_addmm_optimized_node(
         graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
-  } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) {
+  } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) {
     add_addmm_naive_node(
         graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
   } else {
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index 07618239a65..5e34af78742 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -29,7 +29,7 @@ void check_matmul_args(
   VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
   VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
 
-  VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out));
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
 
   VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes));
 }
@@ -48,16 +48,10 @@ void resize_matmul_node(
   const int out_rows = mat2_is_transposed ? utils::val_at(-2, mat2->sizes())
                                           : utils::val_at(-1, mat2->sizes());
 
-  std::vector<int64_t> new_out_sizes(3);
-  if (mat1->sizes().size() == 2) {
-    new_out_sizes.resize(2);
-    new_out_sizes.at(0) = out_cols;
-    new_out_sizes.at(1) = out_rows;
-  } else {
-    new_out_sizes.at(0) = mat1->sizes().at(0);
-    new_out_sizes.at(1) = out_cols;
-    new_out_sizes.at(2) = out_rows;
-  }
+  const int64_t out_dim = out->dim();
+  std::vector<int64_t> new_out_sizes(mat1->sizes());
+  new_out_sizes.at(out_dim - 1) = out_rows;
+  new_out_sizes.at(out_dim - 2) = out_cols;
 
   out->virtual_resize(new_out_sizes);
 }
@@ -116,25 +110,31 @@ void add_matmul_naive_texture3d_node(
       : "matmul_naive";
   kernel_name.reserve(kShaderNameReserve);
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
+  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
        {{mat1, mat2}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.texture_limits_ubo(out),
+          graph.sizes_ubo(out),
+          graph.logical_limits_ubo(out),
+          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1),
+          graph.axis_map_ubo(mat1),
+          graph.sizes_ubo(mat2),
+          graph.axis_map_ubo(mat2),
       },
       // Specialization Constants
-      {},
+      {graph.packed_dim_of(out),
+       graph.packed_dim_of(mat1),
+       graph.packed_dim_of(mat2)},
       // Resizing Logic
       resize_matmul_node,
       {mat2_is_transposed}));
@@ -159,7 +159,7 @@ void add_matmul_optimized_node(
   ValueRef mat2_packed = mat2;
   const utils::GPUMemoryLayout mat2_layout =
       mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked;
-  if (graph.memory_layout_of(mat2) != mat2_layout) {
+  if (graph.estimate_memory_layout_of(mat2) != mat2_layout) {
     mat2_packed = graph.add_tensor_like(mat2, mat2_layout);
     viewFn(graph, {mat2, graph.add_none(), mat2_packed});
   }
@@ -187,13 +187,13 @@ void add_matmul_optimized_node(
   // thread is the (x, y, z) coordinate of the output tile it is computing, and
   // this identity can be used to compute the tensor index of the top left
   // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
-  utils::uvec3 global_size;
+  utils::uvec3 global_size = graph.logical_limits_of(out);
   if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    // Use `mapped_extents` instead of `image_extents` because the workgroup
+    // Use `logical_extents` instead of `image_extents` because the workgroup
     // axes need to correspond to tensor dimensions.
-    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1});
+    global_size = utils::divup_vec(global_size, {4, 2, 1});
   } else {
-    global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1});
+    global_size = utils::divup_vec(global_size, {4, 4, 1});
   }
 
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
@@ -209,14 +209,14 @@ void add_matmul_optimized_node(
       // Shader params buffers
       {
           graph.sizes_ubo(out),
-          graph.axis_mapping_ubo(out),
+          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1_W_packed),
-          graph.axis_mapping_ubo(mat1_W_packed),
+          graph.axis_map_ubo(mat1_W_packed),
           graph.sizes_ubo(mat2_packed),
-          graph.axis_mapping_ubo(mat2_packed),
+          graph.axis_map_ubo(mat2_packed),
       },
       // Specialization Constants
-      {graph.packed_dim_whcn_idx_of(out)},
+      {graph.packed_dim_of(out)},
       // Resizing Logic
       resize_matmul_node,
       {mat2_is_transposed}));
@@ -231,9 +231,9 @@ void add_matmul_node(
   if (graph.is_buffer_storage(out)) {
     add_matmul_naive_buffer_node(
         graph, mat1, mat2_data, out, mat2_is_transposed);
-  } else if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) {
+  } else if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) {
     add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed);
-  } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) {
+  } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) {
     add_matmul_naive_texture3d_node(
         graph, mat1, mat2_data, out, mat2_is_transposed);
   } else {
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
index 2b15d924706..553075fc4bb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -49,8 +49,8 @@ void resize_native_layer_norm_node(
 }
 
 void check_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void add_native_layer_norm_node(
@@ -76,10 +76,10 @@ void add_native_layer_norm_node(
   }
 
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
-  ValueRef arg_weight =
-      prepack_if_tensor_ref(graph, weight, graph.memory_layout_of(arg_in));
-  ValueRef arg_bias =
-      prepack_if_tensor_ref(graph, bias, graph.memory_layout_of(arg_in));
+  ValueRef arg_weight = prepack_if_tensor_ref(
+      graph, weight, graph.estimate_memory_layout_of(arg_in));
+  ValueRef arg_bias = prepack_if_tensor_ref(
+      graph, bias, graph.estimate_memory_layout_of(arg_in));
 
   const auto out_val = graph.get_value_list(out);
   vTensorPtr t_out = graph.get_tensor(out_val->at(0));
@@ -91,7 +91,7 @@ void add_native_layer_norm_node(
 
   std::vector<int64_t> in_sizes = t_input->sizes();
 
-  utils::uvec3 global_size = t_mean->image_extents();
+  utils::uvec3 global_size = t_mean->logical_limits();
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   std::string kernel_name("native_layer_norm");
@@ -109,7 +109,7 @@ void add_native_layer_norm_node(
         vkapi::MemoryAccessType::WRITE},
        {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_out->sizes_ubo(),
        graph.create_params_buffer(epsilon)},
       // Specialization Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index e78fca15a0a..e45a333123d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -28,8 +28,8 @@ void check_args(
     const api::vTensor& in,
     const std::vector<int64_t>& permute_dims,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 
   // This implementation doesn't not requires the input tensor to have the same
   // dim size as the argument. The code will work as long as the input tensor's
@@ -90,7 +90,7 @@ void add_permute_node(
       graph.create_local_wg_size(out),
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_out->sizes_ubo(),
        graph.create_params_buffer(params)},
       // Specialization Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index 8b477d3a31a..ba8d971a1af 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -18,8 +18,8 @@
 namespace vkcompute {
 
 void check_pool2d_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void resize_pool2d_node(
@@ -79,7 +79,7 @@ void add_max_pool2d_node(
 
   check_pool2d_args(*t_in, *t_out);
 
-  utils::uvec3 global_size = t_out->image_extents();
+  utils::uvec3 global_size = t_out->logical_limits();
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   std::string kernel_name("max_pool2d");
@@ -103,7 +103,7 @@ void add_max_pool2d_node(
        {arg, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out->texture_limits_ubo(),
+          t_out->logical_limits_ubo(),
           t_in->sizes_ubo(),
           graph.create_params_buffer(kernel_params),
       },
@@ -155,7 +155,7 @@ void add_avg_pool2d_node(
 
   check_pool2d_args(*t_in, *t_out);
 
-  utils::uvec3 global_size = t_out->image_extents();
+  utils::uvec3 global_size = t_out->logical_limits();
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   std::string kernel_name("avg_pool2d");
@@ -176,7 +176,7 @@ void add_avg_pool2d_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {arg, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_in->sizes_ubo(),
        graph.create_params_buffer(kernel_params),
        graph.create_params_buffer(divisor_params)},
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
index 732643ef754..28bf6513957 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -30,7 +30,7 @@ void check_qlinear_args(
   VK_CHECK_COND(qmat2_sizes.size() == 2);
   VK_CHECK_COND(scales_sizes.size() == 1);
 
-  VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out));
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
 
   VK_CHECK_COND(
       utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes));
@@ -78,8 +78,8 @@ void add_q_8w_linear_node(
 
   std::string kernel_name = "q_8w_linear";
   kernel_name.reserve(kShaderNameReserve);
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
-  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(q_mat2));
+  add_packed_dim_suffix(kernel_name, graph.packed_dim_of(mat1));
+  add_packed_dim_suffix(kernel_name, graph.packed_dim_of(q_mat2));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
 
@@ -94,7 +94,7 @@ void add_q_8w_linear_node(
          graph.strides_ubo(q_mat2),
          graph.strides_ubo(scales)});
   } else {
-    ubos.append({graph.texture_limits_ubo(out), graph.sizes_ubo(mat1)});
+    ubos.append({graph.logical_limits_ubo(out), graph.sizes_ubo(mat1)});
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp
index d478b7c253e..17bd62ad6ea 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp
@@ -30,15 +30,21 @@ void check_q_matmul_args(
   VK_CHECK_COND(mat1_sizes.size() == 2);
   VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
 
-  VK_CHECK_COND(graph.memory_layout_of(mat1) == utils::kWidthPacked);
-  VK_CHECK_COND(graph.memory_layout_of(mat2_data) == utils::kWidthPacked);
-  VK_CHECK_COND(
-      graph.memory_layout_of(scales_and_zeros) == utils::kWidthPacked);
+  using namespace WHCN;
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(mat2_data) == kWidthDim);
+  // VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);
+
+  if (graph.storage_type_of(scales_and_zeros) == utils::kBuffer) {
+    VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);
+  } else {
+    VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kChannelsDim);
+  }
 
   if (graph.storage_type_of(out) == utils::kBuffer) {
-    VK_CHECK_COND(graph.memory_layout_of(out) == utils::kWidthPacked);
+    VK_CHECK_COND(graph.packed_dim_of(out) == kWidthDim);
   } else {
-    VK_CHECK_COND(graph.memory_layout_of(out) == utils::kChannelsPacked);
+    VK_CHECK_COND(graph.packed_dim_of(out) == kChannelsDim);
   }
 
   const int mat1_K = utils::val_at(-1, mat1_sizes);
@@ -106,13 +112,8 @@ void add_q_matmul_node(
     const ValueRef out) {
   auto storage_type = graph.storage_type_of(out);
 
-  ValueRef mat2;
-
-  if (storage_type == utils::kBuffer) {
-    mat2 = prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);
-  } else {
-    mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);
-  }
+  ValueRef mat2 =
+      prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);
 
   ValueRef scales_and_zeros =
       prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked);
@@ -135,6 +136,7 @@ void add_q_matmul_node(
   } else {
     ubos.append(graph.sizes_ubo(out));
     ubos.append(graph.sizes_ubo(mat1));
+    ubos.append(graph.strides_ubo(mat2));
     ubos.append(graph.strides_ubo(scales_and_zeros));
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
index 3ef80dc49c7..741b65a84f0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -23,8 +23,8 @@ void check_args(
     const api::vTensor& in,
     const std::vector<int64_t>& repeats,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 
   int64_t in_dim = in.dim();
   VK_CHECK_COND(
@@ -108,7 +108,7 @@ void add_repeat_channel_node(
       // Parameter buffers
       {graph.create_params_buffer(repeat_channel_args)},
       // Specialization Constants
-      {SV(t_out->packed_dim_whcn_idx())}));
+      {SV(t_out->packed_dim())}));
 }
 
 void add_repeat_node(
@@ -130,7 +130,7 @@ void add_repeat_node(
   // After expanding a dimension, we will update the "running_range" since we
   // will need to copy the "expanded" area.
 
-  utils::ivec3 running_range = t_in->texture_limits();
+  utils::ivec3 running_range = t_in->logical_limits();
 
   const std::vector<int64_t>& in_sizes = t_in->sizes();
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
new file mode 100644
index 00000000000..5ff363c6665
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void resize_repeat_interleave_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+
+  const int64_t nrepeats = graph->extract_scalar<int64_t>(extra_args[0]);
+  int64_t repeat_dim = graph->extract_scalar<int64_t>(extra_args[1]);
+
+  std::vector<int64_t> new_sizes = in->sizes();
+  repeat_dim = normalize(repeat_dim, new_sizes.size());
+  new_sizes.at(repeat_dim) *= nrepeats;
+
+  out->virtual_resize(new_sizes);
+}
+
+void add_repeat_interleave_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef num_repeats,
+    const ValueRef dim,
+    const ValueRef out) {
+  const int32_t nrepeats = graph.extract_scalar<int32_t>(num_repeats);
+  const int32_t repeat_dim =
+      graph.extract_whcn_dim<int32_t>(dim, graph.dim_of(in));
+
+  VK_CHECK_COND(repeat_dim != graph.packed_dim_of(out));
+  VK_CHECK_COND(repeat_dim != graph.packed_dim_of(in));
+
+  std::string kernel_name = "repeat_interleave";
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  const utils::uvec3 global_wg_size = graph.logical_limits_of(in);
+  const utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      // Shader
+      VK_KERNEL_FROM_STR(kernel_name),
+      // Workgroup sizes
+      global_wg_size,
+      local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {in, vkapi::MemoryAccessType::READ}},
+      // Parameter buffers
+      {graph.logical_limits_ubo(in),
+       graph.axis_map_ubo(in),
+       graph.axis_map_ubo(out)},
+      // Specialization Constants
+      {nrepeats, repeat_dim},
+      // Resizing Logic
+      resize_repeat_interleave_node,
+      {num_repeats, dim}));
+}
+
+void repeat_interleave(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int args_i = 0;
+  const ValueRef in = args[args_i++];
+  const ValueRef num_repeats = args[args_i++];
+  const ValueRef dim = args[args_i++];
+  const ValueRef output_size = args[args_i++];
+  const ValueRef out = args[args_i++];
+
+  // Output size is not used in the kernel
+  (void)output_size;
+
+  add_repeat_interleave_node(graph, in, num_repeats, dim, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.repeat_interleave.self_int, repeat_interleave);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h
new file mode 100644
index 00000000000..f29a817e86e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+void add_repeat_interleave_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef num_repeats,
+    const ValueRef dim,
+    const ValueRef out);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
new file mode 100644
index 00000000000..38f0eea8e39
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Slice.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Softmax.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transpose.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_kv_cache_update_node(
+    ComputeGraph& graph,
+    const ValueRef input_pos_symint,
+    const ValueRef projected,
+    const ValueRef cache) {
+  std::string kernel_name("kv_cache_update");
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(projected));
+  add_dtype_suffix(kernel_name, graph.dtype_of(projected));
+
+  utils::uvec3 global_size;
+  vkapi::ParamsBindList param_ubos;
+
+  if (graph.is_buffer_storage(cache)) {
+    global_size = graph.create_global_wg_size(projected);
+
+    param_ubos = {
+        graph.numel_ubo(projected),
+        graph.strides_ubo(cache),
+        graph.get_or_create_int_param_buffer(input_pos_symint)};
+  } else {
+    global_size = graph.logical_limits_of(projected);
+
+    param_ubos = {
+        graph.logical_limits_ubo(projected),
+        graph.get_or_create_int_param_buffer(input_pos_symint)};
+  }
+  const utils::uvec3 local_size = graph.create_local_wg_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{cache, vkapi::kWrite}, {projected, vkapi::kRead}},
+      // Shader param buffers
+      param_ubos,
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      nullptr,
+      {}));
+}
+
+void add_attn_weight_scale_and_mask_node(
+    ComputeGraph& graph,
+    const ValueRef input_pos_symint,
+    const ValueRef q_projected,
+    const ValueRef attn_weight) {
+  std::string kernel_name("sdpa_attn_weight_scale_and_mask");
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(attn_weight));
+  add_dtype_suffix(kernel_name, graph.dtype_of(attn_weight));
+
+  const int32_t head_dim_size = graph.size_at<int32_t>(-1, q_projected);
+  const float scale_val = 1.0f / std::sqrt(static_cast<float>(head_dim_size));
+
+  utils::uvec3 global_size;
+  utils::uvec3 local_size;
+  vkapi::ParamsBindList param_ubos;
+
+  if (graph.is_buffer_storage(attn_weight)) {
+    global_size = {
+        graph.size_at<uint32_t>(-1, attn_weight),
+        graph.size_at<uint32_t>(-2, attn_weight),
+        graph.size_at<uint32_t>(-3, attn_weight),
+    };
+
+    param_ubos = {
+        graph.sizes_ubo(attn_weight),
+        graph.strides_ubo(attn_weight),
+        graph.create_params_buffer(scale_val)};
+  } else {
+    global_size = graph.logical_limits_of(attn_weight);
+
+    param_ubos = {
+        graph.logical_limits_ubo(attn_weight),
+        graph.get_or_create_int_param_buffer(input_pos_symint),
+        graph.create_params_buffer(scale_val)};
+  }
+
+  local_size = graph.create_local_wg_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{attn_weight, vkapi::kReadWrite}},
+      // Shader param buffers
+      param_ubos,
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      nullptr,
+      {}));
+}
+
+std::vector<int64_t> get_cache_slice_sizes(
+    ComputeGraph& graph,
+    ValueRef cache,
+    ValueRef input_pos_symint,
+    ValueRef q_projected) {
+  std::vector<int64_t> slice_sizes = graph.sizes_of(cache);
+
+  // Cache slicing will always be in the channels dim
+  const int32_t input_pos_val = graph.read_symint(input_pos_symint);
+  const int64_t q_seq_len = graph.size_at<int64_t>(1, q_projected);
+  slice_sizes.at(1) = input_pos_val + q_seq_len;
+  return slice_sizes;
+}
+
+void resize_cache_slice_view_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)args;
+  std::vector<int64_t> slice_sizes = get_cache_slice_sizes(
+      *graph, extra_args[0], extra_args[1], extra_args[2]);
+
+  graph->get_tensor(extra_args[3])->virtual_resize(slice_sizes);
+}
+
+void add_cache_slice_view_node(
+    ComputeGraph& graph,
+    ValueRef cache,
+    ValueRef input_pos_symint,
+    ValueRef q_projected,
+    ValueRef cache_sliced,
+    const int64_t max_seq_len) {
+  std::vector<int64_t> slice_sizes =
+      get_cache_slice_sizes(graph, cache, input_pos_symint, q_projected);
+  // Initialize the slice to the maximum possible size to start
+  slice_sizes.at(1) = max_seq_len;
+
+  graph.get_tensor(cache_sliced)->virtual_resize(slice_sizes);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      resize_cache_slice_view_node,
+      {cache, input_pos_symint, q_projected, cache_sliced}));
+}
+
+void resize_sdpa_out(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)args;
+
+  int arg_idx = 0;
+  const ValueRef q_projected = extra_args[arg_idx++];
+  const ValueRef out = extra_args[arg_idx++];
+  graph->get_tensor(out)->virtual_resize(graph->sizes_of(q_projected));
+}
+
+void sdpa_with_kv_cache_impl(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int arg_idx = 0;
+  const ValueRef q_projected = args[arg_idx++];
+  const ValueRef k_projected = args[arg_idx++];
+  const ValueRef v_projected = args[arg_idx++];
+  const ValueRef k_cache_data = args[arg_idx++];
+  const ValueRef v_cache_data = args[arg_idx++];
+  const ValueRef input_pos_symint = args[arg_idx++];
+  const ValueRef sequence_len = args[arg_idx++];
+  const ValueRef attn_mask = args[arg_idx++];
+  const ValueRef dropout_p = args[arg_idx++];
+  const ValueRef is_causal = args[arg_idx++];
+  const ValueRef scale = args[arg_idx++];
+
+  // Output tensors
+  const ValueRef out = args[arg_idx++];
+
+  // Unused variables
+  (void)sequence_len;
+
+  // Batches must be 1
+  VK_CHECK_COND(graph.size_at<int32_t>(-4, q_projected) == 1);
+  VK_CHECK_COND(graph.size_at<int32_t>(-4, k_projected) == 1);
+  VK_CHECK_COND(graph.size_at<int32_t>(-4, v_projected) == 1);
+  // k and v projected must have the same shape
+  VK_CHECK_COND(graph.sizes_of(k_projected) == graph.sizes_of(v_projected));
+  // head dim must match between tensors
+  VK_CHECK_COND(
+      graph.size_at<int32_t>(-1, q_projected) ==
+      graph.size_at<int32_t>(-1, k_projected));
+  // All tensors must have the packed dim be the width (head) dimension
+  VK_CHECK_COND(graph.packed_dim_of(q_projected) == WHCN::kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(k_projected) == WHCN::kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(v_projected) == WHCN::kWidthDim);
+  // Some variables are not supported yet
+  VK_CHECK_COND(
+      graph.val_is_none(dropout_p) ||
+      graph.extract_scalar<double>(dropout_p) == 0);
+  VK_CHECK_COND(graph.val_is_none(scale));
+  // is_causal is assumed to be true in the current implementation.
+  VK_CHECK_COND(
+      graph.val_is_none(is_causal) || graph.extract_scalar<bool>(is_causal));
+  VK_CHECK_COND(graph.val_is_none(attn_mask));
+
+  const ValueRef k_cache =
+      prepack_if_tensor_ref(graph, k_cache_data, utils::kWidthPacked);
+  const ValueRef v_cache =
+      prepack_if_tensor_ref(graph, v_cache_data, utils::kWidthPacked);
+
+  const int32_t max_seq_len = graph.size_at<int32_t>(1, k_cache);
+
+  add_kv_cache_update_node(graph, input_pos_symint, k_projected, k_cache);
+  add_kv_cache_update_node(graph, input_pos_symint, v_projected, v_cache);
+
+  // Slice caches from 0 to input_pos + sequence_len
+  const ValueRef k_cache_sliced = graph.add_tensor_view(k_cache);
+  const ValueRef v_cache_sliced = graph.add_tensor_view(v_cache);
+  add_cache_slice_view_node(
+      graph,
+      k_cache,
+      input_pos_symint,
+      q_projected,
+      k_cache_sliced,
+      max_seq_len);
+  add_cache_slice_view_node(
+      graph,
+      v_cache,
+      input_pos_symint,
+      q_projected,
+      v_cache_sliced,
+      max_seq_len);
+
+  // Scalar values for various dims
+  const ValueRef channels = graph.add_scalar<int64_t>(1);
+  const ValueRef height = graph.add_scalar<int64_t>(2);
+  const ValueRef width = graph.add_scalar<int64_t>(3);
+
+  // Repeat interleave
+  const int64_t num_heads = graph.size_at<int64_t>(2, q_projected);
+  const int64_t num_kv_heads = graph.size_at<int64_t>(2, k_projected);
+
+  const ValueRef num_repeats =
+      graph.add_scalar<int64_t>(num_heads / num_kv_heads);
+
+  std::vector<int64_t> cache_slice_repeated_sizes(graph.sizes_of(q_projected));
+  cache_slice_repeated_sizes.at(1) = max_seq_len;
+
+  TmpTensor k_cache_sliced_repeated(
+      &graph, cache_slice_repeated_sizes, graph.dtype_of(k_cache_sliced));
+  TmpTensor v_cache_sliced_repeated(
+      &graph, cache_slice_repeated_sizes, graph.dtype_of(v_cache_sliced));
+
+  add_repeat_interleave_node(
+      graph, k_cache_sliced, num_repeats, height, k_cache_sliced_repeated);
+  add_repeat_interleave_node(
+      graph, v_cache_sliced, num_repeats, height, v_cache_sliced_repeated);
+
+  // Transpose sequence and head dims
+  const ValueRef q_transposed = graph.add_tensor_view(q_projected);
+  const ValueRef k_transposed = graph.add_tensor_view(k_cache_sliced_repeated);
+  const ValueRef v_transposed = graph.add_tensor_view(v_cache_sliced_repeated);
+
+  add_transpose_view_node(graph, q_projected, channels, height, q_transposed);
+  add_transpose_view_node(
+      graph, k_cache_sliced_repeated, channels, height, k_transposed);
+  add_transpose_view_node(
+      graph, v_cache_sliced_repeated, channels, height, v_transposed);
+
+  // Transpose K again to prepare for matmul
+  const ValueRef k_transposed_2 = graph.add_tensor_view(k_transposed);
+  add_transpose_view_node(graph, k_transposed, height, width, k_transposed_2);
+
+  // Initialize attn_weight to the maximum possible size
+  std::vector<int64_t> attn_weight_full_sizes = graph.sizes_of(q_transposed);
+  attn_weight_full_sizes.at(2) = max_seq_len;
+  attn_weight_full_sizes.at(3) = max_seq_len;
+  TmpTensor attn_weight(
+      &graph, attn_weight_full_sizes, graph.dtype_of(q_transposed));
+
+  // Resize attn_weight to the correct dim
+  std::vector<int64_t> attn_weight_sizes = attn_weight_full_sizes;
+  attn_weight_sizes.at(2) = graph.size_at<int64_t>(2, q_transposed);
+  attn_weight_sizes.at(3) = graph.size_at<int64_t>(2, k_transposed);
+  graph.get_tensor(attn_weight)->virtual_resize(attn_weight_sizes);
+
+  // Calculate attention weight, which is a matmul of Q and K
+  const ValueRef mat2_is_transposed = graph.add_scalar<bool>(false);
+  add_matmul_node(
+      graph, q_transposed, k_transposed_2, attn_weight, mat2_is_transposed);
+
+  // Apply scale and mask to the attention weight
+  add_attn_weight_scale_and_mask_node(
+      graph, input_pos_symint, q_projected, attn_weight);
+
+  TmpTensor attn_weight_softmax(
+      &graph, attn_weight_full_sizes, graph.dtype_of(q_transposed));
+  graph.get_tensor(attn_weight_softmax)->virtual_resize(attn_weight_sizes);
+  add_softmax_node(graph, attn_weight, width, attn_weight_softmax, false);
+
+  // Calculate final output
+  const ValueRef out_transposed = graph.add_tensor_view(out);
+  add_transpose_view_node(graph, out, channels, height, out_transposed);
+  add_matmul_node(
+      graph,
+      attn_weight_softmax,
+      v_transposed,
+      out_transposed,
+      mat2_is_transposed);
+
+  graph.execute_nodes().emplace_back(
+      new ExecuteNode(resize_sdpa_out, {q_projected, out}));
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(sdpa_with_kv_cache.default, sdpa_with_kv_cache_impl);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
index 351db0d192b..b2f2245f648 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
@@ -22,8 +22,8 @@ void check_args(
     int64_t dim,
     int64_t index,
     const api::vTensor& t_out) {
-  VK_CHECK_COND(check_memory_layout_is(t_in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(t_out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(t_in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(t_out, WHCN::kChannelsDim));
 
   const int64_t in_dim = t_in.dim();
   VK_CHECK_COND(
@@ -112,7 +112,7 @@ void add_select_int_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
       // Parameter buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        t_out->sizes_ubo(),
        // TODO: num_batches and num_texel_per_batch are provided by
        // t_out->sizes. Can change the following to reduce params
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
index 8b323bafedd..c54994abd83 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -10,6 +10,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Slice.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
@@ -31,7 +33,7 @@ inline int64_t normalize_idx(
   return normalize(index, max);
 }
 
-void add_slice_tensor_out_node(
+void add_slice_tensor_copy_node(
     ComputeGraph& graph,
     ValueRef in,
     ValueRef dim_ref,
@@ -42,8 +44,8 @@ void add_slice_tensor_out_node(
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
 
   // Need normalize the dim
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
@@ -123,7 +125,7 @@ void add_slice_tensor_out_node(
     kernel_name.reserve(kShaderNameReserve);
     add_dtype_suffix(kernel_name, *t_out);
 
-    utils::uvec3 global_size = t_out->image_extents();
+    utils::uvec3 global_size = t_out->logical_limits();
     utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
     const struct Block final {
@@ -149,8 +151,126 @@ void add_slice_tensor_out_node(
   }
 }
 
-void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_slice_tensor_out_node(
+std::vector<int64_t> get_slice_sizes(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref) {
+  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  std::optional<int64_t> opt_start =
+      graph.extract_optional_scalar<int64_t>(opt_start_ref);
+  std::optional<int64_t> opt_end =
+      graph.extract_optional_scalar<int64_t>(opt_end_ref);
+
+  int64_t dim_size = graph.size_at<int64_t>(dim, in_ref);
+  int64_t start = opt_start.value_or(0);
+  int64_t end = opt_end.value_or(dim_size);
+
+  start = normalize_idx(start, dim_size, 0);
+  end = normalize_idx(end, dim_size, dim_size);
+
+  std::vector<int64_t> new_out_sizes = graph.sizes_of(in_ref);
+  new_out_sizes.at(dim) = end - start;
+
+  return new_out_sizes;
+}
+
+void resize_slice_view_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)args;
+  vTensorPtr out = graph->get_tensor(extra_args[0]);
+
+  std::vector<int64_t> new_out_sizes = get_slice_sizes(
+      *graph,
+      extra_args[1], // input
+      extra_args[2], // dim
+      extra_args[3], // optional start
+      extra_args[4]); // optional end
+
+  out->virtual_resize(new_out_sizes);
+}
+
+void check_slice_view_args(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref) {
+  VK_CHECK_COND(
+      graph.val_is_view_of(out_ref, in_ref),
+      "output must be a view of the input");
+
+  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  const int64_t dim_size = graph.size_at<int64_t>(dim, in_ref);
+
+  int64_t start =
+      graph.extract_optional_scalar<int64_t>(opt_start_ref).value_or(0);
+  int64_t end = graph.extract_optional_scalar<int64_t>(opt_end_ref).value_or(0);
+  int64_t step =
+      graph.extract_optional_scalar<int64_t>(opt_step_ref).value_or(1);
+
+  start = normalize_idx(start, dim_size, 0);
+  end = normalize_idx(end, dim_size, dim_size);
+
+  // The start idx must be 0; this is to ensure that the start of the slice view
+  // does not have any offset with respect to the base buffer storage. If the
+  // offset is nonzero, then it will potentially change upon a resize; however
+  // the buffer offset of the view tensor will have been "locked in" when the
+  // descriptor for its buffer storage is bound to a compute shader. Therefore
+  // there is no way to update the offset of the view once it has been bound.
+  VK_CHECK_COND(start == 0, "start must be 0 for slice view");
+  VK_CHECK_COND(step == 1, "step must be 1 for slice view");
+
+  VK_CHECK_COND(
+      end < dim_size, "end must be less than dim size for slice view");
+
+  // We must also check that all earlier dims in the dim order have a size of 1.
+  // This ensures that the slice view encompasses a contiguous memory region of
+  // the source tensor's memory buffer.
+  std::vector<int64_t> in_sizes = graph.sizes_of(in_ref);
+  std::vector<int64_t> in_dim_order = graph.dim_order_of(in_ref);
+  for (int i = 0; i < in_dim_order.size(); ++i) {
+    if (in_dim_order[i] == dim) {
+      break;
+    }
+    VK_CHECK_COND(in_sizes[in_dim_order[i]] == 1);
+  }
+}
+
+void add_slice_view_node(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref) {
+  check_slice_view_args(
+      graph,
+      in_ref,
+      dim_ref,
+      opt_start_ref,
+      opt_end_ref,
+      opt_step_ref,
+      out_ref);
+
+  std::vector<int64_t> new_out_sizes =
+      get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref);
+
+  graph.get_tensor(out_ref)->virtual_resize(new_out_sizes);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      resize_slice_view_node,
+      {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref}));
+}
+
+void slice_tensor_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_slice_tensor_copy_node(
       graph,
       args[0],
       args[1], // dim
@@ -160,9 +280,36 @@ void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       args[5]);
 }
 
+void slice_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef in = args[0];
+  ValueRef out = args[5];
+
+  // Special case if out is a view of in
+  if (graph.val_is_view_of(out, in)) {
+    add_slice_view_node(
+        graph,
+        in,
+        args[1], // dim
+        args[2], // optional start
+        args[3], // optional end
+        args[4], // step
+        out);
+    return;
+  }
+
+  add_slice_tensor_copy_node(
+      graph,
+      in,
+      args[1], // dim
+      args[2], // optional start
+      args[3], // optional end
+      args[4], // step
+      out);
+}
+
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_out);
-  VK_REGISTER_OP(aten.slice.Tensor, slice_tensor_out);
+  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_copy);
+  VK_REGISTER_OP(aten.slice.Tensor, slice_tensor);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.h b/backends/vulkan/runtime/graph/ops/impl/Slice.h
new file mode 100644
index 00000000000..220066ff1bb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <vector>
+
+namespace vkcompute {
+
+void add_slice_view_node(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
index fa4d3df944f..d44ce3f6733 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
@@ -35,40 +35,71 @@ void add_softmax_node(
     ValueRef dim,
     ValueRef out,
     bool log_softmax) {
-  ValueRef in_arg = prepack_if_tensor_ref(graph, in);
-  vTensorPtr t_in = graph.get_tensor(in_arg);
-  int64_t in_dim = t_in->dim();
-
-  int64_t softmax_dim = graph.extract_scalar<int64_t>(dim);
-  softmax_dim = normalize(softmax_dim, in_dim);
-
-  vTensorPtr t_out = graph.get_tensor(out);
+  VK_CHECK_COND(
+      !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out),
+      "Vulkan softmax only supports texture storage");
+
+  const int64_t ndim = graph.dim_of(in);
+
+  int32_t reduce_dim = graph.extract_scalar<int32_t>(dim);
+  reduce_dim = normalize(reduce_dim, ndim);
+  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
+
+  // Check that the concat dim is not the reduction dim, if the tensor has a
+  // batch dim greater than 1.
+  if (graph.dim_of(in) == 4 && graph.size_at<int>(0, in) > 1) {
+    VK_CHECK_COND(
+        graph.concat_dim_of(in) != reduce_dim,
+        "Softmax shader currently does not support concat dim == reduce dim");
+    VK_CHECK_COND(
+        graph.concat_dim_of(out) != reduce_dim,
+        "Softmax shader currently does not support concat dim == reduce dim");
+  }
 
   vkapi::ShaderInfo shader_descriptor;
-  std::string kernel_name = in_dim - softmax_dim == 3
-      ? "softmax_channel"
-      : "softmax_batch_height_width";
+  std::string kernel_name = "softmax";
   kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, *t_out);
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
   if (log_softmax) {
     kernel_name = "log_" + kernel_name;
   }
 
+  // This should match the value of MAX_NTHREADS in the softmax shader.
+  constexpr uint32_t max_nthreads = 16;
+
+  const uint32_t nworkers_per_group = 4;
+  const uint32_t ngroups = 4;
+  VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
+
+  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
+  global_wg_size[reduce_dim] = 1;
+
+  utils::uvec3 local_wg_size{1, 1, 1};
+  local_wg_size[reduce_dim] = nworkers_per_group;
+  const int other_dim_1 = (reduce_dim + 1) % 3;
+  const int other_dim_2 = (reduce_dim + 2) % 3;
+  int32_t group_dim;
+  if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) {
+    local_wg_size[other_dim_1] = ngroups;
+    group_dim = other_dim_1;
+  } else {
+    local_wg_size[other_dim_2] = ngroups;
+    group_dim = other_dim_2;
+  }
+
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       // shader_descriptor,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      global_wg_size,
+      local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
-       {in_arg, vkapi::MemoryAccessType::READ}},
+       {in, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
-       t_in->sizes_ubo(),
-       graph.create_params_buffer(utils::make_ivec2({in_dim, softmax_dim}))},
+      {graph.logical_limits_ubo(out), graph.sizes_ubo(in)},
       // Specialization Constants
-      {},
+      {graph.packed_dim_of(out), reduce_dim, group_dim},
       // Resizing Logic
       resize_softmax_node));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.h b/backends/vulkan/runtime/graph/ops/impl/Softmax.h
new file mode 100644
index 00000000000..58fcfb93404
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Softmax.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <vector>
+
+namespace vkcompute {
+
+void add_softmax_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef dim,
+    ValueRef out,
+    bool log_softmax);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index e093ccf1b72..39039e51025 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -25,7 +25,7 @@ void add_split_with_sizes_default_node(
     ValueRef out_list_ref) {
   vTensorPtr t_in = graph.get_tensor(in);
 
-  VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
 
   ValueListPtr out_list = graph.get_value_list(out_list_ref);
 
@@ -38,7 +38,7 @@ void add_split_with_sizes_default_node(
     ValueRef out_ref = (*out_list)[split_idx];
 
     vTensorPtr t_out = graph.get_tensor(out_ref);
-    VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked));
+    VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
     VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size);
   }
 
@@ -50,7 +50,7 @@ void add_split_with_sizes_default_node(
       // Doesn't need to use split_size since we have already verified that the
       // output tensor's size matches with the split_size.
       vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->texture_limits();
+      utils::ivec3 range = t_out->logical_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
       src_offset[0] += range[0];
@@ -61,7 +61,7 @@ void add_split_with_sizes_default_node(
 
     for (ValueRef out_ref : *out_list) {
       vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->texture_limits();
+      utils::ivec3 range = t_out->logical_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
       src_offset[1] += range[1];
@@ -72,7 +72,7 @@ void add_split_with_sizes_default_node(
 
     for (ValueRef out_ref : *out_list) {
       vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->texture_limits();
+      utils::ivec3 range = t_out->logical_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
       src_offset[2] += range[2];
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 6a759e0fd2e..ef6e8347df8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -31,8 +31,7 @@ void add_staging_to_tensor_node(
          graph.strides_ubo(out_tensor),
          graph.numel_ubo(out_tensor)});
   } else {
-    ubos.append(
-        {graph.sizes_ubo(out_tensor), graph.axis_mapping_ubo(out_tensor)});
+    ubos.append({graph.sizes_ubo(out_tensor), graph.axis_map_ubo(out_tensor)});
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -46,7 +45,7 @@ void add_staging_to_tensor_node(
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_whcn_idx_of(out_tensor))},
+      {SV(graph.packed_dim_of(out_tensor))},
       // Resizing Logic
       nullptr,
       {}));
@@ -70,8 +69,7 @@ void add_tensor_to_staging_node(
          graph.strides_ubo(in_tensor),
          graph.numel_ubo(in_tensor)});
   } else {
-    ubos.append(
-        {graph.sizes_ubo(in_tensor), graph.axis_mapping_ubo(in_tensor)});
+    ubos.append({graph.sizes_ubo(in_tensor), graph.axis_map_ubo(in_tensor)});
   }
 
   // Normally, the image_to_nchw shader is structured so that each thread reads
@@ -99,7 +97,7 @@ void add_tensor_to_staging_node(
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_whcn_idx_of(in_tensor))}));
+      {SV(graph.packed_dim_of(in_tensor))}));
 }
 
 ValueRef prepack(
@@ -115,7 +113,7 @@ ValueRef prepack(
   if (graph.is_buffer_storage(v)) {
     ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)});
   } else {
-    ubos.append({graph.sizes_ubo(v), graph.axis_mapping_ubo(v)});
+    ubos.append({graph.sizes_ubo(v), graph.axis_map_ubo(v)});
   }
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
@@ -129,7 +127,7 @@ ValueRef prepack(
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_whcn_idx_of(v))}));
+      {SV(graph.packed_dim_of(v))}));
 
   return v;
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
index b466f404ad1..c0ce9e4f2c4 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
@@ -47,8 +47,8 @@ void resize_sum_node(
 }
 
 void check_sum_args(const api::vTensor& in, const api::vTensor& out) {
-  VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
+  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
 
 void add_sum_dim_node(
@@ -85,7 +85,7 @@ void add_sum_dim_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {arg, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        graph.create_params_buffer(dim + 4 - in_dim),
        graph.create_params_buffer(dim_size),
        graph.create_params_buffer(int(ceil(channel / 4.0)))},
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
new file mode 100644
index 00000000000..8501d085bc8
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transpose.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <algorithm>
+
+namespace vkcompute {
+
+void resize_transpose_view_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)args;
+  vTensorPtr out = graph->get_tensor(extra_args[0]);
+  vTensorPtr in = graph->get_tensor(extra_args[1]);
+
+  const int64_t dim0 = graph->extract_scalar<int64_t>(extra_args[2]);
+  const int64_t dim1 = graph->extract_scalar<int64_t>(extra_args[3]);
+
+  std::vector<int64_t> new_sizes = in->sizes();
+  // Transpose the resized input sizes
+  std::iter_swap(new_sizes.begin() + dim0, new_sizes.begin() + dim1);
+  out->virtual_resize(new_sizes);
+}
+
+void check_transpose_view_args(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    const int64_t dim0,
+    const int64_t dim1,
+    ValueRef out_ref) {
+  VK_CHECK_COND(
+      graph.val_is_view_of(out_ref, in_ref),
+      "output tensor must be a view of the input tensor");
+
+  const int64_t in_ndim = graph.dim_of(in_ref);
+  VK_CHECK_COND(
+      dim0 >= 0 && dim0 < in_ndim, "dim0 is not in the range of [0, in_ndim)");
+  VK_CHECK_COND(
+      dim1 >= 0 && dim1 < in_ndim, "dim1 is not in the range of [0, in_ndim)");
+}
+
+void add_transpose_view_node(
+    ComputeGraph& graph,
+    ValueRef input_ref,
+    ValueRef dim0_ref,
+    ValueRef dim1_ref,
+    ValueRef out_ref) {
+  const int64_t dim0 = graph.extract_scalar<int64_t>(dim0_ref);
+  const int64_t dim1 = graph.extract_scalar<int64_t>(dim1_ref);
+
+  check_transpose_view_args(graph, input_ref, dim0, dim1, out_ref);
+  const vTensorPtr in = graph.get_tensor(input_ref);
+  graph.get_tensor(out_ref)->virtual_clone(*in);
+  graph.get_tensor(out_ref)->virtual_transpose(dim0, dim1);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      resize_transpose_view_node, {out_ref, input_ref, dim0_ref, dim1_ref}));
+}
+
+void transpose(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  const ValueRef out = args[3];
+  return add_transpose_view_node(
+      graph,
+      args[0], // input
+      args[1], // dim0
+      args[2], // dim1
+      out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.transpose.int, transpose);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.h b/backends/vulkan/runtime/graph/ops/impl/Transpose.h
new file mode 100644
index 00000000000..a4fc4029222
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Transpose.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <vector>
+
+namespace vkcompute {
+
+void add_transpose_view_node(
+    ComputeGraph& graph,
+    ValueRef input_ref,
+    ValueRef dim0_ref,
+    ValueRef dim1_ref,
+    ValueRef out_ref);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index 075c0bc923a..ea27183ead0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -46,7 +46,7 @@ void add_unary_op_node(
   if (graph.is_buffer_storage(out)) {
     ubos.append({graph.numel_ubo(out)});
   } else {
-    ubos.append({graph.texture_limits_ubo(out)});
+    ubos.append({graph.logical_limits_ubo(out)});
   }
   ubos.append(
       {graph.create_params_buffer(min), graph.create_params_buffer(max)});
@@ -114,12 +114,6 @@ float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) {
         "hardshrink");                                                   \
   }
 
-#define DEFINE_HARDSWISH_FN(op_name)                                     \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_unary_op_node(                                            \
-        graph, args[0], kDummyFloat, kDummyFloat, args[1], #op_name);    \
-  }
-
 void gelu(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   // args[1] is the `approximate` string
   // https://fburl.com/code/9omngmyo
@@ -140,7 +134,8 @@ DEFINE_CLAMP_FN(clamp);
 DEFINE_CLAMP_FN(hardtanh);
 DEFINE_RELU_FN(relu);
 DEFINE_HARDSHRINK_FN(hardshrink);
-DEFINE_HARDSWISH_FN(hardswish);
+DEFINE_ACTIVATION_FN(hardswish);
+DEFINE_ACTIVATION_FN(hardsigmoid);
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.abs.default, abs);
@@ -157,6 +152,7 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.tanh.default, tanh);
   VK_REGISTER_OP(aten.hardshrink.default, hardshrink);
   VK_REGISTER_OP(aten.hardswish.default, hardswish);
+  VK_REGISTER_OP(aten.hardsigmoid.default, hardsigmoid);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
index 9183f2aea80..f7fe5282e02 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
@@ -66,7 +66,7 @@ void add_upsample_nearest2d_node(
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
 
   vTensorPtr t_in = graph.get_tensor(in);
-  utils::uvec3 input_sizes = t_in->image_extents();
+  utils::uvec3 input_sizes = t_in->logical_limits();
 
   utils::ivec2 input_size = {
       utils::safe_downcast<int32_t>(input_sizes[0]),
@@ -105,7 +105,7 @@ void add_upsample_nearest2d_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {arg_in, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        graph.create_params_buffer(input_size),
        graph.create_params_buffer(rev_scales)},
       // Specialization Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
index 507dbdcf8b1..4832c16ab99 100644
--- a/backends/vulkan/runtime/graph/ops/impl/View.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp
@@ -76,7 +76,7 @@ void add_view_node(
       // Parameter Buffers
       {t_out->sizes_ubo(), t_in->sizes_ubo()},
       // Specialization Constants
-      {SV(t_in->packed_dim_whcn_idx()), SV(t_out->packed_dim_whcn_idx())},
+      {SV(t_in->packed_dim()), SV(t_out->packed_dim())},
       // Resizing Logic
       resize_view_node,
       {sizes}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
index 2737a86a1ab..9d010c794ec 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -45,28 +45,26 @@ bool check_same_sizes_at(
   return utils::val_at(d1, t1.sizes()) == utils::val_at(d2, t2.sizes());
 }
 
-bool check_memory_layout_is(
-    const api::vTensor& t,
-    utils::GPUMemoryLayout layout) {
-  return t.gpu_memory_layout() == layout;
+bool check_packed_dim_is(const api::vTensor& t, const int32_t packed_dim) {
+  return t.packed_dim() == packed_dim;
 }
 
 bool check_same_ndim(const api::vTensor& t1, const api::vTensor& t2) {
   return t1.sizes().size() == t2.sizes().size();
 }
 
-bool check_same_memory_layout(const api::vTensor& t1, const api::vTensor& t2) {
-  return t1.gpu_memory_layout() == t2.gpu_memory_layout();
+bool check_same_packed_dim(const api::vTensor& t1, const api::vTensor& t2) {
+  return t1.packed_dim() == t2.packed_dim();
 }
 
-bool check_same_memory_layout(
+bool check_same_packed_dim(
     const api::vTensor& t1,
     const api::vTensor& t2,
     const api::vTensor& t3) {
-  if (t1.gpu_memory_layout() != t2.gpu_memory_layout()) {
+  if (t1.packed_dim() != t2.packed_dim()) {
     return false;
   }
-  return (t1.gpu_memory_layout() == t3.gpu_memory_layout());
+  return (t1.packed_dim() == t3.packed_dim());
 }
 
 //
@@ -78,13 +76,15 @@ bool is_packed_dim_broadcasted(
     const api::vTensor& rcvr) {
   // We assume that the tensors are broadcastable. If values aren't equal at
   // some index, then the value of rcvr is 1 and hence should be broadcasted.
-  switch (sndr.gpu_memory_layout()) {
-    case utils::kChannelsPacked:
+  switch (sndr.packed_dim()) {
+    case WHCN::kChannelsDim:
       return utils::val_at(-3, sndr.sizes()) > utils::val_at(-3, rcvr.sizes());
-    case utils::kHeightPacked:
+    case WHCN::kHeightDim:
       return utils::val_at(-2, sndr.sizes()) > utils::val_at(-2, rcvr.sizes());
-    case utils::kWidthPacked:
+    case WHCN::kWidthDim:
       return utils::val_at(-1, sndr.sizes()) > utils::val_at(-1, rcvr.sizes());
+    default:
+      VK_THROW("Invalid packed dim");
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
index 44155a7ce62..c9eeb0efe08 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -34,13 +34,11 @@ bool check_same_sizes_at(
     const api::vTensor& t2,
     int64_t d2);
 
-bool check_memory_layout_is(
-    const api::vTensor& t,
-    utils::GPUMemoryLayout layout);
+bool check_packed_dim_is(const api::vTensor& t, const int32_t packed_dim);
 
-bool check_same_memory_layout(const api::vTensor& t1, const api::vTensor& t2);
+bool check_same_packed_dim(const api::vTensor& t1, const api::vTensor& t2);
 
-bool check_same_memory_layout(
+bool check_same_packed_dim(
     const api::vTensor& t1,
     const api::vTensor& t2,
     const api::vTensor& t3);
@@ -63,8 +61,22 @@ utils::uvec3 adaptive_work_group_size(const utils::uvec3& global_work_group);
 // Tensor dim utilities
 //
 
-inline int64_t normalize(const int64_t dimension, const int64_t n) {
-  return (dimension % n + n) % n;
+template <
+    typename T,
+    typename std::enable_if<
+        std::is_integral<T>::value && std::is_signed<T>::value,
+        int>::type = 0>
+T normalize(const T& nchw_dim, const int64_t ndim) {
+  return (nchw_dim % ndim + ndim) % ndim;
+}
+
+template <
+    typename T,
+    typename std::enable_if<
+        std::is_integral<T>::value && std::is_signed<T>::value,
+        int>::type = 0>
+T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) {
+  return ndim - 1 - nchw_dim;
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
index 2cfb34a052e..b3a72e27c43 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
@@ -13,7 +13,7 @@ namespace vkcompute {
 void bind_tensor_to_descriptor_set(
     api::vTensor& tensor,
     vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::MemoryAccessType accessType,
+    const vkapi::MemoryAccessFlags accessType,
     vkapi::DescriptorSet& descriptor_set,
     const uint32_t idx) {
   if (tensor.buffer()) {
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
index eed39a97979..671a18f7e91 100644
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
@@ -19,7 +19,7 @@ namespace vkcompute {
 void bind_tensor_to_descriptor_set(
     api::vTensor& tensor,
     vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::MemoryAccessType accessType,
+    const vkapi::MemoryAccessFlags accessType,
     vkapi::DescriptorSet& descriptor_set,
     const uint32_t idx);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
index 89f542de6fc..469c2ed8280 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -47,6 +47,10 @@ void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype) {
     case vkapi::kQInt8:
       kernel_name += "_int8";
       break;
+    case vkapi::kByte:
+    case vkapi::kQUInt8:
+      kernel_name += "_uint8";
+      break;
     default:
       break;
   }
@@ -69,28 +73,26 @@ void add_ndim_suffix(std::string& kernel_name, const api::vTensor& tensor) {
   }
 }
 
-void add_memory_layout_suffix(
-    std::string& kernel_name,
-    utils::GPUMemoryLayout layout) {
-  switch (layout) {
-    case utils::kChannelsPacked:
-      kernel_name += "_C_packed";
+void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim) {
+  switch (packed_dim) {
+    case WHCN::kWidthDim:
+      kernel_name += "_W_packed";
       break;
-    case utils::kHeightPacked:
+    case WHCN::kHeightDim:
       kernel_name += "_H_packed";
       break;
-    case utils::kWidthPacked:
-      kernel_name += "_W_packed";
+    case WHCN::kChannelsDim:
+      kernel_name += "_C_packed";
       break;
     default:
-      break;
+      VK_THROW("Invalid packed dim!");
   }
 }
 
-void add_memory_layout_suffix(
+void add_packed_dim_suffix(
     std::string& kernel_name,
     const api::vTensor& tensor) {
-  return add_memory_layout_suffix(kernel_name, tensor.gpu_memory_layout());
+  return add_packed_dim_suffix(kernel_name, tensor.packed_dim());
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
index e8f4f0d229e..10084054964 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
@@ -29,10 +29,8 @@ void add_dtype_suffix(std::string& kernel_name, const api::vTensor& tensor);
 void add_ndim_suffix(std::string& kernel_name, const size_t ndim);
 void add_ndim_suffix(std::string& kernel_name, const api::vTensor& tensor);
 
-void add_memory_layout_suffix(
-    std::string& kernel_name,
-    const utils::GPUMemoryLayout layout);
-void add_memory_layout_suffix(
+void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim);
+void add_packed_dim_suffix(
     std::string& kernel_name,
     const api::vTensor& tensor);
 
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
index 3cd60e25fd2..5ada8df8af7 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.h
+++ b/backends/vulkan/runtime/utils/StorageUtils.h
@@ -8,7 +8,19 @@
 
 #pragma once
 
+#include <ostream>
+
 namespace vkcompute {
+
+// Convenience constexpr to attach semantic names to WHCN dimension index
+namespace WHCN {
+
+constexpr int32_t kWidthDim = 0;
+constexpr int32_t kHeightDim = 1;
+constexpr int32_t kChannelsDim = 2;
+
+} // namespace WHCN
+
 namespace utils {
 
 //
@@ -36,20 +48,42 @@ static constexpr StorageType kTexture3D = StorageType::TEXTURE_3D;
 static constexpr StorageType kTexture2D = StorageType::TEXTURE_2D;
 
 /*
- * The enum below is used to describe how tensor data is laid out when stored in
- * GPU memory; specifically, it indicates how tensor data is packed along a
- * texel (i.e. a vector of 4 scalar values).
+ * A tensor's memory layout is defined in one of two ways:
+ *
+ * 1. If it's a buffer backed tensor, the memory layout is defined by its
+ *    `dim_order`, and by extension its `strides`.
+ * 2. If it's a texture backed tensor, the memory layout is defined by the
+ *    combination of its `axis_map` and its `packed_dim`.
  *
- * Each enum entry indicates which tensor dimension is packed along a texel, and
- * it's value is set to the index of that dimension in WHCN dimension order. For
- * instance, the width dimension corresponds to index 0, so the
- * TENSOR_WIDTH_PACKED enum entry is set to 0.
+ * Providing explicit memory layout metadata upon tensor construction is not
+ * very convenient from an API perspective, so the `GPUMemoryLayout` serves as
+ * an abstraction that is used to determine how to initialize a tensor's layout
+ * metadata based on the developer's intent. A `GPUMemoryLayout` is provided to
+ * the constructor of `vTensor`, which will use it to determine how to set its
+ * `dim_order` if it's a buffer backed tensor, or how to set its `axis_map` and
+ * `packed_dim` if it's a texture backed tensor.
  *
- * When interpreted as an integer, the enum value can be used as a dim index
- * representing the packed dimension. This is used in shaders to resolve tensor
- * indexing calculations.
+ * Note that GPUMemoryLayout is not stored as a tensor property, as it does not
+ * have any meaning after the vTensor is constructed. After construction,
+ * methods such as `virtual_transpose()` may be used to modify the tensor's
+ * layout metadata that cannot be represented by any `GPUMemoryLayout` entry.
+ * Nonetheless, a "best guess" of the closest memory layout can be produced via
+ * the `estimate_memory_layout()` API of `vTensor`.
+ *
+ * Currently, only 3 memory layouts are provided, but more will be added in the
+ * future that will enable different functionality such as minimizing texture
+ * memory footprint.
  */
 enum class GPUMemoryLayout : uint8_t {
+  /*
+   * The below memory layouts will produce a `vTensor` with the following
+   * properties:
+   *
+   * 1. For buffer backed tensors, the `dim_order` will be the same as a
+   *    contiguous dim order, but with the specified dim last in the dim order.
+   * 2. For texture backed tensors, the packed dim will be the specified dim.
+   *    The axis map will be `{0, 1, 2, 2}`.
+   */
   TENSOR_WIDTH_PACKED = 0u,
   TENSOR_HEIGHT_PACKED = 1u,
   TENSOR_CHANNELS_PACKED = 2u,
@@ -64,14 +98,35 @@ static constexpr GPUMemoryLayout kHeightPacked =
 static constexpr GPUMemoryLayout kChannelsPacked =
     GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
 
-/*
- * Given a GPUMemoryLayout, return an offset that can be used to determine the
- * index of the dimension that is packed along texels, assuming NCHW dimension
- * order. The index of the packed dimension will be ndim - offset.
- */
 template <typename T>
-T to_packed_dim_nchw_offset(const GPUMemoryLayout layout) {
-  return static_cast<T>(layout) + 1;
+T to_packed_dim(const GPUMemoryLayout layout) {
+  switch (layout) {
+    case kWidthPacked:
+      return 0;
+    case kHeightPacked:
+      return 1;
+    case kChannelsPacked:
+      return 2;
+  };
+  // Should be unreachable
+  return 0;
+}
+
+inline std::ostream& operator<<(
+    std::ostream& os,
+    const GPUMemoryLayout layout) {
+  switch (layout) {
+    case kWidthPacked:
+      os << "TENSOR_WIDTH_PACKED";
+      break;
+    case kHeightPacked:
+      os << "TENSOR_HEIGHT_PACKED";
+      break;
+    case kChannelsPacked:
+      os << "TENSOR_CHANNELS_PACKED";
+      break;
+  }
+  return os;
 }
 
 } // namespace utils
diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h
index 55bb0f7d1b5..ad4434cf5af 100644
--- a/backends/vulkan/runtime/utils/VecUtils.h
+++ b/backends/vulkan/runtime/utils/VecUtils.h
@@ -238,6 +238,28 @@ struct vec final {
   // NOLINTNEXTLINE
   Type data[N];
 
+  vec() = default;
+
+  // Standard constructor with initializer list
+  vec(std::initializer_list<Type> values) {
+    VK_CHECK_COND(values.size() == N);
+    std::copy(values.begin(), values.end(), data);
+  }
+
+  // Conversion constructor from an _integral_ vec type. Note that this is only
+  // defined if `OtherType` is an integral type to disallow implicit narrowing.
+  template <
+      typename OtherType,
+      typename std::enable_if<
+          !std::is_same<Type, OtherType>::value &&
+              std::is_integral<OtherType>::value,
+          int>::type = 0>
+  /* implicit */ vec(const vec<OtherType, N>& other) {
+    for (int i = 0; i < N; ++i) {
+      data[i] = safe_downcast<Type>(other[i]);
+    }
+  }
+
   const Type& operator[](const uint32_t& i) const {
     VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!");
     return data[i];
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index 1c07cfe2ee6..b2be214c6fe 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -70,6 +70,9 @@ VkDevice create_logical_device(
 #ifdef VK_KHR_portability_subset
       VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
 #endif /* VK_KHR_portability_subset */
+#ifdef VK_ANDROID_external_memory_android_hardware_buffer
+      VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME,
+#endif /* VK_ANDROID_external_memory_android_hardware_buffer */
       VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
       VK_KHR_8BIT_STORAGE_EXTENSION_NAME,
       VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp
index 713fd9917e7..f971a8f8358 100644
--- a/backends/vulkan/runtime/vk_api/Command.cpp
+++ b/backends/vulkan/runtime/vk_api/Command.cpp
@@ -247,7 +247,7 @@ CommandPool::CommandPool(
 }
 
 CommandPool::~CommandPool() {
-  if (VK_NULL_HANDLE == pool_) {
+  if (pool_ == VK_NULL_HANDLE) {
     return;
   }
   vkDestroyCommandPool(device_, pool_, nullptr);
diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h
index f9da296751f..e78d410aec4 100644
--- a/backends/vulkan/runtime/vk_api/Command.h
+++ b/backends/vulkan/runtime/vk_api/Command.h
@@ -99,7 +99,7 @@ class CommandBuffer final {
   VkCommandBuffer get_submit_handle(const bool final_use = false);
 
   inline operator bool() const {
-    return VK_NULL_HANDLE != handle_;
+    return handle_ != VK_NULL_HANDLE;
   }
 };
 
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
index 03b01c3fa86..cd860e3dfb7 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.cpp
+++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -261,7 +261,7 @@ DescriptorPool::DescriptorPool(
 }
 
 DescriptorPool::~DescriptorPool() {
-  if (VK_NULL_HANDLE == pool_) {
+  if (pool_ == VK_NULL_HANDLE) {
     return;
   }
   vkDestroyDescriptorPool(device_, pool_, nullptr);
diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp
index d6a204b89c8..46e534f09f3 100644
--- a/backends/vulkan/runtime/vk_api/Device.cpp
+++ b/backends/vulkan/runtime/vk_api/Device.cpp
@@ -84,7 +84,7 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
 DeviceHandle::DeviceHandle(VkDevice device) : handle(device) {}
 
 DeviceHandle::~DeviceHandle() {
-  if (VK_NULL_HANDLE == handle) {
+  if (handle == VK_NULL_HANDLE) {
     return;
   }
   vkDestroyDevice(handle, nullptr);
diff --git a/backends/vulkan/runtime/vk_api/Fence.cpp b/backends/vulkan/runtime/vk_api/Fence.cpp
index 6a1503c870e..d359990e634 100644
--- a/backends/vulkan/runtime/vk_api/Fence.cpp
+++ b/backends/vulkan/runtime/vk_api/Fence.cpp
@@ -44,7 +44,7 @@ VulkanFence& VulkanFence::operator=(VulkanFence&& other) noexcept {
 }
 
 VulkanFence::~VulkanFence() {
-  if (VK_NULL_HANDLE == handle_) {
+  if (handle_ == VK_NULL_HANDLE) {
     return;
   }
   vkDestroyFence(device_, handle_, nullptr);
diff --git a/backends/vulkan/runtime/vk_api/Fence.h b/backends/vulkan/runtime/vk_api/Fence.h
index 46a58dab4df..52fa24de55b 100644
--- a/backends/vulkan/runtime/vk_api/Fence.h
+++ b/backends/vulkan/runtime/vk_api/Fence.h
@@ -62,7 +62,7 @@ class VulkanFence final {
   }
 
   operator bool() const {
-    return (VK_NULL_HANDLE != handle_);
+    return (handle_ != VK_NULL_HANDLE);
   }
 };
 
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
index 2c0e8668af9..5cc5a76c358 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.cpp
+++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp
@@ -228,7 +228,7 @@ PipelineLayout::PipelineLayout(PipelineLayout&& other) noexcept
 }
 
 PipelineLayout::~PipelineLayout() {
-  if (VK_NULL_HANDLE == handle_) {
+  if (handle_ == VK_NULL_HANDLE) {
     return;
   }
   vkDestroyPipelineLayout(device_, handle_, nullptr);
@@ -300,7 +300,7 @@ ComputePipeline::ComputePipeline(ComputePipeline&& other) noexcept
 }
 
 ComputePipeline::~ComputePipeline() {
-  if (VK_NULL_HANDLE == handle_) {
+  if (handle_ == VK_NULL_HANDLE) {
     return;
   }
   vkDestroyPipeline(device_, handle_, nullptr);
@@ -402,7 +402,7 @@ ComputePipelineCache::ComputePipelineCache(
 ComputePipelineCache::~ComputePipelineCache() {
   purge();
 
-  if (VK_NULL_HANDLE == pipeline_cache_) {
+  if (pipeline_cache_ == VK_NULL_HANDLE) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/vk_api/QueryPool.cpp b/backends/vulkan/runtime/vk_api/QueryPool.cpp
index be11f7473e5..943911d19d0 100644
--- a/backends/vulkan/runtime/vk_api/QueryPool.cpp
+++ b/backends/vulkan/runtime/vk_api/QueryPool.cpp
@@ -30,7 +30,7 @@ constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);
 } // namespace
 
 #define EARLY_RETURN_IF_UNINITIALIZED() \
-  if (VK_NULL_HANDLE == querypool_) {   \
+  if (querypool_ == VK_NULL_HANDLE) {   \
     return;                             \
   }
 
@@ -178,7 +178,7 @@ std::string stringize(const VkExtent3D& extents) {
 }
 std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
 QueryPool::get_shader_timestamp_data() {
-  if (VK_NULL_HANDLE == querypool_) {
+  if (querypool_ == VK_NULL_HANDLE) {
     return {};
   }
   std::lock_guard<std::mutex> lock(mutex_);
diff --git a/backends/vulkan/runtime/vk_api/Runtime.cpp b/backends/vulkan/runtime/vk_api/Runtime.cpp
index fc894ccecc2..e82f631ddb4 100644
--- a/backends/vulkan/runtime/vk_api/Runtime.cpp
+++ b/backends/vulkan/runtime/vk_api/Runtime.cpp
@@ -134,7 +134,7 @@ VkInstance create_instance(const RuntimeConfig& config) {
 
 std::vector<Runtime::DeviceMapping> create_physical_devices(
     VkInstance instance) {
-  if (VK_NULL_HANDLE == instance) {
+  if (instance == VK_NULL_HANDLE) {
     return std::vector<Runtime::DeviceMapping>();
   }
 
@@ -176,7 +176,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
 VkDebugReportCallbackEXT create_debug_report_callback(
     VkInstance instance,
     const RuntimeConfig config) {
-  if (VK_NULL_HANDLE == instance || !config.enable_validation_messages) {
+  if (instance == VK_NULL_HANDLE || !config.enable_validation_messages) {
     return VkDebugReportCallbackEXT{};
   }
 
@@ -296,7 +296,7 @@ Runtime::Runtime(const RuntimeConfig config)
 }
 
 Runtime::~Runtime() {
-  if (VK_NULL_HANDLE == instance_) {
+  if (instance_ == VK_NULL_HANDLE) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/vk_api/Shader.cpp b/backends/vulkan/runtime/vk_api/Shader.cpp
index 960da4b35a4..29774e2f404 100644
--- a/backends/vulkan/runtime/vk_api/Shader.cpp
+++ b/backends/vulkan/runtime/vk_api/Shader.cpp
@@ -83,7 +83,7 @@ ShaderLayout::ShaderLayout(ShaderLayout&& other) noexcept
 }
 
 ShaderLayout::~ShaderLayout() {
-  if (VK_NULL_HANDLE == handle_) {
+  if (handle_ == VK_NULL_HANDLE) {
     return;
   }
   vkDestroyDescriptorSetLayout(device_, handle_, nullptr);
@@ -128,7 +128,7 @@ ShaderModule::ShaderModule(ShaderModule&& other) noexcept
 }
 
 ShaderModule::~ShaderModule() {
-  if (VK_NULL_HANDLE == handle_) {
+  if (handle_ == VK_NULL_HANDLE) {
     return;
   }
   vkDestroyShaderModule(device_, handle_, nullptr);
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
index d4e0fc9702e..fc2de39c811 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
@@ -26,58 +26,37 @@ namespace vkcompute {
 namespace vkapi {
 
 Allocation::Allocation()
-    : memory_requirements{},
-      create_info{},
-      allocator(VK_NULL_HANDLE),
-      allocation(VK_NULL_HANDLE),
-      allocation_info({}),
-      is_copy_(false) {}
+    : allocator(VK_NULL_HANDLE), allocation(VK_NULL_HANDLE), is_copy_(false) {}
 
 Allocation::Allocation(
     VmaAllocator vma_allocator,
     const VkMemoryRequirements& mem_props,
     const VmaAllocationCreateInfo& create_info)
-    : memory_requirements(mem_props),
-      create_info(create_info),
-      allocator(vma_allocator),
-      allocation(VK_NULL_HANDLE),
-      allocation_info({}),
-      is_copy_(false) {
+    : allocator(vma_allocator), allocation(VK_NULL_HANDLE), is_copy_(false) {
   VK_CHECK(vmaAllocateMemory(
-      allocator, &memory_requirements, &create_info, &allocation, nullptr));
+      allocator, &mem_props, &create_info, &allocation, nullptr));
 }
 
 Allocation::Allocation(const Allocation& other) noexcept
-    : memory_requirements(other.memory_requirements),
-      create_info(other.create_info),
-      allocator(other.allocator),
+    : allocator(other.allocator),
       allocation(other.allocation),
-      allocation_info(other.allocation_info),
       is_copy_(true) {}
 
 Allocation::Allocation(Allocation&& other) noexcept
-    : memory_requirements(other.memory_requirements),
-      create_info(other.create_info),
-      allocator(other.allocator),
+    : allocator(other.allocator),
       allocation(other.allocation),
-      allocation_info(other.allocation_info),
       is_copy_(other.is_copy_) {
   other.allocation = VK_NULL_HANDLE;
-  other.allocation_info = {};
 }
 
 Allocation& Allocation::operator=(Allocation&& other) noexcept {
   VmaAllocation tmp_allocation = allocation;
 
-  memory_requirements = other.memory_requirements;
-  create_info = other.create_info;
   allocator = other.allocator;
   allocation = other.allocation;
-  allocation_info = other.allocation_info;
   is_copy_ = other.is_copy_;
 
   other.allocation = tmp_allocation;
-  other.allocation_info = {};
 
   return *this;
 }
@@ -86,7 +65,7 @@ Allocation::~Allocation() {
   // Do not destroy the VmaAllocation if this class instance is a copy of some
   // other class instance, since this means that this class instance does not
   // have ownership of the underlying resource.
-  if (VK_NULL_HANDLE != allocation && !is_copy_) {
+  if (allocation != VK_NULL_HANDLE && !is_copy_) {
     vmaFreeMemory(allocator, allocation);
   }
 }
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.h b/backends/vulkan/runtime/vk_api/memory/Allocation.h
index 44e8277a35c..e56605e14b2 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocation.h
+++ b/backends/vulkan/runtime/vk_api/memory/Allocation.h
@@ -55,15 +55,10 @@ struct Allocation final {
 
   ~Allocation();
 
-  VkMemoryRequirements memory_requirements;
-  // The properties this allocation was created with
-  VmaAllocationCreateInfo create_info;
   // The allocator object this was allocated from
   VmaAllocator allocator;
   // Handles to the allocated memory
   VmaAllocation allocation;
-  // Information about the allocated memory
-  VmaAllocationInfo allocation_info;
 
  private:
   // Indicates whether this class instance is a copy of another class instance,
@@ -80,6 +75,7 @@ struct Allocation final {
   }
 
   friend class VulkanBuffer;
+  friend class VulkanImage;
 };
 
 } // namespace vkapi
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index e814063fa90..16895730cbc 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -52,12 +52,19 @@ Allocator::Allocator(Allocator&& other) noexcept
 }
 
 Allocator::~Allocator() {
-  if (VK_NULL_HANDLE == allocator_) {
+  if (allocator_ == VK_NULL_HANDLE) {
     return;
   }
   vmaDestroyAllocator(allocator_);
 }
 
+VmaAllocationCreateInfo Allocator::gpuonly_resource_create_info() {
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+  return alloc_create_info;
+}
+
 Allocation Allocator::create_allocation(
     const VkMemoryRequirements& memory_requirements,
     const VmaAllocationCreateInfo& create_info) {
@@ -103,9 +110,7 @@ VulkanImage Allocator::create_image(
         (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
   }
 
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+  VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info();
 
   const VulkanImage::ImageProperties image_props{
       image_type,
@@ -157,10 +162,7 @@ VulkanBuffer Allocator::create_storage_buffer(
     const bool allocate_memory) {
   const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
 
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
-
+  VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info();
   return VulkanBuffer(
       allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
 }
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h
index 7d02ffe54e3..56385eb54d7 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.h
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h
@@ -48,6 +48,8 @@ class Allocator final {
   VmaAllocator allocator_;
 
  public:
+  VmaAllocationCreateInfo gpuonly_resource_create_info();
+
   Allocation create_allocation(
       const VkMemoryRequirements& memory_requirements,
       const VmaAllocationCreateInfo& create_info);
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
index 5a78dab764d..9fa3c2ac776 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
@@ -58,8 +58,6 @@ VulkanBuffer::VulkanBuffer(
       nullptr, // pQueueFamilyIndices
   };
 
-  memory_.create_info = allocation_create_info;
-
   if (allocate_memory) {
     VK_CHECK(vmaCreateBuffer(
         allocator_,
@@ -67,7 +65,7 @@ VulkanBuffer::VulkanBuffer(
         &allocation_create_info,
         &handle_,
         &(memory_.allocation),
-        &(memory_.allocation_info)));
+        nullptr));
   } else {
     VmaAllocatorInfo allocator_info{};
     vmaGetAllocatorInfo(allocator_, &allocator_info);
@@ -83,7 +81,7 @@ VulkanBuffer::VulkanBuffer(
     : buffer_properties_(other.buffer_properties_),
       allocator_(other.allocator_),
       memory_(other.memory_),
-      owns_memory_(other.owns_memory_),
+      owns_memory_(false),
       is_copy_(true),
       handle_(other.handle_) {
   // TODO: set the offset and range appropriately
@@ -124,7 +122,7 @@ VulkanBuffer::~VulkanBuffer() {
   // Do not destroy the VkBuffer if this class instance is a copy of another
   // class instance, since this means that this class instance does not have
   // ownership of the underlying resource.
-  if (VK_NULL_HANDLE != handle_ && !is_copy_) {
+  if (handle_ != VK_NULL_HANDLE && !is_copy_) {
     if (owns_memory_) {
       vmaDestroyBuffer(allocator_, handle_, memory_.allocation);
     } else {
@@ -137,6 +135,12 @@ VulkanBuffer::~VulkanBuffer() {
   }
 }
 
+VmaAllocationInfo VulkanBuffer::allocation_info() const {
+  VmaAllocationInfo info;
+  vmaGetAllocationInfo(allocator_, memory_.allocation, &info);
+  return info;
+}
+
 VkMemoryRequirements VulkanBuffer::get_memory_requirements() const {
   VkMemoryRequirements memory_requirements;
   vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements);
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
index af32ffffa84..14722511f4f 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -35,6 +35,13 @@ enum MemoryAccessType : MemoryAccessFlags {
   WRITE = 1u << 1u,
 };
 
+static constexpr MemoryAccessFlags kReadWrite =
+    MemoryAccessType::WRITE | MemoryAccessType::READ;
+
+static constexpr MemoryAccessFlags kRead = MemoryAccessType::READ;
+
+static constexpr MemoryAccessFlags kWrite = MemoryAccessType::WRITE;
+
 class VulkanBuffer final {
  public:
   struct BufferProperties final {
@@ -114,13 +121,7 @@ class VulkanBuffer final {
     return memory_.allocation;
   }
 
-  inline VmaAllocationInfo allocation_info() const {
-    return memory_.allocation_info;
-  }
-
-  inline VmaAllocationCreateInfo allocation_create_info() const {
-    return VmaAllocationCreateInfo(memory_.create_info);
-  }
+  VmaAllocationInfo allocation_info() const;
 
   inline VkBuffer handle() const {
     return handle_;
@@ -160,7 +161,9 @@ class VulkanBuffer final {
 
   inline void bind_allocation(const Allocation& memory) {
     VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
-    VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
+    if (!is_copy_) {
+      VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
+    }
     memory_.allocation = memory.allocation;
   }
 
diff --git a/backends/vulkan/runtime/vk_api/memory/Image.cpp b/backends/vulkan/runtime/vk_api/memory/Image.cpp
index 42352cfb7e7..ea3210c536b 100644
--- a/backends/vulkan/runtime/vk_api/memory/Image.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Image.cpp
@@ -57,7 +57,7 @@ ImageSampler::ImageSampler(ImageSampler&& other) noexcept
 }
 
 ImageSampler::~ImageSampler() {
-  if (VK_NULL_HANDLE == handle_) {
+  if (handle_ == VK_NULL_HANDLE) {
     return;
   }
   vkDestroySampler(device_, handle_, nullptr);
@@ -98,6 +98,8 @@ VulkanImage::VulkanImage()
       allocator_(VK_NULL_HANDLE),
       memory_{},
       owns_memory_(false),
+      owns_view_(false),
+      is_copy_(false),
       handles_{
           VK_NULL_HANDLE,
           VK_NULL_HANDLE,
@@ -120,6 +122,8 @@ VulkanImage::VulkanImage(
       allocator_(vma_allocator),
       memory_{},
       owns_memory_{allocate_memory},
+      owns_view_(false),
+      is_copy_(false),
       handles_{
           VK_NULL_HANDLE,
           VK_NULL_HANDLE,
@@ -157,8 +161,6 @@ VulkanImage::VulkanImage(
       layout_, // initialLayout
   };
 
-  memory_.create_info = allocation_create_info;
-
   if (allocate_memory) {
     VK_CHECK(vmaCreateImage(
         allocator_,
@@ -168,6 +170,7 @@ VulkanImage::VulkanImage(
         &(memory_.allocation),
         nullptr));
     // Only create the image view if the image has been bound to memory
+    owns_view_ = true;
     create_image_view();
   } else {
     VK_CHECK(vkCreateImage(
@@ -175,6 +178,18 @@ VulkanImage::VulkanImage(
   }
 }
 
+VulkanImage::VulkanImage(const VulkanImage& other) noexcept
+    : image_properties_(other.image_properties_),
+      view_properties_(other.view_properties_),
+      sampler_properties_(other.sampler_properties_),
+      allocator_(other.allocator_),
+      memory_(other.memory_),
+      owns_memory_{false},
+      owns_view_{false},
+      is_copy_(true),
+      handles_(other.handles_),
+      layout_(other.layout_) {}
+
 VulkanImage::VulkanImage(VulkanImage&& other) noexcept
     : image_properties_(other.image_properties_),
       view_properties_(other.view_properties_),
@@ -182,6 +197,8 @@ VulkanImage::VulkanImage(VulkanImage&& other) noexcept
       allocator_(other.allocator_),
       memory_(std::move(other.memory_)),
       owns_memory_(other.owns_memory_),
+      owns_view_(other.owns_view_),
+      is_copy_(other.is_copy_),
       handles_(other.handles_),
       layout_(other.layout_) {
   other.handles_.image = VK_NULL_HANDLE;
@@ -201,6 +218,7 @@ VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept {
   allocator_ = other.allocator_;
   memory_ = std::move(other.memory_);
   owns_memory_ = other.owns_memory_;
+  is_copy_ = other.is_copy_;
   handles_ = other.handles_;
   layout_ = other.layout_;
 
@@ -212,11 +230,18 @@ VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept {
 }
 
 VulkanImage::~VulkanImage() {
-  if (VK_NULL_HANDLE != handles_.image_view) {
+  if (owns_view_ && handles_.image_view != VK_NULL_HANDLE) {
     vkDestroyImageView(this->device(), handles_.image_view, nullptr);
   }
 
-  if (VK_NULL_HANDLE != handles_.image) {
+  // Do not destroy any resources if this class instance is a copy of another
+  // class instance, since this means that this class instance does not have
+  // ownership of the underlying resource.
+  if (is_copy_) {
+    return;
+  }
+
+  if (handles_.image != VK_NULL_HANDLE) {
     if (owns_memory_) {
       vmaDestroyImage(allocator_, handles_.image, memory_.allocation);
     } else {
diff --git a/backends/vulkan/runtime/vk_api/memory/Image.h b/backends/vulkan/runtime/vk_api/memory/Image.h
index 1e78f84a5c5..7f10301412f 100644
--- a/backends/vulkan/runtime/vk_api/memory/Image.h
+++ b/backends/vulkan/runtime/vk_api/memory/Image.h
@@ -22,6 +22,12 @@
 #include <unordered_map>
 
 namespace vkcompute {
+
+// Forward declare vTensor classes such that they can be set as friend classes
+namespace api {
+class vTensorStorage;
+} // namespace api
+
 namespace vkapi {
 
 class ImageSampler final {
@@ -96,7 +102,23 @@ class VulkanImage final {
       VkSampler,
       const bool allocate_memory = true);
 
-  VulkanImage(const VulkanImage&) = delete;
+ protected:
+  /*
+   * The Copy constructor allows for creation of a class instance that are
+   * "aliases" of another class instance. The resulting class instance will not
+   * have ownership of the underlying VkImage.
+   *
+   * This behaviour is analogous to creating a copy of a pointer, thus it is
+   * unsafe, as the original class instance may be destroyed before the copy.
+   * These constructors are therefore marked protected so that they may be used
+   * only in situations where the lifetime of the original class instance is
+   * guaranteed to exceed, or at least be the same as, the lifetime of the
+   * copied class instance.
+   */
+  VulkanImage(const VulkanImage& other) noexcept;
+
+ public:
+  // To discourage creating copies, the assignment operator is still deleted.
   VulkanImage& operator=(const VulkanImage&) = delete;
 
   VulkanImage(VulkanImage&&) noexcept;
@@ -123,6 +145,12 @@ class VulkanImage final {
   Allocation memory_;
   // Indicates whether the underlying memory is owned by this resource
   bool owns_memory_;
+  // In some cases, a VulkanImage may be a copy of another VulkanImage but still
+  // own a unique view of the VkImage.
+  bool owns_view_;
+  // Indicates whether this VulkanImage was copied from another VulkanImage,
+  // thus it does not have ownership of the underlying VKBuffer
+  bool is_copy_;
   Handles handles_;
   // Layout
   VkImageLayout layout_;
@@ -144,10 +172,6 @@ class VulkanImage final {
     return memory_.allocation;
   }
 
-  inline VmaAllocationCreateInfo allocation_create_info() const {
-    return VmaAllocationCreateInfo(memory_.create_info);
-  }
-
   inline VkFormat format() const {
     return image_properties_.image_format;
   }
@@ -193,20 +217,37 @@ class VulkanImage final {
     return owns_memory_;
   }
 
+  inline bool is_copy() const {
+    return is_copy_;
+  }
+
   inline operator bool() const {
     return (handles_.image != VK_NULL_HANDLE);
   }
 
+  inline bool is_copy_of(const VulkanImage& other) const {
+    return (handles_.image == other.handles_.image) && is_copy_;
+  }
+
   inline void bind_allocation(const Allocation& memory) {
     VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
-    VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image));
+    // To prevent multiple instances of binding the same VkImage to a memory
+    // block, do not actually bind memory if this VulkanImage is a copy. Assume
+    // that the original VulkanImage is responsible for binding the image.
+    if (!is_copy_) {
+      VK_CHECK(
+          vmaBindImageMemory(allocator_, memory.allocation, handles_.image));
+    }
     memory_.allocation = memory.allocation;
 
     // Only create the image view if the image has been bound to memory
+    owns_view_ = true;
     create_image_view();
   }
 
   VkMemoryRequirements get_memory_requirements() const;
+
+  friend class api::vTensorStorage;
 };
 
 struct ImageMemoryBarrier final {
diff --git a/backends/vulkan/runtime/vk_api/vk_api.h b/backends/vulkan/runtime/vk_api/vk_api.h
index 6affd687dfe..e3fbf057f8b 100644
--- a/backends/vulkan/runtime/vk_api/vk_api.h
+++ b/backends/vulkan/runtime/vk_api/vk_api.h
@@ -10,6 +10,12 @@
 
 #ifdef USE_VULKAN_WRAPPER
 #ifdef USE_VULKAN_VOLK
+#ifdef VK_ANDROID_external_memory_android_hardware_buffer
+#include <android/hardware_buffer.h>
+#include <vulkan/vulkan.h>
+#include <vulkan/vulkan_android.h>
+#endif /* VK_ANDROID_external_memory_android_hardware_buffer */
+
 #include <volk.h>
 #else
 #include <vulkan_wrapper.h>
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index e8b232098be..f37534b089c 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -142,10 +142,18 @@ def define_common_targets(is_fbcode = False):
         VK_API_DEPS += [
             "fbsource//third-party/volk:volk",
         ]
+        VK_API_DEPS += select({
+            "DEFAULT": [],
+            "ovr_config//os:android": ["fbsource//third-party/toolchains:android"],
+        })
         VK_API_PREPROCESSOR_FLAGS += [
             "-DUSE_VULKAN_WRAPPER",
             "-DUSE_VULKAN_VOLK",
         ]
+        VK_API_PREPROCESSOR_FLAGS += select({
+            "DEFAULT": [],
+            "ovr_config//os:android": ["-DVK_ANDROID_external_memory_android_hardware_buffer"],
+        })
     else:
         VK_API_DEPS += [
             "fbsource//third-party/swiftshader:swiftshader_vk_headers",
diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl
index aa2b22c81f9..992907d0c25 100644
--- a/backends/vulkan/test/glsl/scalar_add_texture.glsl
+++ b/backends/vulkan/test/glsl/scalar_add_texture.glsl
@@ -13,7 +13,7 @@
 layout(std430) buffer;
 
 ${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")}
-${layout_declare_ubo(1, "uvec3", "extents")}
+${layout_declare_ubo(1, "ivec3", "extents")}
 ${layout_declare_ubo(2, "int", "scalar")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 7f9f1842adf..0325d2552e4 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -8,7 +8,7 @@
 from collections import namedtuple
 from typing import Callable
 
-from executorch.backends.vulkan.test.op_tests.utils.codegen import VkTestSuite
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite
 
 
 # Prime numbers dim sizes for testing
@@ -49,6 +49,7 @@ def get_binary_elementwise_inputs():
             ((S, S1, S2), (S, S1, S2)),
             ((S, S1, S2), (S, S1, 1), 2.0),
             ((S, S1, S2), (S, 1, S2), 2.0),
+            ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),
         ]
     )
     test_suite.layouts = [
@@ -465,8 +466,8 @@ def get_view_inputs():
     return test_suite
 
 
-@register_test_suite(["aten.slice.Tensor", "aten.slice_copy.Tensor"])
-def get_slice_inputs():
+@register_test_suite("aten.slice_copy.Tensor")
+def get_slice_out_inputs():
     Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
     Test.__new__.__defaults__ = (None, 0, None, None, 1)
 
@@ -548,6 +549,64 @@ def get_slice_inputs():
     return test_suite
 
 
+def get_slice_view_inputs():
+    Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
+    Test.__new__.__defaults__ = (None, 0, None, None, 1)
+
+    # Slice by channel
+    test_cases = [
+        Test(self=[1, 17, 1, 10], dim=1, start=0, end=4),
+        Test(self=[1, 17, 1, 10], dim=1, start=0, end=8),
+        Test(self=[1, 17, 3, 7], dim=1, start=0, end=12),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.layouts = ["utils::kWidthPacked"]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.is_view_op = True
+
+    return test_suite
+
+
+@register_test_suite(["aten.slice.Tensor"])
+def get_slice_inputs():
+    texture_test_suite = get_slice_out_inputs()
+    texture_test_suite.test_name_suffix = "no_view"
+
+    view_test_suite = get_slice_view_inputs()
+    view_test_suite.test_name_suffix = "view"
+
+    return [view_test_suite, texture_test_suite]
+
+
+@register_test_suite(["aten.transpose.int"])
+def get_transpose_inputs():
+    Test = namedtuple("VkTransposeViewTest", ["self", "dim0", "dim1"])
+    Test.__new__.__defaults__ = (None, 0, 1)
+
+    test_cases = [
+        Test(self=[M1, M2], dim0=0, dim1=1),
+        Test(self=[M1, S2, M], dim0=0, dim1=1),
+        Test(self=[M1, S2, M], dim0=0, dim1=2),
+        Test(self=[M1, S2, M], dim0=2, dim1=1),
+        Test(self=[S, M, S2, M2], dim0=3, dim1=2),
+        Test(self=[S, M, S2, M2], dim0=1, dim1=2),
+        Test(self=[S, M, S2, M2], dim0=3, dim1=1),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.is_view_op = True
+    return test_suite
+
+
 @register_test_suite("aten.index_select.default")
 def get_index_select_inputs():
     Test = namedtuple("VkIndexSelectTest", ["self", "dim", "index"])
@@ -688,6 +747,46 @@ def get_repeat_inputs():
     return test_suite
 
 
+@register_test_suite("aten.repeat_interleave.self_int")
+def get_repeat_interleave_inputs():
+    test_suite_W = VkTestSuite(
+        [
+            ((4, 32, 256), 3, -2),
+            # Test repeat on each non-packed dim
+            ((16, 32, 64), 5, -2),
+            ((16, 32, 64), 5, -3),
+            # Test batched inputs
+            ((3, 5, 32, 64), 4, -2),
+            ((3, 5, 32, 64), 4, -3),
+        ]
+    )
+    test_suite_W.layouts = [
+        "utils::kWidthPacked",
+    ]
+    test_suite_W.data_gen = "make_seq_tensor"
+    test_suite_W.dtypes = ["at::kFloat"]
+    test_suite_W.test_name_suffix = "W_packed"
+
+    test_suite_C = VkTestSuite(
+        [
+            # Test repeat on each non-packed dim
+            ((32, 32, 16), 5, -1),
+            ((32, 32, 16), 5, -2),
+            # Test batched inputs
+            ((3, 16, 8, 64), 4, -1),
+            ((3, 16, 8, 64), 4, -2),
+        ]
+    )
+    test_suite_C.layouts = [
+        "utils::kChannelsPacked",
+    ]
+    test_suite_C.data_gen = "make_seq_tensor"
+    test_suite_C.dtypes = ["at::kFloat"]
+    test_suite_C.test_name_suffix = "C_packed"
+
+    return [test_suite_W, test_suite_C]
+
+
 @register_test_suite("aten.cat.default")
 def get_cat_inputs():
     # TensorList must be specified as list of tuples
@@ -841,29 +940,30 @@ def get_split_tensor_inputs():
 def get_softmax_inputs():
     test_suite = VkTestSuite(
         [
-            ((S1), 0, False),
-            ((S1), -1, False),
-            ((S, S1), 0, False),
-            ((S, S1), 1, False),
-            ((S, S1), -1, False),
-            ((S, S1), -2, False),
+            ((L), 0, False),
+            ((L), -1, False),
+            ((M, L), 0, False),
+            ((M, L), 1, False),
+            ((L, M), -1, False),
+            ((M, L), -2, False),
             ((S, S1, S2), 0, False),
             ((S, S1, S2), 1, False),
             ((S, S1, S2), 2, False),
             ((S, S1, S2), -1, False),
             ((S, S1, S2), -2, False),
             ((S, S1, S2), -3, False),
-            ((XS, S, S1, S2), 0, False),
-            ((XS, S, S1, S2), 1, False),
-            ((XS, S, S1, S2), 2, False),
-            ((XS, S, S1, S2), 3, False),
-            ((XS, S, S1, S2), -1, False),
-            ((XS, S, S1, S2), -2, False),
-            ((XS, S, S1, S2), -3, False),
-            ((XS, S, S1, S2), -4, False),
+            ((1, S, S1, S2), 1, False),
+            ((1, S, S1, S2), 2, False),
+            ((1, S, S1, S2), 3, False),
+            ((1, S, S1, S2), -1, False),
+            ((1, S, S1, S2), -2, False),
+            ((1, S, S1, S2), -3, False),
+            # Test batches > 1 where the reduction dim is not the concat dim
+            ((S, S2, S1, 128), -1, False),
         ]
     )
     test_suite.layouts = [
+        "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
     return test_suite
@@ -878,6 +978,7 @@ def get_softmax_inputs():
         "aten.neg.default",
         "aten.cos.default",
         "aten.hardswish.default",
+        "aten.hardsigmoid.default",
     ]
 )
 def get_unary_ops_inputs():
@@ -1058,3 +1159,23 @@ def get_squeeze_copy_dim_inputs():
         ]
     )
     return test_suite
+
+
+@register_test_suite("aten.flip.default")
+def get_flip_inputs():
+    Test = namedtuple("Flip", ["self", "dim"])
+    Test.__new__.__defaults__ = (None, 0)
+
+    test_cases = [
+        Test(self=[9], dim=[0]),
+        Test(self=[9, 9], dim=[0, 1]),
+        Test(self=[9, 9, 9], dim=[0, 2]),
+        Test(self=[9, 9, 9], dim=[0, 1, 2]),
+        Test(self=[9, 9, 9, 9], dim=[0]),
+        Test(self=[9, 9, 9, 9], dim=[0, 2, 3]),
+        Test(self=[9, 9, 9, 9], dim=[1, 3]),
+        Test(self=[9, 9, 9, 9], dim=[0, 1, 2, 3]),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+    return test_suite
diff --git a/backends/vulkan/test/op_tests/generate_op_benchmarks.py b/backends/vulkan/test/op_tests/generate_op_benchmarks.py
new file mode 100644
index 00000000000..7f286123df9
--- /dev/null
+++ b/backends/vulkan/test/op_tests/generate_op_benchmarks.py
@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+from typing import Dict
+
+from executorch.backends.vulkan.test.op_tests.cases import test_suites
+
+from executorch.backends.vulkan.test.op_tests.utils.gen_benchmark_vk import (
+    VkBenchmarkFileGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
+    ComputeGraphGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
+from torchgen import local
+
+from torchgen.gen import parse_native_yaml, ParsedYaml
+from torchgen.model import DispatchKey, NativeFunction
+
+
+def registry_name(f: NativeFunction) -> str:
+    name = str(f.namespace) + "." + str(f.func.name)
+    if len(f.func.name.overload_name) == 0:
+        name += ".default"
+    return name
+
+
+def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]:
+    f_map: Dict[str, NativeFunction] = {}
+    for f in parsed_yaml.native_functions:
+        f_map[registry_name(f)] = f
+    return f_map
+
+
+def process_test_suites(
+    cpp_generator: VkBenchmarkFileGen,
+    f_map: Dict[str, NativeFunction],
+    test_suites: Dict[str, TestSuite],
+) -> None:
+    for registry_name, op_test_suites in test_suites.items():
+        f = f_map[registry_name]
+        if isinstance(op_test_suites, list):
+            for suite in op_test_suites:
+                cpp_generator.add_suite(registry_name, f, suite)
+        else:
+            cpp_generator.add_suite(registry_name, f, op_test_suites)
+
+
+@local.parametrize(
+    use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+)
+def generate_cpp(
+    native_functions_yaml_path: str, tags_path: str, output_dir: str
+) -> None:
+    output_file = os.path.join(output_dir, "op_benchmarks.cpp")
+    cpp_generator = VkBenchmarkFileGen(output_file)
+
+    parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path)
+    f_map = construct_f_map(parsed_yaml)
+
+    ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU]
+
+    process_test_suites(cpp_generator, f_map, test_suites)
+
+    with open(output_file, "w") as file:
+        file.write(cpp_generator.generate_cpp())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--aten-yaml-path",
+        help="path to native_functions.yaml file.",
+    )
+    parser.add_argument(
+        "--tags-path",
+        help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.",
+    )
+
+    parser.add_argument("-o", "--output", help="Output directory", required=True)
+    args = parser.parse_args()
+    generate_cpp(args.aten_yaml_path, args.tags_path, args.output)
diff --git a/backends/vulkan/test/op_tests/generate_op_tests.py b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
similarity index 68%
rename from backends/vulkan/test/op_tests/generate_op_tests.py
rename to backends/vulkan/test/op_tests/generate_op_correctness_tests.py
index 71047ac6f49..4e51e23940b 100644
--- a/backends/vulkan/test/op_tests/generate_op_tests.py
+++ b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
@@ -10,12 +10,14 @@
 from typing import Dict
 
 from executorch.backends.vulkan.test.op_tests.cases import test_suites
+from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
+    ComputeGraphGen,
+)
 
-from executorch.backends.vulkan.test.op_tests.utils.codegen import VkCppTestFileGen
-from executorch.backends.vulkan.test.op_tests.utils.codegen_base import (
-    TestSuite,
-    TestSuiteGen,
+from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_vk import (
+    VkCorrectnessTestFileGen,
 )
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
 from torchgen import local
 
 from torchgen.gen import parse_native_yaml, ParsedYaml
@@ -37,13 +39,17 @@ def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]:
 
 
 def process_test_suites(
-    cpp_generator: VkCppTestFileGen,
+    cpp_generator: VkCorrectnessTestFileGen,
     f_map: Dict[str, NativeFunction],
     test_suites: Dict[str, TestSuite],
 ) -> None:
-    for registry_name, op_test_suite in test_suites.items():
+    for registry_name, op_test_suites in test_suites.items():
         f = f_map[registry_name]
-        cpp_generator.add_suite(registry_name, f, op_test_suite)
+        if isinstance(op_test_suites, list):
+            for suite in op_test_suites:
+                cpp_generator.add_suite(registry_name, f, suite)
+        else:
+            cpp_generator.add_suite(registry_name, f, op_test_suites)
 
 
 @local.parametrize(
@@ -53,12 +59,12 @@ def generate_cpp(
     native_functions_yaml_path: str, tags_path: str, output_dir: str
 ) -> None:
     output_file = os.path.join(output_dir, "op_tests.cpp")
-    cpp_generator = VkCppTestFileGen(output_file)
+    cpp_generator = VkCorrectnessTestFileGen(output_file)
 
     parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path)
     f_map = construct_f_map(parsed_yaml)
 
-    TestSuiteGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU]
+    ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU]
 
     process_test_suites(cpp_generator, f_map, test_suites)
 
@@ -67,16 +73,14 @@ def generate_cpp(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Generate a simple Hello World C++ program."
-    )
+    parser = argparse.ArgumentParser()
     parser.add_argument(
         "--aten-yaml-path",
         help="path to native_functions.yaml file.",
     )
     parser.add_argument(
         "--tags-path",
-        help="Path to tags.yaml. Required by yaml parsing in codegen system.",
+        help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.",
     )
     parser.add_argument("-o", "--output", help="Output directory", required=True)
     args = parser.parse_args()
diff --git a/backends/vulkan/test/op_tests/sdpa_test.cpp b/backends/vulkan/test/op_tests/sdpa_test.cpp
new file mode 100644
index 00000000000..ba85fb3c91f
--- /dev/null
+++ b/backends/vulkan/test/op_tests/sdpa_test.cpp
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/extension/llm/custom_ops/op_sdpa.h>
+
+#include <cassert>
+#include <iostream>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+// The below are copied from executorch/extension/llm/custom_ops/op_sdpa_aot.cpp
+// They are needed because the original definitions are inaccessible due to
+// being defined in an anonymous namespace.
+
+Tensor& sdpa_with_kv_cache_out_no_context(
+    const Tensor& q_projected,
+    const Tensor& k_projected,
+    const Tensor& v_projected,
+    Tensor& key_cache,
+    Tensor& value_cache,
+    const int64_t start_pos,
+    const int64_t seq_len,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output) {
+  executorch::runtime::KernelRuntimeContext context{};
+  return torch::executor::native::sdpa_with_kv_cache_out(
+      context,
+      q_projected,
+      k_projected,
+      v_projected,
+      key_cache,
+      value_cache,
+      start_pos,
+      seq_len,
+      attn_mask,
+      dropout_p,
+      is_causal,
+      scale,
+      output);
+}
+
+at::Tensor sdpa_with_kv_cache_aten(
+    const at::Tensor& q_projected,
+    const at::Tensor& k_projected,
+    const at::Tensor& v_projected,
+    at::Tensor& key_cache,
+    at::Tensor& value_cache,
+    const int64_t start_pos,
+    const int64_t seq_len,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const std::optional<at::Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const std::optional<double> scale) {
+  auto output = at::empty_like(q_projected);
+  WRAP_TO_ATEN(sdpa_with_kv_cache_out_no_context, 11)
+  (q_projected,
+   k_projected,
+   v_projected,
+   key_cache,
+   value_cache,
+   start_pos,
+   seq_len,
+   attn_mask,
+   dropout_p,
+   is_causal,
+   scale,
+   output);
+  return output;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+//
+// Reference Implementation
+//
+
+/*
+ * Converts a boolean mask to an additive mask. Values that are false are
+ * converted to -inf, and values that are true are converted to 0.
+ */
+at::Tensor convert_boolean_attn_mask(
+    const at::Tensor& attn_mask,
+    caffe2::TypeMeta dtype) {
+  // Convert boolean mask to additive mask; need to invert mask to indicate what
+  // to mask *out*.
+  if (attn_mask.dtype() == at::kBool) {
+    return at::where(
+        attn_mask.logical_not(),
+        -std::numeric_limits<double>::infinity(),
+        at::scalar_tensor(
+            0.0, at::TensorOptions().dtype(dtype).device(attn_mask.device())));
+  }
+  // Otherwise, attn_mask represents an additive attention tensor
+  return attn_mask;
+}
+
+/*
+ * Construct an attention mask for SDPA.
+ * 1. Construct a square matrix of ones with each dim equal to start_pos +
+ *    seq_len
+ * 2. Keep the lower triangular elements as 1 and set the rest to 0
+ * 3. Slice the mask to keep only seq_len rows starting from input_pos
+ * 4. Convert the mask to an additive mask
+ */
+at::Tensor construct_attention_mask(
+    const at::Tensor& q,
+    const at::Tensor& k_cache,
+    const int start_pos) {
+  const int max_seq_len = k_cache.size(1);
+  const int seq_len = q.size(1);
+
+  const int length = start_pos + seq_len;
+  at::Tensor attn_mask_base =
+      at::ones({length, length}, q.options().dtype(at::kBool)).tril();
+
+  at::Tensor attn_mask_sliced =
+      at::slice(attn_mask_base, 0, start_pos, start_pos + seq_len);
+
+  attn_mask_sliced = convert_boolean_attn_mask(attn_mask_sliced, q.dtype());
+  return attn_mask_sliced;
+}
+
+/*
+ * Reference implementation of SDPA
+ */
+at::Tensor sdpa_reference_impl(
+    const at::Tensor& q_projected,
+    const at::Tensor& k_projected,
+    const at::Tensor& v_projected,
+    at::Tensor& key_cache,
+    at::Tensor& value_cache,
+    const int64_t start_pos,
+    const int64_t seq_len,
+    const c10::optional<at::Tensor> __attn_mask_ignored,
+    const double dropout_p,
+    const bool is_causal,
+    const c10::optional<double> scale) {
+  at::Tensor attn_mask =
+      construct_attention_mask(q_projected, key_cache, start_pos);
+
+  // Cache update
+  at::Tensor key_cache_updated = at::slice_scatter(
+      key_cache, k_projected, 1, start_pos, start_pos + k_projected.size(1));
+  at::Tensor value_cache_updated = at::slice_scatter(
+      value_cache, v_projected, 1, start_pos, start_pos + v_projected.size(1));
+
+  // Write back to input
+  key_cache = key_cache_updated;
+  value_cache = value_cache_updated;
+
+  at::Tensor key_cache_sliced =
+      at::slice(key_cache_updated, 1, 0, start_pos + q_projected.size(1));
+
+  at::Tensor value_cache_sliced =
+      at::slice(value_cache_updated, 1, 0, start_pos + q_projected.size(1));
+
+  // Since n_heads may not be the same as n_kv_heads, the sliced k and v cache
+  // matrices need to be "expanded" to match
+  const int num_repeats = q_projected.size(2) / key_cache.size(2);
+  at::Tensor key_cache_sliced_repeated =
+      at::repeat_interleave(key_cache_sliced, num_repeats, 2);
+  at::Tensor value_cache_sliced_repeated =
+      at::repeat_interleave(value_cache_sliced, num_repeats, 2);
+
+  at::Tensor q_transposed = q_projected.transpose(1, 2);
+  at::Tensor k_transposed = key_cache_sliced_repeated.transpose(1, 2);
+  at::Tensor v_transposed = value_cache_sliced_repeated.transpose(1, 2);
+
+  at::Tensor k_transposed_2 = k_transposed.transpose(-2, -1);
+  at::Tensor attn_weight_prescale = at::matmul(q_transposed, k_transposed_2);
+
+  float scale_factor = 1.0 / sqrt(q_transposed.size(-1));
+  at::Tensor attn_weight = attn_weight_prescale * scale_factor + attn_mask;
+
+  at::Tensor attn_weight_softmax = at::softmax(attn_weight, -1);
+  at::Tensor out = at::matmul(attn_weight_softmax, v_transposed);
+
+  return out.transpose(1, 2);
+}
+
+//
+// Test functions
+//
+
+void test_reference_sdpa(
+    const int start_input_pos,
+    const int sequence_len,
+    const int embedding_dim,
+    const int num_heads,
+    const int num_kv_heads,
+    const int batch_size,
+    const int max_seq_len,
+    at::ScalarType dtype = at::kFloat) {
+  const int head_dim = embedding_dim / num_heads;
+
+  // K and V caches. Need an extra set for the reference implementation
+
+  at::Tensor k_cache = at::zeros(
+      {batch_size, max_seq_len, num_kv_heads, head_dim},
+      at::device(at::kCPU).dtype(dtype));
+  at::Tensor v_cache = at::zeros_like(k_cache);
+
+  at::Tensor k_cache_ref = at::zeros_like(k_cache);
+  at::Tensor v_cache_ref = at::zeros_like(v_cache);
+
+  for (int input_pos = start_input_pos; input_pos + sequence_len < max_seq_len;
+       input_pos += sequence_len) {
+    at::Tensor q = at::rand(
+        {batch_size, sequence_len, num_heads, head_dim},
+        at::device(at::kCPU).dtype(dtype));
+    at::Tensor k = at::rand(
+        {batch_size, sequence_len, num_kv_heads, head_dim},
+        at::device(at::kCPU).dtype(dtype));
+    at::Tensor v = at::rand_like(k);
+
+    at::Tensor reference_impl_out = sdpa_reference_impl(
+        q, k, v, k_cache, v_cache, input_pos, sequence_len, {}, 0.0, true, {});
+
+    at::Tensor reference_out = torch::executor::native::sdpa_with_kv_cache_aten(
+        q,
+        k,
+        v,
+        k_cache_ref,
+        v_cache_ref,
+        input_pos,
+        sequence_len,
+        {},
+        0.0,
+        true,
+        {});
+
+    ASSERT_TRUE(at::allclose(reference_impl_out, reference_out));
+  }
+}
+
+vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
+  using namespace vkcompute;
+  switch (at_scalartype) {
+    case c10::kFloat:
+      return vkapi::kFloat;
+    case c10::kHalf:
+      return vkapi::kHalf;
+    case c10::kInt:
+      return vkapi::kInt;
+    case c10::kLong:
+      return vkapi::kInt;
+    case c10::kChar:
+      return vkapi::kChar;
+    default:
+      VK_THROW("Unsupported at::ScalarType!");
+  }
+}
+
+void test_vulkan_sdpa(
+    const int start_input_pos,
+    const int base_sequence_len,
+    const int embedding_dim,
+    const int num_heads,
+    const int num_kv_heads,
+    const int batch_size,
+    const int max_seq_len,
+    const bool dynamic_seq_len = true,
+    at::ScalarType dtype = at::kFloat) {
+  const int head_dim = embedding_dim / num_heads;
+
+  const int init_seq_len = dynamic_seq_len ? max_seq_len : base_sequence_len;
+  // K and V caches
+
+  at::Tensor k_cache = at::zeros(
+      {batch_size, max_seq_len, num_kv_heads, head_dim},
+      at::device(at::kCPU).dtype(dtype));
+
+  at::Tensor v_cache = at::zeros_like(k_cache);
+
+  // Reference input data
+  at::Tensor q = at::empty(
+      {batch_size, init_seq_len, num_heads, head_dim},
+      at::device(at::kCPU).dtype(dtype));
+  at::Tensor k = at::empty(
+      {batch_size, init_seq_len, num_kv_heads, head_dim},
+      at::device(at::kCPU).dtype(dtype));
+  at::Tensor v = at::empty_like(k);
+
+  // Get reference output
+  at::Tensor out = at::empty_like(q);
+
+  // Build Vulkan SDPA graph
+  using namespace vkcompute;
+
+  GraphConfig config;
+  config.set_storage_type_override(utils::kTexture3D);
+  ComputeGraph graph(config);
+
+  // "Data" variant for vulkan initialization
+
+  at::Tensor k_cache_data = at::zeros_like(k_cache);
+  at::Tensor v_cache_data = at::zeros_like(v_cache);
+
+#define MAKE_TENSORREF_FOR(x)              \
+  ValueRef r_##x = graph.add_tensorref(    \
+      x.sizes().vec(),                     \
+      from_at_scalartype(x.scalar_type()), \
+      x.const_data_ptr());
+
+  MAKE_TENSORREF_FOR(k_cache_data);
+  MAKE_TENSORREF_FOR(v_cache_data);
+
+#define MAKE_INPUT_FOR(x)                    \
+  IOValueRef r_##x = graph.add_input_tensor( \
+      x.sizes().vec(), from_at_scalartype(x.scalar_type()));
+
+  MAKE_INPUT_FOR(q);
+  MAKE_INPUT_FOR(k);
+  MAKE_INPUT_FOR(v);
+#undef MAKE_INPUT_FOR
+
+  const ValueRef r_input_pos_symint = graph.add_symint(start_input_pos);
+  const ValueRef r_out = graph.add_tensor(
+      out.sizes().vec(), from_at_scalartype(out.scalar_type()));
+
+  VK_GET_OP_FN("sdpa_with_kv_cache.default")
+  (graph,
+   {
+       r_q.value,
+       r_k.value,
+       r_v.value,
+       r_k_cache_data,
+       r_v_cache_data,
+       r_input_pos_symint,
+       kDummyValueRef, // sequence_len
+       kDummyValueRef, // attn_mask
+       kDummyValueRef, // dropout_p
+       kDummyValueRef, // is_causal
+       kDummyValueRef, // scale
+       r_out,
+   });
+
+  ValueRef staging_out = graph.set_output_tensor(r_out);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  //
+  // Run model
+  //
+
+#define COPY_INPUT(x) \
+  graph.copy_into_staging(r_##x.staging, x.const_data_ptr(), x.numel());
+
+#define EXTRACT_TENSOR(x)                             \
+  at::Tensor vk_##x = at::zeros_like(x).contiguous(); \
+  graph.copy_from_staging(                            \
+      staging_##x, vk_##x.mutable_data_ptr(), vk_##x.numel());
+
+  int seq_len = base_sequence_len;
+  for (int i = 0, input_pos = start_input_pos;
+       input_pos + seq_len < max_seq_len;
+       input_pos += seq_len, i++) {
+    q = at::rand(
+        {batch_size, seq_len, num_heads, head_dim},
+        at::device(at::kCPU).dtype(dtype));
+    k = at::rand(
+        {batch_size, seq_len, num_kv_heads, head_dim},
+        at::device(at::kCPU).dtype(dtype));
+    v = at::rand_like(k);
+
+    at::Tensor reference_out = sdpa_reference_impl(
+        q, k, v, k_cache, v_cache, input_pos, seq_len, {}, 0.0, true, {});
+
+    graph.set_symint(r_input_pos_symint, input_pos);
+    graph.resize_input(0, q.sizes().vec());
+    graph.resize_input(1, k.sizes().vec());
+    graph.resize_input(2, v.sizes().vec());
+    graph.propagate_resize();
+
+    // Run Vulkan SDPA
+    COPY_INPUT(q);
+    COPY_INPUT(k);
+    COPY_INPUT(v);
+
+    graph.execute();
+
+    out = at::empty_like(q);
+    EXTRACT_TENSOR(out);
+
+    const bool output_correct = at::allclose(reference_out, vk_out);
+    if (!output_correct) {
+      at::Tensor diffs = at::abs(reference_out - vk_out);
+
+      std::cout << "Failed at input_pos " << input_pos << " with seq_len "
+                << seq_len << std::endl;
+
+      std::cout << "Maximum difference: " << std::endl;
+      std::cout << at::max(diffs).item() << std::endl;
+      std::cout << "Found at index " << std::endl;
+      std::cout << at::argmax(diffs).item() << std::endl;
+
+      std::cout << "Maximum value observed: " << std::endl;
+      std::cout << at::max(at::abs(at::cat({reference_out, vk_out}, -1))).item()
+                << std::endl;
+    }
+    ASSERT_TRUE(output_correct);
+
+    if (dynamic_seq_len) {
+      seq_len = base_sequence_len + (i % 3);
+    }
+  }
+}
+
+TEST(VulkanSDPATest, test_sdpa_op_small_params) {
+  const int starting_input_pos = 0;
+  const int base_sequence_len = 3;
+  const int embedding_dim = 18;
+  const int num_heads = 6;
+  const int num_kv_heads = 2;
+  const int batch_size = 1;
+  const int max_seq_len = 7;
+
+  test_vulkan_sdpa(
+      starting_input_pos,
+      base_sequence_len,
+      embedding_dim,
+      num_heads,
+      num_kv_heads,
+      batch_size,
+      max_seq_len,
+      false);
+}
+
+TEST(VulkanSDPATest, test_sdpa_op_small_params_dynamic) {
+  const int starting_input_pos = 0;
+  const int base_sequence_len = 3;
+  const int embedding_dim = 18;
+  const int num_heads = 6;
+  const int num_kv_heads = 2;
+  const int batch_size = 1;
+  const int max_seq_len = 12;
+
+  test_vulkan_sdpa(
+      starting_input_pos,
+      base_sequence_len,
+      embedding_dim,
+      num_heads,
+      num_kv_heads,
+      batch_size,
+      max_seq_len);
+}
+
+TEST(VulkanSDPATest, test_sdpa_op_llama3_params_dynamic) {
+  const int starting_input_pos = 0;
+  const int base_sequence_len = 3;
+  const int embedding_dim = 2048;
+  const int num_heads = 32;
+  const int num_kv_heads = 8;
+  const int batch_size = 1;
+  const int max_seq_len = 128;
+
+  test_vulkan_sdpa(
+      starting_input_pos,
+      base_sequence_len,
+      embedding_dim,
+      num_heads,
+      num_kv_heads,
+      batch_size,
+      max_seq_len);
+}
+
+TEST(VulkanSDPATest, test_reference_impl) {
+  const int starting_input_pos = 0;
+  const int base_sequence_len = 3;
+  const int embedding_dim = 2048;
+  const int num_heads = 32;
+  const int num_kv_heads = 8;
+  const int batch_size = 1;
+  const int max_seq_len = 128;
+
+  test_reference_sdpa(
+      starting_input_pos,
+      base_sequence_len,
+      embedding_dim,
+      num_heads,
+      num_kv_heads,
+      batch_size,
+      max_seq_len);
+}
diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl
index 0cffb5d80be..3acf1debe50 100644
--- a/backends/vulkan/test/op_tests/targets.bzl
+++ b/backends/vulkan/test/op_tests/targets.bzl
@@ -8,9 +8,9 @@ def define_common_targets(is_fbcode = False):
         return
 
     runtime.python_library(
-        name = "generate_op_tests_lib",
+        name = "generate_op_correctness_tests_lib",
         srcs = native.glob(["utils/*.py"]) + [
-            "generate_op_tests.py",
+            "generate_op_correctness_tests.py",
             "cases.py",
         ],
         base_module = "executorch.backends.vulkan.test.op_tests",
@@ -20,24 +20,45 @@ def define_common_targets(is_fbcode = False):
         external_deps = ["torchgen"],
     )
 
+    runtime.python_library(
+        name = "generate_op_benchmarks_lib",
+        srcs = native.glob(["utils/*.py"]) + [
+            "generate_op_benchmarks.py",
+            "cases.py",
+        ],
+        base_module = "executorch.backends.vulkan.test.op_tests",
+        deps = [
+            "fbsource//third-party/pypi/expecttest:expecttest",
+        ],
+        external_deps = ["torchgen"],
+    )
+
+    runtime.python_binary(
+        name = "generate_op_correctness_tests",
+        main_module = "executorch.backends.vulkan.test.op_tests.generate_op_correctness_tests",
+        deps = [
+            ":generate_op_correctness_tests_lib",
+        ],
+    )
+
     runtime.python_binary(
-        name = "generate_op_tests",
-        main_module = "executorch.backends.vulkan.test.op_tests.generate_op_tests",
+        name = "generate_op_benchmarks",
+        main_module = "executorch.backends.vulkan.test.op_tests.generate_op_benchmarks",
         deps = [
-            ":generate_op_tests_lib",
+            ":generate_op_benchmarks_lib",
         ],
     )
 
     aten_src_path = runtime.external_dep_location("aten-src-path")
     genrule_cmd = [
-        "$(exe :generate_op_tests)",
+        "$(exe :generate_op_correctness_tests)",
         "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path),
         "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path),
         "-o $OUT",
     ]
 
     runtime.genrule(
-        name = "generated_op_tests_cpp",
+        name = "generated_op_correctness_tests_cpp",
         outs = {
             "op_tests.cpp": ["op_tests.cpp"],
         },
@@ -45,6 +66,22 @@ def define_common_targets(is_fbcode = False):
         default_outs = ["."],
     )
 
+    benchmarks_genrule_cmd = [
+        "$(exe :generate_op_benchmarks)",
+        "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path),
+        "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path),
+        "-o $OUT",
+    ]
+
+    runtime.genrule(
+        name = "generated_op_benchmarks_cpp",
+        outs = {
+            "op_benchmarks.cpp": ["op_benchmarks.cpp"],
+        },
+        cmd = " ".join(benchmarks_genrule_cmd),
+        default_outs = ["."],
+    )
+
     pt_operator_library(
         name = "all_aten_ops",
         check_decl = False,
@@ -66,7 +103,7 @@ def define_common_targets(is_fbcode = False):
     runtime.cxx_binary(
         name = "compute_graph_op_tests_bin",
         srcs = [
-            ":generated_op_tests_cpp[op_tests.cpp]",
+            ":generated_op_correctness_tests_cpp[op_tests.cpp]",
         ],
         define_static_target = False,
         deps = [
@@ -76,10 +113,62 @@ def define_common_targets(is_fbcode = False):
         ],
     )
 
+    runtime.cxx_binary(
+        name = "compute_graph_op_benchmarks_bin",
+        srcs = [
+            ":generated_op_benchmarks_cpp[op_benchmarks.cpp]",
+        ],
+        compiler_flags = [
+            "-Wno-unused-variable",
+        ],
+        define_static_target = False,
+        deps = [
+            "//third-party/benchmark:benchmark",
+            "//executorch/backends/vulkan:vulkan_graph_runtime",
+            ":all_aten_ops_lib",
+        ],
+    )
+
     runtime.cxx_test(
         name = "compute_graph_op_tests",
         srcs = [
-            ":generated_op_tests_cpp[op_tests.cpp]",
+            ":generated_op_correctness_tests_cpp[op_tests.cpp]",
+        ],
+        contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
+        fbandroid_additional_loaded_sonames = [
+            "torch-code-gen",
+            "vulkan_graph_runtime",
+            "vulkan_graph_runtime_shaderlib",
+        ],
+        platforms = [ANDROID],
+        use_instrumentation_test = True,
+        deps = [
+            "//third-party/googletest:gtest_main",
+            "//executorch/backends/vulkan:vulkan_graph_runtime",
+            runtime.external_dep_location("libtorch"),
+        ],
+    )
+
+    runtime.cxx_binary(
+        name = "sdpa_test_bin",
+        srcs = [
+            "sdpa_test.cpp",
+        ],
+        compiler_flags = [
+            "-Wno-unused-variable",
+        ],
+        define_static_target = False,
+        deps = [
+            "//third-party/googletest:gtest_main",
+            "//executorch/backends/vulkan:vulkan_graph_runtime",
+            "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "sdpa_test",
+        srcs = [
+            "sdpa_test.cpp",
         ],
         contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
         fbandroid_additional_loaded_sonames = [
@@ -92,6 +181,8 @@ def define_common_targets(is_fbcode = False):
         deps = [
             "//third-party/googletest:gtest_main",
             "//executorch/backends/vulkan:vulkan_graph_runtime",
+            "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+            "//executorch/extension/tensor:tensor",
             runtime.external_dep_location("libtorch"),
         ],
     )
diff --git a/backends/vulkan/test/op_tests/utils/aten_types.py b/backends/vulkan/test/op_tests/utils/aten_types.py
new file mode 100644
index 00000000000..186f5afb78b
--- /dev/null
+++ b/backends/vulkan/test/op_tests/utils/aten_types.py
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+####################
+## ATen C++ Types ##
+####################
+
+AT_INT_ARRAY_REF = "at::IntArrayRef"
+AT_SCALAR = "at::Scalar"
+AT_TENSOR = "at::Tensor"
+AT_TENSOR_LIST = "at::TensorList"
+BOOL = "bool"
+DOUBLE = "double"
+INT = "int64_t"
+OPT_AT_DOUBLE_ARRAY_REF = "::std::optional<at::ArrayRef<double>>"
+OPT_AT_INT_ARRAY_REF = "at::OptionalIntArrayRef"
+OPT_AT_TENSOR = "::std::optional<at::Tensor>"
+OPT_BOOL = "::std::optional<bool>"
+OPT_INT64 = "::std::optional<int64_t>"
+OPT_DEVICE = "::std::optional<at::Device>"
+OPT_LAYOUT = "::std::optional<at::Layout>"
+OPT_MEMORY_FORMAT = "::std::optional<at::MemoryFormat>"
+OPT_SCALAR_TYPE = "::std::optional<at::ScalarType>"
+STRING = "c10::string_view"
+TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
+THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
+TENSOR_VECTOR = "::std::vector<at::Tensor>"
diff --git a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
new file mode 100644
index 00000000000..fb42d982f67
--- /dev/null
+++ b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
@@ -0,0 +1,335 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
+    ComputeGraphGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import (
+    CorrectnessTestGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
+
+from torchgen.model import NativeFunction
+
+##########################
+## Test Suite Generation ##
+##########################
+
+benchmark_fixture_template = """
+class GeneratedOpBenchmark_{op_name} : public ::benchmark::Fixture {{
+ protected:
+  ComputeGraph* graph;
+  at::ScalarType test_dtype = at::kFloat;
+  float rtol = {rtol};
+  float atol = {atol};
+
+  {arg_valuerefs}
+
+  void SetUp(::benchmark::State& state) override {{
+    GraphConfig config;
+    config.descriptor_pool_safety_factor = 2.0;
+    test_dtype = at::ScalarType(state.range(0));
+    const utils::StorageType storage_type = utils::StorageType(state.range(1));
+    const utils::GPUMemoryLayout memory_layout = utils::GPUMemoryLayout(state.range(2));
+    config.set_storage_type_override(storage_type);
+    config.set_memory_layout_override(memory_layout);
+    config.enable_querypool = true;
+    graph = new ComputeGraph(config);
+  }}
+
+  void TearDown(::benchmark::State& state) override {{
+    delete graph;
+    graph = nullptr;
+  }}
+
+  {build_graph_fn}
+  {benchmark_fn}
+}};
+"""
+
+benchmark_template = """
+BENCHMARK_DEFINE_F(GeneratedOpBenchmark_{op_name}, {case_name})(benchmark::State& state) {{
+    {skips}
+    {create_ref_data}
+    {call_build_graph}
+    ShaderTimes shader_times;
+    for (auto _ : state) {{
+        {call_benchmark}
+        graph->context()->querypool().extract_results();
+        QueryPoolResults results = graph->context()->querypool().get_shader_timestamp_data();
+        process_querypool_results(results, shader_times);
+    }}
+    register_shader_time_counters(state, shader_times);
+}}
+
+BENCHMARK_REGISTER_F(GeneratedOpBenchmark_{op_name}, {case_name})->Threads(1)->ArgsProduct({combos});
+"""
+
+
+class VkBenchmarkGen(CorrectnessTestGen):
+    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: TestSuite):
+        super().__init__(f, inputs)
+        self.op_reg_name = op_reg_name
+        self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
+
+    def gen_call_benchmark(self, prepack=False) -> str:
+        test_str = f"benchmark_{self.op_name}("
+        if prepack:
+            test_str = f"prepacked_benchmark_{self.op_name}("
+        for binding in self.f_sig.arguments():
+            arg = binding.argument
+            test_str += f"{arg.name}, "
+        test_str = test_str[:-2] + ");"
+        test_str = re.sub(r"^", "  ", test_str, flags=re.M)
+        return test_str
+
+    def gen_call_build_graph(self, prepack=False) -> str:
+        test_str = f"build_graph_{self.op_name}("
+        if prepack:
+            test_str = f"prepacked_build_graph_{self.op_name}("
+        for binding in self.f_sig.arguments():
+            arg = binding.argument
+            test_str += f"{arg.name}, "
+        test_str = test_str[:-2] + ");"
+        test_str = re.sub(r"^", "  ", test_str, flags=re.M)
+        return test_str
+
+    def gen_combos(self, inputs) -> str:
+        dtypes_list = ", ".join(f"int({dtype})" for dtype in self.suite_def.dtypes)
+        storage_types_list = ", ".join(
+            f"int({storage_type})" for storage_type in self.suite_def.storage_types
+        )
+        layouts_list = ", ".join(f"int({layout})" for layout in self.suite_def.layouts)
+        return f"{{ {{ {dtypes_list} }}, {{ {storage_types_list} }}, {{ {layouts_list} }} }}"
+
+    def generate_benchmark_case(self, inputs, prepack=False) -> str:
+        return benchmark_template.format(
+            op_name=f"{self.op_name}",
+            case_name=self.gen_case_name(inputs, prepack),
+            skips=self.generator.gen_conditional_skips(
+                'state.SkipWithError("unsupported type"); return;'
+            ),
+            create_ref_data=self.gen_create_ref_data(inputs),
+            call_build_graph=self.gen_call_build_graph(prepack),
+            call_benchmark=self.gen_call_benchmark(prepack),
+            combos=self.gen_combos(inputs),
+        )
+
+    def generate_benchmark(self) -> str:
+        benchmarks_cpp = ""
+        for inputs in self.suite_def.input_cases:
+            if not self.suite_def.requires_prepack:
+                benchmarks_cpp += self.generate_benchmark_case(inputs)
+            if self.suite_def.supports_prepack():
+                benchmarks_cpp += self.generate_benchmark_case(inputs, prepack=True)
+        return benchmarks_cpp
+
+    def generate_benchmark_fixture(self) -> str:
+        build_graph_fn = ""
+        benchmark_fn = ""
+        if not self.suite_def.requires_prepack:
+            build_graph_fn = self.generator.gen_build_graph_fn()
+            benchmark_fn = self.generator.gen_op_exec_graph_fn()
+
+        prepacked_build_graph_fn = ""
+        prepacked_benchmark_fn = ""
+        if self.suite_def.supports_prepack():
+            self.generator.should_prepack = True
+            prepacked_build_graph_fn = self.generator.gen_build_graph_fn()
+            build_graph_fn += "\n\n  "
+            build_graph_fn += prepacked_build_graph_fn
+            prepacked_benchmark_fn = self.generator.gen_op_exec_graph_fn()
+            benchmark_fn += "\n\n  "
+            benchmark_fn += prepacked_benchmark_fn
+
+        return benchmark_fixture_template.format(
+            op_name=self.op_name,
+            build_graph_fn=build_graph_fn,
+            benchmark_fn=benchmark_fn,
+            rtol=self.suite_def.rtol,
+            arg_valuerefs=self.generator.gen_arg_valueref_decls(),
+            atol=self.suite_def.atol,
+        )
+
+
+##########################
+## Test File Generation ##
+##########################
+
+cpp_test_template = """
+#include <iostream>
+#include <ATen/ATen.h>
+#include <benchmark/benchmark.h>
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+using namespace vkcompute;
+using TensorOptions = at::TensorOptions;
+
+vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {{
+  switch (at_scalartype) {{
+    case c10::kFloat:
+      return vkapi::kFloat;
+    case c10::kHalf:
+      return vkapi::kHalf;
+    case c10::kInt:
+      return vkapi::kInt;
+    case c10::kLong:
+      return vkapi::kInt;
+    case c10::kChar:
+      return vkapi::kChar;
+    default:
+      VK_THROW("Unsupported at::ScalarType!");
+  }}
+}}
+
+at::Tensor make_rand_tensor(
+    std::vector<int64_t> sizes,
+    at::ScalarType dtype = at::kFloat,
+    float low = 0.0,
+    float high = 1.0) {{
+  if (high == 1.0 && low == 0.0)
+    return at::rand(sizes, at::device(at::kCPU).dtype(dtype));
+    
+  if (dtype == at::kChar)
+    return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype));
+
+  return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low;
+}}
+
+at::Tensor make_seq_tensor(
+    std::vector<int64_t> sizes,
+    at::ScalarType dtype = at::kFloat,
+    float low = 0.0,
+    float high = 1.0) {{
+  (void)low;
+  (void)high;
+
+  int64_t n = 1;
+  for (auto size: sizes) {{
+    n *= size;
+  }}
+
+  std::vector<float> values(n);
+  for (int i=0;i<n;i++) {{
+    values[i] = (float) i;
+  }}
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone();
+}}
+
+at::Tensor make_index_tensor(std::vector<int64_t> indices) {{
+  at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{static_cast<int64_t>(indices.size())}};
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(indices.data(), sizes, dtype).detach().clone();
+}}
+
+at::Tensor make_index_tensor(std::vector<std::vector<int64_t>> indices) {{
+  at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{
+    static_cast<int64_t>(indices.size()),
+    static_cast<int64_t>(indices[0].size())}};
+
+  // Flatten indices as from_blob reads garbage otherwise.
+  std::vector<int64_t> acc;
+  for (auto& vec: indices) {{
+    acc.insert(acc.end(), vec.begin(), vec.end());
+  }}
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
+}}
+
+at::Tensor make_index_tensor(std::vector<std::vector<std::vector<int64_t>>> indices) {{
+  at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{
+    static_cast<int64_t>(indices.size()),
+    static_cast<int64_t>(indices[0].size()),
+    static_cast<int64_t>(indices[0][0].size())}};
+
+  // Flatten indices as from_blob reads garbage otherwise.
+  std::vector<int64_t> acc;
+  for (auto& v: indices) {{
+    for (auto& vv: v) {{
+      acc.insert(acc.end(), vv.begin(), vv.end());
+    }}
+  }}
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
+}}
+
+using ShaderEntry = std::tuple<std::string, uint32_t, uint64_t, uint64_t>;
+using QueryPoolResults = std::vector<ShaderEntry>;
+using ShaderTimes = std::unordered_map<std::string, std::vector<uint64_t>>;
+
+void process_querypool_results(
+    QueryPoolResults& results,
+    ShaderTimes& shader_times) {{
+  for (const ShaderEntry& entry : results) {{
+    std::string kernel_name = std::get<0>(entry);
+    std::uint64_t start_ns = std::get<2>(entry);
+    std::uint64_t end_ns = std::get<3>(entry);
+    std::uint64_t duration_ns = end_ns - start_ns;
+    if (shader_times.find(kernel_name) == shader_times.end()) {{
+      shader_times[kernel_name] = std::vector<uint64_t>();
+    }}
+    shader_times[kernel_name].emplace_back(duration_ns);
+  }}
+}}
+
+void register_shader_time_counters(
+    benchmark::State& state,
+    ShaderTimes& shader_times) {{
+  for (auto& times_list : shader_times) {{
+    // Filter to_nchw and nchw_to shaders
+    if (times_list.first.find("to_nchw") != std::string::npos) {{
+        continue;
+    }}
+    if (times_list.first.find("nchw_to") != std::string::npos) {{
+        continue;
+    }}
+
+    std::sort(times_list.second.begin(), times_list.second.end());
+    uint64_t median_time;
+    median_time = times_list.second[times_list.second.size() / 2];
+    state.counters[times_list.first + " median ns"] = median_time;
+  }}
+}}
+
+{benchmark_fixtures}
+
+{def_benchmarks}
+"""
+
+
+class VkBenchmarkFileGen:
+    def __init__(self, out_path):
+        self.out_path = out_path
+        self.suites_gens = []
+
+    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
+        suites_gen = VkBenchmarkGen(op_reg_name, f, all_input_cases)
+        self.suites_gens.append(suites_gen)
+
+    def generate_benchmarks_cpp(self) -> str:
+        return "\n".join([h.generate_benchmark() for h in self.suites_gens])
+
+    def generate_benchmark_fixtures(self) -> str:
+        return "\n".join([h.generate_benchmark_fixture() for h in self.suites_gens])
+
+    def generate_cpp(self) -> str:
+        return cpp_test_template.format(
+            benchmark_fixtures=self.generate_benchmark_fixtures(),
+            def_benchmarks=self.generate_benchmarks_cpp(),
+        )
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
similarity index 76%
rename from backends/vulkan/test/op_tests/utils/codegen.py
rename to backends/vulkan/test/op_tests/utils/gen_computegraph.py
index 0bccf64458c..f6ee9c78a14 100644
--- a/backends/vulkan/test/op_tests/utils/codegen.py
+++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
@@ -6,15 +6,14 @@
 
 import re
 from dataclasses import dataclass
-from typing import Any, List, Optional, Union
+from typing import List, Optional, Union
 
-from executorch.backends.vulkan.test.op_tests.utils.codegen_base import (
+from executorch.backends.vulkan.test.op_tests.utils.aten_types import (
     AT_INT_ARRAY_REF,
     AT_SCALAR,
     AT_TENSOR,
     AT_TENSOR_LIST,
     BOOL,
-    CppTestFileGen,
     DOUBLE,
     INT,
     OPT_AT_DOUBLE_ARRAY_REF,
@@ -28,37 +27,20 @@
     OPT_SCALAR_TYPE,
     STRING,
     TENSOR_VECTOR,
-    TestSuite,
-    TestSuiteGen,
     THREE_TENSOR_TUPLE,
     TWO_TENSOR_TUPLE,
 )
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
 
 from torchgen.api import cpp
 from torchgen.api.types import CppSignatureGroup
-
 from torchgen.gen import generate_static_dispatch_backend_call, translate_args
-
 from torchgen.gen_aoti_c_shim import gen_static_dispatch_backend_call_signature
 from torchgen.model import NativeFunction, Variant
 
-##################################
-## Custom Test Suite Definition ##
-##################################
-
-
-@dataclass
-class VkTestSuite(TestSuite):
-    def __init__(self, input_cases: List[Any]):
-        super().__init__(input_cases)
-        self.storage_types: List[str] = ["utils::kTexture3D"]
-        self.layouts: List[str] = ["utils::kChannelsPacked"]
-        self.data_gen: str = "make_rand_tensor"
-
-
-##########################
-## Code Generator Class ##
-##########################
+###################################
+## Compute Graph Code Generation ##
+###################################
 
 
 @dataclass
@@ -105,6 +87,8 @@ def vk_out(self):
 
 
 class ComputeGraphGen:
+    backend_key = None
+
     def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
         self.op_reg_name = op_reg_name
         self.f = f
@@ -230,7 +214,7 @@ def gen_decl(self, fn_name: str, ret_type: str = "void") -> str:
 
     def create_aten_fn_call(self) -> str:
         func_call = generate_static_dispatch_backend_call(
-            self.f_sig, self.f, TestSuiteGen.backend_key
+            self.f_sig, self.f, ComputeGraphGen.backend_key
         )[7:].replace("::cpu", "")
 
         return func_call
@@ -244,11 +228,12 @@ def create_aten_method_call(self) -> str:
         func_call = f"ATEN_FN({self.f_sig.name()})({exprs});"
         return func_call
 
-    def create_out_src(self) -> str:
+    def create_out_src(self, include_declarations: bool = True) -> str:
+        cpp_type = self.out.cpp_type if include_declarations else ""
         if Variant.function in self.f.variants:
-            return f"{self.out.cpp_type} out = " + self.create_aten_fn_call() + "\n"
+            return f"{cpp_type} out = " + self.create_aten_fn_call() + "\n"
         else:
-            return f"{self.out.cpp_type} out = " + self.create_aten_method_call() + "\n"
+            return f"{cpp_type} out = " + self.create_aten_method_call() + "\n"
 
     ## Graph code generation utils
 
@@ -258,7 +243,28 @@ def prepack_ref(self, ref: ValueRef) -> bool:
         else:
             return ref.supports_prepack and self.should_prepack
 
-    def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
+    def create_value_decl_for(self, ref: ValueRefList) -> str:  # noqa: C901
+        if isinstance(ref, list):
+            ret_str = ""
+            for r in ref:
+                ret_str += self.create_value_decl_for(r)
+            return ret_str
+
+        cpp_type = "IOValueRef" if (ref.is_in or ref.requires_prepack) else "ValueRef"
+        if ref.src_cpp_type == AT_TENSOR_LIST:
+            ret_str = f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
+            ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
+            return ret_str
+        elif ref.src_cpp_type == TENSOR_VECTOR:
+            ret_str = f"std::vector<IOValueRef> {ref.io_value_list_name};\n"
+            ret_str += f"std::vector<ValueRef> {ref.value_list_name};\n"
+            return ret_str
+        else:
+            return f"{cpp_type} {ref.name};\n"
+
+    def create_value_for(  # noqa: C901
+        self, ref: ValueRefList, include_declarations: bool = True
+    ) -> str:
         if isinstance(ref, list):
             ret_str = ""
             for r in ref:
@@ -266,11 +272,19 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             return ret_str
 
         prepack = self.prepack_ref(ref)
+        ref_is_view = self.suite_def.is_view_op and ref.is_out
 
         cpp_type = "IOValueRef" if (ref.is_in and not prepack) else "ValueRef"
+        if not include_declarations:
+            cpp_type = ""
 
         if ref.src_cpp_type == OPT_AT_TENSOR:
             ret_str = f"{cpp_type} {ref.name} = "
+            if prepack:
+                ret_str = ""
+                if include_declarations:
+                    ret_str += f"IOValueRef {ref.name};\n"
+                ret_str += f"{ref.name}.value = "
             ret_str += f"!{ref.src_cpp_name}.has_value() ? "
             ret_str += f"{self.graph}{self.dot}add_none() : "
             if not prepack:
@@ -307,11 +321,13 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             # each tensor, to facilate staging. On the other hand, we will
             # use the .value tensor to create a ValueList, which will be passed
             # to the corresponding ops.
-            ret_str = f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
-            ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
+            ret_str = ""
+            if include_declarations:
+                ret_str += f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
+                ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
             ret_str += f"for (int i=0; i < {ref.src_cpp_name}.size(); i++) {{\n"
             ret_str += (
-                f"  {cpp_type} io_value_ref = {self.graph}{self.dot}add_input_tensor(\n"
+                f"  IOValueRef io_value_ref = {self.graph}{self.dot}add_input_tensor(\n"
             )
             ret_str += f"      {ref.src_cpp_name}[i].sizes().vec(),\n"
             ret_str += (
@@ -323,9 +339,11 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n"
             return ret_str
         elif ref.src_cpp_type == TENSOR_VECTOR:
-            ret_str = f"""
-std::vector<IOValueRef> {ref.io_value_list_name};
-std::vector<ValueRef> {ref.value_list_name};
+            ret_str = ""
+            if include_declarations:
+                ret_str += f"std::vector<IOValueRef> {ref.io_value_list_name};\n"
+                ret_str += f"std::vector<ValueRef> {ref.value_list_name};\n"
+            ret_str += f"""
 for (int i=0; i<out.size(); i++) {{
     const at::Tensor& cur = out[i];
     IOValueRef io_value_ref;
@@ -339,7 +357,21 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             return ret_str
 
         ret_str = f"{cpp_type} {ref.name} = {self.graph}{self.dot}"
-        if ref.src_cpp_type == AT_TENSOR and not prepack:
+        if prepack:
+            ret_str = ""
+            if include_declarations:
+                ret_str = f"IOValueRef {ref.name};\n"
+            ret_str += f"{ref.name}.value = {self.graph}{self.dot}"
+
+        if ref.src_cpp_type == AT_TENSOR and ref_is_view:
+            input_name = None
+            for _name, ref in self.refs.items():
+                if ref.is_in and ref.src_cpp_type == AT_TENSOR:
+                    input_name = ref.name
+
+            assert input_name is not None
+            ret_str += f"add_tensor_view({input_name}.value);"
+        elif ref.src_cpp_type == AT_TENSOR and not prepack:
             ret_str += "add_input_tensor(" if ref.is_in else "add_tensor("
             ret_str += f"{ref.src_cpp_name}.sizes().vec(), "
             ret_str += f"from_at_scalartype({ref.src_cpp_name}.scalar_type())); \n"
@@ -390,14 +422,29 @@ def create_op_call(self) -> str:
             else:
                 op_create_code += (
                     f"{ref.name}.value, "
-                    if (ref.is_in and not self.prepack_ref(ref)) or ref.is_out
+                    if ref.is_in or ref.requires_prepack or ref.is_out
                     else f"{ref.name}, "
                 )
+                # op_create_code += f"{ref.name}, "
 
         op_create_code += "out_ref});\n"
         return op_create_code
 
-    def set_output(self, ref: ValueRefList) -> str:
+    def gen_output_staging_valueref_decl(self, ref: ValueRefList) -> str:
+        if isinstance(ref, list):
+            ret_str = ""
+            for r in ref[:-1]:
+                ret_str += self.gen_output_staging_valueref_decl(r)
+            return ret_str
+        elif ref.src_cpp_type == TENSOR_VECTOR:
+            assert ref.is_out
+            ret_str = ""
+            return ret_str
+
+        assert ref.src_cpp_type == AT_TENSOR and ref.is_out
+        return f"ValueRef {ref.name}_staging;\n"
+
+    def set_output(self, ref: ValueRefList, include_declarations: bool = True) -> str:
         if isinstance(ref, list):
             ret_str = ""
             for r in ref[:-1]:
@@ -414,7 +461,8 @@ def set_output(self, ref: ValueRefList) -> str:
             return ret_str
 
         assert ref.src_cpp_type == AT_TENSOR and ref.is_out
-        ret_str = f"ValueRef {ref.name}_staging = {self.graph}{self.dot}"
+        cpptype = "ValueRef" if include_declarations else ""
+        ret_str = f"{cpptype} {ref.name}_staging = {self.graph}{self.dot}"
         ret_str += f"set_output_tensor({ref.name});\n"
         return ret_str
 
@@ -532,15 +580,28 @@ def check_graph_out(self, ref: ValueRefList) -> str:
 
     ## Top level code generation
 
-    def gen_graph_build_code(self) -> str:
-        graph_build = self.create_out_src()
+    def gen_arg_valueref_decls(self) -> str:
+        ret_str = ""
         for aten_arg in self.args:
-            graph_build += self.create_value_for(self.refs[aten_arg.name])
+            ref = self.refs[aten_arg.name]
+            ret_str += self.create_value_decl_for(ref)
 
-        graph_build += self.create_value_for(self.refs["out"])
+        ret_str += self.create_value_decl_for(self.refs["out"])
+        ret_str += f"{self.out.cpp_type} out;\n"
+        ret_str += self.gen_output_staging_valueref_decl(self.refs["out"])
+        return ret_str
+
+    def gen_graph_build_code(self, include_declarations: bool = True) -> str:
+        graph_build = self.create_out_src(include_declarations)
+        for aten_arg in self.args:
+            graph_build += self.create_value_for(
+                self.refs[aten_arg.name], include_declarations
+            )
+
+        graph_build += self.create_value_for(self.refs["out"], include_declarations)
         graph_build += self.create_op_call()
 
-        graph_build += self.set_output(self.refs["out"])
+        graph_build += self.set_output(self.refs["out"], include_declarations)
 
         graph_build += f"{self.graph}{self.dot}prepare();\n"
         graph_build += f"{self.graph}{self.dot}encode_prepack();\n"
@@ -550,7 +611,7 @@ def gen_graph_build_code(self) -> str:
         graph_build += "\n"
         return graph_build
 
-    def gen_graph_exec_code(self) -> str:
+    def gen_graph_exec_code(self, check_output=True) -> str:
         graph_exec = ""
         for aten_arg in self.args:
             ref = self.refs[aten_arg.name]
@@ -563,26 +624,27 @@ def gen_graph_exec_code(self) -> str:
 
         graph_exec += self.declare_vk_out_for(self.refs["out"])
         graph_exec += self.copy_from_staging(self.refs["out"])
-        graph_exec += self.check_graph_out(self.refs["out"])
+        if check_output:
+            graph_exec += self.check_graph_out(self.refs["out"])
 
         graph_exec = re.sub(r"^", "  ", graph_exec, flags=re.M)
         graph_exec = "{\n" + graph_exec + "\n}"
 
         return graph_exec
 
-    def gen_conditional_skips(self) -> str:
+    def gen_conditional_skips(self, skip_str: str = "GTEST_SKIP();") -> str:
         fp16_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_float16_buffers_support()) {{\n"
-        fp16_skip += "  GTEST_SKIP();\n"
+        fp16_skip += f"  {skip_str}\n"
         fp16_skip += "}"
         fp16_skip = re.sub(r"^", "  ", fp16_skip, flags=re.M) + "\n"
 
         int8_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_int8_buffers_support()) {{\n"
-        int8_skip += "  GTEST_SKIP();\n"
+        int8_skip += f"  {skip_str};\n"
         int8_skip += "}\n"
 
         skips = ""
 
-        skips = "if (test_dtype == at::kHalf) {\n"
+        skips += "if (test_dtype == at::kHalf) {\n"
         skips += fp16_skip
         skips += "}\n"
 
@@ -596,6 +658,9 @@ def gen_conditional_skips(self) -> str:
 
     def gen_op_check_fn(self) -> str:
         op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
+
         op_check_fn = self.gen_decl(f"check_{op_name}") + " {\n"
         if self.should_prepack:
             op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n"
@@ -612,146 +677,36 @@ def gen_op_check_fn(self) -> str:
 
         return op_check_fn
 
+    def gen_build_graph_fn(self, include_declarations: bool = False) -> str:
+        op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
+        op_build_graph_fn = self.gen_decl(f"build_graph_{op_name}") + " {\n"
+        if self.should_prepack:
+            op_build_graph_fn = (
+                self.gen_decl(f"prepacked_build_graph_{op_name}") + " {\n"
+            )
 
-##################################
-## Test Fixture Code Generation ##
-##################################
-
-test_fixture_template = """
-class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<at::ScalarType, utils::StorageType, utils::GPUMemoryLayout>> {{
- protected:
-  ComputeGraph* graph;
-  at::ScalarType test_dtype = at::kFloat;
-  float rtol = {rtol};
-  float atol = {atol};
-
-  void SetUp() override {{
-    GraphConfig config;
-    utils::StorageType default_storage_type;
-    utils::GPUMemoryLayout default_memory_layout;
-    std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
-    config.set_storage_type_override(default_storage_type);
-    config.set_memory_layout_override(default_memory_layout);
-    graph = new ComputeGraph(config);
-
-    if (test_dtype == at::kHalf) {{
-      rtol = 1e-2;
-      atol = 1e-2;
-    }}
-  }}
-
-  void TearDown() override {{
-    delete graph;
-    graph = nullptr;
-  }}
-
-  {check_fn}
-}};
-"""
-
-
-class VkTestSuiteGen(TestSuiteGen):
-    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite):
-        super().__init__(f, inputs)
-        self.op_reg_name = op_reg_name
-        self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
-
-    def generate_fixture_cpp(self) -> str:
-        check_fn = ""
-        if not self.suite_def.requires_prepack:
-            check_fn = self.generator.gen_op_check_fn()
-
-        prepacked_check_fn = ""
-        if self.suite_def.supports_prepack():
-            self.generator.should_prepack = True
-            prepacked_check_fn = self.generator.gen_op_check_fn()
-            check_fn += "\n\n  "
-            check_fn += prepacked_check_fn
-
-        return test_fixture_template.format(
-            op_name=self.op_name,
-            check_fn=check_fn,
-            rtol=self.suite_def.rtol,
-            atol=self.suite_def.atol,
-        )
+        op_build_graph_fn_body = ""
+        op_build_graph_fn_body += self.gen_graph_build_code(include_declarations)
 
-    def gen_parameterization(self) -> str:
-        dtypes = self.suite_def.dtypes
-        storage_types = self.suite_def.storage_types
-        layouts = self.suite_def.layouts
-
-        return f"""
-INSTANTIATE_TEST_SUITE_P(
-  Combos_{self.op_name},
-  GeneratedOpsTest_{self.op_name},
-    ::testing::Combine(
-      ::testing::Values({', '.join(dtypes)}),
-      ::testing::Values({', '.join(storage_types)}),
-      ::testing::Values({', '.join(layouts)})));
-        """
-
-
-##############################
-## Test File Code Generation ##
-###############################
-
-preamble_str = """
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <tuple>
-
-using namespace vkcompute;
-using TensorOptions = at::TensorOptions;
-
-vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
-  switch (at_scalartype) {
-    case c10::kFloat:
-      return vkapi::kFloat;
-    case c10::kHalf:
-      return vkapi::kHalf;
-    case c10::kInt:
-      return vkapi::kInt;
-    case c10::kLong:
-      return vkapi::kInt;
-    case c10::kChar:
-      return vkapi::kChar;
-    default:
-      VK_THROW("Unsupported at::ScalarType!");
-  }
-}
-
-#ifdef USE_VULKAN_FP16_INFERENCE
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) {
-#else
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) {
-#endif
-  // Skip checking index tensors
-  if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) {
-    return true;
-  }
-  bool is_close = at::allclose(t1, t2, rtol, atol);
-  if (!is_close && t1.numel() < 500) {
-    std::cout << "reference: " << std::endl;
-    print(t1, 150);
-    std::cout << std::endl;
-    std::cout << "vulkan: " << std::endl;
-    print(t2, 150);
-    std::cout << std::endl;
-  }
-  return is_close;
-}
-"""
+        op_build_graph_fn += op_build_graph_fn_body
+        op_build_graph_fn += "\n  }"
+        return op_build_graph_fn
 
+    def gen_op_exec_graph_fn(self) -> str:
+        op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
+        op_benchmark_fn = self.gen_decl(f"benchmark_{op_name}") + " {\n"
+        if self.should_prepack:
+            op_benchmark_fn = self.gen_decl(f"prepacked_benchmark_{op_name}") + " {\n"
 
-class VkCppTestFileGen(CppTestFileGen):
-    def __init__(self, out_path: str):
-        super().__init__(out_path)
+        op_benchmark_fn_body = ""
+        op_benchmark_fn_body += self.gen_graph_exec_code(False)
 
-    def generate_preamble(self) -> str:
-        return preamble_str
+        op_benchmark_fn_body = re.sub(r"^", "    ", op_benchmark_fn_body, flags=re.M)
 
-    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
-        suites_gen = VkTestSuiteGen(op_reg_name, f, all_input_cases)
-        self.suites_gens.append(suites_gen)
+        op_benchmark_fn += op_benchmark_fn_body
+        op_benchmark_fn += "\n  }"
+        return op_benchmark_fn
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
similarity index 87%
rename from backends/vulkan/test/op_tests/utils/codegen_base.py
rename to backends/vulkan/test/op_tests/utils/gen_correctness_base.py
index 5b3ca0908cf..def3508a8a7 100644
--- a/backends/vulkan/test/op_tests/utils/codegen_base.py
+++ b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
@@ -7,60 +7,31 @@
 import re
 from typing import Any, List
 
+from executorch.backends.vulkan.test.op_tests.utils.aten_types import (
+    AT_INT_ARRAY_REF,
+    AT_SCALAR,
+    AT_TENSOR,
+    AT_TENSOR_LIST,
+    BOOL,
+    DOUBLE,
+    INT,
+    OPT_AT_DOUBLE_ARRAY_REF,
+    OPT_AT_INT_ARRAY_REF,
+    OPT_AT_TENSOR,
+    OPT_BOOL,
+    OPT_DEVICE,
+    OPT_INT64,
+    OPT_LAYOUT,
+    OPT_MEMORY_FORMAT,
+    OPT_SCALAR_TYPE,
+    STRING,
+)
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
+
 from torchgen.api import cpp
 from torchgen.api.types import CppSignatureGroup
 from torchgen.model import Argument, NativeFunction
 
-########################
-## ATen code patterns ##
-########################
-
-AT_INT_ARRAY_REF = "at::IntArrayRef"
-AT_SCALAR = "at::Scalar"
-AT_TENSOR = "at::Tensor"
-AT_TENSOR_LIST = "at::TensorList"
-BOOL = "bool"
-DOUBLE = "double"
-INT = "int64_t"
-OPT_AT_DOUBLE_ARRAY_REF = "::std::optional<at::ArrayRef<double>>"
-OPT_AT_INT_ARRAY_REF = "at::OptionalIntArrayRef"
-OPT_AT_TENSOR = "::std::optional<at::Tensor>"
-OPT_BOOL = "::std::optional<bool>"
-OPT_INT64 = "::std::optional<int64_t>"
-OPT_DEVICE = "::std::optional<at::Device>"
-OPT_LAYOUT = "::std::optional<at::Layout>"
-OPT_MEMORY_FORMAT = "::std::optional<at::MemoryFormat>"
-OPT_SCALAR_TYPE = "::std::optional<at::ScalarType>"
-STRING = "c10::string_view"
-TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
-THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
-TENSOR_VECTOR = "::std::vector<at::Tensor>"
-
-###########################
-## Test Suite definition ##
-###########################
-
-
-class TestSuite:
-    def __init__(self, input_cases: List[Any]):
-        self.input_cases: List[Any] = input_cases
-        self.prepacked_args: List[str] = []
-        self.requires_prepack: bool = False
-        self.dtypes: List[str] = ["at::kFloat", "at::kHalf"]
-
-        self.data_gen: str = "make_rand_tensor"
-        self.data_range = (0, 1)
-
-        self.arg_dtype = {}
-        self.arg_data_range = {}
-
-        self.atol: str = "1e-5"
-        self.rtol: str = "1e-5"
-
-    def supports_prepack(self):
-        return len(self.prepacked_args) > 0
-
-
 ##########################
 ## Test Suite Generation ##
 ##########################
@@ -103,13 +74,13 @@ def get_or_return_default(arg: Argument, inputs: List[Any], i: int):
         return arg.default
 
 
-class TestSuiteGen:
-    backend_key = None
-
+class CorrectnessTestGen:
     def __init__(self, f: NativeFunction, test_suite: TestSuite):
         self.f = f
         self.suite_def = test_suite
         self.op_name = f.func.name.unambiguous_name()
+        if test_suite.test_name_suffix is not None:
+            self.op_name += f"_{test_suite.test_name_suffix}"
 
         self.f_sig = CppSignatureGroup.from_native_function(
             self.f, method=False, fallback_binding=self.f.manual_cpp_binding
@@ -377,7 +348,7 @@ def generate_suite_cpp(self) -> str:
 """
 
 
-class CppTestFileGen:
+class CorrectnessTestFileGen:
     def __init__(self, out_path):
         self.out_path = out_path
         self.suites_gens = []
@@ -395,5 +366,5 @@ def generate_test_suites_cpp(self) -> str:
         return "\n".join([h.generate_suite_cpp() for h in self.suites_gens])
 
     def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
-        suites_gen = TestSuiteGen(f, all_input_cases)
+        suites_gen = CorrectnessTestGen(f, all_input_cases)
         self.suites_gens.append(suites_gen)
diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
new file mode 100644
index 00000000000..6c165a777db
--- /dev/null
+++ b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
@@ -0,0 +1,159 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
+    ComputeGraphGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import (
+    CorrectnessTestFileGen,
+    CorrectnessTestGen,
+)
+from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite
+
+from torchgen.model import NativeFunction
+
+##################################
+## Test Fixture Code Generation ##
+##################################
+
+test_fixture_template = """
+class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<at::ScalarType, utils::StorageType, utils::GPUMemoryLayout>> {{
+ protected:
+  ComputeGraph* graph;
+  at::ScalarType test_dtype = at::kFloat;
+  float rtol = {rtol};
+  float atol = {atol};
+
+  void SetUp() override {{
+    GraphConfig config;
+    utils::StorageType default_storage_type;
+    utils::GPUMemoryLayout default_memory_layout;
+    std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
+    config.set_storage_type_override(default_storage_type);
+    config.set_memory_layout_override(default_memory_layout);
+    graph = new ComputeGraph(config);
+
+    if (test_dtype == at::kHalf) {{
+      rtol = 1e-2;
+      atol = 1e-2;
+    }}
+  }}
+
+  void TearDown() override {{
+    delete graph;
+    graph = nullptr;
+  }}
+
+  {check_fn}
+}};
+"""
+
+
+class VkCorrectnessTestGen(CorrectnessTestGen):
+    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite):
+        super().__init__(f, inputs)
+        self.op_reg_name = op_reg_name
+        self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
+
+    def generate_fixture_cpp(self) -> str:
+        check_fn = ""
+        if not self.suite_def.requires_prepack:
+            check_fn = self.generator.gen_op_check_fn()
+
+        prepacked_check_fn = ""
+        if self.suite_def.supports_prepack():
+            self.generator.should_prepack = True
+            prepacked_check_fn = self.generator.gen_op_check_fn()
+            check_fn += "\n\n  "
+            check_fn += prepacked_check_fn
+
+        return test_fixture_template.format(
+            op_name=self.op_name,
+            check_fn=check_fn,
+            rtol=self.suite_def.rtol,
+            atol=self.suite_def.atol,
+        )
+
+    def gen_parameterization(self) -> str:
+        dtypes = self.suite_def.dtypes
+        storage_types = self.suite_def.storage_types
+        layouts = self.suite_def.layouts
+
+        return f"""
+INSTANTIATE_TEST_SUITE_P(
+  Combos_{self.op_name},
+  GeneratedOpsTest_{self.op_name},
+    ::testing::Combine(
+      ::testing::Values({', '.join(dtypes)}),
+      ::testing::Values({', '.join(storage_types)}),
+      ::testing::Values({', '.join(layouts)})));
+        """
+
+
+##############################
+## Test File Code Generation ##
+###############################
+
+preamble_str = """
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <tuple>
+
+using namespace vkcompute;
+using TensorOptions = at::TensorOptions;
+
+vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
+  switch (at_scalartype) {
+    case c10::kFloat:
+      return vkapi::kFloat;
+    case c10::kHalf:
+      return vkapi::kHalf;
+    case c10::kInt:
+      return vkapi::kInt;
+    case c10::kLong:
+      return vkapi::kInt;
+    case c10::kChar:
+      return vkapi::kChar;
+    default:
+      VK_THROW("Unsupported at::ScalarType!");
+  }
+}
+
+#ifdef USE_VULKAN_FP16_INFERENCE
+bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) {
+#else
+bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) {
+#endif
+  // Skip checking index tensors
+  if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) {
+    return true;
+  }
+  bool is_close = at::allclose(t1, t2, rtol, atol);
+  if (!is_close && t1.numel() < 500) {
+    std::cout << "reference: " << std::endl;
+    print(t1, 150);
+    std::cout << std::endl;
+    std::cout << "vulkan: " << std::endl;
+    print(t2, 150);
+    std::cout << std::endl;
+  }
+  return is_close;
+}
+"""
+
+
+class VkCorrectnessTestFileGen(CorrectnessTestFileGen):
+    def __init__(self, out_path: str):
+        super().__init__(out_path)
+
+    def generate_preamble(self) -> str:
+        return preamble_str
+
+    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
+        suites_gen = VkCorrectnessTestGen(op_reg_name, f, all_input_cases)
+        self.suites_gens.append(suites_gen)
diff --git a/backends/vulkan/test/op_tests/utils/test_suite.py b/backends/vulkan/test/op_tests/utils/test_suite.py
new file mode 100644
index 00000000000..dd01bdde3a4
--- /dev/null
+++ b/backends/vulkan/test/op_tests/utils/test_suite.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+###################################
+## Generic Test Suite definition ##
+###################################
+
+
+class TestSuite:
+    def __init__(self, input_cases: List[Any]):
+        self.input_cases: List[Any] = input_cases
+        self.prepacked_args: List[str] = []
+        self.requires_prepack: bool = False
+        self.dtypes: List[str] = ["at::kFloat", "at::kHalf"]
+
+        self.data_gen: str = "make_rand_tensor"
+        self.data_range = (0, 1)
+
+        self.arg_dtype = {}
+        self.arg_data_range = {}
+
+        self.atol: str = "1e-5"
+        self.rtol: str = "1e-5"
+
+        self.is_view_op: bool = False
+        self.test_name_suffix: Optional[str] = None
+
+    def supports_prepack(self):
+        return len(self.prepacked_args) > 0
+
+
+##################################
+## Vulkan Test Suite Definition ##
+##################################
+
+
+@dataclass
+class VkTestSuite(TestSuite):
+    def __init__(self, input_cases: List[Any]):
+        super().__init__(input_cases)
+        self.storage_types: List[str] = ["utils::kTexture3D"]
+        self.layouts: List[str] = ["utils::kChannelsPacked"]
+        self.data_gen: str = "make_rand_tensor"
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index d80809ec79f..d1303d14ebb 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -204,6 +204,16 @@ def forward(self, x, y, w):
 
         self.lower_module_and_test_output(add_module, sample_inputs)
 
+        sample_inputs = (
+            torch.rand(size=(4, 5, 2, 3), dtype=torch.float32),
+            torch.rand(size=(4, 5, 2, 3), dtype=torch.float32),
+            torch.rand(
+                size=(2, 3), dtype=torch.float32
+            ),  # test broadcasting on packed dim
+        )
+
+        self.lower_module_and_test_output(add_module, sample_inputs)
+
     def test_vulkan_backend_add_int(self):
         class AddIntModule(torch.nn.Module):
             def __init__(self):
@@ -1377,6 +1387,9 @@ def forward(self, x):
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
 
+    @unittest.skip(
+        "Softmax shader with shared memory does not work with swiftshader due to potential swiftshader bug"
+    )
     def test_vulkan_backend_softmax(self):
         class SoftmaxModule(torch.nn.Module):
             def __init__(self):
@@ -1396,6 +1409,9 @@ def forward(self, x):
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
 
+    @unittest.skip(
+        "Softmax shader with shared memory does not work with swiftshader due to potential swiftshader bug"
+    )
     def test_vulkan_backend_logsoftmax(self):
         class LogSoftmaxModule(torch.nn.Module):
             def __init__(self):
@@ -1633,6 +1649,20 @@ def forward(self, x):
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
 
+    def test_vulkan_backend_flip(self):
+        class FlipModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.flip(x, [0, 1, 2, 3])
+
+        self.lower_module_and_test_output(
+            FlipModule(),
+            (torch.arange(48).reshape(2, 3, 4, 2),),
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
     def test_vulkan_backend_conv_with_clamp(self):
         class ConvWithClampModule(torch.nn.Module):
             def __init__(self):
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 4feaecced53..e4ada921226 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -8,13 +8,15 @@
 
 #include <executorch/backends/vulkan/test/utils/test_utils.h>
 
-#include <executorch/runtime/core/portable_type/half.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 
 #include <cassert>
 #include <random>
 
+using namespace vkcompute;
+
 //
 // Operator Recording Functions
 //
@@ -68,15 +70,14 @@ void record_nchw_to_image_op(
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {
-      SV(v_dst.packed_dim_whcn_idx())};
+  vkapi::SpecVarList specialization_constants = {SV(v_dst.packed_dim())};
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(
           v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
       pipeline_barrier,
-      v_dst.image_extents(),
-      adaptive_work_group_size(v_dst.image_extents()),
+      v_dst.logical_limits(),
+      adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
@@ -86,7 +87,7 @@ void record_nchw_to_image_op(
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
       v_dst.sizes_ubo(),
-      v_dst.axis_mapping_ubo());
+      v_dst.axis_map_ubo());
 }
 
 void record_image_to_nchw_op(
@@ -94,21 +95,20 @@ void record_image_to_nchw_op(
     api::vTensor& v_src,
     vkapi::VulkanBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {
-      SV(v_src.packed_dim_whcn_idx())};
+  vkapi::SpecVarList specialization_constants = {SV(v_src.packed_dim())};
 
   context->submit_compute_job(
       get_tensor_to_nchw_shader(v_src),
       pipeline_barrier,
-      v_src.image_extents(),
-      adaptive_work_group_size(v_src.image_extents()),
+      v_src.logical_limits(),
+      adaptive_work_group_size(v_src.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
       dst_buffer,
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
-      v_src.axis_mapping_ubo());
+      v_src.axis_map_ubo());
 }
 
 void record_int8_image_to_nchw_noint8_op(
@@ -123,13 +123,13 @@ void record_int8_image_to_nchw_noint8_op(
       pipeline_barrier,
       global_wg_size,
       adaptive_work_group_size(global_wg_size),
-      {v_src.packed_dim_whcn_idx()},
+      {v_src.packed_dim()},
       VK_NULL_HANDLE,
       0,
       dst_buffer.buffer(),
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
-      v_src.axis_mapping_ubo(),
+      v_src.axis_map_ubo(),
       v_src.numel_ubo());
 }
 
@@ -158,8 +158,8 @@ void record_conv2d_prepack_weights_op(
   context->submit_compute_job(
       shader,
       pipeline_barrier,
-      v_dst.image_extents(),
-      adaptive_work_group_size(v_dst.image_extents()),
+      v_dst.logical_limits(),
+      adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
@@ -186,8 +186,8 @@ void record_binary_op(
   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
       pipeline_barrier,
-      v_dst.image_extents(),
-      adaptive_work_group_size(v_dst.image_extents()),
+      v_dst.logical_limits(),
+      adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
@@ -314,6 +314,42 @@ void record_reference_matmul(
       mat2.strides_ubo());
 }
 
+void record_matmul_texture3d(
+    api::Context* context,
+    api::vTensor& out,
+    api::vTensor& mat1,
+    api::vTensor& mat2) {
+  std::string kernel_name = "matmul_naive";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, out.storage_type());
+  add_dtype_suffix(kernel_name, out.dtype());
+
+  utils::uvec3 global_wg_size = out.logical_limits();
+
+  vkapi::PipelineBarrier pipeline_barrier{};
+  api::context()->submit_compute_job(
+      VK_KERNEL_FROM_STR(kernel_name),
+      pipeline_barrier,
+      global_wg_size,
+      {8, 8, 1},
+      {out.packed_dim(), mat1.packed_dim(), mat2.packed_dim()},
+      VK_NULL_HANDLE,
+      0,
+      out.image(
+          pipeline_barrier,
+          vkapi::PipelineStage::COMPUTE,
+          vkapi::MemoryAccessType::WRITE),
+      mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
+      mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
+      out.sizes_ubo(),
+      out.logical_limits_ubo(),
+      out.axis_map_ubo(),
+      mat1.sizes_ubo(),
+      mat1.axis_map_ubo(),
+      mat2.sizes_ubo(),
+      mat2.axis_map_ubo());
+}
+
 //
 // Input & Output Utilities
 //
@@ -322,7 +358,7 @@ void record_reference_matmul(
   _(uint8_t, Byte)                \
   _(int8_t, Char)                 \
   _(int32_t, Int)                 \
-  _(torch::executor::Half, Half)  \
+  _(executorch::aten::Half, Half) \
   _(float, Float)                 \
   _(int8_t, QInt8)
 
@@ -457,8 +493,10 @@ void submit_to_gpu() {
 }
 
 vkapi::Allocation allocate_memory_for(const api::vTensor& vten) {
+  VmaAllocationCreateInfo alloc_create_info =
+      api::context()->adapter_ptr()->vma().gpuonly_resource_create_info();
   return api::context()->adapter_ptr()->vma().create_allocation(
-      vten.get_memory_requirements(), vten.get_allocation_create_info());
+      vten.get_memory_requirements(), alloc_create_info);
 }
 
 VmaTotalStatistics get_vma_stats() {
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 25163e664bf..d9d83a9620f 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -16,11 +16,9 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
-using namespace vkcompute;
-
 #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory)  \
-  api::vTensor(                                       \
-      api::context(),                                 \
+  vkcompute::api::vTensor(                            \
+      vkcompute::api::context(),                      \
       sizes,                                          \
       vkapi::kFloat,                                  \
       utils::StorageType::TEXTURE_3D,                 \
@@ -28,25 +26,29 @@ using namespace vkcompute;
       allocate_memory);
 
 #define CREATE_FLOAT_BUFFER(sizes, allocate_memory) \
-  api::vTensor(                                     \
-      api::context(),                               \
+  vkcompute::api::vTensor(                          \
+      vkcompute::api::context(),                    \
       sizes,                                        \
       vkapi::kFloat,                                \
       utils::StorageType::BUFFER,                   \
       utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED,  \
       allocate_memory);
 
-#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor)          \
-  api::StagingBuffer staging_buffer_##tensor(                        \
-      api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
-  record_nchw_to_image_op(                                           \
-      api::context(), staging_buffer_##tensor.buffer(), tensor);
-
-#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor)        \
-  api::StagingBuffer staging_buffer_##tensor(                        \
-      api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
-  record_image_to_nchw_op(                                           \
-      api::context(), tensor, staging_buffer_##tensor.buffer());
+#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \
+  vkcompute::api::StagingBuffer staging_buffer_##tensor(    \
+      vkcompute::api::context(),                            \
+      vkapi::kFloat,                                        \
+      tensor.staging_buffer_numel());                       \
+  record_nchw_to_image_op(                                  \
+      vkcompute::api::context(), staging_buffer_##tensor.buffer(), tensor);
+
+#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \
+  vkcompute::api::StagingBuffer staging_buffer_##tensor(      \
+      vkcompute::api::context(),                              \
+      vkapi::kFloat,                                          \
+      tensor.staging_buffer_numel());                         \
+  record_image_to_nchw_op(                                    \
+      vkcompute::api::context(), tensor, staging_buffer_##tensor.buffer());
 
 #define CHECK_VALUE(data, idx, expected)                          \
   do {                                                            \
@@ -63,70 +65,80 @@ using namespace vkcompute;
 //
 
 void record_nchw_to_buffer_op(
-    api::Context* const context,
-    vkapi::VulkanBuffer& src_buffer,
-    api::vTensor& v_dst);
+    vkcompute::api::Context* const context,
+    vkcompute::vkapi::VulkanBuffer& src_buffer,
+    vkcompute::api::vTensor& v_dst);
 
 void record_buffer_to_nchw_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    vkapi::VulkanBuffer& dst_buffer);
+    vkcompute::api::Context* const context,
+    vkcompute::api::vTensor& v_src,
+    vkcompute::vkapi::VulkanBuffer& dst_buffer);
 
 void record_nchw_to_image_op(
-    api::Context* const context,
-    vkapi::VulkanBuffer& src_buffer,
-    api::vTensor& v_dst);
+    vkcompute::api::Context* const context,
+    vkcompute::vkapi::VulkanBuffer& src_buffer,
+    vkcompute::api::vTensor& v_dst);
 
 void record_image_to_nchw_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    vkapi::VulkanBuffer& dst_buffer);
+    vkcompute::api::Context* const context,
+    vkcompute::api::vTensor& v_src,
+    vkcompute::vkapi::VulkanBuffer& dst_buffer);
 
 void record_int8_image_to_nchw_noint8_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    api::StagingBuffer& dst_buffer);
+    vkcompute::api::Context* const context,
+    vkcompute::api::vTensor& v_src,
+    vkcompute::api::StagingBuffer& dst_buffer);
 
 void record_conv2d_prepack_weights_op(
-    api::Context* const context,
-    vkapi::VulkanBuffer& src_buffer,
-    api::vTensor& v_dst,
+    vkcompute::api::Context* const context,
+    vkcompute::vkapi::VulkanBuffer& src_buffer,
+    vkcompute::api::vTensor& v_dst,
     const std::vector<int64_t>& original_sizes,
     const bool transposed);
 
 void record_binary_op(
-    api::Context* const context,
+    vkcompute::api::Context* const context,
     const std::string& op_name,
-    api::vTensor& v_in1,
-    api::vTensor& v_in2,
-    api::vTensor& v_dst);
+    vkcompute::api::vTensor& v_in1,
+    vkcompute::api::vTensor& v_in2,
+    vkcompute::api::vTensor& v_dst);
 
 void execute_and_check_add(
-    api::vTensor& a,
-    api::vTensor& b,
-    api::vTensor& c,
+    vkcompute::api::vTensor& a,
+    vkcompute::api::vTensor& b,
+    vkcompute::api::vTensor& c,
     float a_val,
     float b_val);
 
-void record_index_fill_buffer(api::Context* const context, api::vTensor& v_ten);
+void record_index_fill_buffer(
+    vkcompute::api::Context* const context,
+    vkcompute::api::vTensor& v_ten);
 
 void record_scalar_add_buffer(
-    api::Context* context,
-    api::vTensor& v_ten,
+    vkcompute::api::Context* context,
+    vkcompute::api::vTensor& v_ten,
     float offset);
 
 void record_reference_matmul(
-    api::Context* context,
-    api::vTensor& out,
-    api::vTensor& mat1,
-    api::vTensor& mat2);
+    vkcompute::api::Context* context,
+    vkcompute::api::vTensor& out,
+    vkcompute::api::vTensor& mat1,
+    vkcompute::api::vTensor& mat2);
+
+void record_matmul_texture3d(
+    vkcompute::api::Context* context,
+    vkcompute::api::vTensor& out,
+    vkcompute::api::vTensor& mat1,
+    vkcompute::api::vTensor& mat2);
 
 //
 // Input & Output Utilities
 //
 
-inline void
-fill_staging(api::StagingBuffer& staging, float val, int numel = -1) {
+inline void fill_staging(
+    vkcompute::api::StagingBuffer& staging,
+    float val,
+    int numel = -1) {
   if (numel < 0) {
     numel = staging.numel();
   }
@@ -135,9 +147,9 @@ fill_staging(api::StagingBuffer& staging, float val, int numel = -1) {
   staging.copy_from(data.data(), sizeof(float) * numel);
 }
 
-void fill_vtensor(api::vTensor& vten, std::vector<float>& data);
+void fill_vtensor(vkcompute::api::vTensor& vten, std::vector<float>& data);
 
-void fill_vtensor(api::vTensor& vten, float val, bool iota = false);
+void fill_vtensor(vkcompute::api::vTensor& vten, float val, bool iota = false);
 
 std::vector<float> create_random_float_buffer(
     const size_t numel,
@@ -150,21 +162,23 @@ std::vector<uint8_t> create_random_uint8_buffer(
     const uint8_t max = 255);
 
 void fill_vtensor(
-    ComputeGraph& graph,
-    const IOValueRef idx,
+    vkcompute::ComputeGraph& graph,
+    const vkcompute::IOValueRef idx,
     float val,
     bool iota = false);
 
-void extract_vtensor(api::vTensor& vten, std::vector<float>& data);
+void extract_vtensor(vkcompute::api::vTensor& vten, std::vector<float>& data);
 
-inline std::vector<float> extract_vtensor(api::vTensor& vten) {
+inline std::vector<float> extract_vtensor(vkcompute::api::vTensor& vten) {
   std::vector<float> data_out(vten.staging_buffer_numel());
   extract_vtensor(vten, data_out);
   return data_out;
 }
 
-inline void
-check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) {
+inline void check_staging_buffer(
+    vkcompute::api::StagingBuffer& staging,
+    float val,
+    int numel = -1) {
   if (numel < 0) {
     numel = staging.numel();
   }
@@ -177,21 +191,21 @@ check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) {
 }
 
 inline int64_t get_buf_idx(
-    ComputeGraph& graph,
-    IOValueRef ref,
+    vkcompute::ComputeGraph& graph,
+    vkcompute::IOValueRef ref,
     const std::vector<int64_t>& tensor_coor) {
-  vTensorPtr vten_ptr = graph.get_tensor(ref.value);
+  vkcompute::vTensorPtr vten_ptr = graph.get_tensor(ref.value);
 
   const std::vector<int64_t>& sizes = vten_ptr->sizes();
 
-  int64_t c = dim_at<kChannel4D>(sizes);
-  int64_t h = dim_at<kHeight4D>(sizes);
-  int64_t w = dim_at<kWidth4D>(sizes);
+  int64_t c = vkcompute::dim_at<vkcompute::kChannel4D>(sizes);
+  int64_t h = vkcompute::dim_at<vkcompute::kHeight4D>(sizes);
+  int64_t w = vkcompute::dim_at<vkcompute::kWidth4D>(sizes);
 
-  int64_t ni = dim_at<kBatch4D>(tensor_coor);
-  int64_t ci = dim_at<kChannel4D>(tensor_coor);
-  int64_t hi = dim_at<kHeight4D>(tensor_coor);
-  int64_t wi = dim_at<kWidth4D>(tensor_coor);
+  int64_t ni = vkcompute::dim_at<vkcompute::kBatch4D>(tensor_coor);
+  int64_t ci = vkcompute::dim_at<vkcompute::kChannel4D>(tensor_coor);
+  int64_t hi = vkcompute::dim_at<vkcompute::kHeight4D>(tensor_coor);
+  int64_t wi = vkcompute::dim_at<vkcompute::kWidth4D>(tensor_coor);
 
   return (ni * c * h * w + ci * h * w + hi * w + wi);
 }
@@ -202,7 +216,8 @@ inline int64_t get_buf_idx(
 
 void submit_to_gpu();
 
-vkapi::Allocation allocate_memory_for(const api::vTensor& vten);
+vkcompute::vkapi::Allocation allocate_memory_for(
+    const vkcompute::api::vTensor& vten);
 
 VmaTotalStatistics get_vma_stats();
 
@@ -213,7 +228,7 @@ size_t get_vma_allocation_count();
 //
 
 void execute_graph_and_check_output(
-    ComputeGraph& graph,
+    vkcompute::ComputeGraph& graph,
     std::vector<float> input_vals,
     std::vector<float> expected_outputs);
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 53d0c820f41..c0840d2864a 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -11,7 +11,7 @@
 #include <utility>
 #include <vector>
 
-#include <executorch/runtime/core/portable_type/half.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
@@ -25,6 +25,7 @@
 
 #include <executorch/backends/vulkan/test/utils/test_utils.h>
 
+using namespace vkcompute;
 using namespace vkcompute::api;
 
 std::vector<float>
@@ -179,27 +180,26 @@ TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
 
 TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
   // ndim, GPUMemoryLayout, expected dim order pairs
-  std::vector<std::tuple<size_t, utils::GPUMemoryLayout, std::vector<int64_t>>>
-      test_cases = {
-          {1, utils::kWidthPacked, {0}},
-          {1, utils::kHeightPacked, {0}},
-          {1, utils::kChannelsPacked, {0}},
-          {2, utils::kWidthPacked, {0, 1}},
-          {2, utils::kHeightPacked, {1, 0}},
-          {2, utils::kChannelsPacked, {0, 1}},
-          {3, utils::kWidthPacked, {0, 1, 2}},
-          {3, utils::kHeightPacked, {0, 2, 1}},
-          {3, utils::kChannelsPacked, {1, 2, 0}},
-          {4, utils::kWidthPacked, {0, 1, 2, 3}},
-          {4, utils::kHeightPacked, {0, 1, 3, 2}},
-          {4, utils::kChannelsPacked, {0, 2, 3, 1}},
-      };
+  std::vector<std::tuple<size_t, int32_t, std::vector<int64_t>>> test_cases = {
+      {1, WHCN::kWidthDim, {0}},
+      {1, WHCN::kHeightDim, {0}},
+      {1, WHCN::kChannelsDim, {0}},
+      {2, WHCN::kWidthDim, {0, 1}},
+      {2, WHCN::kHeightDim, {1, 0}},
+      {2, WHCN::kChannelsDim, {0, 1}},
+      {3, WHCN::kWidthDim, {0, 1, 2}},
+      {3, WHCN::kHeightDim, {0, 2, 1}},
+      {3, WHCN::kChannelsDim, {1, 2, 0}},
+      {4, WHCN::kWidthDim, {0, 1, 2, 3}},
+      {4, WHCN::kHeightDim, {0, 1, 3, 2}},
+      {4, WHCN::kChannelsDim, {0, 2, 3, 1}},
+  };
 
   for (const auto& test_case : test_cases) {
     const size_t& ndim = std::get<0>(test_case);
-    const utils::GPUMemoryLayout& layout = std::get<1>(test_case);
+    const int32_t packed_dim = std::get<1>(test_case);
     const auto& expected_dim_order = std::get<2>(test_case);
-    std::vector<int64_t> dim_order = calculate_dim_order(ndim, layout);
+    std::vector<int64_t> dim_order = calculate_dim_order(ndim, packed_dim);
 
     ASSERT_TRUE(dim_order == expected_dim_order);
   }
@@ -221,8 +221,9 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
     for (const auto& layout :
          {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
       {
+        const int32_t packed_dim = static_cast<int32_t>(layout);
         std::vector<int64_t> dim_order =
-            calculate_dim_order(sizes.size(), layout);
+            calculate_dim_order(sizes.size(), packed_dim);
         std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
         std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
         ASSERT_TRUE(strides == ref_strides);
@@ -258,26 +259,141 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
   }
 }
 
+TEST_F(VulkanComputeAPITest, virtual_transpose_test) {
+  std::vector<int64_t> sizes = {7, 9, 11, 13};
+  // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx
+  std::vector<std::vector<std::vector<int64_t>>> test_cases = {
+      {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}},
+      {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 1}, {0}},
+      {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 0}, {2}},
+  };
+
+  for (const auto& test_case : test_cases) {
+    const int dim0 = test_case.at(0).at(0);
+    const int dim1 = test_case.at(0).at(1);
+
+    const auto& expected_sizes = test_case.at(1);
+    const auto& expected_dim_order = test_case.at(2);
+    const auto& expected_axis_map = test_case.at(3);
+    const int expected_packed_dim = test_case.at(4).at(0);
+
+    {
+      vTensor a_buffer = vTensor(
+          context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked);
+
+      a_buffer.virtual_transpose(dim0, dim1);
+      EXPECT_TRUE(a_buffer.sizes() == expected_sizes);
+      EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order);
+    }
+
+    {
+      vTensor a_texture = vTensor(
+          context(),
+          sizes,
+          vkapi::kFloat,
+          utils::kTexture3D,
+          utils::kWidthPacked);
+      a_texture.virtual_transpose(dim0, dim1);
+      EXPECT_TRUE(a_texture.sizes() == expected_sizes);
+      EXPECT_TRUE(a_texture.axis_map() == expected_axis_map);
+      EXPECT_TRUE(a_texture.packed_dim() == expected_packed_dim);
+    }
+  }
+}
+
+TEST_F(VulkanComputeAPITest, view_of_view_test) {
+  constexpr int N = 3;
+  constexpr int C = 5;
+  constexpr int H = 17;
+  constexpr int W = 19;
+
+  std::vector<int64_t> sizes = {N, C, H, W};
+
+  vTensor t1 = vTensor(
+      context(), sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked);
+
+  vTensor t2 = vTensor(t1);
+  EXPECT_TRUE(t2.sizes() == sizes);
+  vTensor t3 = vTensor(t2);
+  EXPECT_TRUE(t2.sizes() == sizes);
+
+  t2.virtual_transpose(1, 2);
+  std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
+  EXPECT_TRUE(t2.sizes() == expected_t2_sizes);
+
+  // Because t3 was created before t2's metadata was updated, we need to first
+  // update t3's metadata to match t2's metadata. Then the transpose will yield
+  // the correct metadata.
+  t3.virtual_clone(t2);
+  t3.virtual_transpose(2, 3);
+  std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
+  EXPECT_TRUE(t3.sizes() == expected_t3_sizes);
+}
+
+utils::ivec3 make_temp_ivec3(int x, int y, int z) {
+  return utils::ivec3{x, y, z};
+}
+
 TEST_F(VulkanComputeAPITest, vec_test) {
-  utils::vec3 v3({1, 2, 3});
-  ASSERT_TRUE(v3[0] == 1);
-  ASSERT_TRUE(v3[1] == 2);
-  ASSERT_TRUE(v3[2] == 3);
-  v3 = {4, 5, 6};
-  ASSERT_TRUE(v3[0] == 4);
-  ASSERT_TRUE(v3[1] == 5);
-  ASSERT_TRUE(v3[2] == 6);
-
-  utils::uvec4 uv4({4, 3, 2, 1});
-  ASSERT_TRUE(uv4[0] == 4);
-  ASSERT_TRUE(uv4[1] == 3);
-  ASSERT_TRUE(uv4[2] == 2);
-  ASSERT_TRUE(uv4[3] == 1);
-  uv4 = {11, 13, 12, 88};
-  ASSERT_TRUE(uv4[0] == 11);
-  ASSERT_TRUE(uv4[1] == 13);
-  ASSERT_TRUE(uv4[2] == 12);
-  ASSERT_TRUE(uv4[3] == 88);
+  {
+    utils::vec3 v3({1, 2, 3});
+    ASSERT_TRUE(v3[0] == 1);
+    ASSERT_TRUE(v3[1] == 2);
+    ASSERT_TRUE(v3[2] == 3);
+    v3 = {4, 5, 6};
+    ASSERT_TRUE(v3[0] == 4);
+    ASSERT_TRUE(v3[1] == 5);
+    ASSERT_TRUE(v3[2] == 6);
+  }
+
+  {
+    utils::uvec4 uv4({4, 3, 2, 1});
+    ASSERT_TRUE(uv4[0] == 4);
+    ASSERT_TRUE(uv4[1] == 3);
+    ASSERT_TRUE(uv4[2] == 2);
+    ASSERT_TRUE(uv4[3] == 1);
+    uv4 = {11, 13, 12, 88};
+    ASSERT_TRUE(uv4[0] == 11);
+    ASSERT_TRUE(uv4[1] == 13);
+    ASSERT_TRUE(uv4[2] == 12);
+    ASSERT_TRUE(uv4[3] == 88);
+  }
+
+  // Test copy from same type
+  {
+    utils::ivec3 v{5, 6, 8};
+    utils::ivec3 v2 = v;
+
+    ASSERT_TRUE(v2[0] == 5);
+    ASSERT_TRUE(v2[1] == 6);
+    ASSERT_TRUE(v2[2] == 8);
+  }
+
+  // Test copy from different type
+  {
+    utils::uvec3 v{5, 6, 8};
+    utils::ivec3 v2 = v;
+
+    ASSERT_TRUE(v2[0] == 5);
+    ASSERT_TRUE(v2[1] == 6);
+    ASSERT_TRUE(v2[2] == 8);
+  }
+
+  // Test construction from temporary vec
+  {
+    utils::uvec3 v{make_temp_ivec3(4, 5, 10)};
+    ASSERT_TRUE(v[0] == 4);
+    ASSERT_TRUE(v[1] == 5);
+    ASSERT_TRUE(v[2] == 10);
+  }
+
+  // Test initalization from temporary vec
+  {
+    utils::uvec3 v = make_temp_ivec3(4, 5, 10);
+    ASSERT_TRUE(v[0] == 4);
+    ASSERT_TRUE(v[1] == 5);
+    ASSERT_TRUE(v[2] == 10);
+  }
 }
 
 TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
@@ -485,7 +601,7 @@ TEST_F(VulkanComputeAPITest, test_buffer_float16) {
   if (!context()->adapter_ptr()->has_full_float16_buffers_support()) {
     GTEST_SKIP();
   }
-  test_storage_buffer_type<torch::executor::Half, vkapi::kHalf>(16);
+  test_storage_buffer_type<executorch::aten::Half, vkapi::kHalf>(16);
 }
 
 TEST_F(VulkanComputeAPITest, test_buffer_int8) {
@@ -567,7 +683,7 @@ TEST_F(VulkanComputeAPITest, buffer_tensor_sanity_check) {
             run_buffer_tensor_sanity_check<float>(a);
             break;
           case vkapi::kHalf:
-            run_buffer_tensor_sanity_check<torch::executor::Half>(a);
+            run_buffer_tensor_sanity_check<executorch::aten::Half>(a);
             break;
           case vkapi::kChar:
             run_buffer_tensor_sanity_check<int8_t>(a);
@@ -604,26 +720,30 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
   }
 }
 
-TEST_F(VulkanComputeAPITest, tensor_copy_test) {
-  std::vector<int64_t> sizes = {9, 9};
-  std::vector<int64_t> strides =
-      get_reference_strides(sizes, utils::kWidthPacked);
-  std::vector<int64_t> dim_order = {0, 1};
+TEST_F(VulkanComputeAPITest, tensor_alias_test) {
+  for (utils::StorageType storage_type : {utils::kTexture3D, utils::kBuffer}) {
+    std::vector<int64_t> sizes = {9, 9};
 
-  vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true);
-  vTensor copy = vTensor(original, sizes, dim_order);
-  EXPECT_TRUE(get_vma_allocation_count() == 1);
-  EXPECT_TRUE(copy.is_view_of(original));
+    const size_t alloc_count_before = get_vma_allocation_count();
 
-  // Fill original tensor with some data
-  fill_vtensor(original, 2.5f, true);
+    vTensor original = vTensor(context(), sizes, vkapi::kFloat, storage_type);
 
-  std::vector<float> data_out(copy.staging_buffer_numel());
-  // Extract the copy tensor; should contain the data of the original tensor
-  extract_vtensor(copy, data_out);
+    vTensor copy = vTensor(original);
 
-  for (size_t i = 0; i < data_out.size(); ++i) {
-    CHECK_VALUE(data_out, i, 2.5f + i);
+    // Two tensors but only one additional allocation.
+    EXPECT_TRUE(get_vma_allocation_count() == alloc_count_before + 1);
+    EXPECT_TRUE(copy.is_view_of(original));
+
+    // Fill original tensor with some data
+    fill_vtensor(original, 2.5f, true);
+
+    std::vector<float> data_out(copy.staging_buffer_numel());
+    // Extract the copy tensor; should contain the data of the original tensor
+    extract_vtensor(copy, data_out);
+
+    for (size_t i = 0; i < original.numel(); ++i) {
+      CHECK_VALUE(data_out, i, 2.5f + i);
+    }
   }
 }
 
@@ -633,46 +753,58 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   constexpr int N = 17;
   std::vector<int64_t> mat1_sizes = {M, K};
   std::vector<int64_t> mat2_sizes = {N, K};
-  std::vector<int64_t> mat2_t_sizes = {K, N};
   std::vector<int64_t> out_sizes = {M, N};
 
-  std::vector<int64_t> transposed_dim_order = {1, 0};
-
-  vTensor mat1 = CREATE_FLOAT_BUFFER(mat1_sizes, /*allocate_memory=*/true);
-  vTensor mat2 = CREATE_FLOAT_BUFFER(mat2_sizes, /*allocate_memory=*/true);
-  vTensor out = CREATE_FLOAT_BUFFER(out_sizes, /*allocate_memory=*/true);
-
-  // Generate data
-  std::vector<float> mat1_data =
-      create_random_float_buffer(mat1.staging_buffer_numel());
-  std::vector<float> mat2_data =
-      create_random_float_buffer(mat2.staging_buffer_numel());
-
-  // Create direct view and modify sizes and strides later
-  vTensor mat2_t = vTensor(mat2);
-
-  std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
-  std::vector<float> ref_out =
-      compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
-
-  // Fill original tensor with some data
-  fill_vtensor(mat1, mat1_data);
-  fill_vtensor(mat2, mat2_data);
-
-  record_reference_matmul(api::context(), out, mat1, mat2_t);
-
-  // Update sizes and strides of mat2_t to be that of a transposed tensor
-  mat2_t.virtual_reconfigure(mat2_t_sizes, transposed_dim_order);
-  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
-
-  std::vector<float> data_out(out.staging_buffer_numel());
-  // Extract the copy tensor; should contain the data of the original tensor
-  extract_vtensor(out, data_out);
+  for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) {
+    vTensor mat1 = vTensor(
+        context(),
+        mat1_sizes,
+        vkapi::kFloat,
+        storage_type,
+        utils::kWidthPacked);
+    vTensor mat2 = vTensor(
+        context(),
+        mat2_sizes,
+        vkapi::kFloat,
+        storage_type,
+        utils::kWidthPacked);
+    vTensor out = vTensor(
+        context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked);
+
+    // Generate data
+    std::vector<float> mat1_data =
+        create_random_float_buffer(mat1.staging_buffer_numel());
+    std::vector<float> mat2_data =
+        create_random_float_buffer(mat2.staging_buffer_numel());
+
+    // Create direct view and modify sizes and strides later
+    vTensor mat2_t = vTensor(mat2);
+    // Update sizes and strides of mat2_t to be that of a transposed tensor
+    mat2_t.virtual_transpose(0, 1);
+
+    EXPECT_TRUE(mat2_t.packed_dim() == WHCN::kHeightDim);
+
+    std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
+    std::vector<float> ref_out =
+        compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
+
+    // Fill original tensor with some data
+    fill_vtensor(mat1, mat1_data);
+    fill_vtensor(mat2, mat2_data);
+
+    if (storage_type == utils::kTexture3D) {
+      record_matmul_texture3d(context(), out, mat1, mat2_t);
+    } else {
+      record_reference_matmul(context(), out, mat1, mat2_t);
+    }
 
-  EXPECT_TRUE(data_out.size() == ref_out.size());
+    std::vector<float> data_out(out.staging_buffer_numel());
+    // Extract the copy tensor; should contain the data of the original tensor
+    extract_vtensor(out, data_out);
 
-  for (size_t i = 0; i < data_out.size(); ++i) {
-    EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
+    for (size_t i = 0; i < ref_out.size(); ++i) {
+      EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
+    }
   }
 }
 
@@ -882,64 +1014,6 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
   EXPECT_THROW(fill_vtensor(a, data_a), vkapi::Error);
 }
 
-TEST_F(VulkanComputeAPITest, tensor_reallocation_test) {
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-
-  execute_and_check_add(a, b, c, 3.0f, 5.0f);
-
-  // Redo with new sizes
-  std::vector<int64_t> new_sizes = {4, 6, 3};
-  a.reallocate(new_sizes);
-  b.reallocate(new_sizes);
-  c.reallocate(new_sizes);
-
-  // Flush everything
-  context()->flush();
-
-  execute_and_check_add(a, b, c, 12.0f, 10.0f);
-}
-
-TEST_F(
-    VulkanComputeAPITest,
-    tensor_reallocation_with_deferred_allocation_test) {
-  std::vector<int64_t> sizes = {8, 8, 8};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-
-  vkapi::Allocation a_mem = allocate_memory_for(a);
-  a.image().bind_allocation(a_mem);
-  vkapi::Allocation b_mem = allocate_memory_for(b);
-  b.image().bind_allocation(b_mem);
-  vkapi::Allocation c_mem = allocate_memory_for(c);
-  c.image().bind_allocation(c_mem);
-
-  execute_and_check_add(a, b, c, 4.0f, 8.0f);
-
-  std::vector<std::vector<int64_t>> new_sizes_list = {
-      {4, 3, 5}, {4, 1, 7}, {8, 3, 2}, {8, 7, 2}};
-
-  for (auto& new_sizes : new_sizes_list) {
-    // Redo with new sizes
-    a.reallocate(new_sizes);
-    b.reallocate(new_sizes);
-    c.reallocate(new_sizes);
-
-    // Flush everything
-    context()->flush();
-
-    a.image().bind_allocation(a_mem);
-    b.image().bind_allocation(b_mem);
-    c.image().bind_allocation(c_mem);
-
-    execute_and_check_add(
-        a, b, c, float(new_sizes[1] + 4.5f), float(new_sizes[2] + 13.0f));
-  }
-}
-
 TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
   context()->set_cmd(/*reusable = */ true);
   std::vector<int64_t> sizes = {8, 12, 12};
@@ -1008,11 +1082,11 @@ TEST_F(VulkanComputeAPITest, print_object_sizes) {
   // can alert ourselves to any significant changes in the sizes of these
   // objects by checking the `sizeof()` the class against some loose thresholds.
 
-  // Current known size on 64 bit system: 1824 B
-  EXPECT_TRUE(sizeof(vTensor) < 2000);
-  // Current known size on 64 bit system: 1840 B
-  EXPECT_TRUE(sizeof(Value) < 2200);
-  // Current known size on 64 bit system: 240 B
+  // Current known size on 64 bit system: 1040 B
+  EXPECT_TRUE(sizeof(vTensor) < 1200);
+  // Current known size on 64 bit system: 1056 B
+  EXPECT_TRUE(sizeof(Value) < 1200);
+  // Current known size on 64 bit system: 120 B
   EXPECT_TRUE(sizeof(StagingBuffer) < 500);
   // Current known size on 64 bit system: 384 B
   EXPECT_TRUE(sizeof(ComputeGraph) < 500);
@@ -1229,6 +1303,49 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
   }
 }
 
+TEST(VulkanComputeGraphTest, test_graph_view_of_view) {
+  GraphConfig config;
+  config.set_storage_type_override(utils::kTexture3D);
+  ComputeGraph graph(config);
+
+  constexpr int N = 3;
+  constexpr int C = 5;
+  constexpr int H = 17;
+  constexpr int W = 19;
+
+  std::vector<int64_t> orig_sizes = {N, C, H, W};
+
+  // Test a common view of view usage pattern. In delegate execution, the values
+  // of the graph are created first; then operators are added. As a result,
+  // creating views of views is a bit tricky because metadata updates to a view
+  // does not update the metadata of the view's views. Nonetheless, view
+  // operators have an implicit assumption that the metadata of the output is
+  // equivalent to the metadata of the input. Therefore, view operators must
+  // account for unseen updates to the input view by first calling
+  // `virtual_clone()` to make the output equivalent to the input before.
+  // modifying metadata.
+
+  ValueRef t1 = graph.add_tensor(orig_sizes, vkapi::kFloat);
+  ValueRef t2 = graph.add_tensor_view(t1);
+  ValueRef t3 = graph.add_tensor_view(t2);
+
+  ValueRef channels = graph.add_scalar<int64_t>(1);
+  ValueRef height = graph.add_scalar<int64_t>(2);
+  ValueRef width = graph.add_scalar<int64_t>(3);
+
+  auto opFn = VK_GET_OP_FN("aten.transpose.int");
+
+  opFn(graph, {t1, channels, height, t2});
+  std::vector<int64_t> t2_sizes = graph.sizes_of(t2);
+  std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
+  EXPECT_TRUE(t2_sizes == expected_t2_sizes);
+
+  opFn(graph, {t2, height, width, t3});
+  std::vector<int64_t> t3_sizes = graph.sizes_of(t3);
+  std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
+  EXPECT_TRUE(t3_sizes == expected_t3_sizes);
+}
+
 TEST(VulkanComputeGraphTest, test_simple_graph) {
   GraphConfig config;
   ComputeGraph graph(config);
@@ -1297,7 +1414,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) {
       // Inputs and Outputs
       {{out.value, vkapi::MemoryAccessType::WRITE}},
       // Shader params buffers
-      {graph.texture_limits_ubo(a.value),
+      {graph.logical_limits_ubo(a.value),
        graph.get_or_create_int_param_buffer(scalar)},
       // Specialization Constants
       {},
@@ -1316,6 +1433,9 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) {
     int scalar_val = i - 3.0f;
     graph.set_symint(scalar, scalar_val);
 
+    int32_t scalar_val_read = graph.read_symint(scalar);
+    EXPECT_TRUE(scalar_val_read == scalar_val);
+
     float val_a = i + 2.0f;
     float val_out = val_a + scalar_val;
 
@@ -1399,6 +1519,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
 TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   GraphConfig config;
   ComputeGraph graph(config);
+  size_t expected_vma_allocation_count = 0;
 
   std::vector<int64_t> size_big = {12, 64, 64};
   std::vector<int64_t> size_small = {12, 64, 64};
@@ -1415,9 +1536,10 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 4);
 
   // +2: t.sizes_ubo() for each staging shader
-  // +2: t.axis_mapping_ubo() for each staging shader
+  // +2: t.axis_map_ubo() for each staging shader
   // +2: staging buffer for each input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 6);
+  expected_vma_allocation_count += 6;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef c = graph.add_tensor(
       size_big,
@@ -1427,16 +1549,22 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
   addFn(graph, {a.value, b.value, kDummyValueRef, c});
 
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +1: t.sizes_ubo() for arithmetic shader output c
+  // +1: t.axis_map_ubo() for arithmetic shader output c
+  expected_vma_allocation_count += 4;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
+
   IOValueRef d = graph.add_input_tensor(
       size_small,
       vkapi::kFloat,
       /*shared_object_idx = */ 2);
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() uniform buffer for staging shader
-  // +1: t.axis_mapping_ubo() uniform buffer for staging shader
+  // +1: t.axis_map_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 12);
+  expected_vma_allocation_count += 3;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -1446,21 +1574,26 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
   mulFn(graph, {c, d.value, e});
 
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +1: t.sizes_ubo() for arithmetic shader output e
+  // +1: t.axis_map_ubo() for arithmetic shader output e
+  expected_vma_allocation_count += 4;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
+
   IOValueRef out = {};
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
-  // +1: t.sizes_ubo() for staging shader
-  // +1: t.axis_mapping_ubo() for staging shader
-  // +1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 17);
+  // +1: staging buffer for the output tensor
+  expected_vma_allocation_count += 1;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   graph.prepare();
   graph.encode_execute();
 
   // +3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 20);
+  expected_vma_allocation_count += 3;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   // Run graph
 
@@ -2216,9 +2349,9 @@ void run_from_gpu_test(
     context()->submit_compute_job(
         VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
-        vten.image_extents(),
+        vten.logical_limits(),
         {4, 4, 4},
-        {vten.packed_dim_whcn_idx(), offset},
+        {vten.packed_dim(), offset},
         VK_NULL_HANDLE,
         0,
         vten.image(
@@ -2395,7 +2528,7 @@ TEST(VulkanToFromGPUShaderTest, round_trip_tests) {
 
   for (auto& sizes : to_test) {
     RUN_TESTS(float, vkapi::kFloat)
-    RUN_TESTS(torch::executor::Half, vkapi::kHalf)
+    RUN_TESTS(executorch::aten::Half, vkapi::kHalf)
   }
 
   for (auto& sizes : to_test_int8) {
@@ -2874,7 +3007,7 @@ void test_int4pack_mm(
       int4mm_pack_weights(mat2_size, B_quant_data.data());
 
   IOValueRef B_int4 =
-      graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, storage_type);
+      graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, utils::kBuffer);
   graph.copy_into_staging(
       B_int4.staging, B_int4_data.data(), B_int4_data.size());
 
@@ -2882,8 +3015,18 @@ void test_int4pack_mm(
 
   // Random scales and zeroes. Keep scales small to avoid overflow and zeroes in
   // int4 range
-  IOValueRef scales_and_zeros =
-      graph.add_input_tensor({2, N, k_groups}, vkapi::kFloat, storage_type);
+  IOValueRef scales_and_zeros;
+
+  if (storage_type == utils::kBuffer) {
+    scales_and_zeros.value = graph.add_tensor(
+        {2, N, k_groups}, vkapi::kFloat, storage_type, utils::kWidthPacked);
+  } else {
+    scales_and_zeros.value = graph.add_tensor(
+        {2, N, k_groups}, vkapi::kFloat, storage_type, utils::kChannelsPacked);
+  }
+
+  scales_and_zeros.staging = graph.set_input_tensor(scales_and_zeros.value);
+
   std::vector<float> s_data(graph.numel_of(scales_and_zeros.value));
   const int zeros_stride = s_data.size() / 2;
   for (size_t i = 0; i < zeros_stride; i++) {
@@ -2945,7 +3088,7 @@ void test_int4pack_mm(
       out_deq.staging, out_deq_data.data(), out_deq_data.size());
 
   for (int i = 0; i < out_int4_data.size(); i++) {
-    CHECK_VALUE(out_int4_data, i, out_deq_data[i]);
+    EXPECT_TRUE(check_close(out_int4_data[i], out_deq_data[i]));
   }
 }
 
@@ -2971,3 +3114,94 @@ TEST(VulkanComputeGraphOpsTest, int4pack_mm_test) {
     test_int4pack_mm({37, 256, 19}, 64, storage_type);
   }
 }
+
+void test_transpose_view_mm(
+    const int B,
+    const int M,
+    const int K,
+    const int N,
+    utils::StorageType storage_type) {
+  GraphConfig config;
+  config.set_storage_type_override(storage_type);
+  ComputeGraph graph(config);
+
+  std::vector<int64_t> mat1_size = {M, K};
+  std::vector<int64_t> mat2_t_size = {N, K};
+  std::vector<int64_t> out_size = {M, N};
+
+  std::vector<int64_t> mat1_small_size = {M - 4, K - 3};
+  std::vector<int64_t> mat2_t_small_size = {N - 1, K - 3};
+
+  if (B > 1) {
+    mat1_size.resize(3);
+    mat1_size = {B, M, K};
+    mat2_t_size.resize(3);
+    mat2_t_size = {B, N, K};
+    out_size.resize(3);
+    out_size = {B, M, N};
+
+    mat1_small_size.resize(3);
+    mat1_small_size = {B, M - 4, K - 3};
+    mat2_t_small_size.resize(3);
+    mat2_t_small_size = {B, N - 1, K - 3};
+  }
+
+  // Build graph; use shared objects to test views of shared objects
+
+  IOValueRef mat1 =
+      graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kWidthPacked, 0);
+  IOValueRef mat2_transpose = graph.add_input_tensor(
+      mat2_t_size, vkapi::kFloat, utils::kWidthPacked, 1);
+
+  ValueRef mat2 = graph.add_tensor_view(mat2_transpose.value);
+
+  ValueRef dim0;
+  ValueRef dim1;
+
+  if (B > 1) {
+    dim0 = graph.add_scalar<int64_t>(1);
+    dim1 = graph.add_scalar<int64_t>(2);
+  } else {
+    dim0 = graph.add_scalar<int64_t>(0);
+    dim1 = graph.add_scalar<int64_t>(1);
+  }
+
+  IOValueRef out;
+  out.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kWidthPacked, 2);
+
+  VK_GET_OP_FN("aten.transpose.int")
+  (graph, {mat2_transpose.value, dim0, dim1, mat2});
+  VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2, out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  for (int i = 1; i < 4; i++) {
+    float val_mat1 = i;
+    float val_mat2 = i + 1;
+    float val_out = K * (val_mat1 * val_mat2);
+
+    // Try at full size
+    graph.resize_input(0, mat1_size);
+    graph.resize_input(1, mat2_t_size);
+    graph.propagate_resize();
+    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
+
+    // Try at reduced sizes
+    val_out = (K - 3) * (val_mat1 * val_mat2);
+    graph.resize_input(0, mat1_small_size);
+    graph.resize_input(1, mat2_t_small_size);
+    graph.propagate_resize();
+    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
+  }
+}
+
+TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
+  for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) {
+    test_transpose_view_mm(2, 7, 17, 5, storage_type);
+  }
+}
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 1865c32acd7..7e85c25faee 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -57,7 +57,7 @@ def preprocess(  # noqa: C901
             MeanToSumDiv(),
             SpecPropPass(),
             ConstraintBasedSymShapeEvalPass(),
-            MemoryPlanningPass("greedy"),
+            MemoryPlanningPass(),
         ]
 
         new_gm = program.graph_module
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 98870bf0e11..5d4eb2c7bbb 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -32,14 +32,20 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-# NB: Enabling this will serialize execution of delegate instances Keeping this
-# OFF by default to maintain existing behavior, to be revisited.
+# NB: Enabling this will serialize execution of delegate instances
+# Keeping this OFF by default to maintain existing behavior, to be revisited.
 option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
-       "Enable workspace sharing across different delegate instances" OFF
-)
+  "Enable workspace sharing across different delegate instances" ON)
+# Keeping this OFF by default due to regressions in decode
+# and model load with kleidi kernels
+option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI
+  "Enable workspace sharing across different delegate instances" OFF)
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+  add_definitions(-DENABLE_XNNPACK_KLEIDI)
+endif()
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
@@ -93,7 +99,7 @@ include(cmake/Dependencies.cmake)
 list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(xnnpack_backend STATIC ${_xnnpack_backend__srcs})
 target_link_libraries(
-  xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_no_prim_ops
+  xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_core
                           xnnpack_schema
 )
 
diff --git a/backends/xnnpack/TARGETS b/backends/xnnpack/TARGETS
index 3d53606b6f7..4a3dfed7625 100644
--- a/backends/xnnpack/TARGETS
+++ b/backends/xnnpack/TARGETS
@@ -16,8 +16,8 @@ runtime.python_library(
     ],
     deps = [
         "//executorch/backends/transforms:lib",
+        "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/backends/xnnpack/operators:operators",
-        "//executorch/backends/xnnpack/passes:xnnpack_passes",
         "//executorch/backends/xnnpack/serialization:xnnpack_serializer",
         "//executorch/exir:graph_module",
         "//executorch/exir/backend:backend_details",
diff --git a/backends/xnnpack/passes/TARGETS b/backends/xnnpack/_passes/TARGETS
similarity index 100%
rename from backends/xnnpack/passes/TARGETS
rename to backends/xnnpack/_passes/TARGETS
diff --git a/backends/xnnpack/passes/__init__.py b/backends/xnnpack/_passes/__init__.py
similarity index 73%
rename from backends/xnnpack/passes/__init__.py
rename to backends/xnnpack/_passes/__init__.py
index c3a85e4aa86..00e1ba03586 100644
--- a/backends/xnnpack/passes/__init__.py
+++ b/backends/xnnpack/_passes/__init__.py
@@ -6,23 +6,27 @@
 
 from typing import List, Optional, Type
 
-from executorch.backends.xnnpack.passes.channels_last_tagged_reshape_pass import (
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
-from executorch.backends.xnnpack.passes.conv1d_unsqueeze_pass import Conv1dUnsqueezePass
-from executorch.backends.xnnpack.passes.convert_to_linear import ConvertToLinearPass
-from executorch.backends.xnnpack.passes.convert_to_sdpa import ConvertToSDPAPass
-from executorch.backends.xnnpack.passes.convert_to_upsample_bilinear2d import (
+from executorch.backends.xnnpack._passes.conv1d_unsqueeze_pass import (
+    Conv1dUnsqueezePass,
+)
+from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
+from executorch.backends.xnnpack._passes.convert_to_sdpa import ConvertToSDPAPass
+from executorch.backends.xnnpack._passes.convert_to_upsample_bilinear2d import (
     ConvertToUpsampleBilinear2d,
 )
-from executorch.backends.xnnpack.passes.fuse_activation_pass import FuseActivationPass
-from executorch.backends.xnnpack.passes.fuse_batch_norm_with_conv import (
+from executorch.backends.xnnpack._passes.fuse_activation_pass import FuseActivationPass
+from executorch.backends.xnnpack._passes.fuse_batch_norm_with_conv import (
     FuseBatchNormWithConvPass,
 )
-from executorch.backends.xnnpack.passes.prelu_reshape_pass import PReLUReshapePass
-from executorch.backends.xnnpack.passes.remove_getitem_op import RemoveGetItemPass
-from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass
+from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
+from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
+    TagImplicitQDqPass,
+)
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 
 from executorch.exir.pass_base import ExportPass
 
diff --git a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
similarity index 99%
rename from backends/xnnpack/passes/channels_last_tagged_reshape_pass.py
rename to backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 692f1a9d145..89a44f303df 100644
--- a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -7,7 +7,7 @@
 from typing import Optional, Tuple
 
 import torch
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.utils.utils import is_param_node
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
diff --git a/backends/xnnpack/passes/conv1d_unsqueeze_pass.py b/backends/xnnpack/_passes/conv1d_unsqueeze_pass.py
similarity index 99%
rename from backends/xnnpack/passes/conv1d_unsqueeze_pass.py
rename to backends/xnnpack/_passes/conv1d_unsqueeze_pass.py
index 8c63d298c68..3173cab2746 100644
--- a/backends/xnnpack/passes/conv1d_unsqueeze_pass.py
+++ b/backends/xnnpack/_passes/conv1d_unsqueeze_pass.py
@@ -7,7 +7,7 @@
 from typing import Optional
 
 import torch
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant
 from executorch.backends.xnnpack.utils.utils import get_param_tensor, is_param_node
 from executorch.exir.dialects._ops import ops as exir_ops
diff --git a/backends/xnnpack/passes/convert_to_linear.py b/backends/xnnpack/_passes/convert_to_linear.py
similarity index 100%
rename from backends/xnnpack/passes/convert_to_linear.py
rename to backends/xnnpack/_passes/convert_to_linear.py
diff --git a/backends/xnnpack/passes/convert_to_sdpa.py b/backends/xnnpack/_passes/convert_to_sdpa.py
similarity index 98%
rename from backends/xnnpack/passes/convert_to_sdpa.py
rename to backends/xnnpack/_passes/convert_to_sdpa.py
index 97aca5491dd..c7982db750f 100644
--- a/backends/xnnpack/passes/convert_to_sdpa.py
+++ b/backends/xnnpack/_passes/convert_to_sdpa.py
@@ -9,9 +9,9 @@
 
 import torch
 from executorch.backends.transforms import get_shape
-from executorch.backends.xnnpack.partition.graphs import sdpa
 
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack.partition.graphs import sdpa
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from torch.fx.passes.infra.pass_base import PassResult
diff --git a/backends/xnnpack/passes/convert_to_upsample_bilinear2d.py b/backends/xnnpack/_passes/convert_to_upsample_bilinear2d.py
similarity index 97%
rename from backends/xnnpack/passes/convert_to_upsample_bilinear2d.py
rename to backends/xnnpack/_passes/convert_to_upsample_bilinear2d.py
index 45956ee6f6f..47bff3b99eb 100644
--- a/backends/xnnpack/passes/convert_to_upsample_bilinear2d.py
+++ b/backends/xnnpack/_passes/convert_to_upsample_bilinear2d.py
@@ -5,8 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.partition.graphs import bilinear_2d
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.utils.utils import check_or_raise
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx.passes.infra.pass_base import PassResult
diff --git a/backends/xnnpack/passes/fuse_activation_pass.py b/backends/xnnpack/_passes/fuse_activation_pass.py
similarity index 97%
rename from backends/xnnpack/passes/fuse_activation_pass.py
rename to backends/xnnpack/_passes/fuse_activation_pass.py
index 797b4bb6e24..289e2b03fdb 100644
--- a/backends/xnnpack/passes/fuse_activation_pass.py
+++ b/backends/xnnpack/_passes/fuse_activation_pass.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import OutputMinMax
 
 from executorch.backends.xnnpack.utils.utils import check_or_raise
diff --git a/backends/xnnpack/passes/fuse_batch_norm_with_conv.py b/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
similarity index 98%
rename from backends/xnnpack/passes/fuse_batch_norm_with_conv.py
rename to backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
index a2d3606826f..8014edfb1c3 100644
--- a/backends/xnnpack/passes/fuse_batch_norm_with_conv.py
+++ b/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
@@ -8,7 +8,7 @@
 
 import torch
 
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 
 from executorch.backends.xnnpack.utils.utils import get_param_tensor, is_param_node
 from executorch.exir import ExportedProgram
diff --git a/backends/xnnpack/passes/prelu_reshape_pass.py b/backends/xnnpack/_passes/prelu_reshape_pass.py
similarity index 97%
rename from backends/xnnpack/passes/prelu_reshape_pass.py
rename to backends/xnnpack/_passes/prelu_reshape_pass.py
index 86698af1874..fb7f9ac9823 100644
--- a/backends/xnnpack/passes/prelu_reshape_pass.py
+++ b/backends/xnnpack/_passes/prelu_reshape_pass.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.utils.utils import (
     check_or_raise,
     get_param_tensor,
diff --git a/backends/xnnpack/passes/remove_getitem_op.py b/backends/xnnpack/_passes/remove_getitem_op.py
similarity index 100%
rename from backends/xnnpack/passes/remove_getitem_op.py
rename to backends/xnnpack/_passes/remove_getitem_op.py
diff --git a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py b/backends/xnnpack/_passes/tag_implicit_q_dq_pass.py
similarity index 99%
rename from backends/xnnpack/passes/tag_implicit_q_dq_pass.py
rename to backends/xnnpack/_passes/tag_implicit_q_dq_pass.py
index ac6ccc9b89d..edbe9b44dcd 100644
--- a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py
+++ b/backends/xnnpack/_passes/tag_implicit_q_dq_pass.py
@@ -7,11 +7,11 @@
 from typing import cast, List, Optional
 
 import torch
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.partition.configs import (
     SUPPORTED_IMPLICIT_Q_DQ_MODULES_SET,
     SUPPORTED_IMPLICIT_Q_DQ_OP_NAMES_SET,
 )
-from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.utils.quant_utils import (
     is_dequant,
     is_dynamic_qdq,
diff --git a/backends/xnnpack/passes/xnnpack_pass.py b/backends/xnnpack/_passes/xnnpack_pass.py
similarity index 100%
rename from backends/xnnpack/passes/xnnpack_pass.py
rename to backends/xnnpack/_passes/xnnpack_pass.py
diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake
index b76c54bee60..fef63badf23 100644
--- a/backends/xnnpack/cmake/Dependencies.cmake
+++ b/backends/xnnpack/cmake/Dependencies.cmake
@@ -36,13 +36,39 @@ set(XNNPACK_ENABLE_AVXVNNI
     OFF
     CACHE BOOL ""
 )
-set(XNNPACK_ENABLE_KLEIDIAI
+
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+    set(XNNPACK_ENABLE_KLEIDIAI
+        ON
+        CACHE BOOL ""
+    )
+else()
+    set(XNNPACK_ENABLE_KLEIDIAI
+        OFF
+        CACHE BOOL ""
+    )
+endif()
+
+
+set(XNNPACK_BUILD_ALL_MICROKERNELS
     OFF
     CACHE BOOL ""
 )
 add_subdirectory("${XNNPACK_SOURCE_DIR}")
 include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
 list(APPEND xnnpack_third_party XNNPACK)
+install(TARGETS microkernels-prod
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+    install(TARGETS kleidiai
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
 
 # Revert PIC Flag to what it originally was
 set(CMAKE_POSITION_INDEPENDENT_CODE
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
index 0b0eb7912aa..de48748f8f4 100644
--- a/backends/xnnpack/operators/node_visitor.py
+++ b/backends/xnnpack/operators/node_visitor.py
@@ -11,12 +11,12 @@
 import torch
 from executorch.backends.transforms import get_shape
 
-from executorch.backends.xnnpack.operators.quant_params import QuantParams
-
-from executorch.backends.xnnpack.passes.channels_last_tagged_reshape_pass import (
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
 
+from executorch.backends.xnnpack.operators.quant_params import QuantParams
+
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     ConstantDataOffset,
     PerChannelGroupQuant,
diff --git a/backends/xnnpack/operators/op_add.py b/backends/xnnpack/operators/op_add.py
index b4a1dac0570..117e54e5cf0 100644
--- a/backends/xnnpack/operators/op_add.py
+++ b/backends/xnnpack/operators/op_add.py
@@ -7,12 +7,12 @@
 from typing import Dict
 
 import torch
+from executorch.backends.xnnpack._passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
-from executorch.backends.xnnpack.passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNAdd,
     XNNGraph,
diff --git a/backends/xnnpack/operators/op_conv2d.py b/backends/xnnpack/operators/op_conv2d.py
index 28da4805749..62c30c010a1 100644
--- a/backends/xnnpack/operators/op_conv2d.py
+++ b/backends/xnnpack/operators/op_conv2d.py
@@ -8,12 +8,12 @@
 
 import torch
 from executorch.backends.transforms import get_shape
+from executorch.backends.xnnpack._passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
-from executorch.backends.xnnpack.passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNConv2d,
     XNNDepthwiseConv2d,
diff --git a/backends/xnnpack/operators/op_dequantize_per_tensor.py b/backends/xnnpack/operators/op_dequantize_per_tensor.py
index e50498318d7..cea76b31057 100644
--- a/backends/xnnpack/operators/op_dequantize_per_tensor.py
+++ b/backends/xnnpack/operators/op_dequantize_per_tensor.py
@@ -7,12 +7,14 @@
 from typing import Dict
 
 import torch
+from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
+    TagImplicitQDqPass,
+)
 from executorch.backends.xnnpack.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
-from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNConvert,
     XNNGraph,
diff --git a/backends/xnnpack/operators/op_linear.py b/backends/xnnpack/operators/op_linear.py
index 7fb0de8228d..560f7d1a516 100644
--- a/backends/xnnpack/operators/op_linear.py
+++ b/backends/xnnpack/operators/op_linear.py
@@ -7,13 +7,13 @@
 from typing import Dict
 
 import torch
+from executorch.backends.xnnpack._passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.operators.node_visitor import (
     get_input_node,
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
-from executorch.backends.xnnpack.passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNFullyConnected,
     XNNGraph,
diff --git a/backends/xnnpack/operators/op_permute.py b/backends/xnnpack/operators/op_permute.py
index 0d2de5b1f17..4d62d457cd0 100644
--- a/backends/xnnpack/operators/op_permute.py
+++ b/backends/xnnpack/operators/op_permute.py
@@ -7,13 +7,13 @@
 from typing import cast, Dict, List
 
 import torch
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
+    ChannelsLastTaggedReshapePass,
+)
 from executorch.backends.xnnpack.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
-from executorch.backends.xnnpack.passes.channels_last_tagged_reshape_pass import (
-    ChannelsLastTaggedReshapePass,
-)
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNGraph,
     XNNStaticTranspose,
diff --git a/backends/xnnpack/operators/op_quantize_per_tensor.py b/backends/xnnpack/operators/op_quantize_per_tensor.py
index 7b6845d9963..da15559410e 100644
--- a/backends/xnnpack/operators/op_quantize_per_tensor.py
+++ b/backends/xnnpack/operators/op_quantize_per_tensor.py
@@ -7,12 +7,14 @@
 from typing import Dict
 
 import torch
+from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
+    TagImplicitQDqPass,
+)
 from executorch.backends.xnnpack.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
-from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNConvert,
     XNNGraph,
diff --git a/backends/xnnpack/operators/op_sub.py b/backends/xnnpack/operators/op_sub.py
index 354c16fc3c8..7f8a6cc9623 100644
--- a/backends/xnnpack/operators/op_sub.py
+++ b/backends/xnnpack/operators/op_sub.py
@@ -7,12 +7,12 @@
 from typing import Dict
 
 import torch
+from executorch.backends.xnnpack._passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
-from executorch.backends.xnnpack.passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNGraph,
     XNNSubtract,
diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py
index 44908ac7fca..a2d26772ecc 100644
--- a/backends/xnnpack/operators/quant_params.py
+++ b/backends/xnnpack/operators/quant_params.py
@@ -9,7 +9,9 @@
 from typing import cast, Optional, Union
 
 import torch
-from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass
+from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
+    TagImplicitQDqPass,
+)
 from executorch.backends.xnnpack.utils.quant_utils import (
     extract_qdq_affine_op_args_for_decomposed_ops,
     is_affine_qdq,
diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index 54c07ad5abc..cbcb14899d4 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -52,8 +52,8 @@ class GEMMConfig(XNNPartitionerConfig):
     different ops
     """
 
-    def __init__(self, weight_idx, bias_idx, act_idx, fused_acts):
-        super().__init__()
+    def __init__(self, weight_idx, bias_idx, act_idx, fused_acts, **kwargs):
+        super().__init__(**kwargs)
         self.weight_idx = weight_idx
         self.bias_idx = bias_idx
         self.act_idx = act_idx
@@ -250,17 +250,28 @@ def _get_act_deps(
 class LinearConfig(GEMMConfig):
     target_name = "linear.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=1,
             bias_idx=2,
             act_idx=0,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
 
     def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         return torch.ops.aten.linear.default
 
+    def _get_weight_deps(
+        self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
+    ) -> Tuple[bool, List[torch.fx.Node]]:
+        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
+            # if force fp32_dynamic_linear is on and we detected this as fp32, then we
+            # do not partition the weight node
+            return (True, [])
+
+        return super()._get_weight_deps(node, ep, precision)
+
     def supported_precision_types(self):
         return [
             ConfigPrecisionType.DYNAMIC_QUANT,
@@ -272,12 +283,13 @@ def supported_precision_types(self):
 class ConvolutionConfig(GEMMConfig):
     target_name = "convolution.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=1,
             bias_idx=2,
             act_idx=0,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
 
     def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
@@ -314,12 +326,13 @@ class AddmmConfig(GEMMConfig):
 
     target_name = "addmm.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=2,
             bias_idx=0,
             act_idx=1,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
         self.src_partitions = None
         self.linear_modules = [torch.nn.functional.linear, torch.nn.Linear]
@@ -417,8 +430,8 @@ def supported_precision_types(self):
 class MMConfig(AddmmConfig):
     target_name = "mm.default"
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
         self.bias_idx = None
         self.weight_idx = 1
         self.act_idx = 0
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
index 69defae0213..b95d7c5b89c 100644
--- a/backends/xnnpack/partition/config/generic_node_configs.py
+++ b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -25,13 +25,13 @@
 
 
 class GenericNodePartitionerConfig(XNNPartitionerConfig):
-    def __init__(self, fused_act: Optional[List[str]] = None):
+    def __init__(self, fused_act: Optional[List[str]] = None, **kwargs):
         """
         fused_act is a list of node target names that can be fused with this
         node under quantization
         """
         self.fused_acts = fused_act or []
-        super().__init__()
+        super().__init__(**kwargs)
 
     def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         return self.check_common_constraints(node, ep)
@@ -98,8 +98,8 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
 class AddConfig(GenericNodePartitionerConfig):
     target_name = "add.Tensor"
 
-    def __init__(self):
-        super().__init__(fused_act=["relu.default"])
+    def __init__(self, **kwargs):
+        super().__init__(fused_act=["relu.default"], **kwargs)
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
diff --git a/backends/xnnpack/partition/config/node_configs.py b/backends/xnnpack/partition/config/node_configs.py
index 2449d9d6440..23acfbfb8c4 100644
--- a/backends/xnnpack/partition/config/node_configs.py
+++ b/backends/xnnpack/partition/config/node_configs.py
@@ -9,13 +9,13 @@
 from typing import List, Optional
 
 import torch
+from executorch.backends.xnnpack._passes.fuse_batch_norm_with_conv import (
+    FuseBatchNormWithConvPass,
+)
 from executorch.backends.xnnpack.partition.config.xnnpack_config import (
     ConfigPrecisionType,
     XNNPartitionerConfig,
 )
-from executorch.backends.xnnpack.passes.fuse_batch_norm_with_conv import (
-    FuseBatchNormWithConvPass,
-)
 from executorch.backends.xnnpack.utils.utils import is_param_node
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
     format_target_name,
diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py
index f39a651e198..d261416a76f 100644
--- a/backends/xnnpack/partition/config/xnnpack_config.py
+++ b/backends/xnnpack/partition/config/xnnpack_config.py
@@ -37,9 +37,11 @@ class XNNPartitionerConfig(PartitionerConfig):
     types they want to enable
     """
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__()
         self.enabled_precision_types = self.supported_precision_types()
+        # Flag used in GEMMConfig()
+        self.force_fp32_dynamic_linear = kwargs.get("force_fp32_dynamic_linear", False)
 
     def get_partition(
         self, node: torch.fx.Node, ep: ExportedProgram
diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py
index 9afbefebce3..700c7d1b753 100644
--- a/backends/xnnpack/partition/xnnpack_partitioner.py
+++ b/backends/xnnpack/partition/xnnpack_partitioner.py
@@ -36,6 +36,7 @@ def __init__(
         ] = None,
         per_op_mode=False,
         verbose: bool = False,
+        **kwargs,
     ):
         """
         @verbose: if True, print out more information about the partitioner.
@@ -55,7 +56,7 @@ def __init__(
 
         for config in configs_to_use:
             # Config Classes given to XnnpackPartitioner should no longer be abstract
-            initialized = config()  #  pyre-ignore
+            initialized = config(**kwargs)  #  pyre-ignore
             initialized.set_enabled_precision_types(config_precisions)
             initialized_configs.append(initialized)
 
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 2145ea15199..1080da0beae 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -16,11 +16,15 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
 #pragma clang diagnostic ignored "-Wglobal-constructors"
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace xnnpack {
 namespace delegate {
 
+using executorch::runtime::Error;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Result;
+
 /*
  * Provide compile-time allocation.
  */
@@ -630,7 +634,14 @@ Error defineConvertNode(
       subgraph_ptr,
       remapped_ids.at(graph_node->input_id()),
       remapped_ids.at(graph_node->output_id()),
+#ifdef ENABLE_XNNPACK_KLEIDI
+      // This maps to XNNPACK's XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM
+      // however this is not currently exposed at top level
+      // xnnpack.h Header
+      0x00000100);
+#else
       graph_node->flags());
+#endif
 
   ET_CHECK_OR_RETURN_ERROR(
       status == xnn_status_success,
@@ -1776,14 +1787,14 @@ ET_NODISCARD Error XNNCompiler::compileModel(
       subgraph.get(),
       /*weight_cache=*/nullptr, // TODO - support weight cache
       workspace,
-      torch::executorch::threadpool::get_pthreadpool(),
+      ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
 #else
   status = xnn_create_runtime_v3(
       subgraph.get(),
       /*weight_cache=*/nullptr, // TODO - support weight cache
-      torch::executorch::threadpool::get_pthreadpool(),
+      ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
 #endif
@@ -1804,5 +1815,5 @@ ET_NODISCARD Error XNNCompiler::compileModel(
 
 } // namespace delegate
 } // namespace xnnpack
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h
index 94deda52635..e66cb791ecb 100644
--- a/backends/xnnpack/runtime/XNNCompiler.h
+++ b/backends/xnnpack/runtime/XNNCompiler.h
@@ -15,8 +15,8 @@
 #include <memory>
 #include <vector>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace xnnpack {
 namespace delegate {
 
@@ -25,15 +25,15 @@ class XNNCompiler {
   // Takes Flatbuffer Serialized XNNPACK Model and rebuilds the xnn-subgraph
   // returns an executor object that holds the xnn runtime object which we
   // can then use to set inputs and run inference using the xnn graph.
-  ET_NODISCARD static Error compileModel(
+  ET_NODISCARD static executorch::runtime::Error compileModel(
       const void* buffer_pointer,
       size_t num_bytes,
       XNNExecutor* executor,
-      MemoryAllocator* runtime_allocator,
+      executorch::runtime::MemoryAllocator* runtime_allocator,
       xnn_workspace_t workspace);
 };
 
 } // namespace delegate
 } // namespace xnnpack
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
index 2ca1c7d4b2f..1ba549bb8d7 100644
--- a/backends/xnnpack/runtime/XNNExecutor.cpp
+++ b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -8,14 +8,19 @@
 
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace xnnpack {
 namespace delegate {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using SizesType = exec_aten::SizesType;
+using executorch::aten::ScalarType;
+using executorch::aten::SizesType;
+using executorch::aten::Tensor;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::is_contiguous_dim_order;
+using executorch::runtime::kTensorDimensionLimit;
 
 /**
  * Initializes the XNNExecutor with the runtime and given number of
@@ -204,7 +209,7 @@ ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const {
       expected_output_size[d] = static_cast<SizesType>(dims[d]);
     }
 
-    exec_aten::ArrayRef<SizesType> output_size{
+    executorch::aten::ArrayRef<SizesType> output_size{
         expected_output_size, static_cast<size_t>(num_dim)};
 
     ET_LOG(Debug, "Resizing output tensor to a new shape");
@@ -231,5 +236,5 @@ ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const {
 
 } // namespace delegate
 } // namespace xnnpack
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h
index c35307cb912..68ee18609e3 100644
--- a/backends/xnnpack/runtime/XNNExecutor.h
+++ b/backends/xnnpack/runtime/XNNExecutor.h
@@ -19,8 +19,8 @@
 #include <memory>
 #include <vector>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace xnnpack {
 namespace delegate {
 
@@ -51,7 +51,7 @@ class XNNExecutor {
    * The input/output ids are expected to be sorted in order of their
    * flatbuffer id_outs
    */
-  ET_NODISCARD Error initialize(
+  ET_NODISCARD executorch::runtime::Error initialize(
       xnn_runtime_t runtime,
       std::vector<uint32_t>&& input_ids,
       std::vector<uint32_t>&& output_ids);
@@ -62,24 +62,27 @@ class XNNExecutor {
    * input shapes will be propagated through the runtime, and perform
    * any additional memory planning as needed
    */
-  ET_NODISCARD Error prepare_args(EValue** args);
+  ET_NODISCARD executorch::runtime::Error prepare_args(
+      executorch::runtime::EValue** args);
 
   /**
    * Executes the graph using the args prepared at prepare_args().
    */
-  ET_NODISCARD Error forward(BackendExecutionContext& context);
+  ET_NODISCARD executorch::runtime::Error forward(
+      executorch::runtime::BackendExecutionContext& context);
 
   /**
    * Prepares the outputs to be returned by the delegate
    *
    * Performs any post processing of outputs like tensor resizing
    */
-  ET_NODISCARD Error resize_outputs(EValue** args) const;
+  ET_NODISCARD executorch::runtime::Error resize_outputs(
+      executorch::runtime::EValue** args) const;
 
   friend class XNNCompiler;
 };
 
 } // namespace delegate
 } // namespace xnnpack
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNHeader.cpp b/backends/xnnpack/runtime/XNNHeader.cpp
index 5904792ee43..9397948c55d 100644
--- a/backends/xnnpack/runtime/XNNHeader.cpp
+++ b/backends/xnnpack/runtime/XNNHeader.cpp
@@ -15,11 +15,14 @@
 
 #pragma clang diagnostic ignored "-Wdeprecated"
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace xnnpack {
 namespace delegate {
 
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
 namespace {
 /// Interprets the 8 bytes at `data` as a little-endian uint64_t.
 uint64_t GetUInt64LE(const uint8_t* data) {
@@ -73,5 +76,5 @@ constexpr char XNNHeader::kMagic[kMagicSize];
 
 } // namespace delegate
 } // namespace xnnpack
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNHeader.h b/backends/xnnpack/runtime/XNNHeader.h
index 66922a95f8a..f835a6f6428 100644
--- a/backends/xnnpack/runtime/XNNHeader.h
+++ b/backends/xnnpack/runtime/XNNHeader.h
@@ -10,8 +10,8 @@
 
 #include <executorch/runtime/core/result.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace xnnpack {
 namespace delegate {
 
@@ -98,7 +98,9 @@ struct XNNHeader {
    *     error if size was too short, if the header was not found, or if the
    *     header appeared to be corrupt.
    */
-  static Result<XNNHeader> Parse(const void* data, size_t size);
+  static executorch::runtime::Result<XNNHeader> Parse(
+      const void* data,
+      size_t size);
 
   /**
    * The offset in bytes to the beginning of the flatbuffer data.
@@ -121,5 +123,5 @@ struct XNNHeader {
 
 } // namespace delegate
 } // namespace xnnpack
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index c817c010e29..b4367e40c4c 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -17,8 +17,19 @@
 
 #pragma clang diagnostic ignored "-Wglobal-constructors"
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
+
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
 
 class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
  public:
@@ -145,5 +156,5 @@ Backend backend{"XnnpackBackend", &cls};
 static auto success_with_compiler = register_backend(backend);
 } // namespace
 
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNStatus.h b/backends/xnnpack/runtime/XNNStatus.h
index 7feaa2f89af..d6d9f2274e3 100644
--- a/backends/xnnpack/runtime/XNNStatus.h
+++ b/backends/xnnpack/runtime/XNNStatus.h
@@ -11,8 +11,8 @@
 #include <assert.h>
 #include <xnnpack.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace xnnpack {
 namespace delegate {
 
@@ -34,5 +34,5 @@ inline const char* xnn_status_to_string(enum xnn_status type) {
 
 } // namespace delegate
 } // namespace xnnpack
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/profiling/XNNProfiler.cpp b/backends/xnnpack/runtime/profiling/XNNProfiler.cpp
index 1bde8a37e50..72614083c74 100644
--- a/backends/xnnpack/runtime/profiling/XNNProfiler.cpp
+++ b/backends/xnnpack/runtime/profiling/XNNProfiler.cpp
@@ -22,9 +22,13 @@
 #include <unordered_map>
 // NOLINTEND
 
-namespace torch::executor::xnnpack::delegate::profiling {
+namespace executorch::backends::xnnpack::delegate::profiling {
+
+using executorch::runtime::Error;
+using executorch::runtime::EventTracer;
 
 #if defined(ET_EVENT_TRACER_ENABLED) || defined(ENABLE_XNNPACK_PROFILING)
+
 XNNProfiler::XNNProfiler()
     : state_(XNNProfilerState::Uninitialized), run_count_(0) {}
 
@@ -210,10 +214,10 @@ void XNNProfiler::submit_trace() {
 
     auto end_time = time + interval_ticks;
 
-    torch::executor::event_tracer_log_profiling_delegate(
+    executorch::runtime::event_tracer_log_profiling_delegate(
         event_tracer_,
         name_formatted.c_str(),
-        /*delegate_debug_id=*/static_cast<torch::executor::DebugHandle>(-1),
+        /*delegate_debug_id=*/static_cast<executorch::runtime::DebugHandle>(-1),
         time,
         end_time);
 
@@ -246,4 +250,4 @@ Error XNNProfiler::end() {
 
 #endif
 
-} // namespace torch::executor::xnnpack::delegate::profiling
+} // namespace executorch::backends::xnnpack::delegate::profiling
diff --git a/backends/xnnpack/runtime/profiling/XNNProfiler.h b/backends/xnnpack/runtime/profiling/XNNProfiler.h
index 29e3633197c..2eaec0ad115 100644
--- a/backends/xnnpack/runtime/profiling/XNNProfiler.h
+++ b/backends/xnnpack/runtime/profiling/XNNProfiler.h
@@ -14,8 +14,8 @@
 #include <xnnpack.h>
 #include <vector>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace backends {
 namespace xnnpack {
 namespace delegate {
 namespace profiling {
@@ -30,24 +30,25 @@ class XNNProfiler {
    * Initialize the profiler. This must be called after model is
    * compiled and before calling begin_execution.
    */
-  Error initialize(xnn_runtime_t runtime);
+  executorch::runtime::Error initialize(xnn_runtime_t runtime);
 
   /**
    * Start a new profiling session. This is typically invoked
    * immediately before invoking the XNNPACK runtime as part
    * of a forward pass.
    */
-  Error start(EventTracer* event_tracer);
+  executorch::runtime::Error start(
+      executorch::runtime::EventTracer* event_tracer);
 
   /**
    * End a profiling session. This is typically invoked immediately
    * after the XNNPACK runtime invocation completes.
    */
-  Error end();
+  executorch::runtime::Error end();
 
  private:
 #if defined(ET_EVENT_TRACER_ENABLED) || defined(ENABLE_XNNPACK_PROFILING)
-  EventTracer* event_tracer_;
+  executorch::runtime::EventTracer* event_tracer_;
   xnn_runtime_t runtime_;
   XNNProfilerState state_;
 
@@ -64,9 +65,9 @@ class XNNProfiler {
   std::vector<uint64_t> op_timings_sum_;
 #endif
 
-  Error get_runtime_operator_names();
-  Error get_runtime_num_operators();
-  Error get_runtime_operator_timings();
+  executorch::runtime::Error get_runtime_operator_names();
+  executorch::runtime::Error get_runtime_num_operators();
+  executorch::runtime::Error get_runtime_operator_timings();
 
   void log_operator_timings();
 
@@ -80,5 +81,5 @@ class XNNProfiler {
 } // namespace profiling
 } // namespace delegate
 } // namespace xnnpack
-} // namespace executor
-} // namespace torch
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/utils/utils.cpp b/backends/xnnpack/runtime/utils/utils.cpp
index 1b88601c3f7..bbcb8bc071c 100644
--- a/backends/xnnpack/runtime/utils/utils.cpp
+++ b/backends/xnnpack/runtime/utils/utils.cpp
@@ -10,12 +10,14 @@
 #include <executorch/runtime/platform/assert.h>
 #include <cinttypes>
 
-namespace torch {
-namespace executor {
-namespace qnnpack_utils {
+namespace executorch {
+namespace backends {
+namespace xnnpack {
+namespace utils {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::Error;
 
 constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
 
@@ -222,6 +224,8 @@ void quantize_tensor_arm64_q8_wrapper<int8_t>(
   quantize_tensor_arm64_q8<int8_t, int8x8_t>(in, out, N, scale, zero_point);
 }
 #endif
-} // namespace qnnpack_utils
-} // namespace executor
-} // namespace torch
+
+} // namespace utils
+} // namespace xnnpack
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/utils/utils.h b/backends/xnnpack/runtime/utils/utils.h
index c341b4c2d7d..2eb079f0b0c 100644
--- a/backends/xnnpack/runtime/utils/utils.h
+++ b/backends/xnnpack/runtime/utils/utils.h
@@ -19,16 +19,17 @@
 #include <arm_neon.h>
 #endif
 
-namespace torch {
-namespace executor {
-namespace qnnpack_utils {
+namespace executorch {
+namespace backends {
+namespace xnnpack {
+namespace utils {
 
 struct QuantizationParams {
   double scale;
   int32_t zero_point;
 };
 
-Error ChooseQuantizationParams(
+executorch::runtime::Error ChooseQuantizationParams(
     float min,
     float max,
     int32_t qmin,
@@ -125,9 +126,9 @@ void quantize_tensor_arm64_q8_wrapper(
 #endif /* __aarch64__ */
 
 template <typename T = uint8_t>
-Error QuantizePerTensor(
-    const exec_aten::Tensor& rtensor,
-    exec_aten::Tensor& qtensor,
+executorch::runtime::Error QuantizePerTensor(
+    const executorch::aten::Tensor& rtensor,
+    executorch::aten::Tensor& qtensor,
     double scale,
     int zero_point) {
   const float* rdata = rtensor.const_data_ptr<float>();
@@ -151,17 +152,18 @@ Error QuantizePerTensor(
     qdata[i] = quantize_val<T>(scale, zero_point, rdata[i]);
   }
 #endif /* __aarch64__ */
-  return Error::Ok;
+  return executorch::runtime::Error::Ok;
 }
 
-Error GenerateRequantizationScale(
-    const exec_aten::Tensor& weight_scales,
+executorch::runtime::Error GenerateRequantizationScale(
+    const executorch::aten::Tensor& weight_scales,
     float input_scale,
     float output_scale,
     std::vector<float>& requant_scales);
 
-std::pair<float, float> GetMinMax(const exec_aten::Tensor& ft);
+std::pair<float, float> GetMinMax(const executorch::aten::Tensor& ft);
 
-} // namespace qnnpack_utils
-} // namespace executor
-} // namespace torch
+} // namespace utils
+} // namespace xnnpack
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index dc8cd5917b3..1d959048032 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -1,6 +1,17 @@
 load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
+def _get_preprocessor_flags():
+    """
+    Disable if someone explictly specified a config option,
+    else Enable otherwise
+    """
+    if native.read_config("executorch", "xnnpack_workspace_sharing", "1") == "0":
+        return []
+
+    # Enable if not disabled through config
+    return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"]
+
 def define_common_targets():
     runtime.cxx_library(
         name = "dynamic_quant_utils",
@@ -38,9 +49,9 @@ def define_common_targets():
         preprocessor_flags = [
             # Uncomment to enable per operator timings
             # "-DENABLE_XNNPACK_PROFILING",
-            # Uncomment to enable workspace sharing across delegates
-            # "-DENABLE_XNNPACK_SHARED_WORKSPACE"
-        ],
+            # Uncomment to enable using KleidiAI Kernels
+            # "-DENABLE_XNNPACK_KLEIDI"
+        ] + _get_preprocessor_flags(),
         exported_deps = [
             "//executorch/runtime/backend:interface",
         ],
diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt
index 02852871fe0..077b9635408 100644
--- a/backends/xnnpack/test/CMakeLists.txt
+++ b/backends/xnnpack/test/CMakeLists.txt
@@ -39,6 +39,7 @@ et_cxx_test(
   XNNPACK
   pthreadpool
   cpuinfo
+  microkernels-prod
 )
 target_include_directories(
   backends_xnnpack_test
diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS
index 629ac8275bc..11209e41bac 100644
--- a/backends/xnnpack/test/TARGETS
+++ b/backends/xnnpack/test/TARGETS
@@ -13,7 +13,7 @@ runtime.python_test(
         "test_xnnpack_utils_classes.py",
     ],
     deps = [
-        "//executorch/backends/xnnpack/passes:xnnpack_passes",
+        "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/backends/xnnpack/test/tester:tester",
         "//executorch/backends/xnnpack/utils:xnnpack_utils",
         "//executorch/exir:lib",
diff --git a/backends/xnnpack/test/ops/lstm.py b/backends/xnnpack/test/ops/lstm.py
new file mode 100644
index 00000000000..bfc6113c417
--- /dev/null
+++ b/backends/xnnpack/test/ops/lstm.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+from executorch.backends.xnnpack.test.tester import Tester
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
+
+
+class TestLSTM(unittest.TestCase):
+    class LSTMLinear(torch.nn.Module):
+        def __init__(self, input_size, hidden_size, out_size):
+            super().__init__()
+            self.lstm = torch.nn.LSTM(
+                input_size=input_size, hidden_size=hidden_size, batch_first=True
+            )
+            self.linear = torch.nn.Linear(hidden_size, hidden_size)
+            self.linear2 = torch.nn.Linear(hidden_size, out_size)
+
+        def forward(self, x):
+            x, hs = self.lstm(x)
+            x = self.linear(x[:, -1, :])
+            x = self.linear2(x)
+            return torch.nn.functional.log_softmax(x, dim=1)
+
+    def test_fp32_lstm(self):
+        (
+            Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),))
+            .export()
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
+            .check_not(
+                ["p_lstm_weight", "p_lstm_bias"]
+            )  # These Should be Consumed by Delegate
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp32_lstm_force_dynamic_linear(self):
+        (
+            Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),))
+            .export()
+            .to_edge_transform_and_lower(
+                ToEdgeTransformAndLower(
+                    partitioners=[XnnpackPartitioner(force_fp32_dynamic_linear=True)]
+                )
+            )
+            .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
+            # Weights are supplied as input to linears
+            .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0"])
+            # Biases are owned by delegates
+            .check_not(["p_lstm_bias"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
diff --git a/backends/xnnpack/test/passes/test_activation_fusion.py b/backends/xnnpack/test/passes/test_activation_fusion.py
index a9a5ead36f6..a7964a3181c 100644
--- a/backends/xnnpack/test/passes/test_activation_fusion.py
+++ b/backends/xnnpack/test/passes/test_activation_fusion.py
@@ -7,8 +7,8 @@
 import unittest
 
 import torch
-from executorch.backends.xnnpack.passes.convert_to_linear import ConvertToLinearPass
-from executorch.backends.xnnpack.passes.fuse_activation_pass import FuseActivationPass
+from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
+from executorch.backends.xnnpack._passes.fuse_activation_pass import FuseActivationPass
 from executorch.backends.xnnpack.test.tester import RunPasses, Tester
 from executorch.exir.dialects._ops import ops as exir_ops
 
diff --git a/backends/xnnpack/test/passes/test_batch_norm_fusion.py b/backends/xnnpack/test/passes/test_batch_norm_fusion.py
index 06517c526c8..98e9547c47a 100644
--- a/backends/xnnpack/test/passes/test_batch_norm_fusion.py
+++ b/backends/xnnpack/test/passes/test_batch_norm_fusion.py
@@ -8,7 +8,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.xnnpack.passes.fuse_batch_norm_with_conv import (
+from executorch.backends.xnnpack._passes.fuse_batch_norm_with_conv import (
     FuseBatchNormWithConvPass,
 )
 from executorch.backends.xnnpack.test.tester import RunPasses, Tester
diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
index 36e566abc36..fe781972e34 100644
--- a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
+++ b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.backends.xnnpack.passes.channels_last_tagged_reshape_pass import (
+from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
 from executorch.backends.xnnpack.test.test_xnnpack_utils_classes import (
diff --git a/backends/xnnpack/test/passes/test_convert_to_linear.py b/backends/xnnpack/test/passes/test_convert_to_linear.py
index 0fa80246fd6..a07f8cf61ce 100644
--- a/backends/xnnpack/test/passes/test_convert_to_linear.py
+++ b/backends/xnnpack/test/passes/test_convert_to_linear.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.backends.xnnpack.passes.convert_to_linear import ConvertToLinearPass
+from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
 from executorch.backends.xnnpack.test.tester import RunPasses, Tester
 
 
diff --git a/backends/xnnpack/test/passes/test_remove_get_item_pass.py b/backends/xnnpack/test/passes/test_remove_get_item_pass.py
index fa68c403e38..2365c9bba0c 100644
--- a/backends/xnnpack/test/passes/test_remove_get_item_pass.py
+++ b/backends/xnnpack/test/passes/test_remove_get_item_pass.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.backends.xnnpack.passes.remove_getitem_op import RemoveGetItemPass
+from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.backends.xnnpack.test.tester import RunPasses, Tester
 
 
diff --git a/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py b/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
index dc67a6582df..05d1ac9e8b6 100644
--- a/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
+++ b/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
@@ -7,7 +7,9 @@
 import unittest
 
 import torch
-from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass
+from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
+    TagImplicitQDqPass,
+)
 from executorch.backends.xnnpack.test.tester import RunPasses, Tester
 from executorch.exir.backend.canonical_partitioners.duplicate_dequant_node_pass import (
     DuplicateDequantNodePass,
diff --git a/backends/xnnpack/test/runtime/test_runtime_utils.cpp b/backends/xnnpack/test/runtime/test_runtime_utils.cpp
index c057b32ecea..7116a6f2fe5 100644
--- a/backends/xnnpack/test/runtime/test_runtime_utils.cpp
+++ b/backends/xnnpack/test/runtime/test_runtime_utils.cpp
@@ -8,26 +8,27 @@
 
 #include <gtest/gtest.h>
 
+#include <executorch/backends/xnnpack/runtime/utils/utils.h>
 #include <executorch/extension/aten_util/aten_bridge.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 #include <executorch/runtime/platform/runtime.h>
-#include "executorch/backends/xnnpack/runtime/utils/utils.h"
 
 using namespace ::testing;
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::Error;
-using torch::executor::testing::TensorFactory;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::Error;
+using executorch::runtime::testing::TensorFactory;
+namespace utils = executorch::backends::xnnpack::utils;
 
 TEST(TestUtils, choose_quantization_params) {
   Error e;
-  torch::executor::qnnpack_utils::QuantizationParams qparams;
+  utils::QuantizationParams qparams;
   float min = -128.0 * 10.0;
   float max = +127.0 * 10.0;
-  e = torch::executor::qnnpack_utils::ChooseQuantizationParams(
+  e = utils::ChooseQuantizationParams(
       min, max, 0, 255, qparams, false, false, false);
   ASSERT_EQ(e, Error::Ok);
   ASSERT_EQ(qparams.zero_point, 128);
@@ -35,12 +36,12 @@ TEST(TestUtils, choose_quantization_params) {
 }
 
 TEST(TestUtils, choose_quantization_params_fails) {
-  torch::executor::runtime_init();
+  executorch::runtime::runtime_init();
   Error e;
-  torch::executor::qnnpack_utils::QuantizationParams qparams;
+  utils::QuantizationParams qparams;
   float min = -128.0 * 10.0;
   float max = +127.0 * 10.0;
-  e = torch::executor::qnnpack_utils::ChooseQuantizationParams(
+  e = utils::ChooseQuantizationParams(
       max, min, 0, 255, qparams, false, false, false);
   ASSERT_EQ(e, Error::Internal);
 }
@@ -58,9 +59,8 @@ TEST(TestUtils, quantize_per_tensor) {
       at_tensor, scale, zero_point, at::ScalarType::QUInt8);
   Tensor expected = tfo.zeros_like(output);
   at_expected = at_expected.contiguous();
-  torch::util::alias_etensor_to_attensor(at_expected, expected);
-  Error e = torch::executor::qnnpack_utils::QuantizePerTensor(
-      input, output, scale, zero_point);
+  executorch::extension::alias_etensor_to_attensor(at_expected, expected);
+  Error e = utils::QuantizePerTensor(input, output, scale, zero_point);
   ASSERT_EQ(e, Error::Ok);
   EXPECT_TENSOR_EQ(output, expected);
 }
@@ -71,7 +71,7 @@ TEST(TestUtils, generate_requantizeation_scale) {
   float input_scale = 2.0;
   float output_scale = 3.0;
   std::vector<float> req_scales(15, 0);
-  Error e = torch::executor::qnnpack_utils::GenerateRequantizationScale(
+  Error e = utils::GenerateRequantizationScale(
       weight_scales, input_scale, output_scale, req_scales);
   ASSERT_EQ(e, Error::Ok);
   for (auto m : req_scales) {
@@ -85,14 +85,14 @@ TEST(TestUtils, get_min_max) {
 
   float val = 4.12345;
   const Tensor ft = tf.full({3, 5}, val);
-  std::tie(min, max) = torch::executor::qnnpack_utils::GetMinMax(ft);
+  std::tie(min, max) = utils::GetMinMax(ft);
   EXPECT_FLOAT_EQ(min, val);
   EXPECT_FLOAT_EQ(max, val);
 
   const Tensor ft_min = tf.make(
       {2, 1},
       {std::numeric_limits<float>::min(), std::numeric_limits<float>::max()});
-  std::tie(min, max) = torch::executor::qnnpack_utils::GetMinMax(ft_min);
+  std::tie(min, max) = utils::GetMinMax(ft_min);
   EXPECT_FLOAT_EQ(min, std::numeric_limits<float>::min());
   EXPECT_FLOAT_EQ(max, std::numeric_limits<float>::max());
 
@@ -100,12 +100,12 @@ TEST(TestUtils, get_min_max) {
       {2, 1},
       {std::numeric_limits<float>::lowest(),
        std::numeric_limits<float>::max()});
-  std::tie(min, max) = torch::executor::qnnpack_utils::GetMinMax(ft_lowest);
+  std::tie(min, max) = utils::GetMinMax(ft_lowest);
   EXPECT_FLOAT_EQ(min, std::numeric_limits<float>::lowest());
   EXPECT_FLOAT_EQ(max, std::numeric_limits<float>::max());
 
   const Tensor ft_random = tf.make({5, 1}, {-2.2, -1.1, 0, 1.1, 2.2});
-  std::tie(min, max) = torch::executor::qnnpack_utils::GetMinMax(ft_random);
+  std::tie(min, max) = utils::GetMinMax(ft_random);
   EXPECT_FLOAT_EQ(min, -2.2);
   EXPECT_FLOAT_EQ(max, 2.2);
 }
diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
index 2d785008036..a5a26004b49 100644
--- a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
+++ b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
@@ -9,12 +9,12 @@
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <gtest/gtest.h>
-#include <xnnpack/subgraph.h>
+#include <xnnpack.h>
 
-using torch::executor::Error;
-using torch::executor::EValue;
-using torch::executor::testing::TensorFactory;
-using torch::executor::xnnpack::delegate::XNNExecutor;
+using executorch::backends::xnnpack::delegate::XNNExecutor;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::testing::TensorFactory;
 
 TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
   XNNExecutor executor;
@@ -26,7 +26,7 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
   std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(
       subgraph, xnn_delete_subgraph);
 
-  auto input_id = XNN_INVALID_NODE_ID;
+  auto input_id = XNN_INVALID_VALUE_ID;
   std::vector<size_t> dims = {
       1,
   };
@@ -43,9 +43,9 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           /*external_id=*/0,
           /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT,
           &input_id));
-  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
+  ASSERT_NE(input_id, XNN_INVALID_VALUE_ID);
 
-  auto output_id = XNN_INVALID_NODE_ID;
+  auto output_id = XNN_INVALID_VALUE_ID;
   ASSERT_EQ(
       xnn_status_success,
       xnn_define_quantized_tensor_value(
@@ -59,7 +59,7 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           /*external_id=*/0,
           /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
           &output_id));
-  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
+  ASSERT_NE(output_id, XNN_INVALID_VALUE_ID);
 
   ASSERT_EQ(
       xnn_status_success,
@@ -76,7 +76,7 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
               1,
           }),
       Error::Ok);
-  TensorFactory<exec_aten::ScalarType::Int> tf;
+  TensorFactory<executorch::aten::ScalarType::Int> tf;
   auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});
   ASSERT_EQ(input_tensor.dim(), 9);
   auto output_tensor = tf.make(
diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl
index 2afec1c9378..30ce970a842 100644
--- a/backends/xnnpack/test/targets.bzl
+++ b/backends/xnnpack/test/targets.bzl
@@ -24,6 +24,7 @@ def define_common_targets():
         srcs = ["runtime/test_xnnexecutor.cpp"],
         deps = [
             third_party_dep("XNNPACK"),
+            third_party_dep("FP16"),
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/backends/xnnpack:xnnpack_backend",
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index 7586c4f2313..c561f9f6617 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -14,8 +14,8 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import torch
+from executorch.backends.xnnpack._passes import XNNPACKPassManager
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-from executorch.backends.xnnpack.passes import XNNPACKPassManager
 from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
 from executorch.exir import (
     EdgeCompileConfig,
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index 87ee0b46b83..ad0e62d6981 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3
+Subproject commit ad0e62d69815946be92134a56ed3ff688e2549e8
diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo
index 16bfc1622c6..fa1c679da8d 160000
--- a/backends/xnnpack/third-party/cpuinfo
+++ b/backends/xnnpack/third-party/cpuinfo
@@ -1 +1 @@
-Subproject commit 16bfc1622c6902d6f91d316ec54894910c620325
+Subproject commit fa1c679da8d19e1d87f20175ae1ec10995cd3dd3
diff --git a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
deleted file mode 100644
index e9b23e4a784..00000000000
--- a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env python3
-
-from __future__ import print_function
-from pathlib import Path
-import collections
-import os
-import sys
-import logging
-
-BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
-WRAPPER_SRC_NAMES = {
-    "PROD_SCALAR_MICROKERNEL_SRCS": None,
-    "PROD_FMA_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
-    "PROD_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
-    "PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__arm__)",
-    "PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
-    "PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
-    "PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
-    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
-    "PROD_NEONI8MM_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_SSE41_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_F16C_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_XOP_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_FMA3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
-    "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
-    "AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",
-
-    # add non-prod microkernel sources here:
-}
-
-SRC_NAMES = {
-    "OPERATOR_SRCS",
-    "SUBGRAPH_SRCS",
-    "LOGGING_SRCS",
-    "XNNPACK_SRCS",
-    "TABLE_SRCS",
-    "JIT_SRCS",
-    "PROD_SCALAR_MICROKERNEL_SRCS",
-    "PROD_FMA_MICROKERNEL_SRCS",
-    "PROD_ARMSIMD32_MICROKERNEL_SRCS",
-    "PROD_FP16ARITH_MICROKERNEL_SRCS",
-    "PROD_NEON_MICROKERNEL_SRCS",
-    "PROD_NEONFP16_MICROKERNEL_SRCS",
-    "PROD_NEONFMA_MICROKERNEL_SRCS",
-    "PROD_NEON_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONV8_MICROKERNEL_SRCS",
-    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
-    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONDOT_MICROKERNEL_SRCS",
-    "PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
-    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONI8MM_MICROKERNEL_SRCS",
-    "PROD_SSE_MICROKERNEL_SRCS",
-    "PROD_SSE2_MICROKERNEL_SRCS",
-    "PROD_SSSE3_MICROKERNEL_SRCS",
-    "PROD_SSE41_MICROKERNEL_SRCS",
-    "PROD_AVX_MICROKERNEL_SRCS",
-    "PROD_F16C_MICROKERNEL_SRCS",
-    "PROD_XOP_MICROKERNEL_SRCS",
-    "PROD_FMA3_MICROKERNEL_SRCS",
-    "PROD_AVX2_MICROKERNEL_SRCS",
-    "PROD_AVX512F_MICROKERNEL_SRCS",
-    "PROD_AVX512SKX_MICROKERNEL_SRCS",
-    "PROD_AVX512VBMI_MICROKERNEL_SRCS",
-    "PROD_AVX512VNNI_MICROKERNEL_SRCS",
-    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
-    "PROD_RVV_MICROKERNEL_SRCS",
-    "PROD_AVXVNNI_MICROKERNEL_SRCS",
-    "AARCH32_ASM_MICROKERNEL_SRCS",
-    "AARCH64_ASM_MICROKERNEL_SRCS",
-
-    # add non-prod microkernel sources here:
-}
-
-def handle_singleline_parse(line):
-    start_index = line.find("(")
-    end_index = line.find(")")
-    line = line[start_index+1:end_index]
-    key_val = line.split(" ")
-    return key_val[0], [x[4:] for x in key_val[1:]]
-
-def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
-    print(f"Updating sources from {cmakefile}")
-    sources = collections.defaultdict(list)
-    with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
-        lines = cmake.readlines()
-        i = 0
-        while i < len(lines):
-            line = lines[i]
-            
-            if lines[i].startswith("INCLUDE"):
-                file, _ = handle_singleline_parse(line)
-                if file.startswith("cmake/gen/"):
-                    path = Path(xnnpack_path) / "XNNPACK" / file
-                    local_sources = update_sources(xnnpack_path, path.absolute().as_posix())
-                    for k,v in local_sources.items():
-                        if k in sources:
-                            sources[k] = sources[k] + local_sources[k]
-                        else:
-                            sources[k] = local_sources[k]
-
-            if lines[i].startswith("SET") and "src/" in lines[i]:
-                name, val = handle_singleline_parse(line)
-                sources[name].extend(val)
-                i+=1
-                continue
-
-            if line.startswith("SET") and line.split('(')[1].strip(' \t\n\r') in set(WRAPPER_SRC_NAMES.keys()) | set(SRC_NAMES):
-                name = line.split('(')[1].strip(' \t\n\r')
-                i += 1
-                while i < len(lines) and len(lines[i]) > 0 and ')' not in lines[i]:
-                    # remove "src/" at the beginning, remove whitespaces and newline
-                    value = lines[i].strip(' \t\n\r')
-                    sources[name].append(value[4:])
-                    i += 1
-                if i < len(lines) and len(lines[i]) > 4:
-                    # remove "src/" at the beginning, possibly ')' at the end
-                    value = lines[i].strip(' \t\n\r)')
-                    sources[name].append(value[4:])
-            else:
-                i += 1
-    return sources
-
-def gen_wrappers(xnnpack_path):
-    xnnpack_sources = collections.defaultdict(list)
-    sources = update_sources(xnnpack_path)
-
-    microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/gen/microkernels.cmake")
-    for key in  microkernels_sources:
-        sources[key] = microkernels_sources[key]
-
-    for name in WRAPPER_SRC_NAMES:
-        xnnpack_sources[WRAPPER_SRC_NAMES[name]].extend(sources[name])
-
-    for condition, filenames in xnnpack_sources.items():
-        print(condition)
-        for filename in filenames:
-            filepath = os.path.join(xnnpack_path, "xnnpack_wrappers", filename)
-
-            if not os.path.isdir(os.path.dirname(filepath)):
-                os.makedirs(os.path.dirname(filepath))
-            with open(filepath, "w") as wrapper:
-                print("/* {} */".format(BANNER), file=wrapper)
-                print(file=wrapper)
-
-                # Architecture- or platform-dependent preprocessor flags can be
-                # defined here. Note: platform_preprocessor_flags can't be used
-                # because they are ignored by arc focus & buck project.
-
-                if condition is None:
-                    print("#include <%s>" % filename, file=wrapper)
-                else:
-                    # Include source file only if condition is satisfied
-                    print("#if %s" % condition, file=wrapper)
-                    print("#include <%s>" % filename, file=wrapper)
-                    print("#endif /* %s */" % condition, file=wrapper)
-
-    # update xnnpack_wrapper_defs.bzl file under the same folder
-    with open(os.path.join(os.path.dirname(__file__), "xnnpack_wrapper_defs.bzl"), 'w') as wrapper_defs:
-        print('"""', file=wrapper_defs)
-        print(BANNER, file=wrapper_defs)
-        print('"""', file=wrapper_defs)
-        for name in WRAPPER_SRC_NAMES:
-            print('\n' + name + ' = [', file=wrapper_defs)
-            for file_name in sources[name]:
-                print('    "xnnpack_wrappers/{}",'.format(file_name), file=wrapper_defs)
-            print(']', file=wrapper_defs)
-
-    # update xnnpack_src_defs.bzl file under the same folder
-    with open(os.path.join(os.path.dirname(__file__), "xnnpack_src_defs.bzl"), 'w') as src_defs:
-        print('"""', file=src_defs)
-        print(BANNER, file=src_defs)
-        print('"""', file=src_defs)
-        for name in SRC_NAMES:
-            print('\n' + name + ' = [', file=src_defs)
-            for file_name in sources[name]:
-                print('    "XNNPACK/src/{}",'.format(file_name), file=src_defs)
-            print(']', file=src_defs)
-
-
-def main(argv):
-    print("Generating wrappers...")
-
-    if argv is None or len(argv) == 0:
-        gen_wrappers(".")
-    else:
-        gen_wrappers(argv[0])
-
-# The first argument is the place where the "xnnpack_wrappers" folder will be created.
-# Run it without arguments will generate "xnnpack_wrappers" in the current path.
-# The two .bzl files will always be generated in the current path.
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl
index 7f0a8ca6f21..d2068661fea 100644
--- a/backends/xnnpack/third-party/xnnpack.buck.bzl
+++ b/backends/xnnpack/third-party/xnnpack.buck.bzl
@@ -1,4 +1,3 @@
-load("//third-party:glob_defs.bzl", "subdir_glob")
 load(
     ":xnnpack_src_defs.bzl",
     "LOGGING_SRCS",
@@ -6,51 +5,15 @@ load(
     "SUBGRAPH_SRCS",
     "TABLE_SRCS",
     "XNNPACK_SRCS",
-)
-load(
-    ":xnnpack_wrapper_defs.bzl",
-    "AARCH32_ASM_MICROKERNEL_SRCS",
-    "AARCH64_ASM_MICROKERNEL_SRCS",
-    "PROD_ARMSIMD32_MICROKERNEL_SRCS",
-    "PROD_AVX2_MICROKERNEL_SRCS",
-    "PROD_AVX512F_MICROKERNEL_SRCS",
-    "PROD_AVX512SKX_MICROKERNEL_SRCS",
-    "PROD_AVX512VBMI_MICROKERNEL_SRCS",
-    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
-    "PROD_AVX512VNNI_MICROKERNEL_SRCS",
-    "PROD_AVXVNNI_MICROKERNEL_SRCS",
-    "PROD_AVX_MICROKERNEL_SRCS",
-    "PROD_F16C_MICROKERNEL_SRCS",
-    "PROD_FMA3_MICROKERNEL_SRCS",
-    "PROD_FP16ARITH_MICROKERNEL_SRCS",
-    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
-    "PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONDOT_MICROKERNEL_SRCS",
-    "PROD_NEONFMA_MICROKERNEL_SRCS",
-    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
-    "PROD_NEONFP16_MICROKERNEL_SRCS",
-    "PROD_NEONI8MM_MICROKERNEL_SRCS",
-    "PROD_NEONV8_MICROKERNEL_SRCS",
-    "PROD_NEON_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEON_MICROKERNEL_SRCS",
-    "PROD_SCALAR_MICROKERNEL_SRCS",
-    "PROD_SSE2_MICROKERNEL_SRCS",
-    "PROD_SSE41_MICROKERNEL_SRCS",
-    "PROD_SSE_MICROKERNEL_SRCS",
-    "PROD_SSSE3_MICROKERNEL_SRCS",
-    "PROD_XOP_MICROKERNEL_SRCS",
+    "get_xnnpack_headers",
+    "prod_srcs_for_arch_wrapper",
 )
 
 def define_xnnpack():
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     native.cxx_library(
         name = "interface",
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/include", "**/*.h"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         exported_headers = {
             "xnnpack.h": "XNNPACK/include/xnnpack.h",
@@ -79,11 +42,9 @@ def define_xnnpack():
             "XNNPACK/src/mutex.c",
             "XNNPACK/src/normalization.c",
             "XNNPACK/src/operator-utils.c",
-            "XNNPACK/src/packing.c",
+            "XNNPACK/src/packing.cc",
         ],
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
@@ -110,9 +71,7 @@ def define_xnnpack():
         compiler_flags = [
             "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
         ],
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         preferred_linkage = "static",
         preprocessor_flags = [
@@ -133,9 +92,7 @@ def define_xnnpack():
     native.cxx_library(
         name = "tables",
         srcs = TABLE_SRCS,
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
@@ -157,11 +114,8 @@ def define_xnnpack():
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     native.cxx_library(
         name = "ukernels_scalar",
-        srcs = PROD_SCALAR_MICROKERNEL_SRCS,
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        srcs = prod_srcs_for_arch_wrapper("scalar"),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O3",
@@ -193,12 +147,9 @@ def define_xnnpack():
         name = "ukernels_armsimd32",
         srcs = select({
             "DEFAULT": DEFAULT_DUMMY_SRC,
-            "ovr_config//cpu:arm32": PROD_ARMSIMD32_MICROKERNEL_SRCS,
+            "ovr_config//cpu:arm32": prod_srcs_for_arch_wrapper("armsimd32"),
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
@@ -236,14 +187,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_fp16arith",
         srcs = select({
-            "DEFAULT": PROD_FP16ARITH_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("fp16arith"),
             "ovr_config//cpu:x86_32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
@@ -272,14 +220,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_sse",
         srcs = select({
-            "DEFAULT": PROD_SSE_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("sse"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -294,6 +239,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -304,14 +250,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_sse2",
         srcs = select({
-            "DEFAULT": PROD_SSE2_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("sse2"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -337,14 +280,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_ssse3",
         srcs = select({
-            "DEFAULT": PROD_SSSE3_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("ssse3"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -370,14 +310,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_sse41",
         srcs = select({
-            "DEFAULT": PROD_SSE41_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("sse41"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -403,14 +340,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx",
         srcs = select({
-            "DEFAULT": PROD_AVX_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -425,38 +359,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
-            ":interface",
-        ],
-    )
-
-    XOP_COMPILER_FLAGS = ["-mxop"]
-
-    # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
-    native.cxx_library(
-        name = "ukernels_xop",
-        srcs = select({
-            "DEFAULT": PROD_XOP_MICROKERNEL_SRCS,
-            "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
-            "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
-        }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
-        header_namespace = "",
-        compiler_flags = [
-            "-O2",
-            "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
-        ] + select({
-            "DEFAULT": XOP_COMPILER_FLAGS,
-            "ovr_config//cpu:arm32": [],
-            "ovr_config//cpu:arm64": [],
-        }),
-        preferred_linkage = "static",
-        preprocessor_flags = [
-            "-DXNN_LOG_LEVEL=0",
-        ],
-        exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -467,14 +370,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_f16c",
         srcs = select({
-            "DEFAULT": PROD_F16C_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("f16c"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -489,6 +389,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -502,14 +403,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_fma3",
         srcs = select({
-            "DEFAULT": PROD_FMA3_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("fma3"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -524,6 +422,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -538,14 +437,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx2",
         srcs = select({
-            "DEFAULT": PROD_AVX2_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx2"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -560,6 +456,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -570,14 +467,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx512",
         srcs = select({
-            "DEFAULT": PROD_AVX512F_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx512f"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -592,6 +486,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -608,14 +503,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx512skx",
         srcs = select({
-            "DEFAULT": PROD_AVX512SKX_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx512skx"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -630,6 +522,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -645,13 +538,10 @@ def define_xnnpack():
         name = "ukernels_asm",
         srcs = select({
             "DEFAULT": DEFAULT_DUMMY_SRC,
-            "ovr_config//cpu:arm32": AARCH32_ASM_MICROKERNEL_SRCS,
-            "ovr_config//cpu:arm64": AARCH64_ASM_MICROKERNEL_SRCS,
+            "ovr_config//cpu:arm32": prod_srcs_for_arch_wrapper("aarch32"),
+            "ovr_config//cpu:arm64": prod_srcs_for_arch_wrapper("aarch64"),
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "xnnpack/assembly.h"),
-            ("XNNPACK/src", "**/*.S"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         platform_compiler_flags = [
             (
@@ -686,14 +576,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_neon",
         srcs = select({
-            "DEFAULT": PROD_NEON_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("neon"),
             "ovr_config//cpu:x86_32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -720,14 +607,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx512vbmi",
         srcs = select({
-            "DEFAULT": PROD_AVX512VBMI_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx512vbmi"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -742,25 +626,23 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
 
-    NEON64_AARCH64_COMPILER_FLAGS = ["-mfpu=neon-vfpv6"]
+    NEON64_AARCH64_COMPILER_FLAGS = []
 
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     native.cxx_library(
         name = "ukernels_neon_aarch64",
         srcs = select({
-            "DEFAULT": PROD_NEON_AARCH64_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("neon_aarch64"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -787,14 +669,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_neon_fp16",
         srcs = select({
-            "DEFAULT": PROD_NEONFP16_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("neonfp16"),
             "ovr_config//cpu:x86_32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -811,13 +690,13 @@ def define_xnnpack():
         ],
         exported_deps = [
             ":interface",
+            ":FP16",
         ],
     )
 
     NEON32_FMA_COMPILER_FLAGS = ["-mfpu=neon-vfp4"]
     NEON64_FMA_COMPILER_FLAGS = [
         "-march=armv8-a",
-        "-mfpu=neon-fp-armv8",
     ]
 
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
@@ -825,15 +704,12 @@ def define_xnnpack():
         name = "ukernels_neon_fma",
         srcs = select({
             "DEFAULT": DEFAULT_DUMMY_SRC,
-            "ovr_config//cpu:arm32": PROD_NEONFMA_MICROKERNEL_SRCS,
-            "ovr_config//cpu:arm64": PROD_NEONFMA_MICROKERNEL_SRCS + PROD_NEON_AARCH64_MICROKERNEL_SRCS,
+            "ovr_config//cpu:arm32": prod_srcs_for_arch_wrapper("neonfma"),
+            "ovr_config//cpu:arm64": prod_srcs_for_arch_wrapper("neonfma") + prod_srcs_for_arch_wrapper("neonfma_aarch64"),
             "ovr_config//cpu:x86_32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -857,7 +733,6 @@ def define_xnnpack():
 
     NEON64_V8_COMPILER_FLAGS = [
         "-march=armv8-a",
-        "-mfpu=neon-fp-armv8",
     ]
 
     NEON32_V8_COMPILER_FLAGS = [
@@ -870,14 +745,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_neon_v8",
         srcs = select({
-            "DEFAULT": PROD_NEONV8_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("neonv8"),
             "ovr_config//cpu:x86_32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -910,13 +782,10 @@ def define_xnnpack():
         name = "ukernels_neon_fp16arith",
         srcs = select({
             "DEFAULT": DEFAULT_DUMMY_SRC,
-            "ovr_config//cpu:arm32": PROD_NEONFP16ARITH_MICROKERNEL_SRCS,
-            "ovr_config//cpu:arm64": PROD_NEONFP16ARITH_MICROKERNEL_SRCS + PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS,
+            "ovr_config//cpu:arm32": prod_srcs_for_arch_wrapper("neonfp16arith"),
+            "ovr_config//cpu:arm64": prod_srcs_for_arch_wrapper("neonfp16arith") + prod_srcs_for_arch_wrapper("neonfp16arith_aarch64"),
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -953,13 +822,10 @@ def define_xnnpack():
         name = "ukernels_neondotfp16arith",
         srcs = select({
             "DEFAULT": DEFAULT_DUMMY_SRC,
-            "ovr_config//cpu:arm32": PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS,
-            "ovr_config//cpu:arm64": PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS + PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS,
+            "ovr_config//cpu:arm32": prod_srcs_for_arch_wrapper("neondotfp16arith"),
+            "ovr_config//cpu:arm64": prod_srcs_for_arch_wrapper("neondotfp16arith") + prod_srcs_for_arch_wrapper("neondotfp16arith_aarch64"),
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -991,15 +857,12 @@ def define_xnnpack():
         name = "ukernels_neon_dot",
         srcs = select({
             "DEFAULT": DEFAULT_DUMMY_SRC,
-            "ovr_config//cpu:arm32": PROD_NEONDOT_MICROKERNEL_SRCS,
-            "ovr_config//cpu:arm64": PROD_NEONDOT_MICROKERNEL_SRCS + PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS,
+            "ovr_config//cpu:arm32": prod_srcs_for_arch_wrapper("neondot"),
+            "ovr_config//cpu:arm64": prod_srcs_for_arch_wrapper("neondot") + prod_srcs_for_arch_wrapper("neondot_aarch64"),
             "ovr_config//cpu:x86_32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -1036,14 +899,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_neon_i8mm",
         srcs = select({
-            "DEFAULT": PROD_NEONI8MM_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("neoni8mm"),
             "ovr_config//cpu:x86_32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:x86_64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -1077,14 +937,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx512vnni",
         srcs = select({
-            "DEFAULT": PROD_AVX512VNNI_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx512vnni"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -1099,6 +956,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -1111,14 +969,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx512vnnigfni",
         srcs = select({
-            "DEFAULT": PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx512vnnifgni"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -1133,6 +988,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -1148,14 +1004,11 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avxvnni",
         srcs = select({
-            "DEFAULT": PROD_AVXVNNI_MICROKERNEL_SRCS,
+            "DEFAULT": prod_srcs_for_arch_wrapper("avxvnni"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/src", "**/*.c"),
-        ]),
+        headers = get_xnnpack_headers(),
         header_namespace = "",
         compiler_flags = [
             "-O2",
@@ -1170,6 +1023,7 @@ def define_xnnpack():
             "-DXNN_LOG_LEVEL=0",
         ],
         exported_deps = [
+            ":FP16",
             ":interface",
         ],
     )
@@ -1192,7 +1046,6 @@ def define_xnnpack():
         ":ukernels_sse2",
         ":ukernels_sse41",
         ":ukernels_ssse3",
-        ":ukernels_xop",
         ":ukernels_avx512vbmi",
         ":ukernels_avx512vnnigfni",
         ":ukernels_avx512vnni",
@@ -1218,16 +1071,13 @@ def define_xnnpack():
     native.cxx_library(
         name = "XNNPACK",
         srcs = XNNPACK_SRCS + LOGGING_SRCS + [
-            "XNNPACK/src/amalgam/gen/scalar.c",
+            "XNNPACK/src/init.c",
+            "XNNPACK/src/params.c",
             "XNNPACK/src/configs/hardware-config.c",
             "XNNPACK/src/microparams-init.c",
-            "XNNPACK/src/operator-run.c",
             "XNNPACK/src/microkernel-utils.c",
         ],
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-            ("XNNPACK/include", "**/*.h"),
-        ]),
+        headers = get_xnnpack_headers(),
         exported_headers = {
             "xnnpack.h": "XNNPACK/include/xnnpack.h",
         },
@@ -1238,12 +1088,6 @@ def define_xnnpack():
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
-            "-DXNN_NO_Q8_OPERATORS",
-            "-DXNN_NO_F16_OPERATORS",
-            "-DXNN_NO_NCHW_OPERATORS",
-            "-DXNN_NO_U8_OPERATORS",
-            "-DXNN_NO_X32_OPERATORS",
-            "-DXNN_NO_X8_OPERATORS",
             "-DXNN_ENABLE_MEMOPT",
             "-DXNN_ENABLE_SPARSE=0",
             "-DXNN_ENABLE_ASSEMBLY",
@@ -1252,9 +1096,11 @@ def define_xnnpack():
             "-DXNN_ENABLE_CPUINFO",
             # "-DXNN_ENABLE_DWCONV_MULTIPLASS=1",
             "-DXNN_ENABLE_ARM_I8MM=1",
+            "-DXNN_ENABLE_ARM_FP16_VECTOR=1",
         ],
         visibility = ["PUBLIC"],
         exported_deps = COMMON_XNNPACK_DEPS + [
+            ":FP16",
             ":pthreadpool",
             ":interface",
             ":cpuinfo",
diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
index d8ebe7c72bb..038b90acab0 100644
--- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl
+++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
@@ -1,607 +1,43 @@
-"""
-Auto-generated by generate-wrappers.py script. Do not modify
-"""
-
-PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondotfp16-aarch64.c",
-]
-
-PROD_FP16ARITH_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fp16arith.c",
-]
-
-PROD_SSE_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse.c",
-]
-
-PROD_FMA3_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fma3.c",
-]
-
-PROD_SSE2_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse2.c",
-]
-
-PROD_NEONV8_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonv8.c",
-]
-
-PROD_AVX512SKX_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512skx.c",
-]
-
-AARCH32_ASM_MICROKERNEL_SRCS = [
-    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S",
-    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S",
-    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S",
-    "XNNPACK/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S",
-    "XNNPACK/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S",
-    "XNNPACK/src/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
-    "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
-    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
-    "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
-    "XNNPACK/src/qs16-qs8-vcvt/qs16-qs8-vcvt-asm-aarch32-neon-u16.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S",
-    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S",
-    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S",
-]
-
-PROD_FMA_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fma.c",
-]
-
-PROD_NEON_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neon-aarch64.c",
-    "XNNPACK/src/amalgam/gen/neonfma-aarch64.c",
-]
-
-PROD_SSSE3_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/ssse3.c",
-]
-
-TABLE_SRCS = [
-    "XNNPACK/src/tables/exp2-k-over-64.c",
-    "XNNPACK/src/tables/exp2-k-over-2048.c",
-    "XNNPACK/src/tables/exp2minus-k-over-4.c",
-    "XNNPACK/src/tables/exp2minus-k-over-8.c",
-    "XNNPACK/src/tables/exp2minus-k-over-16.c",
-    "XNNPACK/src/tables/exp2minus-k-over-32.c",
-    "XNNPACK/src/tables/exp2minus-k-over-64.c",
-    "XNNPACK/src/tables/exp2minus-k-over-2048.c",
-    "XNNPACK/src/tables/vlog.c",
-]
-
-PROD_AVX512VBMI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512vbmi.c",
-]
-
-PROD_AVX2_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx2.c",
-]
-
-PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondotfp16arith.c",
-]
-
-PROD_AVXVNNI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avxvnni.c",
-]
-
-OPERATOR_SRCS = [
-    "XNNPACK/src/operator-delete.c",
-    "XNNPACK/src/operators/argmax-pooling-nhwc.c",
-    "XNNPACK/src/operators/average-pooling-nhwc.c",
-    "XNNPACK/src/operators/batch-matrix-multiply-nc.c",
-    "XNNPACK/src/operators/binary-elementwise-nd.c",
-    "XNNPACK/src/operators/channel-shuffle-nc.c",
-    "XNNPACK/src/operators/constant-pad-nd.c",
-    "XNNPACK/src/operators/convolution-nchw.c",
-    "XNNPACK/src/operators/convolution-nhwc.c",
-    "XNNPACK/src/operators/deconvolution-nhwc.c",
-    "XNNPACK/src/operators/dynamic-fully-connected-nc.c",
-    "XNNPACK/src/operators/fully-connected-nc.c",
-    "XNNPACK/src/operators/global-average-pooling-ncw.c",
-    "XNNPACK/src/operators/global-average-pooling-nwc.c",
-    "XNNPACK/src/operators/lut-elementwise-nc.c",
-    "XNNPACK/src/operators/max-pooling-nhwc.c",
-    "XNNPACK/src/operators/prelu-nc.c",
-    "XNNPACK/src/operators/reduce-nd.c",
-    "XNNPACK/src/operators/resize-bilinear-nchw.c",
-    "XNNPACK/src/operators/resize-bilinear-nhwc.c",
-    "XNNPACK/src/operators/rope-nthc.c",
-    "XNNPACK/src/operators/scaled-dot-product-attention-nhtc.c",
-    "XNNPACK/src/operators/slice-nd.c",
-    "XNNPACK/src/operators/softmax-nc.c",
-    "XNNPACK/src/operators/transpose-nd.c",
-    "XNNPACK/src/operators/unary-elementwise-nc.c",
-    "XNNPACK/src/operators/unpooling-nhwc.c",
-]
-
-PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfp16arith.c",
-]
-
-PROD_F16C_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/f16c.c",
-]
-
-PROD_XOP_MICROKERNEL_SRCS = [
-]
-
-PROD_AVX512F_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512f.c",
-]
-
-SUBGRAPH_SRCS = [
-    "XNNPACK/src/memory-planner.c",
-    "XNNPACK/src/runtime.c",
-    "XNNPACK/src/subgraph.c",
-    "XNNPACK/src/subgraph/abs.c",
-    "XNNPACK/src/subgraph/add2.c",
-    "XNNPACK/src/subgraph/argmax-pooling-2d.c",
-    "XNNPACK/src/subgraph/average-pooling-2d.c",
-    "XNNPACK/src/subgraph/bankers-rounding.c",
-    "XNNPACK/src/subgraph/batch-matrix-multiply.c",
-    "XNNPACK/src/subgraph/ceiling.c",
-    "XNNPACK/src/subgraph/clamp.c",
-    "XNNPACK/src/subgraph/concatenate.c",
-    "XNNPACK/src/subgraph/convert.c",
-    "XNNPACK/src/subgraph/convolution-2d.c",
-    "XNNPACK/src/subgraph/copy.c",
-    "XNNPACK/src/subgraph/deconvolution-2d.c",
-    "XNNPACK/src/subgraph/depth-to-space-2d.c",
-    "XNNPACK/src/subgraph/depthwise-convolution-2d.c",
-    "XNNPACK/src/subgraph/divide.c",
-    "XNNPACK/src/subgraph/elu.c",
-    "XNNPACK/src/subgraph/even-split.c",
-    "XNNPACK/src/subgraph/floor.c",
-    "XNNPACK/src/subgraph/fully-connected-sparse.c",
-    "XNNPACK/src/subgraph/fully-connected.c",
-    "XNNPACK/src/subgraph/global-average-pooling.c",
-    "XNNPACK/src/subgraph/global-sum-pooling.c",
-    "XNNPACK/src/subgraph/hardswish.c",
-    "XNNPACK/src/subgraph/leaky-relu.c",
-    "XNNPACK/src/subgraph/max-pooling-2d.c",
-    "XNNPACK/src/subgraph/maximum2.c",
-    "XNNPACK/src/subgraph/minimum2.c",
-    "XNNPACK/src/subgraph/multiply2.c",
-    "XNNPACK/src/subgraph/negate.c",
-    "XNNPACK/src/subgraph/prelu.c",
-    "XNNPACK/src/subgraph/reshape-helpers.c",
-    "XNNPACK/src/subgraph/scaled-dot-product-attention.c",
-    "XNNPACK/src/subgraph/sigmoid.c",
-    "XNNPACK/src/subgraph/softmax.c",
-    "XNNPACK/src/subgraph/space-to-depth-2d.c",
-    "XNNPACK/src/subgraph/square-root.c",
-    "XNNPACK/src/subgraph/square.c",
-    "XNNPACK/src/subgraph/squared-difference.c",
-    "XNNPACK/src/subgraph/static-constant-pad.c",
-    "XNNPACK/src/subgraph/static-mean.c",
-    "XNNPACK/src/subgraph/static-reshape.c",
-    "XNNPACK/src/subgraph/static-resize-bilinear-2d.c",
-    "XNNPACK/src/subgraph/static-slice.c",
-    "XNNPACK/src/subgraph/static-transpose.c",
-    "XNNPACK/src/subgraph/subtract.c",
-    "XNNPACK/src/subgraph/tanh.c",
-    "XNNPACK/src/subgraph/unpooling-2d.c",
-    "XNNPACK/src/subgraph/validation.c",
-    "XNNPACK/src/tensor.c",
-]
-
-PROD_RVV_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/rvv.c",
-]
-
-AARCH64_ASM_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemm-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-gemm/gen/f16-gemminc-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "XNNPACK/src/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "XNNPACK/src/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-goi-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-igemm/f32-igemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "XNNPACK/src/f32-igemm/f32-igemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "XNNPACK/src/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S",
-    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-]
-
-XNNPACK_SRCS = [
-    "XNNPACK/src/configs/argmaxpool-config.c",
-    "XNNPACK/src/configs/avgpool-config.c",
-    "XNNPACK/src/configs/binary-elementwise-config.c",
-    "XNNPACK/src/configs/cmul-config.c",
-    "XNNPACK/src/configs/conv-hwc2chw-config.c",
-    "XNNPACK/src/configs/dwconv-config.c",
-    "XNNPACK/src/configs/dwconv2d-chw-config.c",
-    "XNNPACK/src/configs/experiments-config.c",
-    "XNNPACK/src/configs/gavgpool-config.c",
-    "XNNPACK/src/configs/gavgpool-cw-config.c",
-    "XNNPACK/src/configs/gemm-config.c",
-    "XNNPACK/src/configs/ibilinear-chw-config.c",
-    "XNNPACK/src/configs/ibilinear-config.c",
-    "XNNPACK/src/configs/lut32norm-config.c",
-    "XNNPACK/src/configs/maxpool-config.c",
-    "XNNPACK/src/configs/pavgpool-config.c",
-    "XNNPACK/src/configs/prelu-config.c",
-    "XNNPACK/src/configs/raddstoreexpminusmax-config.c",
-    "XNNPACK/src/configs/reduce-config.c",
-    "XNNPACK/src/configs/rmax-config.c",
-    "XNNPACK/src/configs/spmm-config.c",
-    "XNNPACK/src/configs/transpose-config.c",
-    "XNNPACK/src/configs/unary-elementwise-config.c",
-    "XNNPACK/src/configs/unpool-config.c",
-    "XNNPACK/src/configs/vmulcaddc-config.c",
-    "XNNPACK/src/configs/xx-fill-config.c",
-    "XNNPACK/src/configs/xx-pad-config.c",
-    "XNNPACK/src/configs/x8-lut-config.c",
-    "XNNPACK/src/configs/zip-config.c",
-    "XNNPACK/src/init.c",
-    "XNNPACK/src/params.c",
-]
-
-JIT_SRCS = [
-    "XNNPACK/src/jit/aarch32-assembler.cc",
-    "XNNPACK/src/jit/aarch64-assembler.cc",
-    "XNNPACK/src/jit/assembler.cc",
-]
-
-PROD_NEONFMA_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfma.c",
-]
-
-PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfp16arith-aarch64.c",
-]
-
-PROD_NEONFP16_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfp16.c",
-]
-
-PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512vnnigfni.c",
-]
-
-PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondot-aarch64.c",
-]
-
-PROD_ARMSIMD32_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/armsimd32.c",
-]
-
-PROD_NEONDOT_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondot.c",
-]
-
-PROD_SCALAR_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/scalar.c",
-]
-
-PROD_SSE41_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse41.c",
-]
-
-PROD_NEONI8MM_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neoni8mm.c",
-]
-
-LOGGING_SRCS = [
-    "XNNPACK/src/enums/datatype-strings.c",
-    "XNNPACK/src/enums/microkernel-type.c",
-    "XNNPACK/src/enums/node-type.c",
-    "XNNPACK/src/enums/operator-type.c",
-    "XNNPACK/src/log.c",
-]
-
-PROD_NEON_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neon.c",
-]
-
-PROD_AVX512VNNI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512vnni.c",
-]
-
-PROD_AVX_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx.c",
-]
+load(
+    "//backends/xnnpack/third-party/XNNPACK:build_srcs.bzl",
+    _LOGGING_SRCS = "LOGGING_SRCS",
+    _OPERATOR_SRCS = "OPERATOR_SRCS",
+    _SUBGRAPH_SRCS = "SUBGRAPH_SRCS",
+    _TABLE_SRCS = "TABLE_SRCS",
+    _XNNPACK_SRCS = "XNNPACK_SRCS",
+)
+load("//backends/xnnpack/third-party/XNNPACK/gen:microkernels.bzl", "prod_srcs_for_arch")
+load("//third-party:glob_defs.bzl", "subdir_glob")
+
+def define_xnnpack_build_src(xnnpack_build_src):
+    return ["XNNPACK/{}".format(src) for src in xnnpack_build_src]
+
+def prod_srcs_for_arch_wrapper(arch):
+    prod_srcs = prod_srcs_for_arch(arch)
+    return define_xnnpack_build_src(prod_srcs)
+
+def get_xnnpack_headers():
+    # XNNPACK Headers in the path containing xnnpack/ or configs/
+    # do not contain the src/ path. However headers not in xnnpack/ or
+    # configs/ are prepend with the src/ path. This function helps us
+    # to correctly parse all the header files to the correct name
+    src_headers = subdir_glob([
+        ("XNNPACK/src", "**/*.h"),
+    ])
+    fixed_headers = {}
+    for k, v in src_headers.items():
+        new_key = k
+        if not k.startswith("xnnpack") and not k.startswith("configs"):
+            new_key = "src/{}".format(k)
+        fixed_headers[new_key] = v
+    include_headers = subdir_glob([
+        ("XNNPACK/include", "*.h"),
+    ])
+
+    return fixed_headers | include_headers
+
+OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS)
+SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS)
+TABLE_SRCS = define_xnnpack_build_src(_TABLE_SRCS)
+XNNPACK_SRCS = define_xnnpack_build_src(_XNNPACK_SRCS)
+LOGGING_SRCS = define_xnnpack_build_src(_LOGGING_SRCS)
diff --git a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl
deleted file mode 100644
index a9d4af95ccf..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl
+++ /dev/null
@@ -1,461 +0,0 @@
-"""
-Auto-generated by generate-wrappers.py script. Do not modify
-"""
-
-PROD_SCALAR_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/scalar.c",
-]
-
-PROD_FMA_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/fma.c",
-]
-
-PROD_ARMSIMD32_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/armsimd32.c",
-]
-
-PROD_FP16ARITH_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/fp16arith.c",
-]
-
-PROD_NEON_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neon.c",
-]
-
-PROD_NEONFP16_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neonfp16.c",
-]
-
-PROD_NEONFMA_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neonfma.c",
-]
-
-PROD_NEON_AARCH64_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neon-aarch64.c",
-    "xnnpack_wrappers/amalgam/gen/neonfma-aarch64.c",
-]
-
-PROD_NEONV8_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neonv8.c",
-]
-
-PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neonfp16arith.c",
-]
-
-PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neonfp16arith-aarch64.c",
-]
-
-PROD_NEONDOT_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neondot.c",
-]
-
-PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neondot-aarch64.c",
-]
-
-PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neondotfp16arith.c",
-]
-
-PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neondotfp16-aarch64.c",
-]
-
-PROD_NEONI8MM_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/neoni8mm.c",
-]
-
-PROD_SSE_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/sse.c",
-]
-
-PROD_SSE2_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/sse2.c",
-]
-
-PROD_SSSE3_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/ssse3.c",
-]
-
-PROD_SSE41_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/sse41.c",
-]
-
-PROD_AVX_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/avx.c",
-]
-
-PROD_F16C_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/f16c.c",
-]
-
-PROD_XOP_MICROKERNEL_SRCS = [
-]
-
-PROD_FMA3_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/fma3.c",
-]
-
-PROD_AVX2_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/avx2.c",
-]
-
-PROD_AVX512F_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/avx512f.c",
-]
-
-PROD_AVX512SKX_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/avx512skx.c",
-]
-
-PROD_AVX512VBMI_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/avx512vbmi.c",
-]
-
-PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c",
-]
-
-PROD_AVX512VNNI_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/avx512vnni.c",
-]
-
-PROD_RVV_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/rvv.c",
-]
-
-PROD_AVXVNNI_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/amalgam/gen/avxvnni.c",
-]
-
-AARCH32_ASM_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S",
-    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S",
-    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S",
-    "xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S",
-    "xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S",
-    "xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
-    "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
-    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
-    "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
-    "xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
-    "xnnpack_wrappers/qs16-qs8-vcvt/qs16-qs8-vcvt-asm-aarch32-neon-u16.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S",
-    "xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S",
-    "xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S",
-]
-
-AARCH64_ASM_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-igemm/f32-igemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-igemm/f32-igemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
-    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S",
-    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-]
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/armsimd32.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/armsimd32.c
deleted file mode 100644
index 62db44817d2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/armsimd32.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <amalgam/gen/armsimd32.c>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx.c
deleted file mode 100644
index d8c1e417a74..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/avx.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx2.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx2.c
deleted file mode 100644
index 0759519af98..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx2.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/avx2.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512f.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512f.c
deleted file mode 100644
index 672d0c4a490..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512f.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/avx512f.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512skx.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512skx.c
deleted file mode 100644
index 10b570a3f3f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512skx.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/avx512skx.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vbmi.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vbmi.c
deleted file mode 100644
index b3ac85dc9bb..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vbmi.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/avx512vbmi.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnni.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnni.c
deleted file mode 100644
index a8005d7c9fc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnni.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/avx512vnni.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c
deleted file mode 100644
index c02b5a09ae0..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/avx512vnnigfni.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avxvnni.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avxvnni.c
deleted file mode 100644
index d9221962349..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avxvnni.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/avxvnni.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/f16c.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/f16c.c
deleted file mode 100644
index dc0d944d978..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/f16c.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/f16c.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fma.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fma.c
deleted file mode 100644
index 9fd97e1e99f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fma.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__riscv) || defined(__riscv__)
-#include <amalgam/gen/fma.c>
-#endif /* defined(__riscv) || defined(__riscv__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fma3.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fma3.c
deleted file mode 100644
index 71ed4c2ae8d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fma3.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/fma3.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fp16arith.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fp16arith.c
deleted file mode 100644
index 551d865ead2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/fp16arith.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <amalgam/gen/fp16arith.c>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neon-aarch64.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neon-aarch64.c
deleted file mode 100644
index 82800dba4fa..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neon-aarch64.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <amalgam/gen/neon-aarch64.c>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neon.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neon.c
deleted file mode 100644
index f48a3631cf0..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neon.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <amalgam/gen/neon.c>
-#endif /* defined(__arm__) || defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondot-aarch64.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondot-aarch64.c
deleted file mode 100644
index 9d83d606f06..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondot-aarch64.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <amalgam/gen/neondot-aarch64.c>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondot.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondot.c
deleted file mode 100644
index 8acde64fea8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondot.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <amalgam/gen/neondot.c>
-#endif /* defined(__arm__) || defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondotfp16-aarch64.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondotfp16-aarch64.c
deleted file mode 100644
index 56f6f0307ef..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondotfp16-aarch64.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <amalgam/gen/neondotfp16-aarch64.c>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondotfp16arith.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondotfp16arith.c
deleted file mode 100644
index 359b79e9795..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neondotfp16arith.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <amalgam/gen/neondotfp16arith.c>
-#endif /* defined(__arm__) || defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfma-aarch64.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfma-aarch64.c
deleted file mode 100644
index 351c55dfbe2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfma-aarch64.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <amalgam/gen/neonfma-aarch64.c>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfma.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfma.c
deleted file mode 100644
index 43c96f1031b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfma.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <amalgam/gen/neonfma.c>
-#endif /* defined(__arm__) || defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16.c
deleted file mode 100644
index 83357d369f3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <amalgam/gen/neonfp16.c>
-#endif /* defined(__arm__) || defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16arith-aarch64.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16arith-aarch64.c
deleted file mode 100644
index fbd21f7c053..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16arith-aarch64.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <amalgam/gen/neonfp16arith-aarch64.c>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16arith.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16arith.c
deleted file mode 100644
index b5760084799..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonfp16arith.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <amalgam/gen/neonfp16arith.c>
-#endif /* defined(__arm__) || defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neoni8mm.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neoni8mm.c
deleted file mode 100644
index 7daf0179ec5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neoni8mm.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <amalgam/gen/neoni8mm.c>
-#endif /* defined(__arm__) || defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonv8.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonv8.c
deleted file mode 100644
index 756b27b1104..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/neonv8.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__) || defined(__aarch64__)
-#include <amalgam/gen/neonv8.c>
-#endif /* defined(__arm__) || defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/rvv.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/rvv.c
deleted file mode 100644
index 612c2c23076..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/rvv.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__riscv) || defined(__riscv__)
-#include <amalgam/gen/rvv.c>
-#endif /* defined(__riscv) || defined(__riscv__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/scalar.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/scalar.c
deleted file mode 100644
index dc5e2f4345d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/scalar.c
+++ /dev/null
@@ -1,3 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#include <amalgam/gen/scalar.c>
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse.c
deleted file mode 100644
index dc4e5d28af4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/sse.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse2.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse2.c
deleted file mode 100644
index acca5bc1d46..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse2.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/sse2.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse41.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse41.c
deleted file mode 100644
index 436f39a9a15..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/sse41.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/sse41.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/ssse3.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/ssse3.c
deleted file mode 100644
index 95d2e4379e5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/ssse3.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/ssse3.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/xop.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/xop.c
deleted file mode 100644
index 77c1592253a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/xop.c
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
-#include <amalgam/gen/xop.c>
-#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S b/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S
deleted file mode 100644
index 6fe004d839f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S b/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S
deleted file mode 100644
index 53744823784..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S b/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S
deleted file mode 100644
index d5a594c666d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S b/backends/xnnpack/third-party/xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S
deleted file mode 100644
index ac2c282a3ac..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S b/backends/xnnpack/third-party/xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S
deleted file mode 100644
index 33391c149b3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index 92c24ce9649..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 9f0ed0c405a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 0c4c5163a57..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index 9f0944f7ece..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 3ddaad0fdd9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index ddc19830002..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S
deleted file mode 100644
index 1589528ebf5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S
deleted file mode 100644
index 5904ddc2121..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S
deleted file mode 100644
index 076db66a17c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index c3a561e618f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index a65f69e1f75..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index d4c19845edb..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 47966f88a9c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemm-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemm-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index 45d9b06b096..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 83eff5103e8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index d2f7538ff81..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 78688e936c2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S
deleted file mode 100644
index 55313e07aca..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S
deleted file mode 100644
index 9e6df8b5816..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index ef9fc56f5ed..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 50563539d73..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 5ee04b0b6d3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-gemm/gen/f16-gemminc-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-gemm/gen/f16-gemminc-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index 46560fd7864..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index be26c7eaeb3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index d50fa9a8198..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 2dbeea99793..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S
deleted file mode 100644
index dfff65b0044..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S
deleted file mode 100644
index ed504d232fb..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S
deleted file mode 100644
index fb5b612f93a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S
deleted file mode 100644
index 6bf8d002654..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S
deleted file mode 100644
index 3d78676f932..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma-cortex-a55.S
deleted file mode 100644
index cc2194fabea..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma.S
deleted file mode 100644
index 7b6c7740d6b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index ae3fccf9ecc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S
deleted file mode 100644
index 5af4ff5e9be..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S
deleted file mode 100644
index e44b3cb3457..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S
deleted file mode 100644
index 9bff7e793e5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S
deleted file mode 100644
index 35687e5a6ca..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
deleted file mode 100644
index de2943f6fbf..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index a69d095ed7c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index affcc0839b7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 78ee0ac66e6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S
deleted file mode 100644
index d56decf99d2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S
deleted file mode 100644
index ae1255d8169..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S
deleted file mode 100644
index d859d79c344..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S
deleted file mode 100644
index f03c2a038a3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
deleted file mode 100644
index f6b1147b82e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index c0cb6c9e728..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S
deleted file mode 100644
index 3b840e13ad3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S
deleted file mode 100644
index 76dde6f9818..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S
deleted file mode 100644
index a84a9373ef2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S
deleted file mode 100644
index c70624c383e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
deleted file mode 100644
index 6bb2c8abeb4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 6a5b2de57be..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index d1d69b3d62d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 18467988d26..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index 9b2179fec74..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 8c7a23b62ba..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 66c70476b59..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index c885f2f7bdb..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 5cf5567d712..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S
deleted file mode 100644
index c211eac53b8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S
deleted file mode 100644
index db90cca787e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S
deleted file mode 100644
index a3201c58201..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S
deleted file mode 100644
index 680f9fb33d2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S
deleted file mode 100644
index 73a0c7df000..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S
deleted file mode 100644
index ce746e110c9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S
deleted file mode 100644
index c42e92b459e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S
deleted file mode 100644
index b85c98e6047..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S
deleted file mode 100644
index a5870edb01c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
deleted file mode 100644
index 5308899f078..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index e9d36d76133..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S
deleted file mode 100644
index c2985c07bd7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index da27ac40299..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 2ec409afdac..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index ae0d82e598f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index cd4aae17a19..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 43fb97f6c25..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 4e76af2a8a8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
deleted file mode 100644
index d3496226cf0..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index 93ebb32392b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S
deleted file mode 100644
index 570a5bfee83..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S
deleted file mode 100644
index c6d7c30d63f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 308c8ad0f70..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index f2c2176ade4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 54b418192bd..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index cdd0bb28904..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
deleted file mode 100644
index 97eb07f90a8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 7ec4b5e8f28..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-4x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-4x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index f8beacb9982..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemm/gen/f32-gemm-goi-4x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemm/gen/f32-gemm-goi-4x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index ff82cd19abc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index 59c74c432d2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 44b6ef72a0d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 9c49b8ada10..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index cafc14bb4f0..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index d434d5673bc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index 3e7b62114d5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S
deleted file mode 100644
index 41d518192a6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 488d08a63fc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 08b362f972e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index c5fa9fc71e9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 6638c6380e5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 534e9cb4e13..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 71d0f2a79c1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index d3179046fce..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S
deleted file mode 100644
index c1d1e7e0f3c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S
deleted file mode 100644
index de125877589..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 867be55ac83..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index f60b45bbaab..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 43233bdc11f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 094058df77b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-gemminc/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index b1dd5dabbff..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/f32-igemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index 7ebcc940b1d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/f32-igemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S
deleted file mode 100644
index 781a31359af..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S
deleted file mode 100644
index 2e4e327a582..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S
deleted file mode 100644
index 6bc9acfc2cc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S
deleted file mode 100644
index fdb26129135..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S
deleted file mode 100644
index 2ccc6077be9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S
deleted file mode 100644
index e1296f4fa66..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
deleted file mode 100644
index 926e2c5962e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index 5c14362cca1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 19b7688938d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 6f07bf2b866..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
deleted file mode 100644
index 79c47d005a8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 10c61e3f92e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 8f10cf81d53..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index ae1d5baa129..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 8e50ad50d24..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S
deleted file mode 100644
index 0d5fb1a8a65..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S
deleted file mode 100644
index 3366bcb2175..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S
deleted file mode 100644
index 50225989eb9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S
deleted file mode 100644
index de5eff0c270..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S
deleted file mode 100644
index 10c7df4fda1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S
deleted file mode 100644
index 5c7d702890f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
deleted file mode 100644
index 65280f453b3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index fce127460ed..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 5e6f67e247e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 47455b7895e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index b78c11f8cd6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index e7b9e96eb7b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index e534fa0cd55..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 3973dd8ee34..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
deleted file mode 100644
index 26ebf4d45c7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S
deleted file mode 100644
index ac0a0d4a8b2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 868331972dc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 5bee40c554a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 02db4ab458c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 63fce157702..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 92b185246e6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 71869488aab..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
deleted file mode 100644
index aae4691631b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 87564cbee60..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-ppmm/gen/f32-ppmm-4x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
deleted file mode 100644
index 137db6650d1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75.S
deleted file mode 100644
index 39965b0623b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
deleted file mode 100644
index 3011083c3c2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index f58a249267a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-ppmm/gen/f32-ppmm-8x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S
deleted file mode 100644
index 523da4bf02e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S
deleted file mode 100644
index e6d4c1c4820..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S
deleted file mode 100644
index b83bd087c9e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S
deleted file mode 100644
index bd234e39037..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S
deleted file mode 100644
index 22611bc5c82..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S
deleted file mode 100644
index 44723618fa4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
deleted file mode 100644
index 432562c6eca..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 0712a1572f8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S
deleted file mode 100644
index f131c2e475a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S
deleted file mode 100644
index c42e3043e44..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S
deleted file mode 100644
index d5a1ea9458e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S
deleted file mode 100644
index 4047124ca93..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
deleted file mode 100644
index e79abd8b235..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 9a9f008918c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index ead4a4ce719..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 34b9fd8f257..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index a3d8f65c3de..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 30188e9bad6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index b2cd147a5c0..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 70ff2774dfa..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 5135a4d0800..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index df6d6ebdb17..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S
deleted file mode 100644
index f1ccf939009..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S
deleted file mode 100644
index 19faddd9e92..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neon-ld128-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S
deleted file mode 100644
index 78592a160f0..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S
deleted file mode 100644
index 47b886941b1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S
deleted file mode 100644
index df69d4495c7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S
deleted file mode 100644
index b80099942c8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
deleted file mode 100644
index 78aec3df25d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 09d17d8ea41..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S
deleted file mode 100644
index 53ef3b2eeaf..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S
deleted file mode 100644
index 58390f9af09..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc2.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S
deleted file mode 100644
index e9a3a40bb95..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S
deleted file mode 100644
index dd037c6afee..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-acc4.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
deleted file mode 100644
index e3a2856e563..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index ac70ffb0295..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index c52094d4b38..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 8e924415842..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-4x1-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index ca932d3eac6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index 4d7a94d2045..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index d48164a9657..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index acd6b23bf3a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
deleted file mode 100644
index 3d5ee462034..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S
deleted file mode 100644
index d6e3e493055..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 09d69d113ac..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S
deleted file mode 100644
index 90629c3189d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S
deleted file mode 100644
index 8edafd1f128..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 54c754b5d86..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 41bee5072d7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S
deleted file mode 100644
index db2eda8704d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 97ca7f73291..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 12a22c0cd9a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index 55786a523e6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S
deleted file mode 100644
index 8f9691d73a9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 21a69dfbc10..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index ea0ca0b0793..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S
deleted file mode 100644
index 21822b60441..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs16-qs8-vcvt/qs16-qs8-vcvt-asm-aarch32-neon-u16.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs16-qs8-vcvt/qs16-qs8-vcvt-asm-aarch32-neon-u16.S
deleted file mode 100644
index 81a9bea4394..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs16-qs8-vcvt/qs16-qs8-vcvt-asm-aarch32-neon-u16.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs16-qs8-vcvt/qs16-qs8-vcvt-asm-aarch32-neon-u16.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S
deleted file mode 100644
index 33e6b4a3b43..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index 03fee7c6c00..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S
deleted file mode 100644
index 275baaa89a6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index 2c1816afde3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index 11abd2eeca9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index deaba92f1b3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index ae70c1757ce..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index 38c8aa5dfed..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index e7c0bbba78b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 3d930d4e1ac..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index 788bbb6259c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index f17b968a612..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index b73016d3691..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 436d51a367b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 36e6cdc4017..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 994843e4899..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index 40972ad7bcd..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index ac7612ef0fb..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index 0d1f3ddfd4c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index a7dc320e0a8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S
deleted file mode 100644
index 99a0927b92b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index 31b92ecc099..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index 28df8e3d2e3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index 230ac99c752..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 6bd8ba02d33..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S
deleted file mode 100644
index ee48c88ccc1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 95c27257ace..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index 84d0f8edf2e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 38357fe70ad..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index 50702219fd6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 54cd93a360f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index ebe60235fb6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 3476055355e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index 52f8cf22bed..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 7de93e80a2c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index a6727b17027..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S
deleted file mode 100644
index 76852aa974a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index d1eaa186e79..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index cd3dc4e3a90..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 5a6faeb391d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S
deleted file mode 100644
index 67df08cd04d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index 762c7693b39..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 021c7266e6f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index 54d173b3587..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index a835b6fb563..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index 7594c9c0257..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 923797aae8e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S
deleted file mode 100644
index 5a726232631..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index ca123f854f0..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index 1f677998ba4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S
deleted file mode 100644
index eac3b7426ca..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S
deleted file mode 100644
index f09166762a4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index 1a57dd5ffa5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index 8bf83105b89..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index 4032da60ca8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index 6e2b925a7cb..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index 6f66f130084..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index a1e97aaa9e4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index a9a2542fa71..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index 8b397990f73..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index af9c54c3fd1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 035cca4bd75..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index b8d1eb9958d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 5b727eb959f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index 9512a7a954f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index b146e64df9e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index b80c36858e9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 6251384a798..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index f771156a2b7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index 46f7c783a97..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index 5eeba9fdcf2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 2b864f46611..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 86be5ee8275..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index ec722ab4d34..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 9b21f909730..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index 6a9e0e1afae..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 337ade200dc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index ebf3a973378..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index d7331d0cf3c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index 9bc505756c1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index b9bc055c6ac..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index e1e4761fb50..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index c1ff12f51e9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 91c490d39e1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index f9ddf579313..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index 2ffcbafeca2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 2f16c9629d8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index c38f96ff308..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index f5a6bcc4ab4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index eb0e5c2c601..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index f43b6e4ef20..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S
deleted file mode 100644
index 9b7675ce575..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 97d018750dc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index 95e804779df..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S
deleted file mode 100644
index 1619908c61b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S
deleted file mode 100644
index 1ff2c1ae27b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S
deleted file mode 100644
index 5adf641e553..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S
deleted file mode 100644
index 8ee0b48e89e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S
deleted file mode 100644
index 0b9b22fe6a4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index 2445bcb1d26..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index a6d20f7d483..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index 3eeb0498e2b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S
deleted file mode 100644
index 01fa488642c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S
deleted file mode 100644
index 62d03b2e517..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index 83ffcccec46..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index 6909ea9ef02..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index 61f2a84d740..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index acd248a9b8b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 3e28fd9e38d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index 6839f61d5ab..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index bc9831f301c..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index a0e0c60ce8a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index ee9c3c679ee..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S
deleted file mode 100644
index c641f627a9a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 83f6a4979a7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index 33c1e04bcc4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index e77869dd80b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index d7ae51a83e5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index a9b3b13af0f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 74b68f3bf93..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S
deleted file mode 100644
index 4140a8fdc87..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index 51016996ca7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index cbc59a52de5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index 509c71d5ff1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index fd58a7bfd15..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index 02aba69fc86..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 0c9437f7a84..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S
deleted file mode 100644
index 88786030182..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S
deleted file mode 100644
index 2131f575bb1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S
deleted file mode 100644
index 09884efee23..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index d86bc1d4841..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S
deleted file mode 100644
index 91122a96526..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S
deleted file mode 100644
index a03bdc51c78..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S
deleted file mode 100644
index ec40b4cf001..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S
deleted file mode 100644
index 44066674337..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S
deleted file mode 100644
index d88239252ff..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index 71085c1e566..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index 72f584446f2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S
deleted file mode 100644
index 80dea250ba4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S
deleted file mode 100644
index 53468945bd7..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index f9e16794826..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index 2989955e89f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index ed3cc81d84a..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 08b842b13ec..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index 560929a8081..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
deleted file mode 100644
index cd16c91853f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
deleted file mode 100644
index 28e1ae1431d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
deleted file mode 100644
index 791a3fdae04..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
deleted file mode 100644
index ab1964f6db1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 977fc5ca72f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index eeb08ca0e15..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 1cec1c5f30b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index 8bd93033458..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 30c2bf38e18..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 91ab31876cd..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
deleted file mode 100644
index a6407d46c03..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index d01f37e00c0..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index fb0954d584e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index 637636ba18b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index 79e6d44be57..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 8111a6778f1..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S
deleted file mode 100644
index e1b6507ccdc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S
deleted file mode 100644
index 5ed70475644..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S
deleted file mode 100644
index b1f724d5669..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 410e921dd1f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S
deleted file mode 100644
index 137040149ee..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S
deleted file mode 100644
index d40740db7d9..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S
deleted file mode 100644
index 81d95eaa1e8..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S
deleted file mode 100644
index b361f11e2ec..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S
deleted file mode 100644
index 6253a5ce8c4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index 3a587aff9f4..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index d3322eadc12..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index d94445207c2..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index 094b884c7d6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S
deleted file mode 100644
index dcbf3982672..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S
deleted file mode 100644
index f72372c7cff..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 9082bebb5ea..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index 5ee2a70478f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 6b98c0af279..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index b9a0479a69e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 4b9f5ec2916..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index df4ff1bf30b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 6cae13ae375..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index cb2b68d4cfb..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index 4d8f0fada83..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index 8884fbd0b7d..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 535adaf05fc..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S
deleted file mode 100644
index db1d0f0c541..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 7d787acd0fe..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 476a9c281a6..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index 50aa1283341..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index 7023ccdab57..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index 02530de8e97..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index 93e9dbc269f..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S
deleted file mode 100644
index 0c208870498..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S
deleted file mode 100644
index 12791167760..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 983ab5b46f3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
deleted file mode 100644
index bf78942f8b3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index e34256d00ad..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 6f3cb0604db..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index ed634476963..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index bba9d3f80fa..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
deleted file mode 100644
index bd5da12fca5..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S
deleted file mode 100644
index 6fd78eff2cb..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
deleted file mode 100644
index 1d42c82e33e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
deleted file mode 100644
index fd8c77710d3..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S
deleted file mode 100644
index 070cb137832..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S
deleted file mode 100644
index c20918787fd..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
deleted file mode 100644
index 3da0c8fa047..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
deleted file mode 100644
index 65df8c05987..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__aarch64__)
-#include <qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S>
-#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S b/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S
deleted file mode 100644
index 4cab64be090..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S b/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S
deleted file mode 100644
index 00a1f70b75b..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S b/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S
deleted file mode 100644
index 1bcc881720e..00000000000
--- a/backends/xnnpack/third-party/xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Auto-generated by generate-wrappers.py script. Do not modify */
-
-#if defined(__arm__)
-#include <u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S>
-#endif /* defined(__arm__) */
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index b7ee440c289..4548de4940a 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -9,11 +9,13 @@
 from typing import Dict, final, List
 
 import torch
-from executorch.backends.xnnpack.operators.node_visitor import get_node_visitors
 
-from executorch.backends.xnnpack.passes import XNNPACKPassManager
-from executorch.backends.xnnpack.passes.convert_to_linear import ConvertToLinearPass
-from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass
+from executorch.backends.xnnpack._passes import XNNPACKPassManager
+from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
+from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
+    TagImplicitQDqPass,
+)
+from executorch.backends.xnnpack.operators.node_visitor import get_node_visitors
 
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     ConstantDataOffset,
diff --git a/build/Codegen.cmake b/build/Codegen.cmake
index 381cd0958fd..435b3d24802 100644
--- a/build/Codegen.cmake
+++ b/build/Codegen.cmake
@@ -154,7 +154,7 @@ function(gen_custom_ops_aot_lib)
   if(TARGET portable_lib)
     target_link_libraries(${GEN_LIB_NAME} PRIVATE portable_lib)
   else()
-    target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops)
+    target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_core)
   endif()
 endfunction()
 
diff --git a/build/Utils.cmake b/build/Utils.cmake
index bf04fa1b15c..246bd68c837 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -68,6 +68,9 @@ function(executorch_print_configuration_summary)
   message(STATUS "  EXECUTORCH_BUILD_EXTENSION_TENSOR      : "
                  "${EXECUTORCH_BUILD_EXTENSION_TENSOR}"
   )
+  message(STATUS "  EXECUTORCH_BUILD_EXTENSION_TRAINING      : "
+                 "${EXECUTORCH_BUILD_EXTENSION_TRAINING}"
+  )
   message(
     STATUS
       "  EXECUTORCH_BUILD_FLATC                 : ${EXECUTORCH_BUILD_FLATC}"
@@ -100,7 +103,7 @@ function(executorch_print_configuration_summary)
                  "${EXECUTORCH_BUILD_KERNELS_QUANTIZED}"
   )
   message(
-    STATUS "  EXECUTORCH_BUILD_SDK                   : ${EXECUTORCH_BUILD_SDK}"
+    STATUS "  EXECUTORCH_BUILD_DEVTOOLS              : ${EXECUTORCH_BUILD_DEVTOOLS}"
   )
   message(
     STATUS
@@ -184,11 +187,20 @@ function(extract_sources sources_file)
       set(executorch_root ${CMAKE_CURRENT_SOURCE_DIR})
     endif()
 
+    if(ANDROID_ABI)
+      if("${ANDROID_ABI}" STREQUAL "arm64-v8a")
+        set(target_platforms_arg "--target-platforms=shim//:android-arm64")
+      elseif("${ANDROID_ABI}" STREQUAL "x86_64")
+        set(target_platforms_arg "--target-platforms=shim//:android-x86_64")
+      else()
+        message(FATAL_ERROR "Unsupported ANDROID_ABI setting ${ANDROID_ABI}. Please add it here!")
+      endif()
+    endif()
     execute_process(
       COMMAND
         ${PYTHON_EXECUTABLE} ${executorch_root}/build/extract_sources.py
         --config=${executorch_root}/build/cmake_deps.toml --out=${sources_file}
-        --buck2=${BUCK2}
+        --buck2=${BUCK2} ${target_platforms_arg}
       OUTPUT_VARIABLE gen_srcs_output
       ERROR_VARIABLE gen_srcs_error
       RESULT_VARIABLE gen_srcs_exit_code
@@ -235,15 +247,14 @@ function(resolve_buck2)
     OUTPUT_VARIABLE resolve_buck2_output
     ERROR_VARIABLE resolve_buck2_error
     RESULT_VARIABLE resolve_buck2_exit_code
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    WORKING_DIRECTORY ${executorch_root}
     OUTPUT_STRIP_TRAILING_WHITESPACE
   )
 
+  # $BUCK2 is a copy of the var from the parent scope. This block will set
+  # $buck2 to the value we want to return.
   if(resolve_buck2_exit_code EQUAL 0)
-    set(BUCK2
-        ${resolve_buck2_output}
-        PARENT_SCOPE
-    )
+    set(buck2 ${resolve_buck2_output})
     message(STATUS "Resolved buck2 as ${resolve_buck2_output}.")
   elseif(resolve_buck2_exit_code EQUAL 2)
     # Wrong buck version used. Stop here to ensure that the user sees the error.
@@ -254,17 +265,22 @@ function(resolve_buck2)
     message(WARNING "${resolve_buck2_error}")
 
     if("${BUCK2}" STREQUAL "")
-      set(BUCK2
-          "buck2"
-          PARENT_SCOPE
-      )
+      set(buck2 "buck2")
     endif()
   endif()
 
+  # Update the var in the parent scope. Note that this does not modify our
+  # local $BUCK2 value.
+  set(BUCK2 "${buck2}" PARENT_SCOPE)
+
   # The buck2 daemon can get stuck. Killing it can help.
   message(STATUS "Killing buck2 daemon")
   execute_process(
-    COMMAND "${BUCK2} kill" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    # Note that we need to use the local buck2 variable. BUCK2 is only set in
+    # the parent scope, and can still be empty in this scope.
+    COMMAND "${buck2} kill"
+    WORKING_DIRECTORY ${executorch_root}
+    COMMAND_ECHO STDOUT
   )
 endfunction()
 
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index d8de4cfd94e..9b1100c6978 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -42,11 +42,11 @@ build_android_native_library() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DEXECUTORCH_BUILD_NEURON=ON \
-    -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
+    -DEXECUTORCH_BUILD_NEURON="${EXECUTORCH_BUILD_NEURON}" \
+    -DNEURON_BUFFER_ALLOCATOR_LIB="${NEURON_BUFFER_ALLOCATOR_LIB}" \
     -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \
     -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
-    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+    -DCMAKE_BUILD_TYPE=Release \
     -B"${CMAKE_OUT}"
 
   if [ "$(uname)" == "Darwin" ]; then
@@ -54,7 +54,7 @@ build_android_native_library() {
   else
     CMAKE_JOBS=$(( $(nproc) - 1 ))
   fi
-  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config RelWithDebInfo
+  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
 
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
@@ -67,19 +67,15 @@ build_android_native_library() {
     -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+    -DCMAKE_BUILD_TYPE=Release \
     -B"${CMAKE_OUT}"/extension/android
 
-  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config RelWithDebInfo
+  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release
 
   # Copy artifacts to ABI specific directory
   mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
   cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
 
-  cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
-  cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
-  cp /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuronusdk_adapter.mtk.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
-
   # Copy QNN related so library
   if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
     cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
@@ -92,6 +88,13 @@ build_android_native_library() {
     cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
     cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
   fi
+
+  # Copy MTK related so library
+  if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
+    cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
+    cp "$NEURON_BUFFER_ALLOCATOR_LIB"/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
+    cp "$NEURON_BUFFER_ALLOCATOR_LIB"/libneuronusdk_adapter.mtk.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
+  fi
 }
 
 build_aar() {
@@ -104,8 +107,8 @@ build_aar() {
   # between Java and JNI
   find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
   # Zip all necessary files into the AAR file
-  zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml
-  zip -r executorch-llama.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml
+  zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml
+  cp executorch.aar executorch-llama.aar
   popd
 }
 
@@ -151,7 +154,6 @@ BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
 if [ -z "$ANDROID_ABIS" ]; then
   ANDROID_ABIS=("arm64-v8a" "x86_64")
-  ANDROID_ABIS=("arm64-v8a")
 fi
 export ANDROID_ABIS
 
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index 348111e2b4c..acd98583b8c 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -29,7 +29,7 @@ PLATFORM_TARGET=("17.0" "17.0" "10.15")
 
 FRAMEWORK_EXECUTORCH="executorch:\
 libexecutorch.a,\
-libexecutorch_no_prim_ops.a,\
+libexecutorch_core.a,\
 libextension_apple.a,\
 libextension_data_loader.a,\
 libextension_module.a,\
@@ -49,6 +49,7 @@ libXNNPACK.a,\
 libcpuinfo.a,\
 libpthreadpool.a,\
 libxnnpack_backend.a,\
+libmicrokernels-prod.a,\
 :"
 
 FRAMEWORK_KERNELS_CUSTOM="kernels_custom:\
@@ -57,7 +58,7 @@ libcustom_ops.a,\
 
 FRAMEWORK_KERNELS_OPTIMIZED="kernels_optimized:\
 liboptimized_kernels.a,\
-liboptimized_ops_lib.a,\
+liboptimized_native_cpu_ops_lib.a,\
 :"
 
 FRAMEWORK_KERNELS_PORTABLE="kernels_portable:\
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 1430ea3a9ef..47bcf0ce4bc 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -20,14 +20,14 @@ buck_targets = [
   "//runtime/executor:program",
 ]
 deps = [
-  "executorch_no_prim_ops",
+  "executorch_core",
 ]
 filters = [
   ".cpp$",
 ]
 
 
-[targets.executorch_no_prim_ops]
+[targets.executorch_core]
 buck_targets = [
   "//runtime/executor:program_no_prim_ops",
 ]
@@ -55,7 +55,7 @@ excludes = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
 ]
 
 [targets.optimized_kernels]
@@ -72,7 +72,8 @@ excludes = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
+  "extension_threadpool",
   "portable_kernels",
 ]
 
@@ -90,7 +91,7 @@ excludes = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
   "portable_kernels",
 ]
 
@@ -112,7 +113,7 @@ filters = [
 excludes = [
 ]
 deps = [
-  "executorch_no_prim_ops",
+  "executorch_core",
   "executorch",
 ]
 
@@ -126,7 +127,7 @@ filters = [
 excludes = [
 ]
 deps = [
-  "executorch_no_prim_ops",
+  "executorch_core",
   "executorch",
   "portable_kernels",
 ]
@@ -143,7 +144,7 @@ filters = [
   ".cpp$",
 ]
 deps = [
-  "executorch_no_prim_ops",
+  "executorch_core",
   "executorch",
 ]
 
@@ -156,7 +157,7 @@ filters = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
   "extension_data_loader",
 ]
 
@@ -169,7 +170,7 @@ filters = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
 ]
 
 [targets.extension_llm_runner]
@@ -181,7 +182,7 @@ filters = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
   "extension_module",
   "extension_runner_util",
 ]
@@ -195,7 +196,47 @@ filters = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
+]
+
+[targets.extension_threadpool]
+buck_targets = [
+  "//extension/threadpool:threadpool",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch",
+  "executorch_core",
+]
+
+[targets.extension_training]
+buck_targets = [
+  "//extension/training/module:training_module",
+  "//extension/training/optimizer:sgd",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch_core",
+]
+
+[targets.train_xor]
+buck_targets = [
+  "//extension/training/examples/XOR:train_xor",
+]
+filters = [
+  ".cpp$",
+]
+excludes = [
+  "^codegen",
+]
+deps = [
+  "executorch",
+  "executorch_core",
+  "portable_kernels",
 ]
 # ---------------------------------- extension end ----------------------------------
 # ---------------------------------- binary start ----------------------------------
@@ -212,7 +253,7 @@ excludes = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
   "portable_kernels",
   "quantized_kernels",
 ]
@@ -228,7 +269,7 @@ excludes = [
   "^codegen",
 ]
 deps = [
-  "executorch_no_prim_ops",
+  "executorch_core",
   "executorch",
 ]
 # ---------------------------------- binary end ----------------------------------
@@ -245,7 +286,7 @@ excludes = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
   "portable_kernels",
 ]
 
@@ -258,7 +299,7 @@ filters = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
 ]
 
 [targets.mps_schema]
@@ -284,7 +325,7 @@ excludes = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
   "xnnpack_backend",
   "portable_kernels",
 ]
@@ -298,7 +339,7 @@ filters = [
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
 ]
 
 [targets.xnnpack_schema]
@@ -324,15 +365,19 @@ buck_targets = [
   "//extension/llm/custom_ops:custom_ops",
 ]
 filters = [
-  ".cpp$",
+  # Second clause is to pick up fht_neon.c/fht_avx.c from FFHT. TODO:
+  # remove filters and patch extract_sources.py's Buck query to fetch
+  # srcs; presumably filters is here to remove .h files.
+  "(.cpp$)|(fht.*\\.c$)",
 ]
 excludes = [
   "^codegen",
 ]
 deps = [
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
   "optimized_kernels",
+  "extension_threadpool",
   "xnnpack_backend",
 ]
 
@@ -349,7 +394,7 @@ excludes = [
 deps = [
   "custom_ops",
   "executorch",
-  "executorch_no_prim_ops",
+  "executorch_core",
   "extension_data_loader",
   "extension_module",
   "portable_kernels",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index c40f214133a..dc695bc846f 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -9,81 +9,118 @@
 # is:
 #
 # find_package(executorch REQUIRED)
-
+# -------
+#
+# Finds the ExecuTorch library
+#
+# This will define the following variables:
+#
+#   EXECUTORCH_FOUND        -- True if the system has the Torch library
+#   EXECUTORCH_INCLUDE_DIRS -- The include directories for torch
+#   EXECUTORCH_LIBRARIES    -- Libraries to link against
+#
 cmake_minimum_required(VERSION 3.19)
 
-set(_root "${CMAKE_CURRENT_LIST_DIR}/../..")
-set(required_lib_list executorch executorch_no_prim_ops portable_kernels)
-foreach(lib ${required_lib_list})
-  set(lib_var "LIB_${lib}")
-  add_library(${lib} STATIC IMPORTED)
-  find_library(
-    ${lib_var} ${lib}
-    HINTS "${_root}"
-    CMAKE_FIND_ROOT_PATH_BOTH
+# Find prebuilt libportable_lib.so. If found, assuming current file is inside
+# a pip package:
+#     <site-packages>/executorch/executorch-config.cmake.
+# If not found, assuming current file is inside cmake-out:
+#     <cmake-out>/cmake/ExecuTorch/executorch-config.cmake
+find_library(_portable_lib_LIBRARY _portable_lib.so PATHS "${CMAKE_CURRENT_LIST_DIR}/extension/pybindings/")
+message(WARNING "${CMAKE_CURRENT_LIST_DIR}/extension/pybindings/")
+set(EXECUTORCH_LIBRARIES)
+if(_portable_lib_LIBRARY)
+  # Assuming current file is <site-packages>/executorch/executorch-config.cmake
+  message(WARNING "portable library is found")
+  list(APPEND EXECUTORCH_LIBRARIES _portable_lib)
+  add_library(_portable_lib STATIC IMPORTED)
+  set(EXECUTORCH_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/include)
+  set_target_properties(_portable_lib PROPERTIES
+    IMPORTED_LOCATION "${_portable_lib_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${EXECUTORCH_INCLUDE_DIRS}"
+    CXX_STANDARD 17
   )
-  set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
-  target_include_directories(${lib} INTERFACE ${_root})
-endforeach()
+else()
+  # Assuming current file is <cmake-out>/cmake/ExecuTorch/executorch-config.cmake
+  message(WARNING "portable library is not found")
+  set(_root "${CMAKE_CURRENT_LIST_DIR}/../..")
+  set(required_lib_list executorch executorch_core portable_kernels)
+  foreach(lib ${required_lib_list})
+    set(lib_var "LIB_${lib}")
+    add_library(${lib} STATIC IMPORTED)
+    find_library(
+      ${lib_var} ${lib}
+      HINTS "${_root}"
+      CMAKE_FIND_ROOT_PATH_BOTH
+    )
+    set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
+    target_include_directories(${lib} INTERFACE ${_root})
+  endforeach()
 
-target_link_libraries(executorch INTERFACE executorch_no_prim_ops)
+  target_link_libraries(executorch INTERFACE executorch_core)
 
-if(CMAKE_BUILD_TYPE MATCHES "Debug")
-  set(FLATCCRT_LIB flatccrt_d)
-else()
-  set(FLATCCRT_LIB flatccrt)
-endif()
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")
+    set(FLATCCRT_LIB flatccrt_d)
+  else()
+    set(FLATCCRT_LIB flatccrt)
+  endif()
 
-set(lib_list
-    etdump
-    bundled_program
-    extension_data_loader
-    ${FLATCCRT_LIB}
-    coremldelegate
-    mpsdelegate
-    neuron_backend
-    qnn_executorch_backend
-    portable_ops_lib
-    extension_module
-    extension_module_static
-    extension_runner_util
-    extension_tensor
-    extension_threadpool
-    xnnpack_backend
-    XNNPACK
-    cpuinfo
-    pthreadpool
-    vulkan_backend
-    optimized_kernels
-    cpublas
-    eigen_blas
-    optimized_ops_lib
-    optimized_native_cpu_ops_lib
-    quantized_kernels
-    quantized_ops_lib
-    quantized_ops_aot_lib
-)
-foreach(lib ${lib_list})
-  # Name of the variable which stores result of the find_library search
-  set(lib_var "LIB_${lib}")
-  find_library(
-    ${lib_var} ${lib}
-    HINTS "${_root}"
-    CMAKE_FIND_ROOT_PATH_BOTH
+  set(lib_list
+      etdump
+      bundled_program
+      extension_data_loader
+      ${FLATCCRT_LIB}
+      coremldelegate
+      mpsdelegate
+      neuron_backend
+      qnn_executorch_backend
+      portable_ops_lib
+      extension_module
+      extension_module_static
+      extension_runner_util
+      extension_tensor
+      extension_threadpool
+      extension_training
+      xnnpack_backend
+      # Start XNNPACK Lib Deps
+      XNNPACK
+      microkernels-prod
+      kleidiai
+      # End XNNPACK Lib Deps
+      cpuinfo
+      pthreadpool
+      vulkan_backend
+      optimized_kernels
+      cpublas
+      eigen_blas
+      optimized_ops_lib
+      optimized_native_cpu_ops_lib
+      quantized_kernels
+      quantized_ops_lib
+      quantized_ops_aot_lib
   )
-  if(NOT ${lib_var})
-    message("${lib} library is not found.
-            If needed rebuild with the proper options in CMakeLists.txt"
+  foreach(lib ${lib_list})
+    # Name of the variable which stores result of the find_library search
+    set(lib_var "LIB_${lib}")
+    find_library(
+      ${lib_var} ${lib}
+      HINTS "${_root}"
+      CMAKE_FIND_ROOT_PATH_BOTH
     )
-  else()
-    if("${lib}" STREQUAL "extension_module" AND (NOT CMAKE_TOOLCHAIN_IOS))
-      add_library(${lib} SHARED IMPORTED)
+    if(NOT ${lib_var})
+      message("${lib} library is not found.
+              If needed rebuild with the proper options in CMakeLists.txt"
+      )
     else()
-      # Building a share library on iOS requires code signing, so it's easier to
-      # keep all libs as static when CMAKE_TOOLCHAIN_IOS is used
-      add_library(${lib} STATIC IMPORTED)
+      if("${lib}" STREQUAL "extension_module" AND (NOT CMAKE_TOOLCHAIN_IOS))
+        add_library(${lib} SHARED IMPORTED)
+      else()
+        # Building a share library on iOS requires code signing, so it's easier to
+        # keep all libs as static when CMAKE_TOOLCHAIN_IOS is used
+        add_library(${lib} STATIC IMPORTED)
+      endif()
+      set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
+      target_include_directories(${lib} INTERFACE ${_root})
     endif()
-    set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
-    target_include_directories(${lib} INTERFACE ${_root})
-  endif()
-endforeach()
+  endforeach()
+endif()
diff --git a/build/extract_sources.py b/build/extract_sources.py
index ce8b3de9812..5004fe0c508 100755
--- a/build/extract_sources.py
+++ b/build/extract_sources.py
@@ -11,7 +11,7 @@
 import re
 
 from enum import Enum
-from typing import Any, Optional, Sequence
+from typing import Any, List, Optional, Sequence
 
 from buck_util import Buck2Runner
 
@@ -96,7 +96,12 @@ def __init__(
             else:
                 self._config[k] = v
 
-    def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
+    def get_sources(
+        self, graph: "Graph", runner: Buck2Runner, buck_args: Optional[List[str]]
+    ) -> frozenset[str]:
+        if buck_args is None:
+            buck_args = []
+
         if self._state == Target._InitState.READY:
             return self._sources
         # Detect cycles.
@@ -113,7 +118,7 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
         )
 
         # Get the complete list of source files that this target depends on.
-        sources: set[str] = set(runner.run(["cquery", query]))
+        sources: set[str] = set(runner.run(["cquery", query] + buck_args))
 
         # Keep entries that match all of the filters.
         filters = [re.compile(p) for p in self._config.get("filters", [])]
@@ -128,7 +133,9 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
         # its deps. Remove entries that are already covered by the transitive
         # set of dependencies.
         for dep in self._config.get("deps", []):
-            sources.difference_update(graph.by_name[dep].get_sources(graph, runner))
+            sources.difference_update(
+                graph.by_name[dep].get_sources(graph, runner, buck_args)
+            )
 
         self._sources = frozenset(sources)
         self._state = Target._InitState.READY
@@ -173,6 +180,9 @@ def parse_args() -> argparse.Namespace:
         metavar="file",
         help="Path to the file to generate.",
     )
+    parser.add_argument(
+        "--target-platforms", help="--target-platforms to pass to buck cquery, if any."
+    )
     return parser.parse_args()
 
 
@@ -199,8 +209,12 @@ def main():
     # Run the queries and get the lists of source files.
     target_to_srcs: dict[str, list[str]] = {}
     runner: Buck2Runner = Buck2Runner(args.buck2)
+    buck_args = []
+    if args.target_platforms:
+        buck_args = ["--target-platforms"]
+        buck_args.append(args.target_platforms)
     for name, target in graph.by_name.items():
-        target_to_srcs[name] = sorted(target.get_sources(graph, runner))
+        target_to_srcs[name] = sorted(target.get_sources(graph, runner, buck_args))
 
     # Generate the requested format.
     output: bytes
diff --git a/build/packaging/env_var_script_linux.sh b/build/packaging/env_var_script_linux.sh
index 6379dee6b5a..24f2fcb3c72 100644
--- a/build/packaging/env_var_script_linux.sh
+++ b/build/packaging/env_var_script_linux.sh
@@ -11,6 +11,10 @@
 # Enable pybindings so that users can execute ExecuTorch programs from python.
 export EXECUTORCH_BUILD_PYBIND=1
 
+# Override extension suffix to be ".so", skipping package info such as
+# "cpython-311-x86_64-linux-gnu"
+export SETUPTOOLS_EXT_SUFFIX=".so"
+
 # Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
 # if not defined.
 export CMAKE_ARGS="${CMAKE_ARGS:-}"
diff --git a/build/packaging/env_var_script_m1.sh b/build/packaging/env_var_script_m1.sh
index da1192455f6..b9e3e6cf543 100644
--- a/build/packaging/env_var_script_m1.sh
+++ b/build/packaging/env_var_script_m1.sh
@@ -11,6 +11,10 @@
 # Enable pybindings so that users can execute ExecuTorch programs from python.
 export EXECUTORCH_BUILD_PYBIND=1
 
+# Override extension suffix to be ".so", skipping package info such as
+# "cpython-311-darwin"
+export SETUPTOOLS_EXT_SUFFIX=".so"
+
 # Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
 # if not defined.
 export CMAKE_ARGS="${CMAKE_ARGS:-}"
diff --git a/devtools/bundled_program/schema/test/TARGETS b/devtools/bundled_program/schema/test/TARGETS
index ca98d32a478..6c0b5ff7260 100644
--- a/devtools/bundled_program/schema/test/TARGETS
+++ b/devtools/bundled_program/schema/test/TARGETS
@@ -1,5 +1,3 @@
-# @noautodeps
-
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 
 oncall("executorch")
diff --git a/devtools/bundled_program/serialize/test/TARGETS b/devtools/bundled_program/serialize/test/TARGETS
index dd92f63f2dd..4c6bfbe3d5e 100644
--- a/devtools/bundled_program/serialize/test/TARGETS
+++ b/devtools/bundled_program/serialize/test/TARGETS
@@ -1,5 +1,3 @@
-# @noautodeps
-
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 
 oncall("executorch")
diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py
index 0539d4f5e4b..3691cd0234d 100644
--- a/devtools/inspector/_inspector.py
+++ b/devtools/inspector/_inspector.py
@@ -967,6 +967,7 @@ class Inspector:
     def __init__(
         self,
         etdump_path: Optional[str] = None,
+        etdump_data: Optional[bytes] = None,
         etrecord: Optional[Union[ETRecord, str]] = None,
         source_time_scale: TimeScale = TimeScale.NS,
         target_time_scale: TimeScale = TimeScale.MS,
@@ -980,11 +981,12 @@ def __init__(
         enable_module_hierarchy: bool = False,
     ) -> None:
         r"""
-        Initialize an `Inspector` instance with the underlying `EventBlock`\ s populated with data from the provided ETDump path
+        Initialize an `Inspector` instance with the underlying `EventBlock`\ s populated with data from the provided ETDump path or binary,
         and optional ETRecord path.
 
         Args:
-            etdump_path: Path to the ETDump file.
+            etdump_path: Path to the ETDump file. Either this parameter or etdump_data should be provided.
+            etdump_data: ETDump binary. Either this parameter or etdump_path should be provided.
             etrecord: Optional ETRecord object or path to the ETRecord file.
             source_time_scale: The time scale of the performance data retrieved from the runtime. The default time hook implentation in the runtime returns NS.
             target_time_scale: The target time scale to which the users want their performance data converted to. Defaults to MS.
@@ -1025,8 +1027,13 @@ def __init__(
         else:
             raise TypeError("Unsupported ETRecord type")
 
+        if (etdump_path is None) == (etdump_data is None):
+            raise ValueError(
+                "Expecting exactly one of etdump_path or etdump_data to be specified."
+            )
+
         # Create EventBlocks from ETDump
-        etdump = gen_etdump_object(etdump_path=etdump_path)
+        etdump = gen_etdump_object(etdump_path=etdump_path, etdump_data=etdump_data)
         if debug_buffer_path is not None:
             with open(debug_buffer_path, "rb") as f:
                 output_buffer = f.read()
diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
index 5f04e2d0413..a2989c224e1 100644
--- a/devtools/inspector/_inspector_utils.py
+++ b/devtools/inspector/_inspector_utils.py
@@ -279,13 +279,20 @@ def _extract_debug_handles(graph: OperatorGraph):
     return debug_handle_to_op_node_map
 
 
-def gen_etdump_object(etdump_path: Optional[str] = None) -> ETDumpFlatCC:
+def gen_etdump_object(
+    etdump_path: Optional[str] = None, etdump_data: Optional[bytes] = None
+) -> ETDumpFlatCC:
     # Gen event blocks from etdump
-    if etdump_path is None:
-        raise ValueError("Etdump_path must be specified.")
-    with open(etdump_path, "rb") as buff:
-        etdump = deserialize_from_etdump_flatcc(buff.read())
-        return etdump
+    if etdump_data is None and etdump_path is not None:
+        with open(etdump_path, "rb") as buff:
+            etdump_data = buff.read()
+
+    if etdump_data is None:
+        raise ValueError(
+            "Unable to get ETDump data. One and only one of etdump_path and etdump_data must be specified."
+        )
+
+    return deserialize_from_etdump_flatcc(etdump_data)
 
 
 def plot_metric(result: List[float], metric_name: str):
diff --git a/devtools/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py
index 34c96eef534..4b3f8075d8e 100644
--- a/devtools/inspector/tests/inspector_test.py
+++ b/devtools/inspector/tests/inspector_test.py
@@ -86,7 +86,9 @@ def test_inspector_constructor(self):
 
             # Assert that expected functions are called
             mock_parse_etrecord.assert_called_once_with(etrecord_path=ETRECORD_PATH)
-            mock_gen_etdump.assert_called_once_with(etdump_path=ETDUMP_PATH)
+            mock_gen_etdump.assert_called_once_with(
+                etdump_path=ETDUMP_PATH, etdump_data=None
+            )
             mock_gen_from_etdump.assert_called_once()
             # Because we mocked parse_etrecord() to return None, this method shouldn't be called
             mock_gen_graphs_from_etrecord.assert_not_called()
diff --git a/devtools/targets.bzl b/devtools/targets.bzl
index 38c2e6e820e..17d9e89cad3 100644
--- a/devtools/targets.bzl
+++ b/devtools/targets.bzl
@@ -4,5 +4,5 @@ def build_sdk():
 def get_sdk_flags():
     sdk_flags = []
     if build_sdk():
-        sdk_flags += ["-DEXECUTORCH_BUILD_SDK"]
+        sdk_flags += ["-DEXECUTORCH_BUILD_DEVTOOLS"]
     return sdk_flags
diff --git a/docs/source/_static/img/benchmark-infra.png b/docs/source/_static/img/benchmark-infra.png
new file mode 100644
index 00000000000..a5d30774257
Binary files /dev/null and b/docs/source/_static/img/benchmark-infra.png differ
diff --git a/docs/source/_static/img/chat.png b/docs/source/_static/img/chat.png
new file mode 100644
index 00000000000..e7ed934519d
Binary files /dev/null and b/docs/source/_static/img/chat.png differ
diff --git a/docs/source/_static/img/chat_response.png b/docs/source/_static/img/chat_response.png
new file mode 100644
index 00000000000..714265276fe
Binary files /dev/null and b/docs/source/_static/img/chat_response.png differ
diff --git a/docs/source/_static/img/ios_benchmark_app.png b/docs/source/_static/img/ios_benchmark_app.png
new file mode 100644
index 00000000000..e1892bba1df
Binary files /dev/null and b/docs/source/_static/img/ios_benchmark_app.png differ
diff --git a/docs/source/_static/img/ios_benchmark_app_macos.png b/docs/source/_static/img/ios_benchmark_app_macos.png
new file mode 100644
index 00000000000..c2776c09574
Binary files /dev/null and b/docs/source/_static/img/ios_benchmark_app_macos.png differ
diff --git a/docs/source/_static/img/ios_benchmark_app_macos_signing.png b/docs/source/_static/img/ios_benchmark_app_macos_signing.png
new file mode 100644
index 00000000000..faf081d2cf2
Binary files /dev/null and b/docs/source/_static/img/ios_benchmark_app_macos_signing.png differ
diff --git a/docs/source/_static/img/ios_benchmark_app_signing.png b/docs/source/_static/img/ios_benchmark_app_signing.png
new file mode 100644
index 00000000000..5f1763d6dd0
Binary files /dev/null and b/docs/source/_static/img/ios_benchmark_app_signing.png differ
diff --git a/docs/source/_static/img/ios_benchmark_app_test_forward.png b/docs/source/_static/img/ios_benchmark_app_test_forward.png
new file mode 100644
index 00000000000..b4f508cc294
Binary files /dev/null and b/docs/source/_static/img/ios_benchmark_app_test_forward.png differ
diff --git a/docs/source/_static/img/ios_benchmark_app_test_generate.png b/docs/source/_static/img/ios_benchmark_app_test_generate.png
new file mode 100644
index 00000000000..f0dd1d02eac
Binary files /dev/null and b/docs/source/_static/img/ios_benchmark_app_test_generate.png differ
diff --git a/docs/source/_static/img/ios_benchmark_app_test_load.png b/docs/source/_static/img/ios_benchmark_app_test_load.png
new file mode 100644
index 00000000000..0f4c651c8f9
Binary files /dev/null and b/docs/source/_static/img/ios_benchmark_app_test_load.png differ
diff --git a/docs/source/_static/img/ios_benchmark_app_tests.png b/docs/source/_static/img/ios_benchmark_app_tests.png
new file mode 100644
index 00000000000..42e25ff2859
Binary files /dev/null and b/docs/source/_static/img/ios_benchmark_app_tests.png differ
diff --git a/docs/source/_static/img/ios_demo_app.jpg b/docs/source/_static/img/ios_demo_app.jpg
new file mode 100644
index 00000000000..076508d0e0d
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app.jpg differ
diff --git a/docs/source/_static/img/ios_demo_app_choosing_package.png b/docs/source/_static/img/ios_demo_app_choosing_package.png
new file mode 100644
index 00000000000..20599d7ea80
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_choosing_package.png differ
diff --git a/docs/source/_static/img/ios_demo_app_llava.jpg b/docs/source/_static/img/ios_demo_app_llava.jpg
new file mode 100644
index 00000000000..316d68b71bd
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_llava.jpg differ
diff --git a/docs/source/_static/img/ios_demo_app_mps.jpg b/docs/source/_static/img/ios_demo_app_mps.jpg
new file mode 100644
index 00000000000..58114f869c6
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_mps.jpg differ
diff --git a/docs/source/_static/img/ios_demo_app_swift_pm.png b/docs/source/_static/img/ios_demo_app_swift_pm.png
new file mode 100644
index 00000000000..19e7a6726e1
Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_swift_pm.png differ
diff --git a/docs/source/_static/img/llava_example.png b/docs/source/_static/img/llava_example.png
new file mode 100644
index 00000000000..ccac335ee65
Binary files /dev/null and b/docs/source/_static/img/llava_example.png differ
diff --git a/docs/source/_static/img/load_complete_and_start_prompt.png b/docs/source/_static/img/load_complete_and_start_prompt.png
new file mode 100644
index 00000000000..43d81f10d00
Binary files /dev/null and b/docs/source/_static/img/load_complete_and_start_prompt.png differ
diff --git a/docs/source/_static/img/logs.png b/docs/source/_static/img/logs.png
new file mode 100644
index 00000000000..e35227a1c0c
Binary files /dev/null and b/docs/source/_static/img/logs.png differ
diff --git a/docs/source/_static/img/memory_planning_inspection.png b/docs/source/_static/img/memory_planning_inspection.png
new file mode 100644
index 00000000000..80a00669e76
Binary files /dev/null and b/docs/source/_static/img/memory_planning_inspection.png differ
diff --git a/docs/source/_static/img/mtk_changes_to_shell_file.png b/docs/source/_static/img/mtk_changes_to_shell_file.png
new file mode 100644
index 00000000000..7fa4e461863
Binary files /dev/null and b/docs/source/_static/img/mtk_changes_to_shell_file.png differ
diff --git a/docs/source/_static/img/mtk_output.png b/docs/source/_static/img/mtk_output.png
new file mode 100644
index 00000000000..e41d54c3561
Binary files /dev/null and b/docs/source/_static/img/mtk_output.png differ
diff --git a/docs/source/_static/img/opening_the_app_details.png b/docs/source/_static/img/opening_the_app_details.png
new file mode 100644
index 00000000000..60494ecc69d
Binary files /dev/null and b/docs/source/_static/img/opening_the_app_details.png differ
diff --git a/docs/source/_static/img/settings_menu.png b/docs/source/_static/img/settings_menu.png
new file mode 100644
index 00000000000..028e6b55cd7
Binary files /dev/null and b/docs/source/_static/img/settings_menu.png differ
diff --git a/docs/source/apple-runtime.md b/docs/source/apple-runtime.md
index 2378ddc2bd2..023903db3b2 100644
--- a/docs/source/apple-runtime.md
+++ b/docs/source/apple-runtime.md
@@ -19,6 +19,19 @@ Link your binary with the ExecuTorch runtime and any backends or kernels used by
 
 ## Integration
 
+### Setup
+
+#### CMake
+
+Building the Xcode project requires CMake. Installing via homebrew does not
+typically work; instead, install the packaged application and commandline tools
+globally:
+
+1. Download the macOS `.dmg` installer from https://cmake.org/download
+2. Open the `.dmg`
+3. Drag the CMake app to the `/Applications` folder
+4. In a terminal, install the command line tools: `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install`
+
 ### Swift Package Manager
 
 The prebuilt ExecuTorch runtime, backend, and kernels are available as a [Swift PM](https://www.swift.org/documentation/package-manager/) package.
diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md
index 52755773eed..9751dc066f2 100644
--- a/docs/source/build-run-coreml.md
+++ b/docs/source/build-run-coreml.md
@@ -87,7 +87,7 @@ cd executorch
 
 Note that profiling is supported on [macOS](https://developer.apple.com/macos) >= 14.4.
 
-1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) when exporting your model.
+1. [Optional] Generate an [ETRecord](./etrecord.rst) when exporting your model.
 ```bash
 cd executorch
 
@@ -100,7 +100,7 @@ python3 -m examples.apple.coreml.scripts.export --model_name mv3 --generate_etre
 # Builds `coreml_executor_runner`.
 ./examples/apple/coreml/scripts/build_executor_runner.sh
 ```
-3. Run and generate an [ETDump](./sdk-etdump.md).
+3. Run and generate an [ETDump](./etdump.md).
 ```bash
 cd executorch
 
@@ -108,7 +108,7 @@ cd executorch
 ./coreml_executor_runner --model_path mv3_coreml_all.pte --profile_model --etdump_path etdump.etdp
 ```
 
-4. Create an instance of the [Inspector API](./sdk-inspector.rst) by passing in the [ETDump](./sdk-etdump.md) you have sourced from the runtime along with the optionally generated [ETRecord](./sdk-etrecord.rst) from step 1 or execute the following command in your terminal to display the profiling data table.
+4. Create an instance of the [Inspector API](./model-inspector.rst) by passing in the [ETDump](./etdump.md) you have sourced from the runtime along with the optionally generated [ETRecord](./etrecord.rst) from step 1 or execute the following command in your terminal to display the profiling data table.
 ```bash
 python examples/apple/coreml/scripts/inspector_cli.py --etdump_path etdump.etdp --etrecord_path mv3_coreml.bin
 ```
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
index 230f007d3fc..5e43a63c760 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
@@ -59,9 +59,7 @@ This example is verified with SM8550 and SM8450.
    - Click the "Get Software" button to download a version of QNN SDK.
    - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6.
    - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon.
-   - [QNN 2.25.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip)
-   - [QNN 2.24.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.24.0.240626.zip)
-   - [QNN 2.23.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip)
+   - [QNN 2.26.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip)
 
 The directory with installed Qualcomm AI Engine Direct SDK looks like:
 ```
@@ -134,7 +132,7 @@ cmake .. \
   -DCMAKE_INSTALL_PREFIX=$PWD \
   -DEXECUTORCH_BUILD_QNN=ON \
   -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-  -DEXECUTORCH_BUILD_SDK=ON \
+  -DEXECUTORCH_BUILD_DEVTOOLS=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
@@ -149,6 +147,10 @@ cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(npro
 # The filename might vary depending on your Python and host version.
 cp -f backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
 cp -f backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
+
+# Workaround for fbs files in exir/_serialize
+cp $EXECUTORCH_ROOT/schema/program.fbs $EXECUTORCH_ROOT/exir/_serialize/program.fbs
+cp $EXECUTORCH_ROOT/schema/scalar_type.fbs $EXECUTORCH_ROOT/exir/_serialize/scalar_type.fbs
 ```
 
 ### Runtime:
@@ -166,7 +168,7 @@ cmake .. \
     -DCMAKE_INSTALL_PREFIX=$PWD \
     -DEXECUTORCH_BUILD_QNN=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-    -DEXECUTORCH_BUILD_SDK=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
@@ -207,9 +209,6 @@ We use deeplab-v3-resnet101 as an example in this tutorial. Run below commands t
 
 ```bash
 cd $EXECUTORCH_ROOT
-# Workaround for fbs files in exir/_serialize
-cp schema/program.fbs exir/_serialize/program.fbs
-cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
 
 python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --compile_only --download
 ```
@@ -356,7 +355,7 @@ Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_RO
 
 ## What is coming?
 
- - [llama2 and llama3](https://github.com/pytorch/executorch/pull/4030). Note that at the moment of writing, we still suffer from the quantization issue in llama2-7B and llama3-8B cases. Only storiesllama works well.
+ - Improve the performance for llama3-8B-Instruct and support batch prefill.
  - We will support pre-compiled binaries from [Qualcomm AI Hub](https://aihub.qualcomm.com/).
 
 ## FAQ
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
new file mode 100644
index 00000000000..74c86cb8cc7
--- /dev/null
+++ b/docs/source/bundled-io.md
@@ -0,0 +1,554 @@
+# Bundled Program -- a Tool for ExecuTorch Model Validation
+
+## Introduction
+`BundledProgram` is a wrapper around the core ExecuTorch program designed to help users wrapping test cases with the model they deploy. `BundledProgram` is not necessarily a core part of the program and not needed for its execution, but is particularly important for various other use-cases, such as model correctness evaluation, including e2e testing during the model bring-up process.
+
+Overall, the procedure can be broken into two stages, and in each stage we are supporting:
+
+* **Emit stage**: Bundling the test I/O cases along with the ExecuTorch program, serializing into flatbuffer.
+* **Runtime stage**: Accessing, executing, and verifying the bundled test cases during runtime.
+
+## Emit stage
+This stage mainly focuses on the creation of a `BundledProgram` and dumping it out to the disk as a flatbuffer file. The main procedure is as follow:
+1. Create a model and emit its ExecuTorch program.
+2. Construct a `List[MethodTestSuite]` to record all test cases that needs to be bundled.
+3. Generate `BundledProgram` by using the emited model and `List[MethodTestSuite]`.
+4. Serialize the `BundledProgram` and dump it out to the disk.
+
+### Step 1: Create a Model and Emit its ExecuTorch Program.
+
+ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate Sample ExecuTorch program](./getting-started-setup.md) or [Exporting to ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial).
+
+### Step 2: Construct `List[MethodTestSuite]` to hold test info
+
+In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTestSuite`, to hold essential info for ExecuTorch program verification.
+
+`MethodTestCase` represents a single testcase. Each `MethodTestCase` contains inputs and expected outputs for a single execution.
+
+:::{dropdown} `MethodTestCase`
+
+```{eval-rst}
+.. autofunction:: executorch.devtools.bundled_program.config.MethodTestCase.__init__
+    :noindex:
+```
+:::
+
+`MethodTestSuite` contains all testing info for single method, including a str representing method name, and a `List[MethodTestCase]` for all testcases:
+
+:::{dropdown} `MethodTestSuite`
+
+```{eval-rst}
+.. autofunction:: executorch.devtools.bundled_program.config.MethodTestSuite
+    :noindex:
+```
+:::
+
+Since each model may have multiple inference methods, we need to generate `List[MethodTestSuite]` to hold all essential infos.
+
+
+### Step 3: Generate `BundledProgram`
+
+We provide `BundledProgram` class under `executorch/devtools/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including
+                            `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`:
+
+:::{dropdown} `BundledProgram`
+
+```{eval-rst}
+.. autofunction:: executorch.devtools.bundled_program.core.BundledProgram.__init__
+    :noindex:
+```
+:::
+
+Construtor of `BundledProgram `will do sannity check internally to see if the given `List[MethodTestSuite]` matches the given Program's requirements. Specifically:
+1. The method_names of each `MethodTestSuite` in `List[MethodTestSuite]` for should be also in program. Please notice that it is no need to set testcases for every method in the Program.
+2. The metadata of each testcase should meet the requirement of the coresponding inference methods input.
+
+### Step 4: Serialize `BundledProgram` to Flatbuffer.
+
+To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/devtools/bundled_program/serialize/__init__.py`.
+
+:::{dropdown} Serialize and Deserialize
+
+```{eval-rst}
+.. currentmodule:: executorch.devtools.bundled_program.serialize
+.. autofunction:: serialize_from_bundled_program_to_flatbuffer
+    :noindex:
+```
+
+```{eval-rst}
+.. currentmodule:: executorch.devtools.bundled_program.serialize
+.. autofunction:: deserialize_from_flatbuffer_to_bundled_program
+    :noindex:
+```
+:::
+
+### Emit Example
+
+Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch model and the representative inputs we want to test it along with.
+
+```python
+import torch
+
+from executorch.exir import to_edge
+from executorch.devtools import BundledProgram
+
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
+from torch.export import export, export_for_training
+
+
+# Step 1: ExecuTorch Program Export
+class SampleModel(torch.nn.Module):
+    """An example model with multi-methods. Each method has multiple input and single output"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.a: torch.Tensor = 3 * torch.ones(2, 2, dtype=torch.int32)
+        self.b: torch.Tensor = 2 * torch.ones(2, 2, dtype=torch.int32)
+
+    def forward(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
+        z = x.clone()
+        torch.mul(self.a, x, out=z)
+        y = x.clone()
+        torch.add(z, self.b, out=y)
+        torch.add(y, q, out=y)
+        return y
+
+
+# Inference method name of SampleModel we want to bundle testcases to.
+# Notices that we do not need to bundle testcases for every inference methods.
+method_name = "forward"
+model = SampleModel()
+
+# Inputs for graph capture.
+capture_input = (
+    (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+    (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+)
+
+# Export method's FX Graph.
+method_graph = export(
+    export_for_training(model, capture_input).module(),
+    capture_input,
+)
+
+
+# Emit the traced method into ET Program.
+et_program = to_edge(method_graph).to_executorch()
+
+# Step 2: Construct MethodTestSuite for Each Method
+
+# Prepare the Test Inputs.
+
+# Number of input sets to be verified
+n_input = 10
+
+# Input sets to be verified.
+inputs = [
+    # Each list below is a individual input set.
+    # The number of inputs, dtype and size of each input follow Program's spec.
+    [
+        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
+    ]
+    for _ in range(n_input)
+]
+
+# Generate Test Suites
+method_test_suites = [
+    MethodTestSuite(
+        method_name=method_name,
+        test_cases=[
+            MethodTestCase(
+                inputs=input,
+                expected_outputs=(getattr(model, method_name)(*input), ),
+            )
+            for input in inputs
+        ],
+    ),
+]
+
+# Step 3: Generate BundledProgram
+bundled_program = BundledProgram(et_program, method_test_suites)
+
+# Step 4: Serialize BundledProgram to flatbuffer.
+serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
+    bundled_program
+)
+save_path = "bundled_program.bpte"
+with open(save_path, "wb") as f:
+    f.write(serialized_bundled_program)
+
+```
+
+We can also regenerate `BundledProgram` from flatbuffer file if needed:
+
+```python
+from executorch.devtools.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program
+save_path = "bundled_program.bpte"
+with open(save_path, "rb") as f:
+    serialized_bundled_program = f.read()
+
+regenerate_bundled_program = deserialize_from_flatbuffer_to_bundled_program(serialized_bundled_program)
+```
+
+## Runtime Stage
+This stage mainly focuses on executing the model with the bundled inputs and and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
+
+
+### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
+We need the pointer to ExecuTorch program to do the execution. To unify the process of loading and executing `BundledProgram` and Program flatbuffer, we create an API:
+
+:::{dropdown} `GetProgramData`
+
+```{eval-rst}
+.. doxygenfunction:: torch::executor::bundled_program::GetProgramData
+```
+:::
+
+Here's an example of how to use the `GetProgramData` API:
+```c++
+// Assume that the user has read the contents of the file into file_data using
+// whatever method works best for their application. The file could contain
+// either BundledProgram data or Program data.
+void* file_data = ...;
+size_t file_data_len = ...;
+
+// If file_data contains a BundledProgram, GetProgramData() will return a
+// pointer to the Program data embedded inside it. Otherwise it will return
+// file_data, which already pointed to Program data.
+const void* program_ptr;
+size_t program_len;
+status = torch::executor::bundled_program::GetProgramData(
+    file_data, file_data_len, &program_ptr, &program_len);
+ET_CHECK_MSG(
+    status == Error::Ok,
+    "GetProgramData() failed with status 0x%" PRIx32,
+    status);
+```
+
+### Load Bundled Input to Method
+To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `torch::executor::bundled_program::LoadBundledInput`:
+
+:::{dropdown} `LoadBundledInput`
+
+```{eval-rst}
+.. doxygenfunction:: torch::executor::bundled_program::LoadBundledInput
+```
+:::
+
+### Verify the Method's Output.
+We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput` to verify the method's output with bundled expected outputs. Here's the details of this API:
+
+:::{dropdown} `VerifyResultWithBundledExpectedOutput`
+
+```{eval-rst}
+.. doxygenfunction:: torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput
+```
+:::
+
+
+### Runtime Example
+
+Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp), and please review that file if you need more info and context:
+
+```c++
+// method_name is the name for the method we want to test
+// memory_manager is the executor::MemoryManager variable for executor memory allocation.
+// program is the ExecuTorch program.
+Result<Method> method = program->load_method(method_name, &memory_manager);
+
+ET_CHECK_MSG(
+    method.ok(),
+    "load_method() failed with status 0x%" PRIx32,
+    method.error());
+
+// Load testset_idx-th input in the buffer to plan
+status = torch::executor::bundled_program::LoadBundledInput(
+        *method,
+        program_data.bundled_program_data(),
+        FLAGS_testset_idx);
+ET_CHECK_MSG(
+    status == Error::Ok,
+    "LoadBundledInput failed with status 0x%" PRIx32,
+    status);
+
+// Execute the plan
+status = method->execute();
+ET_CHECK_MSG(
+    status == Error::Ok,
+    "method->execute() failed with status 0x%" PRIx32,
+    status);
+
+// Verify the result.
+status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
+        *method,
+        program_data.bundled_program_data(),
+        FLAGS_testset_idx,
+        FLAGS_rtol,
+        FLAGS_atol);
+ET_CHECK_MSG(
+    status == Error::Ok,
+    "Bundle verification failed with status 0x%" PRIx32,
+    status);
+
+```
+
+## Common Errors
+
+Errors will be raised if `List[MethodTestSuites]` doesn't match the `Program`. Here're two common situations:
+
+### Test input doesn't match model's requirement.
+
+Each inference method of PyTorch model has its own requirement for the inputs, like number of input, the dtype of each input, etc. `BundledProgram` will raise error if test input not meet the requirement.
+
+Here's the example of the dtype of test input not meet model's requirement:
+
+```python
+import torch
+
+from executorch.exir import to_edge
+from executorch.devtools import BundledProgram
+
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from torch.export import export, export_for_training
+
+
+class Module(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.a = 3 * torch.ones(2, 2, dtype=torch.float)
+        self.b = 2 * torch.ones(2, 2, dtype=torch.float)
+
+    def forward(self, x):
+        out_1 = torch.ones(2, 2, dtype=torch.float)
+        out_2 = torch.ones(2, 2, dtype=torch.float)
+        torch.mul(self.a, x, out=out_1)
+        torch.add(out_1, self.b, out=out_2)
+        return out_2
+
+
+model = Module()
+method_names = ["forward"]
+
+inputs = (torch.ones(2, 2, dtype=torch.float), )
+
+# Find each method of model needs to be traced my its name, export its FX Graph.
+method_graph = export(
+    export_for_training(model, inputs).module(),
+    inputs,
+)
+
+# Emit the traced methods into ET Program.
+et_program = to_edge(method_graph).to_executorch()
+
+# number of input sets to be verified
+n_input = 10
+
+# Input sets to be verified for each inference methods.
+# To simplify, here we create same inputs for all methods.
+inputs = {
+    # Inference method name corresponding to its test cases.
+    m_name: [
+        # NOTE: executorch program needs torch.float, but here is torch.int
+        [
+            torch.randint(-5, 5, (2, 2), dtype=torch.int),
+        ]
+        for _ in range(n_input)
+    ]
+    for m_name in method_names
+}
+
+# Generate Test Suites
+method_test_suites = [
+    MethodTestSuite(
+        method_name=m_name,
+        test_cases=[
+            MethodTestCase(
+                inputs=input,
+                expected_outputs=(getattr(model, m_name)(*input),),
+            )
+            for input in inputs[m_name]
+        ],
+    )
+    for m_name in method_names
+]
+
+# Generate BundledProgram
+
+bundled_program = BundledProgram(et_program, method_test_suites)
+```
+
+:::{dropdown} Raised Error
+
+```
+The input tensor tensor([[-2,  0],
+        [-2, -1]], dtype=torch.int32) dtype shall be torch.float32, but now is torch.int32
+---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+Cell In[1], line 72
+     56 method_test_suites = [
+     57     MethodTestSuite(
+     58         method_name=m_name,
+   (...)
+     67     for m_name in method_names
+     68 ]
+     70 # Step 3: Generate BundledProgram
+---> 72 bundled_program = create_bundled_program(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
+    264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together.
+    265
+    266 Args:
+   (...)
+    271     The `BundledProgram` variable contains given ExecuTorch program and test cases.
+    272 """
+    274 method_test_suites = sorted(method_test_suites, key=lambda x: x.method_name)
+--> 276 assert_valid_bundle(program, method_test_suites)
+    278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = []
+    280 # Emit data and metadata of bundled tensor
+File /executorch/devtools/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites)
+    215 # type of tensor input should match execution plan
+    216 if type(cur_plan_test_inputs[j]) == torch.Tensor:
+    217     # pyre-fixme[16]: Undefined attribute [16]: Item `bool` of `typing.Union[bool, float, int, torch._tensor.Tensor]`
+    218     # has no attribute `dtype`.
+--> 219     assert cur_plan_test_inputs[j].dtype == get_input_dtype(
+    220         program, program_plan_id, j
+    221     ), "The input tensor {} dtype shall be {}, but now is {}".format(
+    222         cur_plan_test_inputs[j],
+    223         get_input_dtype(program, program_plan_id, j),
+    224         cur_plan_test_inputs[j].dtype,
+    225     )
+    226 elif type(cur_plan_test_inputs[j]) in (
+    227     int,
+    228     bool,
+    229     float,
+    230 ):
+    231     assert type(cur_plan_test_inputs[j]) == get_input_type(
+    232         program, program_plan_id, j
+    233     ), "The input primitive dtype shall be {}, but now is {}".format(
+    234         get_input_type(program, program_plan_id, j),
+    235         type(cur_plan_test_inputs[j]),
+    236     )
+AssertionError: The input tensor tensor([[-2,  0],
+        [-2, -1]], dtype=torch.int32) dtype shall be torch.float32, but now is torch.int32
+
+```
+
+:::
+
+### Method name in `BundleConfig` does not exist.
+
+Another common error would be the method name in any `MethodTestSuite` does not exist in Model. `BundledProgram` will raise error and show the non-exist method name:
+
+```python
+import torch
+
+from executorch.exir import to_edge
+from executorch.devtools import BundledProgram
+
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from torch.export import export, export_for_training
+
+
+class Module(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.a = 3 * torch.ones(2, 2, dtype=torch.float)
+        self.b = 2 * torch.ones(2, 2, dtype=torch.float)
+
+    def forward(self, x):
+        out_1 = torch.ones(2, 2, dtype=torch.float)
+        out_2 = torch.ones(2, 2, dtype=torch.float)
+        torch.mul(self.a, x, out=out_1)
+        torch.add(out_1, self.b, out=out_2)
+        return out_2
+
+
+model = Module()
+method_names = ["forward"]
+
+inputs = (torch.ones(2, 2, dtype=torch.float),)
+
+# Find each method of model needs to be traced my its name, export its FX Graph.
+method_graph = export(
+    export_for_training(model, inputs).module(),
+    inputs,
+)
+
+# Emit the traced methods into ET Program.
+et_program = to_edge(method_graph).to_executorch()
+
+# number of input sets to be verified
+n_input = 10
+
+# Input sets to be verified for each inference methods.
+# To simplify, here we create same inputs for all methods.
+inputs = {
+    # Inference method name corresponding to its test cases.
+    m_name: [
+        [
+            torch.randint(-5, 5, (2, 2), dtype=torch.float),
+        ]
+        for _ in range(n_input)
+    ]
+    for m_name in method_names
+}
+
+# Generate Test Suites
+method_test_suites = [
+    MethodTestSuite(
+        method_name=m_name,
+        test_cases=[
+            MethodTestCase(
+                inputs=input,
+                expected_outputs=(getattr(model, m_name)(*input),),
+            )
+            for input in inputs[m_name]
+        ],
+    )
+    for m_name in method_names
+]
+
+# NOTE: MISSING_METHOD_NAME is not an inference method in the above model.
+method_test_suites[0].method_name = "MISSING_METHOD_NAME"
+
+# Generate BundledProgram
+bundled_program = BundledProgram(et_program, method_test_suites)
+
+```
+
+:::{dropdown} Raised Error
+
+```
+All method names in bundled config should be found in program.execution_plan,          but {'MISSING_METHOD_NAME'} does not include.
+---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+Cell In[3], line 73
+     70 method_test_suites[0].method_name = "MISSING_METHOD_NAME"
+     72 # Generate BundledProgram
+---> 73 bundled_program = create_bundled_program(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
+    264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together.
+    265
+    266 Args:
+   (...)
+    271     The `BundledProgram` variable contains given ExecuTorch program and test cases.
+    272 """
+    274 method_test_suites = sorted(method_test_suites, key=lambda x: x.method_name)
+--> 276 assert_valid_bundle(program, method_test_suites)
+    278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = []
+    280 # Emit data and metadata of bundled tensor
+File /executorch/devtools/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites)
+    138 method_name_of_program = {e.name for e in program.execution_plan}
+    139 method_name_of_test_suites = {t.method_name for t in method_test_suites}
+--> 141 assert method_name_of_test_suites.issubset(
+    142     method_name_of_program
+    143 ), f"All method names in bundled config should be found in program.execution_plan, \
+    144      but {str(method_name_of_test_suites - method_name_of_program)} does not include."
+    146 # check if method_tesdt_suites has been sorted in ascending alphabetical order of method name.
+    147 for test_suite_id in range(1, len(method_test_suites)):
+AssertionError: All method names in bundled config should be found in program.execution_plan,          but {'MISSING_METHOD_NAME'} does not include.
+```
+:::
diff --git a/docs/source/compiler-backend-dialect.md b/docs/source/compiler-backend-dialect.md
index 0a8ad973a79..0ab8fe79f2c 100644
--- a/docs/source/compiler-backend-dialect.md
+++ b/docs/source/compiler-backend-dialect.md
@@ -145,6 +145,12 @@ There are the backend operators currently using `bind_pattern_to_op` API.
 * `executorch_prims::floordiv.int(SymInt a, SymInt b) -> SymInt`
   * pattern: builtin.floordiv
   * backend: executor
+* `executorch_prims::truediv.int(Scalar a, Scalar b) -> Scalar`
+  * pattern: builtin.div
+  * backend: executor
+* `executorch_prims::sym_float.Scalar(Scalar a) -> Scalar`
+  * pattern: builtin.float
+  * backend: executor
 * `executorch_prims::gt.int(SymInt a, SymInt b) -> bool`
   * pattern: builtin.gt
   * backend: executor
@@ -160,6 +166,12 @@ There are the backend operators currently using `bind_pattern_to_op` API.
 * `executorch_prims::eq.int(SymInt a, SymInt b) -> bool`
   * pattern: builtin.eq
   * backend: executor
+* `executorch_prims::mod.Scalar(SymInt a, SymInt b) -> SymInt`
+  * pattern: builtin.divmod
+  * backend: executor
+* `executorch_prims::neg.Scalar(Scalar a) -> Scalar`
+  * pattern: operator.ne
+  * backend: executor
 * `quantized_decomposed::embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor`
   * pattern: [source](https://github.com/pytorch/executorch/blob/main/exir/passes/_quant_patterns_and_replacements.py)
   * backend: quantization
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index c82af7d98fe..21a2f4dd392 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -129,11 +129,11 @@ static auto success_with_compiler = register_backend(backend);
 
 ## Developer Tools Integration: Debuggability
 
-Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord).
+Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./etrecord).
 
 Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native Developer Tools does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart.
 
-In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Developer Tools Delegate Integration](./sdk-delegate-integration).
+In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Delegate Debugging](./delegate-debugging).
 
 By leveraging the debug identifier, backend developer can embed the debug as part of the delegated blob
 
diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md
index 1dad3b032fc..0f4489654b4 100644
--- a/docs/source/compiler-memory-planning.md
+++ b/docs/source/compiler-memory-planning.md
@@ -32,9 +32,8 @@ The `MemoryPlanningPass` exposes the option to not memory plan program inputs an
 program = edge_program.to_executorch(
             exir.ExecutorchBackendConfig(
                 memory_planning_pass=MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=False, # Inputs will not be memory planned, the data_ptr for input tensors after model load will be nullptr
-                    alloc_graph_output=True, # Outputs will be memory planned, the data_ptr for input tensors after model load will be in the `planned_memory`.
+                    alloc_graph_output=True, # Outputs will be memory planned, the data_ptr for output tensors after model load will be in the `planned_memory`.
                 )
             )
         )
@@ -77,10 +76,14 @@ Then later when lowering to ExecuTorch you can use your custom plan in the follo
 program = edge_program.to_executorch(
             exir.ExecutorchBackendConfig(
                 memory_planning_pass=CustomPoolMemoryPlanningPass(
-                    memory_planning_algo="greedy",
+                    memory_planning_algo=greedy,
                 )
             )
         )
 ```
 
 Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12).
+
+## Debugging Tool
+
+Please refer to [Memory Planning Inspection](./memory-planning-inspection.md) for a tool to inspect the result of memory planning.
diff --git a/docs/source/concepts.md b/docs/source/concepts.md
index 0c1512b5519..c085505b61a 100644
--- a/docs/source/concepts.md
+++ b/docs/source/concepts.md
@@ -283,7 +283,7 @@ Techniques for performing computations and memory accesses on tensors with lower
 
 The ExecuTorch runtime executes models on edge devices. It is responsible for program initialization, program execution and, optionally, destruction (releasing backend owned resources).
 
-## [Developer Tools](./sdk-overview.md)
+## [Developer Tools](./devtools-overview.md)
 
 A collection of tools users need to profile, debug and visualize programs that are running with ExecuTorch.
 
diff --git a/docs/source/debug-backend-delegate.md b/docs/source/debug-backend-delegate.md
index 17e4afe82a6..68914aaed90 100644
--- a/docs/source/debug-backend-delegate.md
+++ b/docs/source/debug-backend-delegate.md
@@ -6,7 +6,7 @@ We provide a list of util functions to give users insights on what happened to t
 The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call:
 
 ```python
-from executorch.exir.backend.utils import get_delegation_info
+from executorch.devtools.backend_debug import get_delegation_info
 from tabulate import tabulate
 
 # ... After call to to_backend(), but before to_executorch()
diff --git a/docs/source/delegate-debugging.md b/docs/source/delegate-debugging.md
new file mode 100644
index 00000000000..e4e6b0ddcc9
--- /dev/null
+++ b/docs/source/delegate-debugging.md
@@ -0,0 +1,152 @@
+# Delegate Debugging
+
+[Delegate backends](compiler-delegate-and-partitioner.md) are a prominent component of on-device models due to their flexibility in defining behavior. A side effect of this flexibility is that it operates as an opaque transformation. This obfuscates rich associations and mutations that are valuable in post-processing.
+- For example, if two different operator fusions were to occur within a delegate, post processing wouldn’t be able to separate the two transformations.
+
+Specifically, it makes associating runtime information (such as profiling results) through delegated graphs difficult. Delegate Debug Identifiers provides a framework through which delegate authors can propagate this information and utilize it for post run analysis.
+
+The preparation is broken down into three stages:
+- **Ahead-of-time (AOT)**: Delegate authors generate a __Debug Handle Map__.
+- **Runtime**: Delegate authors log using the __Delegate Debug Identifiers__ registered AOT in the __Debug Handle Map__.
+- **Deserialization**: Delegate authors provide a parser for custom metadata in delegate events.
+
+## Ahead-of-Time Integration
+Delegate authors propagate what transformations occur in a lowered backend by returning a **Debug Handle Map** from the backend implementation.
+
+### Generating a Debug Handle Map
+**Debug Handle Maps** communicate what transformations occurred in a backend by mapping **Delegate Debug Identifiers** to debug handles.
+
+**Delegate Debug Identifiers** are generated or user-provided identifiers for representing points of interest during runtime. Recall that debug handles are unique identifiers to operator instances in the model graph.
+
+For example:
+- **{ 0: (10, 11), 1: (11, 12) }:** Identifiers 0 and 1 in the runtime correspond to operators with the debug handles (10, 11) and (11, 12) respectively.
+- **{ “fused_op_1_2_3”: (11, 12, 15) }**: Identifier “fused_op_1_2_3” in the runtime corresponds to operators with debug handles (11, 12, 15), and 11, 12, 15 corresponds to the op 1, op 2 and op 3.
+
+```{Note}
+Identifiers are a means of connecting runtime results to the model graph; the interpretation of the identifiers is defined by the delegate author.
+```
+
+**Debug Handle Maps** are constructed through the use of **DelegateMappingBuilder** and returned as a part of `PreprocessResult`.
+
+```python
+class PreprocessResult:
+    processed_bytes: bytes = bytes()
+
+    debug_handle_map: Optional[
+        Union[Dict[int, Tuple[int]], Dict[str, Tuple[int]]]
+    ] = None
+```
+PreprocessResult is defined [here](https://github.com/pytorch/executorch/blob/main/exir/backend/backend_details.py).
+
+#### DelegateMappingBuilder
+`DelegateMappingBuilder` is a helper class for managing and constructing Debug Handle Maps. The result of the builder should be passed in when constructing PreprocessResult.
+
+`DelegateMappingBuilder` is defined [here](https://github.com/pytorch/executorch/blob/main/exir/backend/utils.py)
+
+A `DelegateMappingBuilder` instance can be constructed in one of 2 modes: manual identifiers or generated identifiers.
+
+```python
+# Manual Identifiers, Default
+builder = DelegateMappingBuilder(generated_identifiers=False)
+
+# Generated Identifiers
+builder = DelegateMappingBuilder(generated_identifiers=True)
+```
+
+With **manual identifiers**, users pass in a **Delegate Debug Identifier** when creating entries.
+With **generated identifiers**, the builder will auto-assign a **Delegate Debug Identifier**.
+
+To add an entry to the **Debug Handle Map**, use `insert_delegate_mapping_entry`. It associates one of `fx.Node(s)` or debug handles(s) (sourced from node.meta["debug_handle"]) to an optional **Delegate Debug Identifier** (used for the manual identifiers). The identifier recorded is returned from the call.
+
+```python
+def insert_delegate_mapping_entry(
+    self,
+    nodes: Optional[Union[Node, List[Node]]] = None,
+    handles: Optional[Union[int, List[int]]] = None,
+    identifier: Optional[Union[int, str]] = None,
+) -> Union[int, str]:
+```
+
+To retrieve the **Debug Handle Map**, use `get_delegate_mapping`.
+```python
+def get_delegate_mapping(
+    self,
+) -> Union[Dict[int, Tuple[int]], Dict[str, Tuple[int]]]
+```
+
+A demo of the AOT mapping can be found [here](https://github.com/pytorch/executorch/blob/main/exir/backend/test/backend_with_delegate_mapping_demo.py)
+
+
+## Runtime Logging
+Corresponding to the AOT map, the runtime then defines the functionality through which these events are logged.
+
+### Real-Time Logging
+
+ExecuTorch allows you to log in real time. **Real time Logging** is useful when timestamps are available as the execution occurs. It provides minimal overhead and is intuitive for authors to call.
+
+To log events in real-time (for example, explicitly denoting the profiling start and stop), `event_tracer_start_profiling_delegate` is used to create an `EventEntry` and `event_tracer_end_profiling_delegate` is used to conclude the `EventEntry` for the provided `EventTracer`.
+
+To start an `EventTracerEntry` using `event_tracer_start_profiling_delegate`, the **Delegate Debug Identifier** (provided AOT to the `debug_handle_map`) is passed as either the name or `delegate_debug_id` argument depending on the **Delegate Debug Identifier** type (str and int respectively)
+
+```c++
+EventTracerEntry event_tracer_start_profiling_delegate(
+    EventTracer* event_tracer,
+    const char* name,
+    DebugHandle delegate_debug_id)
+```
+
+To conclude an `EventTracerEntry`, `event_tracer_end_profiling_delegate` is simply provided the original `EventTracerEntry`.
+
+Optionally, additional runtime `metadata` can also be logged at this point.
+
+```c++
+void event_tracer_end_profiling_delegate(
+    EventTracer* event_tracer,
+    EventTracerEntry event_tracer_entry,
+    const void* metadata = nullptr,
+    size_t metadata_len = 0)
+```
+
+### Post-Time Logging
+ExecuTorch also allows you to log in post time. Some runtime settings don't have access to timestamps while it is executing. **Post-Time Logging** enables authors to still be able to log these events.
+
+To log events in post (for example, logging start and end time simultaneously) `event_tracer_log_profiling_delegate` is called with a combination of the arguments used in the real-time logging API’s and timestamps.
+
+```c++
+void event_tracer_log_profiling_delegate(
+    EventTracer* event_tracer,
+    const char* name,
+    DebugHandle delegate_debug_id,
+    et_timestamp_t start_time,
+    et_timestamp_t end_time,
+    const void* metadata = nullptr,
+    size_t metadata_len = 0)
+```
+A demo of the runtime code can be found [here](https://github.com/pytorch/executorch/blob/main/runtime/executor/test/test_backend_with_delegate_mapping.cpp).
+
+
+## Surfacing custom metadata from delegate events
+
+As seen in the runtime logging API's above, users can log an array of bytes along with their delegate profiling event. We make this data available for users in post processing via the [Inspector API](./model-inspector.rst).
+
+Users can pass a metadata parser when creating an instance of the Inspector. The parser is a callable that deserializes the data and returns a list of strings or a dictionary containing key-value pairs. The deserialized data is then added back to the corresponding event in the event block for user consumption. Here's an example of how to write this parser:
+
+NOTE: The input to the deserializer is a list where each entry is a series of bytes (essentially each entry is an immutable bytearray). Users are expected to iterate over this list, deserialize each entry and then return it in the expected format which is either a list of strings, or a dict.
+
+```python
+Inspector(
+    etdump_path=etdump_path,
+    # Optional
+    etrecord=etrecord_path,
+    # Optional, only needed if debugging was enabled.
+    buffer_path=buffer_path,
+    delegate_metadata_parser=parse_delegate_metadata
+)
+
+
+def parse_delegate_metadata(delegate_metadatas: List[bytes]) -> Union[List[str], Dict[str, Any]]:
+    metadata_str = []
+    for metadata_bytes in delegate_metadatas:
+        metadata_str += str(metadata_bytes)
+    return metadata_str
+```
diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md
new file mode 100644
index 00000000000..259eaf562c3
--- /dev/null
+++ b/docs/source/devtools-overview.md
@@ -0,0 +1,45 @@
+# Introduction to the ExecuTorch Developer Tools
+
+ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch Developer Tools enable this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch.
+
+All the components of the Developer Tools have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from.
+
+## Developer Tools Features
+
+The ExecuTorch Developer Tools support the following features:
+
+- **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output.
+- **Profiling** models with operator level breakdown of performance stats
+    - Linking back operator performance stats to source code and module hierarchy
+    - Model loading and execution time
+- **Delegate Integration** - Surfacing performance details from delegate backends
+    - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy)
+- **Debugging** - Intermediate outputs and output quality analysis
+- **Memory Allocation Insights** - Visualize how memory is planned, where all the live tensors are at any point in time
+- **Visualization** - Coming soon
+
+## Fundamental components of the Developer Tools
+
+In order to fully understand and leverage the power of the Developer Tools in this section, the fundamental components that power the Developer Tools will be detailed.
+
+### ETRecord
+ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the Developer Tools to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model.
+
+To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb).
+
+More details are available in the [ETRecord documentation](etrecord.rst) on how to generate and store an ETRecord.
+
+### ETDump
+ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution.
+
+```{note}
+If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the Developer Tools. For the full experience, it is recommended that the users also generate an ETRecord.
+```
+
+More details are available in the [ETDump documentation](etdump.md) on how to generate and store an ETDump from the runtime.
+
+
+### Inspector APIs
+The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
+
+More details are available in the [Inspector API documentation](model-inspector.rst) on how to use the Inspector APIs.
diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md
new file mode 100644
index 00000000000..33d78cf58da
--- /dev/null
+++ b/docs/source/devtools-tutorial.md
@@ -0,0 +1,3 @@
+## Developer Tools Usage Tutorial
+
+Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
diff --git a/docs/source/etdump.md b/docs/source/etdump.md
new file mode 100644
index 00000000000..42391cf40e9
--- /dev/null
+++ b/docs/source/etdump.md
@@ -0,0 +1,44 @@
+# Prerequisite | ETDump - ExecuTorch Dump
+
+ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch Developer Tools. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging.
+
+
+## Generating an ETDump
+
+Generating an ETDump is a relatively straightforward process. Users can follow the steps detailed below to integrate it into their application that uses ExecuTorch.
+
+1. ***Include*** the ETDump header in your code.
+```C++
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+```
+
+2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime.
+
+```C++
+torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+Result<Method> method =
+      program->load_method(method_name, &memory_manager, &etdump_gen);
+```
+
+3. ***Dump Out the ETDump Buffer*** - after the inference iterations have been completed, users can dump out the ETDump buffer. If users are on a device which has a filesystem, they could just write it out to the filesystem. For more constrained embedded devices, users will have to extract the ETDump buffer from the device through a mechanism that best suits them (e.g. UART, JTAG etc.)
+
+```C++
+etdump_result result = etdump_gen.get_etdump_data();
+if (result.buf != nullptr && result.size > 0) {
+    // On a device with a file system users can just write it out
+    // to the file-system.
+    FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
+    fwrite((uint8_t*)result.buf, 1, result.size, f);
+    fclose(f);
+    free(result.buf);
+  }
+```
+
+4. ***Compile*** your binary using CMake with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime. This flag needs to be added to the ExecuTorch library and any operator library that you are compiling into your binary. For reference, you can take a look at `examples/sdk/CMakeLists.txt`. The lines of interest are:
+```
+target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
+```
+## Using an ETDump
+
+Pass this ETDump into the [Inspector API](./model-inspector.rst) to access this data and do post-run analysis.
diff --git a/docs/source/etrecord.rst b/docs/source/etrecord.rst
new file mode 100644
index 00000000000..1ab84a6ee10
--- /dev/null
+++ b/docs/source/etrecord.rst
@@ -0,0 +1,40 @@
+Prerequisite | ETRecord - ExecuTorch Record
+===========================================
+
+Overview
+--------
+
+``ETRecord`` is intended to be the debug artifact that is generated by
+users ahead of time (when they export their model to run on ExecuTorch).
+To draw a rough equivalent to conventional software development,
+``ETRecord`` can be considered as the binary built with debug symbols
+that is used for debugging in GNU Debugger (gdb). It is expected that
+the user will supply this to the ExecuTorch Developer Tools in order for
+them to debug and visualize their model.
+
+``ETRecord`` contains numerous components such as:
+
+* Edge dialect graph with debug handles
+* Delegate debug handle maps
+
+The ``ETRecord`` object itself is intended to be opaque to users and they should not access any components inside it directly.
+It should be provided to the `Inspector API <model-inspector.html>`__ to link back performance and debug data sourced from the runtime back to the Python source code.
+
+Generating an ``ETRecord``
+--------------------------
+
+The user should use the following API to generate an ``ETRecord`` file. They
+will be expected to provide the Edge Dialect program (returned by the call to ``to_edge()``),
+the ExecuTorch program (returned by the call to ``to_executorch()``), and optional models that
+they are interested in working with via our tooling.
+
+.. warning::
+    Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
+
+.. currentmodule:: executorch.devtools.etrecord._etrecord
+.. autofunction:: generate_etrecord
+
+Using an ``ETRecord``
+---------------------
+
+Pass the ``ETRecord`` as an optional argument into the `Inspector API <model-inspector.html>`__ to access this data and  do post-run analysis.
diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md
index 7516184d1cc..58d2b7e3494 100644
--- a/docs/source/extension-module.md
+++ b/docs/source/extension-module.md
@@ -2,27 +2,27 @@
 
 **Author:** [Anthony Shoumikhin](https://github.com/shoumikhin)
 
-In the [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md), we explored the lower-level ExecuTorch APIs for running an exported model. While these APIs offer zero overhead, great flexibility, and control, they can be verbose and complex for regular use. To simplify this and resemble PyTorch's eager mode in Python, we introduce the Module facade APIs over the regular ExecuTorch runtime APIs. The Module APIs provide the same flexibility but default to commonly used components like `DataLoader` and `MemoryAllocator`, hiding most intricate details.
+In the [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md), we explored the lower-level ExecuTorch APIs for running an exported model. While these APIs offer zero overhead, great flexibility, and control, they can be verbose and complex for regular use. To simplify this and resemble PyTorch's eager mode in Python, we introduce the `Module` facade APIs over the regular ExecuTorch runtime APIs. The `Module` APIs provide the same flexibility but default to commonly used components like `DataLoader` and `MemoryAllocator`, hiding most intricate details.
 
 ## Example
 
-Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial) using the `Module` APIs:
+Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial) using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
 
 ```cpp
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
 
-using namespace ::torch::executor;
+using namespace ::executorch::extension;
 
 // Create a Module.
 Module module("/path/to/model.pte");
 
 // Wrap the input data with a Tensor.
 float input[1 * 3 * 256 * 256];
-Tensor::SizesType sizes[] = {1, 3, 256, 256};
-TensorImpl tensor(ScalarType::Float, std::size(sizes), sizes, input);
+auto tensor = from_blob(input, {1, 3, 256, 256});
 
 // Perform an inference.
-const auto result = module.forward(Tensor(&tensor));
+const auto result = module.forward(tensor);
 
 // Check for success or failure.
 if (result.ok()) {
@@ -37,7 +37,7 @@ The code now boils down to creating a `Module` and calling `forward()` on it, wi
 
 ### Creating a Module
 
-Creating a `Module` object is an extremely fast operation that does not involve significant processing time or memory allocation. The actual loading of a `Program` and a `Method` happens lazily on the first inference unless explicitly requested with a dedicated API.
+Creating a `Module` object is a fast operation that does not involve significant processing time or memory allocation. The actual loading of a `Program` and a `Method` happens lazily on the first inference unless explicitly requested with a dedicated API.
 
 ```cpp
 Module module("/path/to/model.pte");
@@ -60,23 +60,32 @@ const auto error = module.load_method("forward");
 
 assert(module.is_method_loaded("forward"));
 ```
-Note: the `Program` is loaded automatically before any `Method` is loaded. Subsequent attemps to load them have no effect if one of the previous attemps was successful.
+
+You can also use the convenience function to load the `forward` method:
+
+```cpp
+const auto error = module.load_forward();
+
+assert(module.is_method_loaded("forward"));
+```
+
+**Note:** The `Program` is loaded automatically before any `Method` is loaded. Subsequent attempts to load them have no effect if a previous attempt was successful.
 
 ### Querying for Metadata
 
-Get a set of method names that a Module contains udsing the `method_names()` function:
+Get a set of method names that a `Module` contains using the `method_names()` function:
 
 ```cpp
 const auto method_names = module.method_names();
 
 if (method_names.ok()) {
-  assert(method_names.count("forward"));
+  assert(method_names->count("forward"));
 }
 ```
 
-Note: `method_names()` will try to force-load the `Program` when called the first time.
+**Note:** `method_names()` will force-load the `Program` when called for the first time.
 
-Introspect miscellaneous metadata about a particular method via `MethodMeta` struct returned by `method_meta()` function:
+To introspect miscellaneous metadata about a particular method, use the `method_meta()` function, which returns a `MethodMeta` struct:
 
 ```cpp
 const auto method_meta = module.method_meta("forward");
@@ -86,59 +95,136 @@ if (method_meta.ok()) {
   assert(method_meta->num_inputs() > 1);
 
   const auto input_meta = method_meta->input_tensor_meta(0);
-
   if (input_meta.ok()) {
     assert(input_meta->scalar_type() == ScalarType::Float);
   }
-  const auto output_meta = meta->output_tensor_meta(0);
 
+  const auto output_meta = method_meta->output_tensor_meta(0);
   if (output_meta.ok()) {
     assert(output_meta->sizes().size() == 1);
   }
 }
 ```
 
-Note: `method_meta()` will try to force-load the `Method` when called for the first time.
+**Note:** `method_meta()` will also force-load the `Method` the first time it is called.
+
+### Performing an Inference
+
+Assuming the `Program`'s method names and their input format are known ahead of time, you can run methods directly by name using the `execute()` function:
+
+```cpp
+const auto result = module.execute("forward", tensor);
+```
+
+For the standard `forward()` method, the above can be simplified:
 
-### Perform an Inference
+```cpp
+const auto result = module.forward(tensor);
+```
+
+**Note:** `execute()` or `forward()` will load the `Program` and the `Method` the first time they are called. Therefore, the first inference will take longer, as the model is loaded lazily and prepared for execution unless it was explicitly loaded earlier.
+
+### Setting Input and Output
+
+You can set individual input and output values for methods with the following APIs.
+
+#### Setting Inputs
+
+Inputs can be any `EValue`, which includes tensors, scalars, lists, and other supported types. To set a specific input value for a method:
 
-Assuming that the `Program`'s method names and their input format is known ahead of time, we rarely need to query for those and can run the methods directly by name using the `execute()` function:
+```cpp
+module.set_input("forward", input_value, input_index);
+```
+
+- `input_value` is an `EValue` representing the input you want to set.
+- `input_index` is the zero-based index of the input to set.
+
+For example, to set the first input tensor:
 
 ```cpp
-const auto result = module.execute("forward", Tensor(&tensor));
+module.set_input("forward", tensor_value, 0);
 ```
 
-Which can also be simplified for the standard `forward()` method name as:
+You can also set multiple inputs at once:
 
 ```cpp
-const auto result = module.forward(Tensor(&tensor));
+std::vector<runtime::EValue> inputs = {input1, input2, input3};
+module.set_inputs("forward", inputs);
 ```
 
-Note: `execute()` or `forward()` will try to force load the `Program` and the `Method` when called for the first time. Therefore, the first inference will take more time than subsequent ones as it loads the model lazily and prepares it for execution unless the `Program` or `Method` was loaded explicitly earlier using the corresponding functions.
+**Note:** You can skip the method name argument for the `forward()` method.
+
+By pre-setting all inputs, you can perform an inference without passing any arguments:
+
+```cpp
+const auto result = module.forward();
+```
+
+Or just setting and then passing the inputs partially:
+
+```cpp
+// Set the second input ahead of time.
+module.set_input(input_value_1, 1);
+
+// Execute the method, providing the first input at call time.
+const auto result = module.forward(input_value_0);
+```
+
+**Note:** The pre-set inputs are stored in the `Module` and can be reused multiple times for the next executions.
+
+Don't forget to clear or reset the inputs if you don't need them anymore by setting them to default-constructed `EValue`:
+
+```cpp
+module.set_input(runtime::EValue(), 1);
+```
+
+#### Setting Outputs
+
+Only outputs of type Tensor can be set at runtime, and they must not be memory-planned at model export time. Memory-planned tensors are preallocated during model export and cannot be replaced.
+
+To set the output tensor for a specific method:
+
+```cpp
+module.set_output("forward", output_tensor, output_index);
+```
+
+- `output_tensor` is an `EValue` containing the tensor you want to set as the output.
+- `output_index` is the zero-based index of the output to set.
+
+**Note:** Ensure that the output tensor you're setting matches the expected shape and data type of the method's output.
+
+You can skip the method name for `forward()` and the index for the first output:
+
+```cpp
+module.set_output(output_tensor);
+```
+
+**Note:** The pre-set outputs are stored in the `Module` and can be reused multiple times for the next executions, just like inputs.
 
 ### Result and Error Types
 
-Most of the ExecuTorch APIs, including those described above, return either `Result` or `Error` types. Let's understand what those are:
+Most of the ExecuTorch APIs return either `Result` or `Error` types:
 
-* [`Error`](https://github.com/pytorch/executorch/blob/main/runtime/core/error.h) is a C++ enum containing a collection of valid error codes, where the default is `Error::Ok`, denoting success.
+- [`Error`](https://github.com/pytorch/executorch/blob/main/runtime/core/error.h) is a C++ enum containing valid error codes. The default is `Error::Ok`, denoting success.
 
-* [`Result`](https://github.com/pytorch/executorch/blob/main/runtime/core/result.h) can hold either an `Error` if the operation has failed or a payload, i.e., the actual result of the operation like an `EValue` wrapping a `Tensor` or any other standard C++ data type if the operation succeeded. To check if `Result` has a valid value, call the `ok()` function. To get the `Error` use the `error()` function, and to get the actual data, use the overloaded `get()` function or dereferencing pointer operators like `*` and `->`.
+- [`Result`](https://github.com/pytorch/executorch/blob/main/runtime/core/result.h) can hold either an `Error` if the operation fails, or a payload such as an `EValue` wrapping a `Tensor` if successful. To check if a `Result` is valid, call `ok()`. To retrieve the `Error`, use `error()`, and to get the data, use `get()` or dereference operators like `*` and `->`.
 
-### Profile the Module
+### Profiling the Module
 
-Use [ExecuTorch Dump](sdk-etdump.md) to trace model execution. Create an instance of the `ETDumpGen` class and pass it to the `Module` constructor. After executing a method, save the `ETDump` to a file for further analysis. You can capture multiple executions in a single trace if desired.
+Use [ExecuTorch Dump](etdump.md) to trace model execution. Create an `ETDumpGen` instance and pass it to the `Module` constructor. After executing a method, save the `ETDump` data to a file for further analysis:
 
 ```cpp
 #include <fstream>
 #include <memory>
+
 #include <executorch/extension/module/module.h>
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 
-using namespace ::torch::executor;
+using namespace ::executorch::extension;
 
 Module module("/path/to/model.pte", Module::LoadMode::MmapUseMlock, std::make_unique<ETDumpGen>());
 
-// Execute a method, e.g. module.forward(...); or module.execute("my_method", ...);
+// Execute a method, e.g., module.forward(...); or module.execute("my_method", ...);
 
 if (auto* etdump = dynamic_cast<ETDumpGen*>(module.event_tracer())) {
   const auto trace = etdump->get_etdump_data();
@@ -153,3 +239,7 @@ if (auto* etdump = dynamic_cast<ETDumpGen*>(module.event_tracer())) {
   }
 }
 ```
+
+# Conclusion
+
+The `Module` APIs provide a simplified interface for running ExecuTorch models in C++, closely resembling the experience of PyTorch's eager mode. By abstracting away the complexities of the lower-level runtime APIs, developers can focus on model execution without worrying about the underlying details.
diff --git a/docs/source/extension-tensor.md b/docs/source/extension-tensor.md
new file mode 100644
index 00000000000..910c06053ed
--- /dev/null
+++ b/docs/source/extension-tensor.md
@@ -0,0 +1,411 @@
+# Managing Tensor Memory in C++
+
+**Author:** [Anthony Shoumikhin](https://github.com/shoumikhin)
+
+Tensors are fundamental data structures in ExecuTorch, representing multi-dimensional arrays used in computations for neural networks and other numerical algorithms. In ExecuTorch, the [Tensor](https://github.com/pytorch/executorch/blob/main/runtime/core/portable_type/tensor.h) class doesn’t own its metadata (sizes, strides, dim_order) or data, keeping the runtime lightweight. Users are responsible for supplying all these memory buffers and ensuring that the metadata and data outlive the `Tensor` instance. While this design is lightweight and flexible, especially for tiny embedded systems, it places a significant burden on the user. If your environment requires minimal dynamic allocations, a small binary footprint, or limited C++ standard library support, you’ll need to accept that trade-off and stick with the regular `Tensor` type.
+
+Imagine you’re working with a [`Module`](extension-module.md) interface, and you need to pass a `Tensor` to the `forward()` method. You would need to declare and maintain at least the sizes array and data separately, sometimes the strides too, often leading to the following pattern:
+
+```cpp
+#include <executorch/extension/module/module.h>
+
+using namespace executorch::aten;
+using namespace executorch::extension;
+
+SizesType sizes[] = {2, 3};
+DimOrderType dim_order[] = {0, 1};
+StridesType strides[] = {3, 1};
+float data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+TensorImpl tensor_impl(
+    ScalarType::Float,
+    std::size(sizes),
+    sizes,
+    data,
+    dim_order,
+    strides);
+// ...
+module.forward(Tensor(&tensor_impl));
+```
+
+You must ensure `sizes`, `dim_order`, `strides`, and `data` stay valid. This makes code maintenance difficult and error-prone. Users have struggled to manage lifetimes, and many have created their own ad-hoc managed tensor abstractions to hold all the pieces together, leading to a fragmented and inconsistent ecosystem.
+
+## Introducing TensorPtr
+
+To alleviate these issues, ExecuTorch provides `TensorPtr`, a smart pointer that manages the lifecycle of both the tensor's data and its dynamic metadata.
+
+With `TensorPtr`, users no longer need to worry about metadata lifetimes separately. Data ownership is determined based on whether it is passed by pointer or moved into the `TensorPtr` as an `std::vector`. Everything is bundled in one place and managed automatically, enabling you to focus on actual computations.
+
+Here’s how you can use it:
+
+```cpp
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+using namespace executorch::extension;
+
+auto tensor = make_tensor_ptr(
+    {2, 3},                                // sizes
+    {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); // data
+// ...
+module.forward(tensor);
+```
+
+The data is now owned by the tensor instance because it's provided as a vector. To create a non-owning `TensorPtr`, just pass the data by pointer. The `type` is deduced automatically based on the data vector (`float`). `strides` and `dim_order` are computed automatically to default values based on the `sizes` if not specified explicitly as additional arguments.
+
+`EValue` in `Module::forward()` accepts `TensorPtr` directly, ensuring seamless integration. `EValue` can now be constructed implicitly with a smart pointer to any type that it can hold. This allows `TensorPtr` to be dereferenced implicitly when passed to `forward()`, and `EValue` will hold the `Tensor` that the `TensorPtr` points to.
+
+## API Overview
+
+`TensorPtr` is literally an alias for `std::shared_ptr<Tensor>`, so you can work with it easily without duplicating the data and metadata. Each `Tensor` instance may either own its data or reference external data.
+
+### Creating Tensors
+
+There are several ways to create a `TensorPtr`.
+
+#### Creating Scalar Tensors
+
+You can create a scalar tensor, i.e. a tensor with zero dimensions or with one of the sizes being zero.
+
+*Providing A Single Data Value*
+
+```cpp
+auto tensor = make_tensor_ptr(3.14);
+```
+
+The resulting tensor will contain a single value `3.14` of type double, which is deduced automatically.
+
+*Providing A Single Data Value with a Type*
+
+```cpp
+auto tensor = make_tensor_ptr(42, ScalarType::Float);
+```
+
+Now the integer `42` will be cast to float and the tensor will contain a single value `42` of type float.
+
+#### Owning Data from a Vector
+
+When you provide sizes and data vectors, `TensorPtr` takes ownership of both the data and the sizes.
+
+*Providing Data Vector*
+
+```cpp
+auto tensor = make_tensor_ptr(
+    {2, 3},                                 // sizes
+    {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});  // data (float)
+```
+
+The type is deduced automatically as `ScalarType::Float` from the data vector.
+
+*Providing Data Vector with a Type*
+
+If you provide data of one type but specify a different scalar type, the data will be cast to the given type.
+
+```cpp
+auto tensor = make_tensor_ptr(
+    {1, 2, 3, 4, 5, 6},          // data (int)
+    ScalarType::Double);         // double scalar type
+```
+
+In this example, even though the data vector contains integers, we specify the scalar type as `Double`. The integers are cast to double, and the new data vector is owned by the `TensorPtr`. Since the `sizes` argument is skipped in this example, the tensor is one-dimensional with a size equal to the length of the data vector. Note that the reverse cast, from a floating-point type to an integral type, is not allowed because that loses precision. Similarly, casting other types to `Bool` is disallowed.
+
+*Providing Data Vector as `std::vector<uint8_t>`*
+
+You can also provide raw data in the form of a `std::vector<uint8_t>`, specifying the sizes and scalar type. The data will be reinterpreted according to the provided type.
+
+```cpp
+std::vector<uint8_t> data = /* raw data */;
+auto tensor = make_tensor_ptr(
+    {2, 3},                 // sizes
+    std::move(data),        // data as uint8_t vector
+    ScalarType::Int);       // int scalar type
+```
+
+The `data` vector must be large enough to accommodate all the elements according to the provided sizes and scalar type.
+
+#### Non-Owning Data from Raw Pointer
+
+You can create a `TensorPtr` that references existing data without taking ownership.
+
+*Providing Raw Data*
+
+```cpp
+float data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+auto tensor = make_tensor_ptr(
+    {2, 3},              // sizes
+    data,                // raw data pointer
+    ScalarType::Float);  // float scalar type
+```
+
+The `TensorPtr` does not own the data, so you must ensure the `data` remains valid.
+
+*Providing Raw Data with Custom Deleter*
+
+If you want the `TensorPtr` to manage the lifetime of the data, you can provide a custom deleter.
+
+```cpp
+auto* data = new double[6]{1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+auto tensor = make_tensor_ptr(
+    {2, 3},                               // sizes
+    data,                                 // data pointer
+    ScalarType::Double,                   // double scalar type
+    TensorShapeDynamism::DYNAMIC_BOUND,   // default dynamism
+    [](void* ptr) { delete[] static_cast<double*>(ptr); });
+```
+
+The `TensorPtr` will call the custom deleter when it is destroyed, i.e., when the smart pointer is reset and no more references to the underlying `Tensor` exist.
+
+#### Sharing Existing Tensor
+
+Since `TensorPtr` is a `std::shared_ptr<Tensor>`, you can easily create a `TensorPtr` that shares an existing `Tensor`. Any changes made to the shared data are reflected across all instances sharing the same data.
+
+*Sharing Existing TensorPtr*
+
+```cpp
+auto tensor = make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+auto tensor_copy = tensor;
+```
+
+Now `tensor` and `tensor_copy` point to the same data and metadata.
+
+#### Viewing Existing Tensor
+
+You can create a `TensorPtr` from an existing `Tensor`, copying its properties and referencing the same data.
+
+*Viewing Existing Tensor*
+
+```cpp
+Tensor original_tensor = /* some existing tensor */;
+auto tensor = make_tensor_ptr(original_tensor);
+```
+
+Now the newly created `TensorPtr` references the same data as the original tensor, but has its own metadata copy, so it can interpret or "view" the data differently, but any modifications to the data will be reflected in the original `Tensor` as well.
+
+### Cloning Tensors
+
+To create a new `TensorPtr` that owns a copy of the data from an existing tensor:
+
+```cpp
+Tensor original_tensor = /* some existing tensor */;
+auto tensor = clone_tensor_ptr(original_tensor);
+```
+
+The newly created `TensorPtr` has its own copy of the data, so it can modify and manage it independently.
+Likewise, you can create a clone of an existing `TensorPtr`.
+
+```cpp
+auto original_tensor = make_tensor_ptr(/* ... */);
+auto tensor = clone_tensor_ptr(original_tensor);
+```
+
+Note that, regardless of whether the original `TensorPtr` owns the data or not, the newly created `TensorPtr` will own a copy of the data.
+
+### Resizing Tensors
+
+The `TensorShapeDynamism` enum specifies the mutability of a tensor's shape:
+
+- `STATIC`: The tensor's shape cannot be changed.
+- `DYNAMIC_BOUND`: The tensor's shape can be changed but cannot contain more elements than it originally had at creation based on the initial sizes.
+- `DYNAMIC`: The tensor's shape can be changed arbitrarily. Currently, `DYNAMIC` is an alias for `DYNAMIC_BOUND`.
+
+When resizing a tensor, you must respect its dynamism setting. Resizing is only allowed for tensors with `DYNAMIC` or `DYNAMIC_BOUND` shapes, and you cannot resize `DYNAMIC_BOUND` tensors to contain more elements than they had initially.
+
+```cpp
+auto tensor = make_tensor_ptr(
+    {2, 3},                      // sizes
+    {1, 2, 3, 4, 5, 6},          // data
+    ScalarType::Int,
+    TensorShapeDynamism::DYNAMIC_BOUND);
+// Initial sizes: {2, 3}
+// Number of elements: 6
+
+resize_tensor_ptr(tensor, {2, 2});
+// The tensor sizes are now {2, 2}
+// Number of elements is 4 < initial 6
+
+resize_tensor_ptr(tensor, {1, 3});
+// The tensor sizes are now {1, 3}
+// Number of elements is 3 < initial 6
+
+resize_tensor_ptr(tensor, {3, 2});
+// The tensor sizes are now {3, 2}
+// Number of elements is 6 == initial 6
+
+resize_tensor_ptr(tensor, {6, 1});
+// The tensor sizes are now {6, 1}
+// Number of elements is 6 == initial 6
+```
+
+## Convenience Helpers
+
+ExecuTorch provides several helper functions to create tensors conveniently.
+
+### Creating Non-Owning Tensors with `for_blob` and `from_blob`
+
+These helpers allow you to create tensors that do not own the data.
+
+*Using `from_blob()`*
+
+```cpp
+float data[] = {1.0f, 2.0f, 3.0f};
+auto tensor = from_blob(
+    data,                // data pointer
+    {3},                 // sizes
+    ScalarType::Float);  // float scalar type
+```
+
+*Using `for_blob()` with Fluent Syntax*
+
+```cpp
+double data[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+auto tensor = for_blob(data, {2, 3}, ScalarType::Double)
+                  .strides({3, 1})
+                  .dynamism(TensorShapeDynamism::STATIC)
+                  .make_tensor_ptr();
+```
+
+*Using Custom Deleter with `from_blob()`*
+
+```cpp
+int* data = new int[3]{1, 2, 3};
+auto tensor = from_blob(
+    data,             // data pointer
+    {3},              // sizes
+    ScalarType::Int,  // int scalar type
+    [](void* ptr) { delete[] static_cast<int*>(ptr); });
+```
+
+The `TensorPtr` will call the custom deleter when it is destroyed.
+
+### Creating Empty Tensors
+
+`empty()` creates an uninitialized tensor with sizes specified.
+
+```cpp
+auto tensor = empty({2, 3});
+```
+
+`empty_like()` creates an uninitialized tensor with the same sizes as an existing `TensorPtr`.
+
+```cpp
+TensorPtr original_tensor = /* some existing tensor */;
+auto tensor = empty_like(original_tensor);
+```
+
+And `empty_strided()` creates an uninitialized tensor with sizes and strides specified.
+
+```cpp
+auto tensor = empty_strided({2, 3}, {3, 1});
+```
+
+### Creating Tensors Filled with Specific Values
+
+`full()`, `zeros()` and `ones()` create a tensor filled with a provided value, zeros or ones respectively.
+
+```cpp
+auto tensor_full = full({2, 3}, 42.0f);
+auto tensor_zeros = zeros({2, 3});
+auto tensor_ones = ones({3, 4});
+```
+
+Similarly to `empty()`, there are extra helper functions `full_like()`, `full_strided()`, `zeros_like()`, `zeros_strided()`, `ones_like()` and `ones_strided()` to create filled tensors with the same properties as an existing `TensorPtr` or with custom strides.
+
+### Creating Random Tensors
+
+`rand()` creates a tensor filled with random values between 0 and 1.
+
+```cpp
+auto tensor_rand = rand({2, 3});
+```
+
+`randn()` creates a tensor filled with random values from a normal distribution.
+
+```cpp
+auto tensor_randn = randn({2, 3});
+```
+
+`randint()` creates a tensor filled with random integers between min (inclusive) and max (exclusive) integers specified.
+
+```cpp
+auto tensor_randint = randint(0, 10, {2, 3});
+```
+
+### Creating Scalar Tensors
+
+In addition to `make_tensor_ptr()` with a single data value, you can create a scalar tensor with `scalar_tensor()`.
+
+```cpp
+auto tensor = scalar_tensor(3.14f);
+```
+
+Note that the `scalar_tensor()` function expects a value of type `Scalar`. In ExecuTorch, `Scalar` can represent `bool`, `int`, or floating-point types, but not types like `Half` or `BFloat16`, etc. for which you'd need to use `make_tensor_ptr()` to skip the `Scalar` type.
+
+## Notes on EValue and Lifetime Management
+
+The [`Module`](extension-module.md) interface expects data in the form of `EValue`, a variant type that can hold a `Tensor` or other scalar types. When you pass a `TensorPtr` to a function expecting an `EValue`, you can dereference the `TensorPtr` to get the underlying `Tensor`.
+
+```cpp
+TensorPtr tensor = /* create a TensorPtr */
+//...
+module.forward(tensor);
+```
+
+Or even a vector of `EValues` for multiple parameters.
+
+```cpp
+TensorPtr tensor = /* create a TensorPtr */
+TensorPtr tensor2 = /* create another TensorPtr */
+//...
+module.forward({tensor, tensor2});
+```
+
+However, be cautious: `EValue` will not hold onto the dynamic data and metadata from the `TensorPtr`. It merely holds a regular `Tensor`, which does not own the data or metadata but refers to them using raw pointers. You need to ensure that the `TensorPtr` remains valid for as long as the `EValue` is in use.
+
+This also applies when using functions like `set_input()` or `set_output()` that expect `EValue`.
+
+## Interoperability with ATen
+
+If your code is compiled with the preprocessor flag `USE_ATEN_LIB` enabled, all the `TensorPtr` APIs will use `at::` APIs under the hood. E.g. `TensorPtr` becomes a `std::shared_ptr<at::Tensor>`. This allows for seamless integration with [PyTorch ATen](https://pytorch.org/cppdocs) library.
+
+### API Equivalence Table
+
+Here's a table matching `TensorPtr` creation functions with their corresponding ATen APIs:
+
+| ATen                                        | ExecuTorch                                  |
+|---------------------------------------------|---------------------------------------------|
+| `at::tensor(data, type)`                    | `make_tensor_ptr(data, type)`               |
+| `at::tensor(data, type).reshape(sizes)`     | `make_tensor_ptr(sizes, data, type)`        |
+| `tensor.clone()`                            | `clone_tensor_ptr(tensor)`                  |
+| `tensor.resize_(new_sizes)`                 | `resize_tensor_ptr(tensor, new_sizes)`      |
+| `at::scalar_tensor(value)`                  | `scalar_tensor(value)`                      |
+| `at::from_blob(data, sizes, type)`          | `from_blob(data, sizes, type)`              |
+| `at::empty(sizes)`                          | `empty(sizes)`                              |
+| `at::empty_like(tensor)`                    | `empty_like(tensor)`                        |
+| `at::empty_strided(sizes, strides)`         | `empty_strided(sizes, strides)`             |
+| `at::full(sizes, value)`                    | `full(sizes, value)`                        |
+| `at::full_like(tensor, value)`              | `full_like(tensor, value)`                  |
+| `at::full_strided(sizes, strides, value)`   | `full_strided(sizes, strides, value)`       |
+| `at::zeros(sizes)`                          | `zeros(sizes)`                              |
+| `at::zeros_like(tensor)`                    | `zeros_like(tensor)`                        |
+| `at::zeros_strided(sizes, strides)`         | `zeros_strided(sizes, strides)`             |
+| `at::ones(sizes)`                           | `ones(sizes)`                               |
+| `at::ones_like(tensor)`                     | `ones_like(tensor)`                         |
+| `at::ones_strided(sizes, strides)`          | `ones_strided(sizes, strides)`              |
+| `at::rand(sizes)`                           | `rand(sizes)`                               |
+| `at::rand_like(tensor)`                     | `rand_like(tensor)`                         |
+| `at::randn(sizes)`                          | `randn(sizes)`                              |
+| `at::randn_like(tensor)`                    | `randn_like(tensor)`                        |
+| `at::randint(low, high, sizes)`             | `randint(low, high, sizes)`                 |
+| `at::randint_like(tensor, low, high)`       | `randint_like(tensor, low, high)`           |
+
+## Best Practices
+
+- *Manage Lifetimes Carefully*: Even though `TensorPtr` handles memory management, ensure that any non-owned data (e.g., when using `from_blob()`) remains valid while the tensor is in use.
+- *Use Convenience Functions*: Utilize helper functions for common tensor creation patterns to write cleaner and more readable code.
+- *Be Aware of Data Ownership*: Know whether your tensor owns its data or references external data to avoid unintended side effects or memory leaks.
+- *Ensure `TensorPtr` Outlives `EValue`*: When passing tensors to modules that expect `EValue`, ensure that the `TensorPtr` remains valid as long as the `EValue` is in use.
+
+## Conclusion
+
+The `TensorPtr` in ExecuTorch simplifies tensor memory management by bundling the data and dynamic metadata into a smart pointer. This design eliminates the need for users to manage multiple pieces of data and ensures safer and more maintainable code.
+
+By providing interfaces similar to PyTorch's ATen library, ExecuTorch simplifies the adoption of the new API, allowing developers to transition without a steep learning curve.
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
index bccb74b2104..937b5b389f5 100644
--- a/docs/source/getting-started-architecture.md
+++ b/docs/source/getting-started-architecture.md
@@ -89,6 +89,6 @@ _Executor_ is the entry point to load the program and execute it. The execution
 
 ## Developer Tools
 
-It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](./sdk-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
+It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](./devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
 
 During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments.
diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md
index 1fbe35c72bc..15fa084e33f 100644
--- a/docs/source/getting-started-setup.md
+++ b/docs/source/getting-started-setup.md
@@ -110,6 +110,23 @@ Alternatively, if you would like to experiment with ExecuTorch quickly and easil
    ```
 After setting up your environment, you are ready to convert your PyTorch programs
 to ExecuTorch.
+
+> **_NOTE:_**  Cleaning the build system
+>
+> When fetching a new version of the upstream repo (via `git fetch` or `git
+> pull`) it is a good idea to clean the old build artifacts. The build system
+> does not currently adapt well to changes in build dependencies.
+>
+> You should also update and pull the submodules again, in case their versions
+> have changed.
+>
+> ```bash
+> # From the root of the executorch repo:
+> rm -rf cmake-out pip-out
+> git submodule sync
+> git submodule update --init
+> ```
+
 ## Create an ExecuTorch program
 
 After setting up your environment, you are ready to convert your PyTorch programs
@@ -169,13 +186,30 @@ For now, let's use [`executor_runner`](https://github.com/pytorch/executorch/blo
 ### Build Tooling Setup
 The ExecuTorch repo uses CMake to build its C++ code. Here, we'll configure it to build the `executor_runner` tool to run it on our desktop OS.
   ```bash
-  # Clean and configure the CMake build system. Compiled programs will appear in the executorch/cmake-out directory we create here.
+  # Clean and configure the CMake build system. Compiled programs will
+  # appear in the executorch/cmake-out directory we create here.
   (rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
 
   # Build the executor_runner target
   cmake --build cmake-out --target executor_runner -j9
   ```
 
+> **_NOTE:_**  Cleaning the build system
+>
+> When fetching a new version of the upstream repo (via `git fetch` or `git
+> pull`) it is a good idea to clean the old build artifacts. The build system
+> does not currently adapt well to changes in build dependencies.
+>
+> You should also update and pull the submodules again, in case their versions
+> have changed.
+>
+> ```bash
+> # From the root of the executorch repo:
+> rm -rf cmake-out pip-out
+> git submodule sync
+> git submodule update --init
+> ```
+
 ### Run Your Program
 
 Now that we've exported a program and built the runtime, let's execute it!
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 3b0e0959cd7..1e1060f70b7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -94,7 +94,8 @@ Topics in this section will help you get started with ExecuTorch.
    tutorials/export-to-executorch-tutorial
    running-a-model-cpp-tutorial
    extension-module
-   tutorials/sdk-integration-tutorial
+   extension-tensor
+   tutorials/devtools-integration-tutorial
    apple-runtime
    demo-apps-ios
    demo-apps-android
@@ -117,6 +118,9 @@ Topics in this section will help you get started with ExecuTorch.
    :hidden:
 
    llm/getting-started
+   llm/llama-demo-android
+   llm/build-run-llama3-qualcomm-ai-engine-direct-backend
+   llm/llama-demo-ios
 
 .. toctree::
    :glob:
@@ -196,15 +200,16 @@ Topics in this section will help you get started with ExecuTorch.
    :caption: Developer Tools
    :hidden:
 
-   sdk-overview
-   sdk-bundled-io
-   sdk-etrecord
-   sdk-etdump
-   sdk-profiling
-   sdk-debugging
-   sdk-inspector
-   sdk-delegate-integration
-   sdk-tutorial
+   devtools-overview
+   bundled-io
+   etrecord
+   etdump
+   runtime-profiling
+   model-debugging
+   model-inspector
+   memory-planning-inspection
+   delegate-debugging
+   devtools-tutorial
 
 .. toctree::
    :glob:
@@ -243,11 +248,18 @@ ExecuTorch tutorials.
    :link: extension-module.html
    :tags:
 
+.. customcarditem::
+   :header: Managing Tensor Memory in C++ Tutorial
+   :card_description: A tutorial for managing the dynamic memory when working with tensors.
+   :image: _static/img/generic-pytorch-logo.png
+   :link: extension-tensor.html
+   :tags:
+
 .. customcarditem::
    :header: Using the ExecuTorch Developer Tools to Profile a Model
    :card_description: A tutorial for using the ExecuTorch Developer Tools to profile and analyze a model with linkage back to source code.
    :image: _static/img/generic-pytorch-logo.png
-   :link: tutorials/sdk-integration-tutorial.html
+   :link: tutorials/devtools-integration-tutorial.html
    :tags: devtools
 
 .. customcarditem::
diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md
index 2cf87ca4588..0f060d1c5e5 100644
--- a/docs/source/kernel-library-custom-aten-kernel.md
+++ b/docs/source/kernel-library-custom-aten-kernel.md
@@ -3,23 +3,49 @@
 
 At the last stage of [ExecuTorch model exporting](./export-overview.md), we lower the operators in the dialect to the _out variants_ of the [core ATen operators](./ir-ops-set-definition.md). Then we serialize these operator names into the model artifact. During runtime execution, for each operator name we will need to find the actual _kernels_, i.e., the C++ functions that do the heavy-lifting calculations and return results.
 
-Portable kernel library is the in-house default kernel library, it’s easy to use and portable for most of the target backends. However it’s not optimized for performance, because it’s not specialized for any certain target. Therefore we provide kernel registration APIs for ExecuTorch users to easily register their own optimized kernels.
+## Kernel Libraries
+### First-party kernel libraries:
 
+**[Portable kernel library](https://github.com/pytorch/executorch/tree/main/kernels/portable)** is the in-house default kernel library that covers most of the core ATen operators. It’s easy to use/read and is written in portable C++17. However it’s not optimized for performance, because it’s not specialized for any certain target. Therefore we provide kernel registration APIs for ExecuTorch users to easily register their own optimized kernels.
 
-## Design Principles
+**[Optimized kernel library](https://github.com/pytorch/executorch/tree/main/kernels/optimized)** specializes on performance for some of the operators, leveraging existing third party libraries such as [EigenBLAS](https://gitlab.com/libeigen/eigen). This works best along with the portable kernel library, with a good balance on portability and performance. One example of combining these two libraries can be found [here](https://github.com/pytorch/executorch/blob/main/configurations/CMakeLists.txt).
 
-**What do we support?** On the operator coverage side, the kernel registration APIs allow users to register kernels for all core ATen ops as well as custom ops, as long as the custom ops schemas are specified.
+**[Quantized kernel library](https://github.com/pytorch/executorch/tree/main/kernels/quantized)** implements operators for quantization and dequantization. These are out of core ATen operators but are vital to most of the production use cases.
 
-Notice that we also support partial kernels, for example the kernel only supports a subset of tensor dtypes and/or dim orders.
+### Custom kernel libraries:
 
-**Kernel contract**: kernels need to comply with the following requirements:
+**Custom kernels implementing core ATen ops**. Even though we don't have an internal example for custom kernels for core ATen ops, the optimized kernel library can be viewed as a good example. We have optimized [`add.out`](https://github.com/pytorch/executorch/blob/main/kernels/optimized/cpu/op_add.cpp) and a portable [`add.out`](https://github.com/pytorch/executorch/blob/main/kernels/portable/cpu/op_add.cpp). When user is combining these two libraries, we provide APIs to choose which kernel to use for `add.out`. In order to author and use custom kernels implementing core ATen ops, using the [YAML based approach](#yaml-entry-for-core-aten-op-out-variant) is recommended, because it provides full fledged support on
+  1. combining kernel libraries and define fallback kernels;
+  2. using selective build to minimize the kernel size.
+
+A **[Custom operator](https://github.com/pytorch/executorch/tree/main/extension/llm/custom_ops)** is any operator that an ExecuTorch user defines outside of PyTorch's [`native_functions.yaml`](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml).
+
+## Operator & Kernel Contract
+
+All the kernels mentioned above, whether they are in-house or customized, should comply with the following requirements:
 
 * Match the calling convention derived from operator schema. The kernel registration API will generate headers for the custom kernels as references.
-* Satisfy the dtype constraints defined in edge dialect. For tensors with certain dtypes as arguments, the result of a custom kernel needs to match  the expected dtypes. The constraints are available in edge dialect ops.
-* Gives correct result. We will provide a testing framework to automatically test the custom kernels.
+* Satisfy the dtype constraints defined in edge dialect. For tensors with certain dtypes as arguments, the result of a custom kernel needs to match the expected dtypes. The constraints are available in edge dialect ops.
+* Give correct result. We will provide a testing framework to automatically test the custom kernels.
+
+
+## APIs
+
+These are the APIs available to register kernels/custom kernels/custom ops into ExecuTorch:
+
+* [YAML Entry API](#yaml-entry-api-high-level-architecture)
+  - [for core ATen op with custom kernels](#yaml-entry-api-for-core-aten-op-out-variant)
+  - [for custom ops](#yaml-entry-api-for-custom-ops)
+  - [CMake Macros](#cmake-macros)
+* C++ API
+  - [for custom ops](#c-api-for-custom-ops)
+  - [CMake Example](#compile-and-link-the-custom-kernel)
+
+If it's not clear which API to use, please see [Best Practices](#custom-ops-api-best-practices).
+
 
 
-## High Level Architecture
+### YAML Entry API High Level Architecture
 
 ![](./_static/img/kernel-library-custom-aten-kernel.png)
 
@@ -27,10 +53,10 @@ ExecuTorch users are asked to provide:
 
 1. the custom kernel library with C++ implementations
 
-2. a yaml file associated with the library that describes what operators are being implemented by this library. For partial kernels, the yaml file also contains information on the dtypes and dim orders supported by the  kernel. More details in the API section.
+2. a YAML file associated with the library that describes what operators are being implemented by this library. For partial kernels, the yaml file also contains information on the dtypes and dim orders supported by the  kernel. More details in the API section.
 
 
-### Workflow
+### YAML Entry API Workflow
 
 At build time, the yaml files associated with kernel libraries will be passed to the _kernel resolver_ along with the model op info (see selective build doc) and the outcome is a mapping between a combination of operator names and tensor metadata, to kernel symbols. Then codegen tools will use this mapping to generate C++ bindings that connect the kernels to ExecuTorch runtime. ExecuTorch users need to link this generated library into their application to use these kernels.
 
@@ -38,18 +64,10 @@ At static object initialization time, kernels will be registered into the ExecuT
 
 At runtime initialization stage, ExecuTorch will use the operator name and argument metadata as a key to lookup for the kernels. For example, with “aten::add.out” and inputs being float tensors with dim order (0, 1, 2, 3), ExecuTorch will go into the kernel registry and lookup for a kernel that matches the name and the input metadata.
 
-
-## APIs
-
-There are two sets of APIs: yaml files that describe kernel - operator mappings and codegen tools to consume these mappings.
-
-
-### Yaml Entry for Core ATen Op Out Variant
+### YAML Entry API for Core ATen Op Out Variant
 
 Top level attributes:
 
-
-
 * `op` (if the operator appears in `native_functions.yaml`) or `func` for custom operator. The value for this key needs to be the full operator name (including overload name) for `op` key, or a full operator schema (namespace, operator name, operator overload name and schema string), if we are describing a custom operator. For schema syntax please refer to this [instruction](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md).
 * `kernels`: defines kernel information. It consists of `arg_meta` and `kernel_name`, which are bound together to describe "for input tensors with these metadata, use this kernel".
 * `type_alias`(optional): we are giving aliases to possible dtype options. `T0: [Double, Float]` means `T0` can be one of `Double` or `Float`.
@@ -86,86 +104,9 @@ ATen operator with a dtype/dim order specialized kernel (works for `Double` dtyp
       kernel_name: torch::executor::add_out
 
 ```
-### Custom Ops C++ API
-
-For a custom kernel that implements a custom operator, we provides 2 ways to register it into ExecuTorch runtime:
-1. Using `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` C++ macros, covered by this section.
-2. Using `functions.yaml` and codegen'd C++ libraries, covered by [next section](#custom-ops-yaml-entry).
-
-Please refer to [Custom Ops Best Practices](#custom-ops-api-best-practices) on which API to use.
-
-The first option requires C++17 and doesn't have selective build support yet, but it's faster than the second option where we have to go through yaml authoring and build system tweaking.
-
-The first option is particularly suitable for fast prototyping but can also be used in production.
-
-Similar to `TORCH_LIBRARY`, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime.
-
-#### Prepare custom kernel implementation
-
-Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see native_functions.yaml). For example:
-
-```yaml
-custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
-custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
-```
-
-Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
-
-
-```c++
-// custom_linear.h/custom_linear.cpp
-#include <executorch/runtime/kernel/kernel_includes.h>
-Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
-   // calculation
-   return out;
-}
-```
-#### Use a C++ macro to register it into PyTorch & ExecuTorch
-
-Append the following line in the example above:
-```c++
-// custom_linear.h/custom_linear.cpp
-// opset namespace myop
-EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
-```
-
-Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
-
-```c++
-// custom_linear_pytorch.cpp
-#include "custom_linear.h"
-#include <torch/library.h>
-
-at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
-    // initialize out
-    at::Tensor out = at::empty({weight.size(1), input.size(1)});
-    // wrap kernel in custom_linear.cpp into ATen kernel
-    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
-    return out;
-}
-// standard API to register ops into PyTorch
-TORCH_LIBRARY(myop, m) {
-    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
-    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
-}
-```
-
-#### Compile and link the custom kernel
-
-Link it into ExecuTorch runtime: In our `CMakeLists.txt`` that builds the binary/application, we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
-
-Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
-
-```python
-import torch
-torch.ops.load_library("libcustom_linear.so/dylib")
-
-# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp.
-op = torch.ops.myop.custom_linear.default
-```
 
 
-### Custom Ops Yaml Entry
+### YAML Entry API for Custom Ops
 
 As mentioned above, this option provides more support in terms of selective build and features such as merging operator libraries.
 
@@ -215,14 +156,11 @@ ExecuTorch does not support all of the argument types that core PyTorch supports
 * List<Optional<Type>>
 * Optional<List<Type>>
 
-
-### Build Tool Macros
+#### CMake Macros
 
 We provide build time macros to help users to build their kernel registration library. The macro takes the yaml file describing the kernel library as well as model operator metadata, and packages the generated C++ bindings into a C++ library. The macro is available on CMake.
 
 
-#### CMake
-
 `generate_bindings_for_kernels(FUNCTIONS_YAML functions_yaml CUSTOM_OPS_YAML custom_ops_yaml)` takes a yaml file for core ATen op out variants and also a yaml file for custom ops, generate C++ bindings for kernel registration. It also depends on the selective build artifact generated by `gen_selected_ops()`, see selective build doc for more information. Then `gen_operators_lib` will package those bindings to be a C++ library. As an example:
 ```cmake
 # SELECT_OPS_LIST: aten::add.out,aten::mm.out
@@ -263,6 +201,103 @@ And out fallback:
 
 The merged yaml will have the entry in functions.yaml.
 
+### C++ API for Custom Ops
+
+Unlike the YAML entry API, the C++ API only uses C++ macros `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` for kernel registration, also without selective build support. It makes this API faster in terms of development speed, since users don't have to do YAML authoring and build system tweaking.
+
+Please refer to [Custom Ops Best Practices](#custom-ops-api-best-practices) on which API to use.
+
+Similar to [`TORCH_LIBRARY`](https://pytorch.org/cppdocs/library.html#library_8h_1a0bd5fb09d25dfb58e750d712fc5afb84) in PyTorch, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime.
+
+#### Prepare custom kernel implementation
+
+Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see `native_functions.yaml`). For example:
+
+```yaml
+custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
+custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
+```
+
+Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
+
+
+```c++
+// custom_linear.h/custom_linear.cpp
+#include <executorch/runtime/kernel/kernel_includes.h>
+Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
+   // calculation
+   return out;
+}
+```
+#### Use a C++ macro to register it into ExecuTorch
+
+Append the following line in the example above:
+```c++
+// custom_linear.h/custom_linear.cpp
+// opset namespace myop
+EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
+```
+
+Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
+
+```c++
+// custom_linear_pytorch.cpp
+#include "custom_linear.h"
+#include <torch/library.h>
+
+at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
+    // initialize out
+    at::Tensor out = at::empty({weight.size(1), input.size(1)});
+    // wrap kernel in custom_linear.cpp into ATen kernel
+    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
+    return out;
+}
+// standard API to register ops into PyTorch
+TORCH_LIBRARY(myop, m) {
+    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
+    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
+}
+```
+
+#### Compile and link the custom kernel
+
+Link it into ExecuTorch runtime: In our `CMakeLists.txt` that builds the binary/application, we need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
+
+Here's an example to do it:
+
+```cmake
+# For target_link_options_shared_lib
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+# Add a custom op library
+add_library(custom_op_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/custom_op.cpp)
+
+# Include the header
+target_include_directory(custom_op_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+# Link ExecuTorch library
+target_link_libraries(custom_op_lib PUBLIC executorch)
+
+# Define a binary target
+add_executable(custom_op_runner PUBLIC main.cpp)
+
+# Link this library with --whole-archive !! IMPORTANT !! this is to avoid the operators being stripped by linker
+target_link_options_shared_lib(custom_op_lib)
+
+# Link custom op lib
+target_link_libraries(custom_op_runner PUBLIC custom_op_lib)
+
+```
+
+Link it into the PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
+
+```python
+import torch
+torch.ops.load_library("libcustom_linear.so/dylib")
+
+# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp.
+op = torch.ops.myop.custom_linear.default
+```
 
 ### Custom Ops API Best Practices
 
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index 68d999bbc00..1a7562942e0 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -36,11 +36,16 @@ The basic flow looks like this:
 
 ## APIs
 
-We expose build macros for CMake, to allow users specifying op info:
+We expose a CMake macro `[gen_selected_ops](https://github.com/pytorch/executorch/blob/main/build/Codegen.cmake#L12)`, to allow users specifying op info:
 
-[gen_selected_ops](https://github.com/pytorch/executorch/blob/main/build/Codegen.cmake#L12)
-
-Build macros take the following inputs:
+```
+gen_selected_ops(
+  LIB_NAME         # the name of the selective build operator library to be generated
+  OPS_SCHEMA_YAML  # path to a yaml file containing operators to be selected
+  ROOT_OPS         # comma separated operator names to be selected
+  INCLUDE_ALL_OPS  # boolean flag to include all operators
+)
+```
 
 
 ### Select all ops
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
new file mode 100644
index 00000000000..ac95fb21bd8
--- /dev/null
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -0,0 +1,128 @@
+# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend
+
+This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device.
+
+## Prerequisites
+
+- Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment.
+- Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device.
+- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama2) to know how to run a llama model on mobile via ExecuTorch.
+- A Qualcomm device with 16GB RAM
+  - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices.
+- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above.
+
+## Instructions
+
+### Step1: Prepare the checkpoint of the model and optimized matrix from [Spin Quant](https://github.com/facebookresearch/SpinQuant)
+
+1. For Llama 3 tokenizer and checkpoint, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`.
+2. To get the optimized matrix, please refer to [SpinQuant on GitHub](https://github.com/facebookresearch/SpinQuant). You can download the optimized rotation matrices in the Quantized Models section. Please choose **LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0**.
+
+### Step2: Export to ExecuTorch with Qualcomm AI Engine Direct Backend
+Deploying large language models like Llama 3 on-device presents the following challenges:
+
+1. The model size is too large to fit in device memory for inference.
+2. High model loading and inference time.
+3. Difficulty in quantization.
+
+To address these challenges, we have implemented the following solutions:
+1. Using `--pt2e_quantize qnn_16a4w` to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference.
+2. Using `--num_sharding 8` to shard the model into sub-parts.
+3. Performing graph transformations to convert or decompose operations into more accelerator-friendly operations.
+4. Using `--optimized_rotation_path <path_to_optimized_matrix>` to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy.
+5. Using `--calibration_data "<|start_header_id|>system<|end_header_id|..."` to ensure that during the quantization of Llama 3 8B instruct, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to [the model card of meta llama3 instruct](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/).
+
+To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure the following:
+
+1. The host machine has more than 100GB of memory (RAM + swap space).
+2. The entire process takes a few hours.
+
+```bash
+# Please note that calibration_data must include the prompt template for special tokens.
+python -m examples.models.llama2.export_llama  -t <path_to_tokenizer.model>
+llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct>  --use_kv_cache  --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+```
+
+### Step3: Invoke the Runtime on an Android smartphone with Qualcomm SoCs
+1. Build executorch with Qualcomm AI Engine Direct Backend for android
+    ```bash
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake" \
+        -DANDROID_ABI=arm64-v8a \
+        -DANDROID_PLATFORM=android-23 \
+        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        -DEXECUTORCH_BUILD_QNN=ON \
+        -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+        -Bcmake-android-out .
+
+    cmake --build cmake-android-out -j16 --target install --config Release
+    ```
+2. Build llama runner for android
+```bash
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}"/build/cmake/android.toolchain.cmake  \
+        -DANDROID_ABI=arm64-v8a \
+        -DANDROID_PLATFORM=android-23 \
+        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+        -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \
+        -DEXECUTORCH_BUILD_QNN=ON \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+        -Bcmake-android-out/examples/models/llama2 examples/models/llama2
+
+    cmake --build cmake-android-out/examples/models/llama2 -j16 --config Release
+```
+3. Run on Android via adb shell
+*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone
+
+**3.1 Connect your android phone**
+
+**3.2 We need to push required QNN libraries to the device.**
+```bash
+# make sure you have write-permission on below path.
+DEVICE_DIR=/data/local/tmp/llama
+adb shell mkdir -p ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
+```
+
+**3.3 Upload model, tokenizer and llama runner binary to phone**
+```bash
+adb push <model.pte> ${DEVICE_DIR}
+adb push <tokenizer.model> ${DEVICE_DIR}
+adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
+adb push cmake-out-android/examples/models/llama2/llama_main ${DEVICE_DIR}
+```
+
+**3.4 Run model**
+```bash
+adb shell "cd ${DEVICE_DIR} && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.model> --prompt \"<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n\" --seq_len 128"
+```
+You should see the message:
+```
+<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! I'd be delighted to chat with you about Facebook. Facebook is a social media platform that was created in 2004 by Mark Zuckerberg and his colleagues while he was a student at Harvard University. It was initially called "Facemaker" but later changed to Facebook, which is a combination of the words "face" and "book". The platform was initially intended for people to share their thoughts and share information with their friends, but it quickly grew to become one of the
+```
+
+## What is coming?
+- Improve the performance for Llama 3 Instruct
+- Reduce the memory pressure during inference to support 12GB Qualcomm devices
+- Support more LLMs
+
+## FAQ
+
+If you encounter any issues while reproducing the tutorial, please file a github
+issue on ExecuTorch repo and tag use `#qcom_aisw` tag
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index 46a5dc604fc..272098d4445 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -587,8 +587,8 @@ I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a ver
 The delegated model should be noticeably faster compared to the non-delegated model.
 
 For more information regarding backend delegateion, see the ExecuTorch guides
-for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md) and [Core ML
-Backend](../build-run-coreml.md).
+for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md),  [Core ML
+Backend](../build-run-coreml.md) and [Qualcomm AI Engine Direct Backend](build-run-llama3-qualcomm-ai-engine-direct-backend.md).
 
 ## Quantization
 
@@ -752,7 +752,7 @@ Through the ExecuTorch Developer Tools, users are able to profile model executio
 
 ##### ETRecord generation (Optional)
 
-An ETRecord is an artifact generated at the time of export that contains model graphs and source-level metadata linking the ExecuTorch program to the original PyTorch model. You can view all profiling events without an ETRecord, though with an ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [the ETRecord docs](../sdk-etrecord.md).
+An ETRecord is an artifact generated at the time of export that contains model graphs and source-level metadata linking the ExecuTorch program to the original PyTorch model. You can view all profiling events without an ETRecord, though with an ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [the ETRecord docs](../etrecord.md).
 
 
 In your export script, after calling `to_edge()` and `to_executorch()`, call `generate_etrecord()` with the `EdgeProgramManager` from `to_edge()` and the `ExecuTorchProgramManager` from `to_executorch()`. Make sure to copy the `EdgeProgramManager`, as the call to `to_backend()` mutates the graph in-place.
@@ -774,7 +774,7 @@ Run the export script and the ETRecord will be generated as `etrecord.bin`.
 
 ##### ETDump generation
 
-An ETDump is an artifact generated at runtime containing a trace of the model execution. For more information, see [the ETDump docs](../sdk-etdump.md).
+An ETDump is an artifact generated at runtime containing a trace of the model execution. For more information, see [the ETDump docs](../etdump.md).
 
 Include the ETDump header in your code.
 ```cpp
@@ -808,7 +808,7 @@ if (result.buf != nullptr && result.size > 0) {
 Additionally, update CMakeLists.txt to build with Developer Tools and enable events to be traced and logged into ETDump:
 
 ```
-option(EXECUTORCH_BUILD_SDK "" ON)
+option(EXECUTORCH_BUILD_DEVTOOLS "" ON)
 
 # ...
 
@@ -843,7 +843,7 @@ This prints the performance data in a tabular format in “inspector_out.txt”,
 ![](../_static/img/llm_manual_print_data_tabular.png)
 <a href="../_static/img/llm_manual_print_data_tabular.png" target="_blank">View in full size</a>
 
-To learn more about the Inspector and the rich functionality it provides, see the [Inspector API Reference](../sdk-inspector.md).
+To learn more about the Inspector and the rich functionality it provides, see the [Inspector API Reference](../model-inspector.md).
 
 ## Custom Kernels
 With the ExecuTorch custom operator APIs, custom operator and kernel authors can easily bring in their kernel into PyTorch/ExecuTorch.
diff --git a/docs/source/memory-planning-inspection.md b/docs/source/memory-planning-inspection.md
new file mode 100644
index 00000000000..47951a72038
--- /dev/null
+++ b/docs/source/memory-planning-inspection.md
@@ -0,0 +1,30 @@
+# Memory Planning Inspection in ExecuTorch
+
+After the [Memory Planning](https://pytorch.org/executorch/main/concepts.html#memory-planning) pass of ExecuTorch, memory allocation information is stored on the nodes of the [`ExportedProgram`](https://pytorch.org/executorch/main/concepts.html#exportedprogram). Here, we present a tool designed to inspect memory allocation and visualize all active tensor objects.
+
+## Usage
+User should add this code after they call [to_executorch()](https://pytorch.org/executorch/main/export-to-executorch-api-reference.html#executorch.exir.EdgeProgramManager.to_executorch), and it will write memory allocation information stored on the nodes to the file path "memory_profile.json". The file is compatible with the Chrome trace viewer; see below for more information about interpreting the results.
+
+```python
+from executorch.util.activation_memory_profiler import generate_memory_trace
+generate_memory_trace(
+    executorch_program_manager=prog,
+    chrome_trace_filename="memory_profile.json",
+    enable_memory_offsets=True,
+)
+```
+* `prog` is an instance of [`ExecuTorchProgramManager`](https://pytorch.org/executorch/main/export-to-executorch-api-reference.html#executorch.exir.ExecutorchProgramManager), returned by [to_executorch()](https://pytorch.org/executorch/main/export-to-executorch-api-reference.html#executorch.exir.EdgeProgramManager.to_executorch).
+* Set `enable_memory_offsets` to `True` to show the location of each tensor on the memory space.
+
+## Chrome Trace
+Open a Chrome browser tab and navigate to <chrome://tracing/>. Upload the generated `.json` to view.
+Example of a [MobileNet V2](https://pytorch.org/vision/main/models/mobilenetv2.html) model:
+
+![Memory planning Chrome trace visualization](/_static/img/memory_planning_inspection.png)
+
+Note that, since we are repurposing the Chrome trace tool, the axes in this context may have different meanings compared to other Chrome trace graphs you may have encountered previously:
+* The horizontal axis, despite being labeled in seconds (s), actually represents megabytes (MBs).
+* The vertical axis has a 2-level hierarchy. The first level, "pid", represents memory space. For CPU, everything is allocated on one "space"; other backends may have multiple. In the second level, each row represents one time step. Since nodes will be executed sequentially, each node represents one time step, thus you will have as many nodes as there are rows.
+
+## Further Reading
+* [Memory Planning](https://pytorch.org/executorch/main/compiler-memory-planning.html)
diff --git a/docs/source/model-debugging.md b/docs/source/model-debugging.md
new file mode 100644
index 00000000000..5475a703bd7
--- /dev/null
+++ b/docs/source/model-debugging.md
@@ -0,0 +1,82 @@
+# Debugging Models in ExecuTorch
+
+With the ExecuTorch Developer Tools, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.).
+
+Currently, ExecuTorch supports the following debugging flows:
+- Extraction of model level outputs via ETDump.
+- Extraction of intermediate outputs (outside of delegates) via ETDump:
+  - Linking of these intermediate outputs back to the eager model python code.
+
+
+## Steps to debug a model in ExecuTorch
+
+### Runtime
+For a real example reflecting the steps below, please refer to [example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp).
+
+1. [Optional] Generate an [ETRecord](./etrecord.rst) while exporting your model. When provided, this enables users to link profiling information back to the eager model source code (with stack traces and module hierarchy).
+2. Integrate [ETDump generation](./etdump.md) into the runtime and set the debugging level by configuring the `ETDumpGen` object. Then, provide an additional buffer to which intermediate outputs and program outputs will be written. Currently we support two levels of debugging:
+    - Program level outputs
+    ```C++
+    Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
+    etdump_gen.set_debug_buffer(buffer);
+    etdump_gen.set_event_tracer_debug_level(
+        EventTracerDebugLogLevel::kProgramOutputs);
+    ```
+
+    - Intermediate outputs of executed (non-delegated) operations (will include the program level outputs too)
+    ```C++
+    Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
+    etdump_gen.set_debug_buffer(buffer);
+    etdump_gen.set_event_tracer_debug_level(
+        EventTracerDebugLogLevel::kIntermediateOutputs);
+    ```
+3. Build the runtime with the pre-processor flag that enables tracking of debug events. Instructions are in the [ETDump documentation](./etdump.md).
+4. Run your model and dump out the ETDump buffer as described [here](./etdump.md). (Do so similarly for the debug buffer if configured above)
+
+
+### Accessing the debug outputs post run using the Inspector API's
+Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](./model-inspector.rst) to inspect these debug outputs.
+
+```python
+from executorch.devtools import Inspector
+
+# Create an Inspector instance with etdump and the debug buffer.
+inspector = Inspector(etdump_path=etdump_path,
+            buffer_path = buffer_path,
+            # etrecord is optional, if provided it'll link back
+            # the runtime events to the eager model python source code.
+            etrecord = etrecord_path)
+
+# Accessing program outputs is as simple as this:
+for event_block in inspector.event_blocks:
+    if event_block.name == "Execute":
+        print(event_blocks.run_output)
+
+# Accessing intermediate outputs from each event (an event here is essentially an instruction that executed in the runtime).
+for event_block in inspector.event_blocks:
+    if event_block.name == "Execute":
+        for event in event_block.events:
+            print(event.debug_data)
+            # If an ETRecord was provided by the user during Inspector initialization, users
+            # can print the stacktraces and module hierarchy of these events.
+            print(event.stack_traces)
+            print(event.module_hierarchy)
+```
+
+We've also provided a simple set of utilities that let users perform quality analysis of their model outputs with respect to a set of reference outputs (possibly from the eager mode model).
+
+
+```python
+from executorch.devtools.inspector import compare_results
+
+# Run a simple quality analysis between the model outputs sourced from the
+# runtime and a set of reference outputs.
+#
+# Setting plot to True will result in the quality metrics being graphed
+# and displayed (when run from a notebook) and will be written out to the
+# filesystem. A dictionary will always be returned which will contain the
+# results.
+for event_block in inspector.event_blocks:
+    if event_block.name == "Execute":
+        compare_results(event_blocks.run_output, ref_outputs, plot = True)
+```
diff --git a/docs/source/model-inspector.rst b/docs/source/model-inspector.rst
new file mode 100644
index 00000000000..d80a8960b1b
--- /dev/null
+++ b/docs/source/model-inspector.rst
@@ -0,0 +1,159 @@
+Inspector APIs
+==============
+
+Overview
+--------
+
+The Inspector APIs provide a convenient interface for analyzing the
+contents of `ETRecord <etrecord.html>`__ and
+`ETDump <etdump.html>`__, helping developers get insights about model
+architecture and performance statistics. It’s built on top of the `EventBlock Class <#eventblock-class>`__ data structure,
+which organizes a group of `Event <#event-class>`__\ s for easy access to details of profiling events.
+
+There are multiple ways in which users can interact with the Inspector
+APIs:
+
+* By using `public methods <#inspector-methods>`__ provided by the ``Inspector`` class.
+* By accessing the `public attributes <#inspector-attributes>`__ of the ``Inspector``, ``EventBlock``, and ``Event`` classes.
+* By using a `CLI <#cli>`__ tool for basic functionalities.
+
+Please refer to the `e2e use case doc <tutorials/devtools-integration-tutorial.html>`__ get an understanding of how to use these in a real world example.
+
+
+Inspector Methods
+-----------------
+
+Constructor
+~~~~~~~~~~~
+
+.. autofunction:: executorch.devtools.Inspector.__init__
+
+**Example Usage:**
+
+.. code:: python
+
+    from executorch.devtools import Inspector
+
+    inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin")
+
+to_dataframe
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: executorch.devtools.Inspector.to_dataframe
+
+
+print_data_tabular
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: executorch.devtools.Inspector.print_data_tabular
+
+.. _example-usage-1:
+
+**Example Usage:**
+
+.. code:: python
+
+    inspector.print_data_tabular()
+
+.. image:: _static/img/print_data_tabular.png
+Note that the unit of delegate profiling events is "cycles". We're working on providing a way to set different units in the future.
+
+
+find_total_for_module
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: executorch.devtools.Inspector.find_total_for_module
+
+.. _example-usage-2:
+
+**Example Usage:**
+
+.. code:: python
+
+    print(inspector.find_total_for_module("L__self___conv_layer"))
+
+::
+
+    0.002
+
+
+get_exported_program
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: executorch.devtools.Inspector.get_exported_program
+
+.. _example-usage-3:
+
+**Example Usage:**
+
+.. code:: python
+
+    print(inspector.get_exported_program())
+
+::
+
+    ExportedProgram:
+        class GraphModule(torch.nn.Module):
+            def forward(self, arg0_1: f32[4, 3, 64, 64]):
+                # No stacktrace found for following nodes
+                _param_constant0 = self._param_constant0
+                _param_constant1 = self._param_constant1
+
+                ### ... Omit part of the program for documentation readability ... ###
+
+    Graph signature: ExportGraphSignature(parameters=[], buffers=[], user_inputs=['arg0_1'], user_outputs=['aten_tan_default'], inputs_to_parameters={}, inputs_to_buffers={}, buffers_to_mutate={}, backward_signature=None, assertion_dep_token=None)
+    Range constraints: {}
+    Equality constraints: []
+
+
+Inspector Attributes
+--------------------
+
+``EventBlock`` Class
+~~~~~~~~~~~~~~~~~~~~
+
+Access ``EventBlock`` instances through the ``event_blocks`` attribute
+of an ``Inspector`` instance, for example:
+
+.. code:: python
+
+    inspector.event_blocks
+
+.. autoclass:: executorch.devtools.inspector.EventBlock
+
+``Event`` Class
+~~~~~~~~~~~~~~~
+
+Access ``Event`` instances through the ``events`` attribute of an
+``EventBlock`` instance.
+
+.. autoclass:: executorch.devtools.inspector.Event
+
+**Example Usage:**
+
+.. code:: python
+
+    for event_block in inspector.event_blocks:
+        for event in event_block.events:
+            if event.name == "Method::execute":
+                print(event.perf_data.raw)
+
+::
+
+    [175.748, 78.678, 70.429, 122.006, 97.495, 67.603, 70.2, 90.139, 66.344, 64.575, 134.135, 93.85, 74.593, 83.929, 75.859, 73.909, 66.461, 72.102, 84.142, 77.774, 70.038, 80.246, 59.134, 68.496, 67.496, 100.491, 81.162, 74.53, 70.709, 77.112, 59.775, 79.674, 67.54, 79.52, 66.753, 70.425, 71.703, 81.373, 72.306, 72.404, 94.497, 77.588, 79.835, 68.597, 71.237, 88.528, 71.884, 74.047, 81.513, 76.116]
+
+
+CLI
+---
+
+Execute the following command in your terminal to display the data
+table. This command produces the identical table output as calling the
+`print_data_tabular <#print-data-tabular>`__ mentioned earlier:
+
+.. code:: bash
+
+    python3 -m devtools.inspector.inspector_cli --etdump_path <path_to_etdump> --etrecord_path <path_to_etrecord>
+
+Note that the `etrecord_path` argument is optional.
+
+We plan to extend the capabilities of the CLI in the future.
diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md
index 72e90161d0c..a26ae0c63ec 100644
--- a/docs/source/native-delegates-executorch-xnnpack-delegate.md
+++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md
@@ -25,22 +25,22 @@ The partitioner is implemented by backend delegates to mark nodes suitable for l
 
 ##### Module-based partitioning
 
-`source_fn` is embedded in the node’s metadata and gives information on where these nodes come from. For example, modules like `torch.nn.Linear` when captured and exported `to_edge` generate groups of nodes for their computation. The group of nodes associated with computing the linear module then has a `source_fn` of `torch.nn.Linear. Partitioning based on `source_fn` allows us to identify groups of nodes which are lowerable via XNNPACK.
+`source_fn_stack` is embedded in the node’s metadata and gives information on where these nodes come from. For example, modules like `torch.nn.Linear` when captured and exported `to_edge` generate groups of nodes for their computation. The group of nodes associated with computing the linear module then has a `source_fn_stack` of `torch.nn.Linear. Partitioning based on `source_fn_stack` allows us to identify groups of nodes which are lowerable via XNNPACK.
 
 For example after capturing `torch.nn.Linear` you would find the following key in the metadata for the addmm node associated with linear:
 ```python
->>> print(linear_node.meta["source_fn"])
-'source_fn': ('fn', <class 'torch.nn.modules.linear.Linear'>)
+>>> print(linear_node.meta["source_fn_stack"])
+'source_fn_stack': ('fn', <class 'torch.nn.modules.linear.Linear'>)
 ```
 
 
 ##### Op-based partitioning
 
-The `XnnpackPartitioner` also partitions using op targets. It traverses the graph and identifies individual nodes which are lowerable to XNNPACK. A drawback to module-based partitioning is that operators which come from [decompositions](https://github.com/pytorch/pytorch/blob/main/torch/_decomp/decompositions.py) may be skipped. For example, an operator like `torch.nn.Hardsigmoid` is decomposed into add, muls, divs, and clamps. While hardsigmoid is not lowerable, we can lower the decomposed ops. Relying on `source_fn` metadata would skip these lowerables because they belong to a non-lowerable module, so in order to improve model performance, we greedily lower operators based on the op targets as well as the `source_fn`.
+The `XnnpackPartitioner` also partitions using op targets. It traverses the graph and identifies individual nodes which are lowerable to XNNPACK. A drawback to module-based partitioning is that operators which come from [decompositions](https://github.com/pytorch/pytorch/blob/main/torch/_decomp/decompositions.py) may be skipped. For example, an operator like `torch.nn.Hardsigmoid` is decomposed into add, muls, divs, and clamps. While hardsigmoid is not lowerable, we can lower the decomposed ops. Relying on `source_fn_stack` metadata would skip these lowerables because they belong to a non-lowerable module, so in order to improve model performance, we greedily lower operators based on the op targets as well as the `source_fn_stack`.
 
 ##### Passes
 
-Before any serialization, we apply passes on the subgraphs to prepare the graph. These passes are essentially graph transformations that help improve the performance of the delegate. We give an overview of the most significant passes and their function below. For a description of all passes see [here](https://github.com/pytorch/executorch/tree/main/backends/xnnpack/passes):
+Before any serialization, we apply passes on the subgraphs to prepare the graph. These passes are essentially graph transformations that help improve the performance of the delegate. We give an overview of the most significant passes and their function below. For a description of all passes see [here](https://github.com/pytorch/executorch/tree/main/backends/xnnpack/_passes):
 
 * Channels Last Reshape
     * ExecuTorch tensors tend to be contiguous before passing them into delegates, while XNNPACK only accepts channels-last memory layout. This pass minimizes the number of permutation operators inserted to pass in channels-last memory format.
@@ -74,7 +74,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information.
+We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information.
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
@@ -110,9 +110,9 @@ quantizer.set_global(quantization_config)
 ### Quantizing your model with the XNNPACKQuantizer
 After configuring our quantizer, we are now ready to quantize our model
 ```python
-from torch._export import capture_pre_autograd_graph
+from torch.export import export_for_training
 
-exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs)
+exported_model = export_for_training(model_to_quantize, example_inputs).module()
 prepared_model = prepare_pt2e(exported_model, quantizer)
 print(prepared_model.graph)
 ```
diff --git a/docs/source/runtime-build-and-cross-compilation.md b/docs/source/runtime-build-and-cross-compilation.md
index 5ad64cb24c3..4d42357618c 100644
--- a/docs/source/runtime-build-and-cross-compilation.md
+++ b/docs/source/runtime-build-and-cross-compilation.md
@@ -98,21 +98,20 @@ If it worked, you should see the message "Model executed successfully" followed
 by the output values.
 
 ```
-I 00:00:00.002052 executorch:executor_runner.cpp:75] Model file add.pte is loaded.
-I 00:00:00.002086 executorch:executor_runner.cpp:85] Running method forward
-I 00:00:00.002092 executorch:executor_runner.cpp:140] Setting up non-const buffer 1, size 48.
-I 00:00:00.002149 executorch:executor_runner.cpp:181] Method loaded.
-I 00:00:00.002154 executorch:util.h:105] input already initialized, refilling.
-I 00:00:00.002157 executorch:util.h:105] input already initialized, refilling.
-I 00:00:00.002159 executorch:executor_runner.cpp:186] Inputs prepared.
-I 00:00:00.011684 executorch:executor_runner.cpp:195] Model executed successfully.
-I 00:00:00.011709 executorch:executor_runner.cpp:210] 8.000000
+I 00:00:00.000526 executorch:executor_runner.cpp:82] Model file add.pte is loaded.
+I 00:00:00.000595 executorch:executor_runner.cpp:91] Using method forward
+I 00:00:00.000612 executorch:executor_runner.cpp:138] Setting up planned buffer 0, size 48.
+I 00:00:00.000669 executorch:executor_runner.cpp:161] Method loaded.
+I 00:00:00.000685 executorch:executor_runner.cpp:171] Inputs prepared.
+I 00:00:00.000764 executorch:executor_runner.cpp:180] Model executed successfully.
+I 00:00:00.000770 executorch:executor_runner.cpp:184] 1 outputs: 
+Output 0: tensor(sizes=[1], [2.])
 ```
 
 
 ## Cross compilation
 
-Follwing are instruction on how to perform cross compilation for Android and iOS.
+Following are instruction on how to perform cross compilation for Android and iOS.
 
 ### Android
 - Prerequisite: [Android NDK](https://developer.android.com/ndk), choose one of the following:
diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md
index 6766e678e0e..1a421fdcc0a 100644
--- a/docs/source/runtime-overview.md
+++ b/docs/source/runtime-overview.md
@@ -33,7 +33,7 @@ The runtime is also responsible for:
   semantics of those operators.
 * Dispatching predetermined sections of the model to [backend
   delegates](compiler-delegate-and-partitioner.md) for acceleration.
-* Optionally gathering [profiling data](sdk-profiling.md) during load and
+* Optionally gathering [profiling data](runtime-profiling.md) during load and
   execution.
 
 ## Design Goals
@@ -159,7 +159,7 @@ For more details about the ExecuTorch runtime, please see:
 * [Simplified Runtime APIs Tutorial](extension-module.md)
 * [Runtime Build and Cross Compilation](runtime-build-and-cross-compilation.md)
 * [Runtime Platform Abstraction Layer](runtime-platform-abstraction-layer.md)
-* [Runtime Profiling](sdk-profiling.md)
+* [Runtime Profiling](runtime-profiling.md)
 * [Backends and Delegates](compiler-delegate-and-partitioner.md)
 * [Backend Delegate Implementation](runtime-backend-delegate-implementation-and-linking.md)
 * [Kernel Library Overview](kernel-library-overview.md)
diff --git a/docs/source/runtime-profiling.md b/docs/source/runtime-profiling.md
new file mode 100644
index 00000000000..c228971d28c
--- /dev/null
+++ b/docs/source/runtime-profiling.md
@@ -0,0 +1,23 @@
+# Profiling Models in ExecuTorch
+
+Profiling in ExecuTorch gives users access to these runtime metrics:
+- Model Load Time.
+- Operator Level Execution Time.
+- Delegate Execution Time.
+  - If the delegate that the user is calling into has been integrated with the [Developer Tools](./delegate-debugging.md), then users will also be able to access delegated operator execution time.
+- End-to-end Inference Execution Time.
+
+One uniqe aspect of ExecuTorch Profiling is the ability to link every runtime executed operator back to the exact line of python code from which this operator originated. This capability enables users to easily identify hotspots in their model, source them back to the exact line of Python code, and optimize if chosen to.
+
+We provide access to all the profiling data via the Python [Inspector API](./model-inspector.rst). The data mentioned above can be accessed through these interfaces, allowing users to perform any post-run analysis of their choice.
+
+## Steps to Profile a Model in ExecuTorch
+
+1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model. If provided this will enable users to link back profiling details to eager model source code (with stack traces and module hierarchy).
+2.  Build the runtime with the pre-processor flags that enable profiling. Detailed in the [ETDump documentation](./etdump.md).
+3. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
+4. Create an instance of the [Inspector API](./model-inspector.rst) by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
+    - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level.
+
+
+Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model.
diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md
index c399bf1e27c..488ade7bac8 100644
--- a/docs/source/sdk-bundled-io.md
+++ b/docs/source/sdk-bundled-io.md
@@ -1,555 +1,3 @@
 # Bundled Program -- a Tool for ExecuTorch Model Validation
 
-## Introduction
-`BundledProgram` is a wrapper around the core ExecuTorch program designed to help users wrapping test cases with the model they deploy. `BundledProgram` is not necessarily a core part of the program and not needed for its execution, but is particularly important for various other use-cases, such as model correctness evaluation, including e2e testing during the model bring-up process.
-
-Overall, the procedure can be broken into two stages, and in each stage we are supporting:
-
-* **Emit stage**: Bundling the test I/O cases along with the ExecuTorch program, serializing into flatbuffer.
-* **Runtime stage**: Accessing, executing, and verifying the bundled test cases during runtime.
-
-## Emit stage
-This stage mainly focuses on the creation of a `BundledProgram` and dumping it out to the disk as a flatbuffer file. The main procedure is as follow:
-1. Create a model and emit its ExecuTorch program.
-2. Construct a `List[MethodTestSuite]` to record all test cases that needs to be bundled.
-3. Generate `BundledProgram` by using the emited model and `List[MethodTestSuite]`.
-4. Serialize the `BundledProgram` and dump it out to the disk.
-
-### Step 1: Create a Model and Emit its ExecuTorch Program.
-
-ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate Sample ExecuTorch program](./getting-started-setup.md) or [Exporting to ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial).
-
-### Step 2: Construct `List[MethodTestSuite]` to hold test info
-
-In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTestSuite`, to hold essential info for ExecuTorch program verification.
-
-`MethodTestCase` represents a single testcase. Each `MethodTestCase` contains inputs and expected outputs for a single execution.
-
-:::{dropdown} `MethodTestCase`
-
-```{eval-rst}
-.. autofunction:: executorch.devtools.bundled_program.config.MethodTestCase.__init__
-    :noindex:
-```
-:::
-
-`MethodTestSuite` contains all testing info for single method, including a str representing method name, and a `List[MethodTestCase]` for all testcases:
-
-:::{dropdown} `MethodTestSuite`
-
-```{eval-rst}
-.. autofunction:: executorch.devtools.bundled_program.config.MethodTestSuite
-    :noindex:
-```
-:::
-
-Since each model may have multiple inference methods, we need to generate `List[MethodTestSuite]` to hold all essential infos.
-
-
-### Step 3: Generate `BundledProgram`
-
-We provide `BundledProgram` class under `executorch/devtools/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including
-                            `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`:
-
-:::{dropdown} `BundledProgram`
-
-```{eval-rst}
-.. autofunction:: executorch.devtools.bundled_program.core.BundledProgram.__init__
-    :noindex:
-```
-:::
-
-Construtor of `BundledProgram `will do sannity check internally to see if the given `List[MethodTestSuite]` matches the given Program's requirements. Specifically:
-1. The method_names of each `MethodTestSuite` in `List[MethodTestSuite]` for should be also in program. Please notice that it is no need to set testcases for every method in the Program.
-2. The metadata of each testcase should meet the requirement of the coresponding inference methods input.
-
-### Step 4: Serialize `BundledProgram` to Flatbuffer.
-
-To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/devtools/bundled_program/serialize/__init__.py`.
-
-:::{dropdown} Serialize and Deserialize
-
-```{eval-rst}
-.. currentmodule:: executorch.devtools.bundled_program.serialize
-.. autofunction:: serialize_from_bundled_program_to_flatbuffer
-    :noindex:
-```
-
-```{eval-rst}
-.. currentmodule:: executorch.devtools.bundled_program.serialize
-.. autofunction:: deserialize_from_flatbuffer_to_bundled_program
-    :noindex:
-```
-:::
-
-### Emit Example
-
-Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch model and the representative inputs we want to test it along with.
-
-```python
-import torch
-
-from executorch.exir import to_edge
-from executorch.devtools import BundledProgram
-
-from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.devtools.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
-from torch._export import capture_pre_autograd_graph
-from torch.export import export
-
-
-# Step 1: ExecuTorch Program Export
-class SampleModel(torch.nn.Module):
-    """An example model with multi-methods. Each method has multiple input and single output"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.a: torch.Tensor = 3 * torch.ones(2, 2, dtype=torch.int32)
-        self.b: torch.Tensor = 2 * torch.ones(2, 2, dtype=torch.int32)
-
-    def forward(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
-        z = x.clone()
-        torch.mul(self.a, x, out=z)
-        y = x.clone()
-        torch.add(z, self.b, out=y)
-        torch.add(y, q, out=y)
-        return y
-
-
-# Inference method name of SampleModel we want to bundle testcases to.
-# Notices that we do not need to bundle testcases for every inference methods.
-method_name = "forward"
-model = SampleModel()
-
-# Inputs for graph capture.
-capture_input = (
-    (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-    (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-)
-
-# Export method's FX Graph.
-method_graph = export(
-    capture_pre_autograd_graph(model, capture_input),
-    capture_input,
-)
-
-
-# Emit the traced method into ET Program.
-et_program = to_edge(method_graph).to_executorch()
-
-# Step 2: Construct MethodTestSuite for Each Method
-
-# Prepare the Test Inputs.
-
-# Number of input sets to be verified
-n_input = 10
-
-# Input sets to be verified.
-inputs = [
-    # Each list below is a individual input set.
-    # The number of inputs, dtype and size of each input follow Program's spec.
-    [
-        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-        (torch.rand(2, 2) - 0.5).to(dtype=torch.int32),
-    ]
-    for _ in range(n_input)
-]
-
-# Generate Test Suites
-method_test_suites = [
-    MethodTestSuite(
-        method_name=method_name,
-        test_cases=[
-            MethodTestCase(
-                inputs=input,
-                expected_outputs=(getattr(model, method_name)(*input), ),
-            )
-            for input in inputs
-        ],
-    ),
-]
-
-# Step 3: Generate BundledProgram
-bundled_program = BundledProgram(et_program, method_test_suites)
-
-# Step 4: Serialize BundledProgram to flatbuffer.
-serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
-    bundled_program
-)
-save_path = "bundled_program.bpte"
-with open(save_path, "wb") as f:
-    f.write(serialized_bundled_program)
-
-```
-
-We can also regenerate `BundledProgram` from flatbuffer file if needed:
-
-```python
-from executorch.devtools.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program
-save_path = "bundled_program.bpte"
-with open(save_path, "rb") as f:
-    serialized_bundled_program = f.read()
-
-regenerate_bundled_program = deserialize_from_flatbuffer_to_bundled_program(serialized_bundled_program)
-```
-
-## Runtime Stage
-This stage mainly focuses on executing the model with the bundled inputs and and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
-
-
-### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
-We need the pointer to ExecuTorch program to do the execution. To unify the process of loading and executing `BundledProgram` and Program flatbuffer, we create an API:
-
-:::{dropdown} `GetProgramData`
-
-```{eval-rst}
-.. doxygenfunction:: torch::executor::bundled_program::GetProgramData
-```
-:::
-
-Here's an example of how to use the `GetProgramData` API:
-```c++
-// Assume that the user has read the contents of the file into file_data using
-// whatever method works best for their application. The file could contain
-// either BundledProgram data or Program data.
-void* file_data = ...;
-size_t file_data_len = ...;
-
-// If file_data contains a BundledProgram, GetProgramData() will return a
-// pointer to the Program data embedded inside it. Otherwise it will return
-// file_data, which already pointed to Program data.
-const void* program_ptr;
-size_t program_len;
-status = torch::executor::bundled_program::GetProgramData(
-    file_data, file_data_len, &program_ptr, &program_len);
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "GetProgramData() failed with status 0x%" PRIx32,
-    status);
-```
-
-### Load Bundled Input to Method
-To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `torch::executor::bundled_program::LoadBundledInput`:
-
-:::{dropdown} `LoadBundledInput`
-
-```{eval-rst}
-.. doxygenfunction:: torch::executor::bundled_program::LoadBundledInput
-```
-:::
-
-### Verify the Method's Output.
-We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput` to verify the method's output with bundled expected outputs. Here's the details of this API:
-
-:::{dropdown} `VerifyResultWithBundledExpectedOutput`
-
-```{eval-rst}
-.. doxygenfunction:: torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput
-```
-:::
-
-
-### Runtime Example
-
-Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/sdk/sdk_example_runner/sdk_example_runner.cpp), and please review that file if you need more info and context:
-
-```c++
-// method_name is the name for the method we want to test
-// memory_manager is the executor::MemoryManager variable for executor memory allocation.
-// program is the ExecuTorch program.
-Result<Method> method = program->load_method(method_name, &memory_manager);
-
-ET_CHECK_MSG(
-    method.ok(),
-    "load_method() failed with status 0x%" PRIx32,
-    method.error());
-
-// Load testset_idx-th input in the buffer to plan
-status = torch::executor::bundled_program::LoadBundledInput(
-        *method,
-        program_data.bundled_program_data(),
-        FLAGS_testset_idx);
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "LoadBundledInput failed with status 0x%" PRIx32,
-    status);
-
-// Execute the plan
-status = method->execute();
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "method->execute() failed with status 0x%" PRIx32,
-    status);
-
-// Verify the result.
-status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
-        *method,
-        program_data.bundled_program_data(),
-        FLAGS_testset_idx,
-        FLAGS_rtol,
-        FLAGS_atol);
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "Bundle verification failed with status 0x%" PRIx32,
-    status);
-
-```
-
-## Common Errors
-
-Errors will be raised if `List[MethodTestSuites]` doesn't match the `Program`. Here're two common situations:
-
-### Test input doesn't match model's requirement.
-
-Each inference method of PyTorch model has its own requirement for the inputs, like number of input, the dtype of each input, etc. `BundledProgram` will raise error if test input not meet the requirement.
-
-Here's the example of the dtype of test input not meet model's requirement:
-
-```python
-import torch
-
-from executorch.exir import to_edge
-from executorch.devtools import BundledProgram
-
-from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
-from torch.export import export
-
-
-class Module(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.a = 3 * torch.ones(2, 2, dtype=torch.float)
-        self.b = 2 * torch.ones(2, 2, dtype=torch.float)
-
-    def forward(self, x):
-        out_1 = torch.ones(2, 2, dtype=torch.float)
-        out_2 = torch.ones(2, 2, dtype=torch.float)
-        torch.mul(self.a, x, out=out_1)
-        torch.add(out_1, self.b, out=out_2)
-        return out_2
-
-
-model = Module()
-method_names = ["forward"]
-
-inputs = (torch.ones(2, 2, dtype=torch.float), )
-
-# Find each method of model needs to be traced my its name, export its FX Graph.
-method_graph = export(
-    capture_pre_autograd_graph(model, inputs),
-    inputs,
-)
-
-# Emit the traced methods into ET Program.
-et_program = to_edge(method_graph).to_executorch()
-
-# number of input sets to be verified
-n_input = 10
-
-# Input sets to be verified for each inference methods.
-# To simplify, here we create same inputs for all methods.
-inputs = {
-    # Inference method name corresponding to its test cases.
-    m_name: [
-        # NOTE: executorch program needs torch.float, but here is torch.int
-        [
-            torch.randint(-5, 5, (2, 2), dtype=torch.int),
-        ]
-        for _ in range(n_input)
-    ]
-    for m_name in method_names
-}
-
-# Generate Test Suites
-method_test_suites = [
-    MethodTestSuite(
-        method_name=m_name,
-        test_cases=[
-            MethodTestCase(
-                inputs=input,
-                expected_outputs=(getattr(model, m_name)(*input),),
-            )
-            for input in inputs[m_name]
-        ],
-    )
-    for m_name in method_names
-]
-
-# Generate BundledProgram
-
-bundled_program = BundledProgram(et_program, method_test_suites)
-```
-
-:::{dropdown} Raised Error
-
-```
-The input tensor tensor([[-2,  0],
-        [-2, -1]], dtype=torch.int32) dtype shall be torch.float32, but now is torch.int32
----------------------------------------------------------------------------
-AssertionError                            Traceback (most recent call last)
-Cell In[1], line 72
-     56 method_test_suites = [
-     57     MethodTestSuite(
-     58         method_name=m_name,
-   (...)
-     67     for m_name in method_names
-     68 ]
-     70 # Step 3: Generate BundledProgram
----> 72 bundled_program = create_bundled_program(program, method_test_suites)
-File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
-    264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together.
-    265
-    266 Args:
-   (...)
-    271     The `BundledProgram` variable contains given ExecuTorch program and test cases.
-    272 """
-    274 method_test_suites = sorted(method_test_suites, key=lambda x: x.method_name)
---> 276 assert_valid_bundle(program, method_test_suites)
-    278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = []
-    280 # Emit data and metadata of bundled tensor
-File /executorch/devtools/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites)
-    215 # type of tensor input should match execution plan
-    216 if type(cur_plan_test_inputs[j]) == torch.Tensor:
-    217     # pyre-fixme[16]: Undefined attribute [16]: Item `bool` of `typing.Union[bool, float, int, torch._tensor.Tensor]`
-    218     # has no attribute `dtype`.
---> 219     assert cur_plan_test_inputs[j].dtype == get_input_dtype(
-    220         program, program_plan_id, j
-    221     ), "The input tensor {} dtype shall be {}, but now is {}".format(
-    222         cur_plan_test_inputs[j],
-    223         get_input_dtype(program, program_plan_id, j),
-    224         cur_plan_test_inputs[j].dtype,
-    225     )
-    226 elif type(cur_plan_test_inputs[j]) in (
-    227     int,
-    228     bool,
-    229     float,
-    230 ):
-    231     assert type(cur_plan_test_inputs[j]) == get_input_type(
-    232         program, program_plan_id, j
-    233     ), "The input primitive dtype shall be {}, but now is {}".format(
-    234         get_input_type(program, program_plan_id, j),
-    235         type(cur_plan_test_inputs[j]),
-    236     )
-AssertionError: The input tensor tensor([[-2,  0],
-        [-2, -1]], dtype=torch.int32) dtype shall be torch.float32, but now is torch.int32
-
-```
-
-:::
-
-### Method name in `BundleConfig` does not exist.
-
-Another common error would be the method name in any `MethodTestSuite` does not exist in Model. `BundledProgram` will raise error and show the non-exist method name:
-
-```python
-import torch
-
-from executorch.exir import to_edge
-from executorch.devtools import BundledProgram
-
-from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
-from torch.export import export
-
-
-class Module(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.a = 3 * torch.ones(2, 2, dtype=torch.float)
-        self.b = 2 * torch.ones(2, 2, dtype=torch.float)
-
-    def forward(self, x):
-        out_1 = torch.ones(2, 2, dtype=torch.float)
-        out_2 = torch.ones(2, 2, dtype=torch.float)
-        torch.mul(self.a, x, out=out_1)
-        torch.add(out_1, self.b, out=out_2)
-        return out_2
-
-
-model = Module()
-method_names = ["forward"]
-
-inputs = (torch.ones(2, 2, dtype=torch.float),)
-
-# Find each method of model needs to be traced my its name, export its FX Graph.
-method_graph = export(
-    capture_pre_autograd_graph(model, inputs),
-    inputs,
-)
-
-# Emit the traced methods into ET Program.
-et_program = to_edge(method_graph).to_executorch()
-
-# number of input sets to be verified
-n_input = 10
-
-# Input sets to be verified for each inference methods.
-# To simplify, here we create same inputs for all methods.
-inputs = {
-    # Inference method name corresponding to its test cases.
-    m_name: [
-        [
-            torch.randint(-5, 5, (2, 2), dtype=torch.float),
-        ]
-        for _ in range(n_input)
-    ]
-    for m_name in method_names
-}
-
-# Generate Test Suites
-method_test_suites = [
-    MethodTestSuite(
-        method_name=m_name,
-        test_cases=[
-            MethodTestCase(
-                inputs=input,
-                expected_outputs=(getattr(model, m_name)(*input),),
-            )
-            for input in inputs[m_name]
-        ],
-    )
-    for m_name in method_names
-]
-
-# NOTE: MISSING_METHOD_NAME is not an inference method in the above model.
-method_test_suites[0].method_name = "MISSING_METHOD_NAME"
-
-# Generate BundledProgram
-bundled_program = BundledProgram(et_program, method_test_suites)
-
-```
-
-:::{dropdown} Raised Error
-
-```
-All method names in bundled config should be found in program.execution_plan,          but {'MISSING_METHOD_NAME'} does not include.
----------------------------------------------------------------------------
-AssertionError                            Traceback (most recent call last)
-Cell In[3], line 73
-     70 method_test_suites[0].method_name = "MISSING_METHOD_NAME"
-     72 # Generate BundledProgram
----> 73 bundled_program = create_bundled_program(program, method_test_suites)
-File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
-    264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together.
-    265
-    266 Args:
-   (...)
-    271     The `BundledProgram` variable contains given ExecuTorch program and test cases.
-    272 """
-    274 method_test_suites = sorted(method_test_suites, key=lambda x: x.method_name)
---> 276 assert_valid_bundle(program, method_test_suites)
-    278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = []
-    280 # Emit data and metadata of bundled tensor
-File /executorch/devtools/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites)
-    138 method_name_of_program = {e.name for e in program.execution_plan}
-    139 method_name_of_test_suites = {t.method_name for t in method_test_suites}
---> 141 assert method_name_of_test_suites.issubset(
-    142     method_name_of_program
-    143 ), f"All method names in bundled config should be found in program.execution_plan, \
-    144      but {str(method_name_of_test_suites - method_name_of_program)} does not include."
-    146 # check if method_tesdt_suites has been sorted in ascending alphabetical order of method name.
-    147 for test_suite_id in range(1, len(method_test_suites)):
-AssertionError: All method names in bundled config should be found in program.execution_plan,          but {'MISSING_METHOD_NAME'} does not include.
-```
-:::
+Please update your link to <https://pytorch.org/executorch/main/bundled-io.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md
index 88c05b8c03f..3e975875f21 100644
--- a/docs/source/sdk-debugging.md
+++ b/docs/source/sdk-debugging.md
@@ -1,82 +1,3 @@
 # Debugging Models in ExecuTorch
 
-With the ExecuTorch Developer Tools, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.).
-
-Currently, ExecuTorch supports the following debugging flows:
-- Extraction of model level outputs via ETDump.
-- Extraction of intermediate outputs (outside of delegates) via ETDump:
-  - Linking of these intermediate outputs back to the eager model python code.
-
-
-## Steps to debug a model in ExecuTorch
-
-### Runtime
-For a real example reflecting the steps below, please refer to [sdk_example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/sdk/sdk_example_runner/sdk_example_runner.cpp).
-
-1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while exporting your model. When provided, this enables users to link profiling information back to the eager model source code (with stack traces and module hierarchy).
-2. Integrate [ETDump generation](./sdk-etdump.md) into the runtime and set the debugging level by configuring the `ETDumpGen` object. Then, provide an additional buffer to which intermediate outputs and program outputs will be written. Currently we support two levels of debugging:
-    - Program level outputs
-    ```C++
-    Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
-    etdump_gen.set_debug_buffer(buffer);
-    etdump_gen.set_event_tracer_debug_level(
-        EventTracerDebugLogLevel::kProgramOutputs);
-    ```
-
-    - Intermediate outputs of executed (non-delegated) operations (will include the program level outputs too)
-    ```C++
-    Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
-    etdump_gen.set_debug_buffer(buffer);
-    etdump_gen.set_event_tracer_debug_level(
-        EventTracerDebugLogLevel::kIntermediateOutputs);
-    ```
-3. Build the runtime with the pre-processor flag that enables tracking of debug events. Instructions are in the [ETDump documentation](./sdk-etdump.md).
-4. Run your model and dump out the ETDump buffer as described [here](./sdk-etdump.md). (Do so similarly for the debug buffer if configured above)
-
-
-### Accessing the debug outputs post run using the Inspector API's
-Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](./sdk-inspector.rst) to inspect these debug outputs.
-
-```python
-from executorch.devtools import Inspector
-
-# Create an Inspector instance with etdump and the debug buffer.
-inspector = Inspector(etdump_path=etdump_path,
-            buffer_path = buffer_path,
-            # etrecord is optional, if provided it'll link back
-            # the runtime events to the eager model python source code.
-            etrecord = etrecord_path)
-
-# Accessing program outputs is as simple as this:
-for event_block in inspector.event_blocks:
-    if event_block.name == "Execute":
-        print(event_blocks.run_output)
-
-# Accessing intermediate outputs from each event (an event here is essentially an instruction that executed in the runtime).
-for event_block in inspector.event_blocks:
-    if event_block.name == "Execute":
-        for event in event_block.events:
-            print(event.debug_data)
-            # If an ETRecord was provided by the user during Inspector initialization, users
-            # can print the stacktraces and module hierarchy of these events.
-            print(event.stack_traces)
-            print(event.module_hierarchy)
-```
-
-We've also provided a simple set of utilities that let users perform quality analysis of their model outputs with respect to a set of reference outputs (possibly from the eager mode model).
-
-
-```python
-from executorch.devtools.inspector import compare_results
-
-# Run a simple quality analysis between the model outputs sourced from the
-# runtime and a set of reference outputs.
-#
-# Setting plot to True will result in the quality metrics being graphed
-# and displayed (when run from a notebook) and will be written out to the
-# filesystem. A dictionary will always be returned which will contain the
-# results.
-for event_block in inspector.event_blocks:
-    if event_block.name == "Execute":
-        compare_results(event_blocks.run_output, ref_outputs, plot = True)
-```
+Please update your link to <https://pytorch.org/executorch/main/model-debugging.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-delegate-integration.md b/docs/source/sdk-delegate-integration.md
index a2f67157c89..7c2c9e92a90 100644
--- a/docs/source/sdk-delegate-integration.md
+++ b/docs/source/sdk-delegate-integration.md
@@ -1,152 +1,3 @@
-# Developer Tools Delegate Integration
+# Debug in Delegates
 
-[Delegate backends](compiler-delegate-and-partitioner.md) are a prominent component of on-device models due to their flexibility in defining behavior. A side effect of this flexibility is that it operates as an opaque transformation. This obfuscates rich associations and mutations that are valuable in post-processing.
-- For example, if two different operator fusions were to occur within a delegate, post processing wouldn’t be able to separate the two transformations.
-
-Specifically, it makes associating runtime information (such as profiling results) through delegated graphs difficult. Delegate Debug Identifiers provides a framework through which delegate authors can propagate this information and utilize it for post run analysis.
-
-The preparation is broken down into three stages:
-- **Ahead-of-time (AOT)**: Delegate authors generate a __Debug Handle Map__.
-- **Runtime**: Delegate authors log using the __Delegate Debug Identifiers__ registered AOT in the __Debug Handle Map__.
-- **Deserialization**: Delegate authors provide a parser for custom metadata in delegate events.
-
-## Ahead-of-Time
-Delegate authors propagate what transformations occur in a lowered backend by returning a **Debug Handle Map** from the backend implementation.
-
-### Generating a Debug Handle Map
-**Debug Handle Maps** communicate what transformations occurred in a backend by mapping **Delegate Debug Identifiers** to debug handles.
-
-**Delegate Debug Identifiers** are generated or user-provided identifiers for representing points of interest during runtime. Recall that debug handles are unique identifiers to operator instances in the model graph.
-
-For example:
-- **{ 0: (10, 11), 1: (11, 12) }:** Identifiers 0 and 1 in the runtime correspond to operators with the debug handles (10, 11) and (11, 12) respectively.
-- **{ “fused_op_1_2_3”: (11, 12, 15) }**: Identifier “fused_op_1_2_3” in the runtime corresponds to operators with debug handles (11, 12, 15), and 11, 12, 15 corresponds to the op 1, op 2 and op 3.
-
-```{Note}
-Identifiers are a means of connecting runtime results to the model graph; the interpretation of the identifiers is defined by the delegate author.
-```
-
-**Debug Handle Maps** are constructed through the use of **DelegateMappingBuilder** and returned as a part of `PreprocessResult`.
-
-```python
-class PreprocessResult:
-    processed_bytes: bytes = bytes()
-
-    debug_handle_map: Optional[
-        Union[Dict[int, Tuple[int]], Dict[str, Tuple[int]]]
-    ] = None
-```
-PreprocessResult is defined [here](https://github.com/pytorch/executorch/blob/main/exir/backend/backend_details.py).
-
-#### DelegateMappingBuilder
-`DelegateMappingBuilder` is a helper class for managing and constructing Debug Handle Maps. The result of the builder should be passed in when constructing PreprocessResult.
-
-`DelegateMappingBuilder` is defined [here](https://github.com/pytorch/executorch/blob/main/exir/backend/utils.py)
-
-A `DelegateMappingBuilder` instance can be constructed in one of 2 modes: manual identifiers or generated identifiers.
-
-```python
-# Manual Identifiers, Default
-builder = DelegateMappingBuilder(generated_identifiers=False)
-
-# Generated Identifiers
-builder = DelegateMappingBuilder(generated_identifiers=True)
-```
-
-With **manual identifiers**, users pass in a **Delegate Debug Identifier** when creating entries.
-With **generated identifiers**, the builder will auto-assign a **Delegate Debug Identifier**.
-
-To add an entry to the **Debug Handle Map**, use `insert_delegate_mapping_entry`. It associates one of `fx.Node(s)` or debug handles(s) (sourced from node.meta["debug_handle"]) to an optional **Delegate Debug Identifier** (used for the manual identifiers). The identifier recorded is returned from the call.
-
-```python
-def insert_delegate_mapping_entry(
-    self,
-    nodes: Optional[Union[Node, List[Node]]] = None,
-    handles: Optional[Union[int, List[int]]] = None,
-    identifier: Optional[Union[int, str]] = None,
-) -> Union[int, str]:
-```
-
-To retrieve the **Debug Handle Map**, use `get_delegate_mapping`.
-```python
-def get_delegate_mapping(
-    self,
-) -> Union[Dict[int, Tuple[int]], Dict[str, Tuple[int]]]
-```
-
-A demo of the AOT mapping can be found [here](https://github.com/pytorch/executorch/blob/main/exir/backend/test/backend_with_delegate_mapping_demo.py)
-
-
-## Runtime
-Corresponding to the AOT map, the runtime then defines the functionality through which these events are logged.
-
-### Real-Time Logging
-
-ExecuTorch allows you to log in real time. **Real time Logging** is useful when timestamps are available as the execution occurs. It provides minimal overhead and is intuitive for authors to call.
-
-To log events in real-time (for example, explicitly denoting the profiling start and stop), `event_tracer_start_profiling_delegate` is used to create an `EventEntry` and `event_tracer_end_profiling_delegate` is used to conclude the `EventEntry` for the provided `EventTracer`.
-
-To start an `EventTracerEntry` using `event_tracer_start_profiling_delegate`, the **Delegate Debug Identifier** (provided AOT to the `debug_handle_map`) is passed as either the name or `delegate_debug_id` argument depending on the **Delegate Debug Identifier** type (str and int respectively)
-
-```c++
-EventTracerEntry event_tracer_start_profiling_delegate(
-    EventTracer* event_tracer,
-    const char* name,
-    DebugHandle delegate_debug_id)
-```
-
-To conclude an `EventTracerEntry`, `event_tracer_end_profiling_delegate` is simply provided the original `EventTracerEntry`.
-
-Optionally, additional runtime `metadata` can also be logged at this point.
-
-```c++
-void event_tracer_end_profiling_delegate(
-    EventTracer* event_tracer,
-    EventTracerEntry event_tracer_entry,
-    const void* metadata = nullptr,
-    size_t metadata_len = 0)
-```
-
-### Post-Time Logging
-ExecuTorch also allows you to log in post time. Some runtime settings don't have access to timestamps while it is executing. **Post-Time Logging** enables authors to still be able to log these events.
-
-To log events in post (for example, logging start and end time simultaneously) `event_tracer_log_profiling_delegate` is called with a combination of the arguments used in the real-time logging API’s and timestamps.
-
-```c++
-void event_tracer_log_profiling_delegate(
-    EventTracer* event_tracer,
-    const char* name,
-    DebugHandle delegate_debug_id,
-    et_timestamp_t start_time,
-    et_timestamp_t end_time,
-    const void* metadata = nullptr,
-    size_t metadata_len = 0)
-```
-A demo of the runtime code can be found [here](https://github.com/pytorch/executorch/blob/main/runtime/executor/test/test_backend_with_delegate_mapping.cpp).
-
-
-## Surfacing custom metadata from delegate events
-
-As seen in the runtime logging API's above, users can log an array of bytes along with their delegate profiling event. We make this data available for users in post processing via the [Inspector API](./sdk-inspector.rst).
-
-Users can pass a metadata parser when creating an instance of the Inspector. The parser is a callable that deserializes the data and returns a list of strings or a dictionary containing key-value pairs. The deserialized data is then added back to the corresponding event in the event block for user consumption. Here's an example of how to write this parser:
-
-NOTE: The input to the deserializer is a list where each entry is a series of bytes (essentially each entry is an immutable bytearray). Users are expected to iterate over this list, deserialize each entry and then return it in the expected format which is either a list of strings, or a dict.
-
-```python
-Inspector(
-    etdump_path=etdump_path,
-    # Optional
-    etrecord=etrecord_path,
-    # Optional, only needed if debugging was enabled.
-    buffer_path=buffer_path,
-    delegate_metadata_parser=parse_delegate_metadata
-)
-
-
-def parse_delegate_metadata(delegate_metadatas: List[bytes]) -> Union[List[str], Dict[str, Any]]:
-    metadata_str = []
-    for metadata_bytes in delegate_metadatas:
-        metadata_str += str(metadata_bytes)
-    return metadata_str
-```
+Please update your link to <https://pytorch.org/executorch/main/delegate-debugging.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md
index c58efb40de7..a765d4cf1b4 100644
--- a/docs/source/sdk-etdump.md
+++ b/docs/source/sdk-etdump.md
@@ -1,44 +1,3 @@
 # Prerequisite | ETDump - ExecuTorch Dump
 
-ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch Developer Tools. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging.
-
-
-## Generating an ETDump
-
-Generating an ETDump is a relatively straightforward process. Users can follow the steps detailed below to integrate it into their application that uses ExecuTorch.
-
-1. ***Include*** the ETDump header in your code.
-```C++
-#include <executorch/devtools/etdump/etdump_flatcc.h>
-```
-
-2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime.
-
-```C++
-torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
-Result<Method> method =
-      program->load_method(method_name, &memory_manager, &etdump_gen);
-```
-
-3. ***Dump Out the ETDump Buffer*** - after the inference iterations have been completed, users can dump out the ETDump buffer. If users are on a device which has a filesystem, they could just write it out to the filesystem. For more constrained embedded devices, users will have to extract the ETDump buffer from the device through a mechanism that best suits them (e.g. UART, JTAG etc.)
-
-```C++
-etdump_result result = etdump_gen.get_etdump_data();
-if (result.buf != nullptr && result.size > 0) {
-    // On a device with a file system users can just write it out
-    // to the file-system.
-    FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
-    fwrite((uint8_t*)result.buf, 1, result.size, f);
-    fclose(f);
-    free(result.buf);
-  }
-```
-
-4. ***Compile*** your binary using CMake with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime. This flag needs to be added to the ExecuTorch library and any operator library that you are compiling into your binary. For reference, you can take a look at `examples/sdk/CMakeLists.txt`. The lines of interest are:
-```
-target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
-target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
-```
-## Using an ETDump
-
-Pass this ETDump into the [Inspector API](./sdk-inspector.rst) to access this data and do post-run analysis.
+Please update your link to <https://pytorch.org/executorch/main/etdump.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst
index 63546f43ca6..ee8f9b2b2d2 100644
--- a/docs/source/sdk-etrecord.rst
+++ b/docs/source/sdk-etrecord.rst
@@ -1,40 +1,4 @@
 Prerequisite | ETRecord - ExecuTorch Record
 ===========================================
 
-Overview
---------
-
-``ETRecord`` is intended to be the debug artifact that is generated by
-users ahead of time (when they export their model to run on ExecuTorch).
-To draw a rough equivalent to conventional software development,
-``ETRecord`` can be considered as the binary built with debug symbols
-that is used for debugging in GNU Debugger (gdb). It is expected that
-the user will supply this to the ExecuTorch Developer Tools in order for
-them to debug and visualize their model.
-
-``ETRecord`` contains numerous components such as:
-
-* Edge dialect graph with debug handles
-* Delegate debug handle maps
-
-The ``ETRecord`` object itself is intended to be opaque to users and they should not access any components inside it directly.
-It should be provided to the `Inspector API <sdk-inspector.html>`__ to link back performance and debug data sourced from the runtime back to the Python source code.
-
-Generating an ``ETRecord``
---------------------------
-
-The user should use the following API to generate an ``ETRecord`` file. They
-will be expected to provide the Edge Dialect program (returned by the call to ``to_edge()``),
-the ExecuTorch program (returned by the call to ``to_executorch()``), and optional models that
-they are interested in working with via our tooling.
-
-.. warning::
-    Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
-
-.. currentmodule:: executorch.devtools.etrecord._etrecord
-.. autofunction:: generate_etrecord
-
-Using an ``ETRecord``
----------------------
-
-Pass the ``ETRecord`` as an optional argument into the `Inspector API <sdk-inspector.html>`__ to access this data and  do post-run analysis.
+Please update your link to <https://pytorch.org/executorch/main/etrecord.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst
index 448f30cfb55..0019528f419 100644
--- a/docs/source/sdk-inspector.rst
+++ b/docs/source/sdk-inspector.rst
@@ -1,159 +1,4 @@
 Inspector APIs
 ==============
 
-Overview
---------
-
-The Inspector APIs provide a convenient interface for analyzing the
-contents of `ETRecord <sdk-etrecord.html>`__ and
-`ETDump <sdk-etdump.html>`__, helping developers get insights about model
-architecture and performance statistics. It’s built on top of the `EventBlock Class <#eventblock-class>`__ data structure,
-which organizes a group of `Event <#event-class>`__\ s for easy access to details of profiling events.
-
-There are multiple ways in which users can interact with the Inspector
-APIs:
-
-* By using `public methods <#inspector-methods>`__ provided by the ``Inspector`` class.
-* By accessing the `public attributes <#inspector-attributes>`__ of the ``Inspector``, ``EventBlock``, and ``Event`` classes.
-* By using a `CLI <#cli>`__ tool for basic functionalities.
-
-Please refer to the `e2e use case doc <tutorials/sdk-integration-tutorial.html>`__ get an understanding of how to use these in a real world example.
-
-
-Inspector Methods
------------------
-
-Constructor
-~~~~~~~~~~~
-
-.. autofunction:: executorch.devtools.Inspector.__init__
-
-**Example Usage:**
-
-.. code:: python
-
-    from executorch.devtools import Inspector
-
-    inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin")
-
-to_dataframe
-~~~~~~~~~~~~~~~~
-
-.. autofunction:: executorch.devtools.Inspector.to_dataframe
-
-
-print_data_tabular
-~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: executorch.devtools.Inspector.print_data_tabular
-
-.. _example-usage-1:
-
-**Example Usage:**
-
-.. code:: python
-
-    inspector.print_data_tabular()
-
-.. image:: _static/img/print_data_tabular.png
-Note that the unit of delegate profiling events is "cycles". We're working on providing a way to set different units in the future.
-
-
-find_total_for_module
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: executorch.devtools.Inspector.find_total_for_module
-
-.. _example-usage-2:
-
-**Example Usage:**
-
-.. code:: python
-
-    print(inspector.find_total_for_module("L__self___conv_layer"))
-
-::
-
-    0.002
-
-
-get_exported_program
-~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: executorch.devtools.Inspector.get_exported_program
-
-.. _example-usage-3:
-
-**Example Usage:**
-
-.. code:: python
-
-    print(inspector.get_exported_program())
-
-::
-
-    ExportedProgram:
-        class GraphModule(torch.nn.Module):
-            def forward(self, arg0_1: f32[4, 3, 64, 64]):
-                # No stacktrace found for following nodes
-                _param_constant0 = self._param_constant0
-                _param_constant1 = self._param_constant1
-
-                ### ... Omit part of the program for documentation readability ... ###
-
-    Graph signature: ExportGraphSignature(parameters=[], buffers=[], user_inputs=['arg0_1'], user_outputs=['aten_tan_default'], inputs_to_parameters={}, inputs_to_buffers={}, buffers_to_mutate={}, backward_signature=None, assertion_dep_token=None)
-    Range constraints: {}
-    Equality constraints: []
-
-
-Inspector Attributes
---------------------
-
-``EventBlock`` Class
-~~~~~~~~~~~~~~~~~~~~
-
-Access ``EventBlock`` instances through the ``event_blocks`` attribute
-of an ``Inspector`` instance, for example:
-
-.. code:: python
-
-    inspector.event_blocks
-
-.. autoclass:: executorch.devtools.inspector.EventBlock
-
-``Event`` Class
-~~~~~~~~~~~~~~~
-
-Access ``Event`` instances through the ``events`` attribute of an
-``EventBlock`` instance.
-
-.. autoclass:: executorch.devtools.inspector.Event
-
-**Example Usage:**
-
-.. code:: python
-
-    for event_block in inspector.event_blocks:
-        for event in event_block.events:
-            if event.name == "Method::execute":
-                print(event.perf_data.raw)
-
-::
-
-    [175.748, 78.678, 70.429, 122.006, 97.495, 67.603, 70.2, 90.139, 66.344, 64.575, 134.135, 93.85, 74.593, 83.929, 75.859, 73.909, 66.461, 72.102, 84.142, 77.774, 70.038, 80.246, 59.134, 68.496, 67.496, 100.491, 81.162, 74.53, 70.709, 77.112, 59.775, 79.674, 67.54, 79.52, 66.753, 70.425, 71.703, 81.373, 72.306, 72.404, 94.497, 77.588, 79.835, 68.597, 71.237, 88.528, 71.884, 74.047, 81.513, 76.116]
-
-
-CLI
----
-
-Execute the following command in your terminal to display the data
-table. This command produces the identical table output as calling the
-`print_data_tabular <#print-data-tabular>`__ mentioned earlier:
-
-.. code:: bash
-
-    python3 -m devtools.inspector.inspector_cli --etdump_path <path_to_etdump> --etrecord_path <path_to_etrecord>
-
-Note that the `etrecord_path` argument is optional.
-
-We plan to extend the capabilities of the CLI in the future.
+Please update your link to <https://pytorch.org/executorch/main/model-inspector.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-overview.md b/docs/source/sdk-overview.md
index 13fd8e00597..1e8f1fae1ba 100644
--- a/docs/source/sdk-overview.md
+++ b/docs/source/sdk-overview.md
@@ -1,44 +1,3 @@
 # Introduction to the ExecuTorch Developer Tools
 
-ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch Developer Tools enable this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch.
-
-All the components of the Developer Tools have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from.
-
-## Developer Tools Features
-
-The ExecuTorch Developer Tools support the following features:
-
-- **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output.
-- **Profiling** models with operator level breakdown of performance stats
-    - Linking back operator performance stats to source code and module hierarchy
-    - Model loading and execution time
-- **Delegate Integration** - Surfacing performance details from delegate backends
-    - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy)
-- **Debugging** - Intermediate outputs and output quality analysis
-- **Visualization** - Coming soon
-
-## Fundamental components of the Developer Tools
-
-In order to fully understand and leverage the power of the Developer Tools in this section, the fundamental components that power the Developer Tools will be detailed.
-
-### ETRecord
-ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the Developer Tools to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model.
-
-To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb).
-
-More details are available in the [ETRecord documentation](sdk-etrecord.rst) on how to generate and store an ETRecord.
-
-### ETDump
-ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution.
-
-```{note}
-If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the Developer Tools. For the full experience, it is recommended that the users also generate an ETRecord.
-```
-
-More details are available in the [ETDump documentation](sdk-etdump.md) on how to generate and store an ETDump from the runtime.
-
-
-### Inspector APIs
-The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
-
-More details are available in the [Inspector API documentation](sdk-inspector.rst) on how to use the Inspector APIs.
+Please update your link to <https://pytorch.org/executorch/main/devtools-overview.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-profiling.md b/docs/source/sdk-profiling.md
index f827d108b1f..9c99a979757 100644
--- a/docs/source/sdk-profiling.md
+++ b/docs/source/sdk-profiling.md
@@ -1,23 +1,3 @@
 # Profiling Models in ExecuTorch
 
-Profiling in ExecuTorch gives users access to these runtime metrics:
-- Model Load Time.
-- Operator Level Execution Time.
-- Delegate Execution Time.
-  - If the delegate that the user is calling into has been integrated with the [Developer Tools](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time.
-- End-to-end Inference Execution Time.
-
-One uniqe aspect of ExecuTorch Profiling is the ability to link every runtime executed operator back to the exact line of python code from which this operator originated. This capability enables users to easily identify hotspots in their model, source them back to the exact line of Python code, and optimize if chosen to.
-
-We provide access to all the profiling data via the Python [Inspector API](./sdk-inspector.rst). The data mentioned above can be accessed through these interfaces, allowing users to perform any post-run analysis of their choice.
-
-## Steps to Profile a Model in ExecuTorch
-
-1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while you're exporting your model. If provided this will enable users to link back profiling details to eager model source code (with stack traces and module hierarchy).
-2.  Build the runtime with the pre-processor flags that enable profiling. Detailed in the [ETDump documentation](./sdk-etdump.md).
-3. Run your Program on the ExecuTorch runtime and generate an [ETDump](./sdk-etdump.md).
-4. Create an instance of the [Inspector API](./sdk-inspector.rst) by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
-    - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level.
-
-
-Please refer to the [Developer Tools tutorial](./tutorials/sdk-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model.
+Please update your link to <https://pytorch.org/executorch/main/runtime-profiling.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-tutorial.md b/docs/source/sdk-tutorial.md
index 2fad3ea9366..457d3b47ebf 100644
--- a/docs/source/sdk-tutorial.md
+++ b/docs/source/sdk-tutorial.md
@@ -1,3 +1,3 @@
 ## Developer Tools Usage Tutorial
 
-Please refer to the [Developer Tools tutorial](./tutorials/sdk-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
+Please update your link to <https://pytorch.org/executorch/main/devtools-tutorial.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 8afa6d6fe77..666ee23aa35 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -74,13 +74,13 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and
 The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Custom Quantization](quantization-custom-quantization.md) note. For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder.
 
 ```python
-from torch._export import capture_pre_autograd_graph
+from torch.export import export_for_training
 from executorch.exir import EdgeCompileConfig
 
 mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
 sample_inputs = (torch.randn(1, 3, 224, 224), )
 
-mobilenet_v2 = capture_pre_autograd_graph(mobilenet_v2, sample_inputs) # 2-stage export for quantization path
+mobilenet_v2 = export_for_training(mobilenet_v2, sample_inputs).module() # 2-stage export for quantization path
 
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
@@ -107,7 +107,7 @@ def quantize(model, example_inputs):
 quantized_mobilenetv2 = quantize(mobilenet_v2, sample_inputs)
 ```
 
-Quantization requires a two stage export. First we use the `capture_pre_autograd_graph` API to capture the model before giving it to `quantize` utility function. After performing the quantization step, we can now leverage the XNNPACK delegate to lower the quantized exported model graph. From here, the procedure is the same as for the non-quantized model lowering to XNNPACK.
+Quantization requires a two stage export. First we use the `export_for_training` API to capture the model before giving it to `quantize` utility function. After performing the quantization step, we can now leverage the XNNPACK delegate to lower the quantized exported model graph. From here, the procedure is the same as for the non-quantized model lowering to XNNPACK.
 
 ```python
 # Continued from earlier...
diff --git a/docs/source/tutorials_source/devtools-integration-tutorial.py b/docs/source/tutorials_source/devtools-integration-tutorial.py
new file mode 100644
index 00000000000..dece18fa8ce
--- /dev/null
+++ b/docs/source/tutorials_source/devtools-integration-tutorial.py
@@ -0,0 +1,301 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Using the ExecuTorch Developer Tools to Profile a Model
+========================
+
+**Author:** `Jack Khuu <https://github.com/Jack-Khuu>`__
+"""
+
+######################################################################
+# The `ExecuTorch Developer Tools <../devtools-overview.html>`__ is a set of tools designed to
+# provide users with the ability to profile, debug, and visualize ExecuTorch
+# models.
+#
+# This tutorial will show a full end-to-end flow of how to utilize the Developer Tools to profile a model.
+# Specifically, it will:
+#
+# 1. Generate the artifacts consumed by the Developer Tools (`ETRecord <../etrecord.html>`__, `ETDump <../etdump.html>`__).
+# 2. Create an Inspector class consuming these artifacts.
+# 3. Utilize the Inspector class to analyze the model profiling result.
+
+######################################################################
+# Prerequisites
+# -------------
+#
+# To run this tutorial, you’ll first need to
+# `Set up your ExecuTorch environment <../getting-started-setup.html>`__.
+#
+
+######################################################################
+# Generate ETRecord (Optional)
+# ----------------------------
+#
+# The first step is to generate an ``ETRecord``. ``ETRecord`` contains model
+# graphs and metadata for linking runtime results (such as profiling) to
+# the eager model. This is generated via ``executorch.devtools.generate_etrecord``.
+#
+# ``executorch.devtools.generate_etrecord`` takes in an output file path (str), the
+# edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model
+# (``ExecutorchProgramManager``), and an optional dictionary of additional models.
+#
+# In this tutorial, an example model (shown below) is used to demonstrate.
+
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from executorch.devtools import generate_etrecord
+
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    ExecutorchProgramManager,
+    to_edge,
+)
+from torch.export import export, ExportedProgram
+
+
+# Generate Model
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        # 1 input image channel, 6 output channels, 5x5 square convolution
+        # kernel
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        # an affine operation: y = Wx + b
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        # Max pooling over a (2, 2) window
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        # If the size is a square, you can specify with a single number
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = torch.flatten(x, 1)  # flatten all dimensions except the batch dimension
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+model = Net()
+
+aten_model: ExportedProgram = export(
+    model,
+    (torch.randn(1, 1, 32, 32),),
+)
+
+edge_program_manager: EdgeProgramManager = to_edge(
+    aten_model, compile_config=EdgeCompileConfig(_check_ir_validity=True)
+)
+edge_program_manager_copy = copy.deepcopy(edge_program_manager)
+et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch()
+
+
+# Generate ETRecord
+etrecord_path = "etrecord.bin"
+generate_etrecord(etrecord_path, edge_program_manager_copy, et_program_manager)
+
+# sphinx_gallery_start_ignore
+from unittest.mock import patch
+
+# sphinx_gallery_end_ignore
+
+######################################################################
+#
+# .. warning::
+#    Users should do a deepcopy of the output of ``to_edge()`` and pass in the
+#    deepcopy to the ``generate_etrecord`` API. This is needed because the
+#    subsequent call, ``to_executorch()``, does an in-place mutation and will
+#    lose debug data in the process.
+#
+
+######################################################################
+# Generate ETDump
+# ---------------
+#
+# Next step is to generate an ``ETDump``. ``ETDump`` contains runtime results
+# from executing a `Bundled Program Model <../bundled-io.html>`__.
+#
+# In this tutorial, a `Bundled Program` is created from the example model above.
+
+import torch
+from executorch.devtools import BundledProgram
+
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
+
+from executorch.exir import to_edge
+from torch.export import export
+
+# Step 1: ExecuTorch Program Export
+m_name = "forward"
+method_graphs = {m_name: export(model, (torch.randn(1, 1, 32, 32),))}
+
+# Step 2: Construct Method Test Suites
+inputs = [[torch.randn(1, 1, 32, 32)] for _ in range(2)]
+
+method_test_suites = [
+    MethodTestSuite(
+        method_name=m_name,
+        test_cases=[
+            MethodTestCase(inputs=inp, expected_outputs=getattr(model, m_name)(*inp))
+            for inp in inputs
+        ],
+    )
+]
+
+# Step 3: Generate BundledProgram
+executorch_program = to_edge(method_graphs).to_executorch()
+bundled_program = BundledProgram(executorch_program, method_test_suites)
+
+# Step 4: Serialize BundledProgram to flatbuffer.
+serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
+    bundled_program
+)
+save_path = "bundled_program.bp"
+with open(save_path, "wb") as f:
+    f.write(serialized_bundled_program)
+
+######################################################################
+# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``::
+#
+#       cd executorch
+#       ./examples/devtools/build_example_runner.sh
+#       cmake-out/examples/devtools/example_runner --bundled_program_path="bundled_program.bp"
+
+######################################################################
+# Creating an Inspector
+# ---------------------
+#
+# Final step is to create the ``Inspector`` by passing in the artifact paths.
+# Inspector takes the runtime results from ``ETDump`` and correlates them to
+# the operators of the Edge Dialect Graph.
+#
+# Recall: An ``ETRecord`` is not required. If an ``ETRecord`` is not provided,
+# the Inspector will show runtime results without operator correlation.
+#
+# To visualize all runtime events, call Inspector's ``print_data_tabular``.
+
+from executorch.devtools import Inspector
+
+# sphinx_gallery_start_ignore
+inspector_patch = patch.object(Inspector, "__init__", return_value=None)
+inspector_patch_print = patch.object(Inspector, "print_data_tabular", return_value="")
+inspector_patch.start()
+inspector_patch_print.start()
+# sphinx_gallery_end_ignore
+etrecord_path = "etrecord.bin"
+etdump_path = "etdump.etdp"
+inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path)
+# sphinx_gallery_start_ignore
+inspector.event_blocks = []
+# sphinx_gallery_end_ignore
+inspector.print_data_tabular()
+
+# sphinx_gallery_start_ignore
+inspector_patch.stop()
+inspector_patch_print.stop()
+# sphinx_gallery_end_ignore
+
+######################################################################
+# Analyzing with an Inspector
+# ---------------------------
+#
+# ``Inspector`` provides 2 ways of accessing ingested information: `EventBlocks <../model-inspector#eventblock-class>`__
+# and ``DataFrames``. These mediums give users the ability to perform custom
+# analysis about their model performance.
+#
+# Below are examples usages, with both ``EventBlock`` and ``DataFrame`` approaches.
+
+# Set Up
+import pprint as pp
+
+import pandas as pd
+
+pd.set_option("display.max_colwidth", None)
+pd.set_option("display.max_columns", None)
+
+######################################################################
+# If a user wants the raw profiling results, they would do something similar to
+# finding the raw runtime data of an ``addmm.out`` event.
+
+for event_block in inspector.event_blocks:
+    # Via EventBlocks
+    for event in event_block.events:
+        if event.name == "native_call_addmm.out":
+            print(event.name, event.perf_data.raw)
+
+    # Via Dataframe
+    df = event_block.to_dataframe()
+    df = df[df.event_name == "native_call_addmm.out"]
+    print(df[["event_name", "raw"]])
+    print()
+
+######################################################################
+# If a user wants to trace an operator back to their model code, they would do
+# something similar to finding the module hierarchy and stack trace of the
+# slowest ``convolution.out`` call.
+
+for event_block in inspector.event_blocks:
+    # Via EventBlocks
+    slowest = None
+    for event in event_block.events:
+        if event.name == "native_call_convolution.out":
+            if slowest is None or event.perf_data.p50 > slowest.perf_data.p50:
+                slowest = event
+    if slowest is not None:
+        print(slowest.name)
+        print()
+        pp.pprint(slowest.stack_traces)
+        print()
+        pp.pprint(slowest.module_hierarchy)
+
+    # Via Dataframe
+    df = event_block.to_dataframe()
+    df = df[df.event_name == "native_call_convolution.out"]
+    if len(df) > 0:
+        slowest = df.loc[df["p50"].idxmax()]
+        print(slowest.event_name)
+        print()
+        pp.pprint(slowest.stack_traces)
+        print()
+        pp.pprint(slowest.module_hierarchy)
+
+######################################################################
+# If a user wants the total runtime of a module, they can use
+# ``find_total_for_module``.
+
+print(inspector.find_total_for_module("L__self__"))
+print(inspector.find_total_for_module("L__self___conv2"))
+
+######################################################################
+# Note: ``find_total_for_module`` is a special first class method of
+# `Inspector <../model-inspector.html>`__
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we learned about the steps required to consume an ExecuTorch
+# model with the ExecuTorch Developer Tools. It also showed how to use the Inspector APIs
+# to analyze the model run results.
+#
+# Links Mentioned
+# ^^^^^^^^^^^^^^^
+#
+# - `ExecuTorch Developer Tools Overview <../devtools-overview.html>`__
+# - `ETRecord <../etrecord.html>`__
+# - `ETDump <../etdump.html>`__
+# - `Inspector <../model-inspector.html>`__
diff --git a/docs/source/tutorials_source/export-to-executorch-tutorial.py b/docs/source/tutorials_source/export-to-executorch-tutorial.py
index 2071567ddd1..fac3eab08e5 100644
--- a/docs/source/tutorials_source/export-to-executorch-tutorial.py
+++ b/docs/source/tutorials_source/export-to-executorch-tutorial.py
@@ -179,8 +179,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # -----------------------
 #
 # To quantize a model, we first need to capture the graph with
-# ``torch._export.capture_pre_autograd_graph``, perform quantization, and then
-# call ``torch.export``. ``torch._export.capture_pre_autograd_graph`` returns a
+# ``torch.export.export_for_training``, perform quantization, and then
+# call ``torch.export``. ``torch.export.export_for_training`` returns a
 # graph which contains ATen operators which are Autograd safe, meaning they are
 # safe for eager-mode training, which is needed for quantization. We will call
 # the graph at this level, the ``Pre-Autograd ATen Dialect`` graph.
@@ -193,10 +193,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # will annotate the nodes in the graph with information needed to quantize the
 # model properly for a specific backend.
 
-from torch._export import capture_pre_autograd_graph
+from torch.export import export_for_training
 
 example_args = (torch.randn(1, 3, 256, 256),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(SimpleConv(), example_args)
+pre_autograd_aten_dialect = export_for_training(SimpleConv(), example_args).module()
 print("Pre-Autograd ATen Dialect Graph")
 print(pre_autograd_aten_dialect)
 
@@ -523,9 +523,7 @@ def forward(self, a, x, b):
 executorch_program: ExecutorchProgramManager = edge_program.to_executorch(
     ExecutorchBackendConfig(
         passes=[],  # User-defined passes
-        memory_planning_pass=MemoryPlanningPass(
-            "greedy"
-        ),  # Default memory planning pass
+        memory_planning_pass=MemoryPlanningPass(),  # Default memory planning pass
     )
 )
 
@@ -562,8 +560,7 @@ def forward(self, a, x, b):
 # Here is an example for an entire end-to-end workflow:
 
 import torch
-from torch._export import capture_pre_autograd_graph
-from torch.export import export, ExportedProgram
+from torch.export import export, export_for_training, ExportedProgram
 
 
 class M(torch.nn.Module):
@@ -577,7 +574,7 @@ def forward(self, x):
 
 
 example_args = (torch.randn(3, 4),)
-pre_autograd_aten_dialect = capture_pre_autograd_graph(M(), example_args)
+pre_autograd_aten_dialect = export_for_training(M(), example_args).module()
 # Optionally do quantization:
 # pre_autograd_aten_dialect = convert_pt2e(prepare_pt2e(pre_autograd_aten_dialect, CustomBackendQuantizer))
 aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py
index bc1d2ebe788..b9a8009c646 100644
--- a/docs/source/tutorials_source/sdk-integration-tutorial.py
+++ b/docs/source/tutorials_source/sdk-integration-tutorial.py
@@ -9,292 +9,5 @@
 Using the ExecuTorch Developer Tools to Profile a Model
 ========================
 
-**Author:** `Jack Khuu <https://github.com/Jack-Khuu>`__
+Please update your link to <https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial.html>. This URL will be deleted after v0.4.0.
 """
-
-######################################################################
-# The `ExecuTorch Developer Tools <../sdk-overview.html>`__ is a set of tools designed to
-# provide users with the ability to profile, debug, and visualize ExecuTorch
-# models.
-#
-# This tutorial will show a full end-to-end flow of how to utilize the Developer Tools to profile a model.
-# Specifically, it will:
-#
-# 1. Generate the artifacts consumed by the Developer Tools (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__).
-# 2. Create an Inspector class consuming these artifacts.
-# 3. Utilize the Inspector class to analyze the model profiling result.
-
-######################################################################
-# Prerequisites
-# -------------
-#
-# To run this tutorial, you’ll first need to
-# `Set up your ExecuTorch environment <../getting-started-setup.html>`__.
-#
-
-######################################################################
-# Generate ETRecord (Optional)
-# ----------------------------
-#
-# The first step is to generate an ``ETRecord``. ``ETRecord`` contains model
-# graphs and metadata for linking runtime results (such as profiling) to
-# the eager model. This is generated via ``executorch.devtools.generate_etrecord``.
-#
-# ``executorch.devtools.generate_etrecord`` takes in an output file path (str), the
-# edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model
-# (``ExecutorchProgramManager``), and an optional dictionary of additional models.
-#
-# In this tutorial, an example model (shown below) is used to demonstrate.
-
-import copy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from executorch.devtools import generate_etrecord
-
-from executorch.exir import (
-    EdgeCompileConfig,
-    EdgeProgramManager,
-    ExecutorchProgramManager,
-    to_edge,
-)
-from torch.export import export, ExportedProgram
-
-
-# Generate Model
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        # 1 input image channel, 6 output channels, 5x5 square convolution
-        # kernel
-        self.conv1 = nn.Conv2d(1, 6, 5)
-        self.conv2 = nn.Conv2d(6, 16, 5)
-        # an affine operation: y = Wx + b
-        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
-        self.fc2 = nn.Linear(120, 84)
-        self.fc3 = nn.Linear(84, 10)
-
-    def forward(self, x):
-        # Max pooling over a (2, 2) window
-        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
-        # If the size is a square, you can specify with a single number
-        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
-        x = torch.flatten(x, 1)  # flatten all dimensions except the batch dimension
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        x = self.fc3(x)
-        return x
-
-
-model = Net()
-
-aten_model: ExportedProgram = export(
-    model,
-    (torch.randn(1, 1, 32, 32),),
-)
-
-edge_program_manager: EdgeProgramManager = to_edge(
-    aten_model, compile_config=EdgeCompileConfig(_check_ir_validity=True)
-)
-edge_program_manager_copy = copy.deepcopy(edge_program_manager)
-et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch()
-
-
-# Generate ETRecord
-etrecord_path = "etrecord.bin"
-generate_etrecord(etrecord_path, edge_program_manager_copy, et_program_manager)
-
-# sphinx_gallery_start_ignore
-from unittest.mock import patch
-
-# sphinx_gallery_end_ignore
-
-######################################################################
-#
-# .. warning::
-#    Users should do a deepcopy of the output of ``to_edge()`` and pass in the
-#    deepcopy to the ``generate_etrecord`` API. This is needed because the
-#    subsequent call, ``to_executorch()``, does an in-place mutation and will
-#    lose debug data in the process.
-#
-
-######################################################################
-# Generate ETDump
-# ---------------
-#
-# Next step is to generate an ``ETDump``. ``ETDump`` contains runtime results
-# from executing a `Bundled Program Model <../sdk-bundled-io.html>`__.
-#
-# In this tutorial, a `Bundled Program` is created from the example model above.
-
-import torch
-from executorch.devtools import BundledProgram
-
-from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.devtools.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
-
-from executorch.exir import to_edge
-from torch.export import export
-
-# Step 1: ExecuTorch Program Export
-m_name = "forward"
-method_graphs = {m_name: export(model, (torch.randn(1, 1, 32, 32),))}
-
-# Step 2: Construct Method Test Suites
-inputs = [[torch.randn(1, 1, 32, 32)] for _ in range(2)]
-
-method_test_suites = [
-    MethodTestSuite(
-        method_name=m_name,
-        test_cases=[
-            MethodTestCase(inputs=inp, expected_outputs=getattr(model, m_name)(*inp))
-            for inp in inputs
-        ],
-    )
-]
-
-# Step 3: Generate BundledProgram
-executorch_program = to_edge(method_graphs).to_executorch()
-bundled_program = BundledProgram(executorch_program, method_test_suites)
-
-# Step 4: Serialize BundledProgram to flatbuffer.
-serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
-    bundled_program
-)
-save_path = "bundled_program.bp"
-with open(save_path, "wb") as f:
-    f.write(serialized_bundled_program)
-
-######################################################################
-# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``::
-#
-#       cd executorch
-#       ./examples/sdk/build_sdk_example_runner.sh
-#       cmake-out/examples/sdk/sdk_example_runner --bundled_program_path="bundled_program.bp"
-
-######################################################################
-# Creating an Inspector
-# ---------------------
-#
-# Final step is to create the ``Inspector`` by passing in the artifact paths.
-# Inspector takes the runtime results from ``ETDump`` and correlates them to
-# the operators of the Edge Dialect Graph.
-#
-# Recall: An ``ETRecord`` is not required. If an ``ETRecord`` is not provided,
-# the Inspector will show runtime results without operator correlation.
-#
-# To visualize all runtime events, call Inspector's ``print_data_tabular``.
-
-from executorch.devtools import Inspector
-
-# sphinx_gallery_start_ignore
-inspector_patch = patch.object(Inspector, "__init__", return_value=None)
-inspector_patch_print = patch.object(Inspector, "print_data_tabular", return_value="")
-inspector_patch.start()
-inspector_patch_print.start()
-# sphinx_gallery_end_ignore
-etdump_path = "etdump.etdp"
-inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path)
-# sphinx_gallery_start_ignore
-inspector.event_blocks = []
-# sphinx_gallery_end_ignore
-inspector.print_data_tabular()
-
-# sphinx_gallery_start_ignore
-inspector_patch.stop()
-inspector_patch_print.stop()
-# sphinx_gallery_end_ignore
-
-######################################################################
-# Analyzing with an Inspector
-# ---------------------------
-#
-# ``Inspector`` provides 2 ways of accessing ingested information: `EventBlocks <../sdk-inspector#eventblock-class>`__
-# and ``DataFrames``. These mediums give users the ability to perform custom
-# analysis about their model performance.
-#
-# Below are examples usages, with both ``EventBlock`` and ``DataFrame`` approaches.
-
-# Set Up
-import pprint as pp
-
-import pandas as pd
-
-pd.set_option("display.max_colwidth", None)
-pd.set_option("display.max_columns", None)
-
-######################################################################
-# If a user wants the raw profiling results, they would do something similar to
-# finding the raw runtime data of an ``addmm.out`` event.
-
-for event_block in inspector.event_blocks:
-    # Via EventBlocks
-    for event in event_block.events:
-        if event.name == "native_call_addmm.out":
-            print(event.name, event.perf_data.raw)
-
-    # Via Dataframe
-    df = event_block.to_dataframe()
-    df = df[df.event_name == "native_call_addmm.out"]
-    print(df[["event_name", "raw"]])
-    print()
-
-######################################################################
-# If a user wants to trace an operator back to their model code, they would do
-# something similar to finding the module hierarchy and stack trace of the
-# slowest ``convolution.out`` call.
-
-for event_block in inspector.event_blocks:
-    # Via EventBlocks
-    slowest = None
-    for event in event_block.events:
-        if event.name == "native_call_convolution.out":
-            if slowest is None or event.perf_data.p50 > slowest.perf_data.p50:
-                slowest = event
-    if slowest is not None:
-        print(slowest.name)
-        print()
-        pp.pprint(slowest.stack_traces)
-        print()
-        pp.pprint(slowest.module_hierarchy)
-
-    # Via Dataframe
-    df = event_block.to_dataframe()
-    df = df[df.event_name == "native_call_convolution.out"]
-    if len(df) > 0:
-        slowest = df.loc[df["p50"].idxmax()]
-        print(slowest.event_name)
-        print()
-        pp.pprint(slowest.stack_traces)
-        print()
-        pp.pprint(slowest.module_hierarchy)
-
-######################################################################
-# If a user wants the total runtime of a module, they can use
-# ``find_total_for_module``.
-
-print(inspector.find_total_for_module("L__self__"))
-print(inspector.find_total_for_module("L__self___conv2"))
-
-######################################################################
-# Note: ``find_total_for_module`` is a special first class method of
-# `Inspector <../sdk-inspector.html>`__
-
-######################################################################
-# Conclusion
-# ----------
-#
-# In this tutorial, we learned about the steps required to consume an ExecuTorch
-# model with the ExecuTorch Developer Tools. It also showed how to use the Inspector APIs
-# to analyze the model run results.
-#
-# Links Mentioned
-# ^^^^^^^^^^^^^^^
-#
-# - `ExecuTorch Developer Tools Overview <../sdk-overview.html>`__
-# - `ETRecord <../sdk-etrecord.html>`__
-# - `ETDump <../sdk-etdump.html>`__
-# - `Inspector <../sdk-inspector.html>`__
diff --git a/docs/website/docs/tutorials/bundled_program.md b/docs/website/docs/tutorials/bundled_program.md
deleted file mode 100644
index e477d8e6a61..00000000000
--- a/docs/website/docs/tutorials/bundled_program.md
+++ /dev/null
@@ -1,162 +0,0 @@
-DEPRECATED: This document is moving to //executorch/docs/source/sdk-bundled-io.md
-
-# Bundled Program
-
-## Introduction
-Bundled Program is a wrapper around the core ExecuTorch program designed to help users wrapping test cases and other related info with the models they deploy. Bundled Program is not necessarily a core part of the program and not needed for its execution but is more necessary for various other use-cases, especially for model correctness evaluation such as e2e testing during model bring-up etc.
-
-Overall procedure can be broken into two stages, and in each stage we are supporting:
-* **Emit stage**: Bundling test I/O cases as well as other useful info in key-value pairs along with the ExecuTorch program.
-* **Runtime stage**: Accessing, executing and verifying the bundled test cases during runtime.
-
-## Emit stage
-
- This stage mainly focuses on the creation of a BundledProgram, and dump it out to the disk as a flatbuffer file. Please refer to Bento notebook [N2744997](https://www.internalfb.com/intern/anp/view/?id=2744997) for details on how to create a bundled program.
-
-## Runtime Stage
-This stage mainly focuses on executing the model with the bundled inputs and and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
-
-### Get executorch program ptr from BundledProgram buffer
-We need the pointer to executorch program to do the execution. To unify the process of loading and executing BundledProgram and Program flatbuffer, we create an API:
- ```c++
-
-/**
- * Finds the serialized ExecuTorch program data in the provided file data.
- *
- * The returned buffer is appropriate for constructing a
- * torch::executor::Program.
- *
- * Calling this is only necessary if the file could be a bundled program. If the
- * file will only contain an unwrapped ExecuTorch program, callers can construct
- * torch::executor::Program with file_data directly.
- *
- * @param[in] file_data The contents of an ExecuTorch program or bundled program
- *                      file.
- * @param[in] file_data_len The length of file_data, in bytes.
- * @param[out] out_program_data The serialized Program data, if found.
- * @param[out] out_program_data_len The length of out_program_data, in bytes.
- *
- * @returns Error::Ok if the program was found, and
- *     out_program_data/out_program_data_len point to the data. Other values
- *     on failure.
- */
-Error GetProgramData(
-    void* file_data,
-    size_t file_data_len,
-    const void** out_program_data,
-    size_t* out_program_data_len);
-```
-
-Here's an example of how to use the GetProgramData API:
-```c++
-  // Assume that the user has read the contents of the file into file_data using
-  // whatever method works best for their application. The file could contain
-  // either BundledProgram data or Program data.
-  void* file_data = ...;
-  size_t file_data_len = ...;
-
-  // If file_data contains a BundledProgram, GetProgramData() will return a
-  // pointer to the Program data embedded inside it. Otherwise it will return
-  // file_data, which already pointed to Program data.
-  const void* program_ptr;
-  size_t program_len;
-  status = torch::executor::bundled_program::GetProgramData(
-      buff_ptr.get(), buff_len, &program_ptr, &program_len);
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "GetProgramData() failed with status 0x%" PRIx32,
-      status);
-```
-
-### Load bundled input to ExecutionPlan
-To execute the program on the bundled input, we need to load the bundled input into the ExecutionPlan. Here we provided an API called `torch::executor::bundled_program::LoadBundledInput`:
-
-```c++
-
-/**
- * Load testset_idx-th bundled input of method_idx-th Method test in
- * bundled_program_ptr to given Method.
- *
- * @param[in] method The Method to verify.
- * @param[in] bundled_program_ptr The bundled program contains expected output.
- * @param[in] testset_idx  The index of input needs to be set into given Method.
- *
- * @returns Return Error::Ok if load successfully, or the error happens during
- * execution.
- */
-ET_NODISCARD Error LoadBundledInput(
-    Method& method,
-    serialized_bundled_program* bundled_program_ptr,
-    size_t testset_idx);
-```
-
-### Verify the plan's output.
-We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput` to verify the method's output with bundled expected outputs. Here's the details of this API:
-
-```c++
-/**
- * Compare the Method's output with testset_idx-th bundled expected
- * output in method_idx-th Method test.
- *
- * @param[in] method The Method to extract outputs from.
- * @param[in] bundled_program_ptr The bundled program contains expected output.
- * @param[in] testset_idx  The index of expected output needs to be compared.
- * @param[in] rtol Relative tolerance used for data comparsion.
- * @param[in] atol Absolute tolerance used for data comparsion.
- *
- * @returns Return Error::Ok if two outputs match, or the error happens during
- * execution.
- */
-ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
-    Method& method,
-    serialized_bundled_program* bundled_program_ptr,
-    size_t testset_idx,
-    double rtol = 1e-5,
-    double atol = 1e-8);
-
-```
-
-### Example
-
-Here we provide an example about how to run the bundled program step by step.
-
-```c++
-    // method_name is the name for the method we want to test
-    // memory_manager is the executor::MemoryManager variable for executor memory allocation.
-    // program is the executorch program.
-    Result<Method> method = program->load_method(method_name, &memory_manager);
-    ET_CHECK_MSG(
-        method.ok(),
-        "load_method() failed with status 0x%" PRIx32,
-        method.error());
-
-    // Load testset_idx-th input in the buffer to plan
-    status = torch::executor::bundled_program::LoadBundledInput(
-          *method,
-          program_data.bundled_program_data(),
-          FLAGS_testset_idx);
-      ET_CHECK_MSG(
-          status == Error::Ok,
-          "LoadBundledInput failed with status 0x%" PRIx32,
-          status);
-
-    // Execute the plan
-    status = method->execute();
-    ET_CHECK_MSG(
-        status == Error::Ok,
-        "method->execute() failed with status 0x%" PRIx32,
-        status);
-
-    // Verify the result.
-    status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
-          *method,
-          program_data.bundled_program_data(),
-          FLAGS_testset_idx,
-          FLAGS_rtol,
-          FLAGS_atol);
-      ET_CHECK_MSG(
-          status == Error::Ok,
-          "Bundle verification failed with status 0x%" PRIx32,
-          status);
-
-```
diff --git a/examples/README.md b/examples/README.md
index f36e873e843..2c1093296cb 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -13,7 +13,7 @@ examples
 ├── models                            # Contains a set of popular and representative PyTorch models
 ├── portable                          # Contains end-to-end demos for ExecuTorch in portable mode
 ├── selective_build                   # Contains demos of selective build for optimizing the binary size of the ExecuTorch runtime
-├── sdk                               # Contains demos of BundledProgram and ETDump
+├── devtools                          # Contains demos of BundledProgram and ETDump
 ├── demo-apps                         # Contains demo apps for Android and iOS
 ├── xnnpack                           # Contains end-to-end ExecuTorch demos with first-party optimization using XNNPACK
 ├── apple
@@ -31,39 +31,45 @@ examples
 
 A user's journey may commence by exploring the demos located in the [`portable/`](./portable) directory. Here, you will gain insights into the fundamental end-to-end workflow to generate a binary file from a ML model in [portable mode](../docs/source/concepts.md##portable-mode-lean-mode) and run it on the ExecuTorch runtime.
 
-## Demo of Llama 2 and Llama 3
+## Demos Apps
 
-[This page](./models/llama2/README.md) demonstrates how to run Llama 2 7B and Llama 3 8B models on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
+Explore mobile apps with ExecuTorch models integrated and deployable on [Android](./demo-apps/android) and [iOS]((./demo-apps/apple_ios)). This provides end-to-end instructions on how to export Llama models, load on device, build the app, and run it on device.
 
-## Demo of Selective Build
+For specific details related to models and backend, you can explore the various subsections.
 
-To understand how to deploy the ExecuTorch runtime with optimization for binary size, explore the demos available in the [`selective_build/`](./selective_build) directory. These demos are specifically designed to illustrate the [Selective Build](../docs/source/kernel-library-selective_build.md), offering insights into reducing the binary size while maintaining efficiency.
+### Llama Models
+
+[This page](./models/llama2/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
+
+### Llava1.5 7B
 
-## Demo of ExecuTorch SDK
+[This page](./models/llava/README.md) demonstrates how to run [Llava 1.5 7B](https://github.com/haotian-liu/LLaVA) model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
 
-You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
+### Selective Build
+
+To understand how to deploy the ExecuTorch runtime with optimization for binary size, explore the demos available in the [`selective_build/`](./selective_build) directory. These demos are specifically designed to illustrate the [Selective Build](../docs/source/kernel-library-selective_build.md), offering insights into reducing the binary size while maintaining efficiency.
 
-## Demo Apps
+### Developer Tools
 
-Explore mobile apps with ExecuTorch models integrated and deployable on Android and iOS in the [`demo-apps/android/`](./demo-apps/android) and [`demo-apps/apple_ios/`](./demo-apps/apple_ios) directories, respectively.
+You will find demos of [ExecuTorch Developer Tools](./devtools/) in the [`devtools/`](./devtools/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
 
-## Demo of XNNPACK delegation
+### XNNPACK delegation
 
 The demos in the [`xnnpack/`](./xnnpack) directory provide valuable insights into the process of lowering and executing an ExecuTorch model with built-in performance enhancements. These demos specifically showcase the workflow involving [XNNPACK backend](https://github.com/pytorch/executorch/tree/main/backends/xnnpack) delegation and quantization.
 
-## Demo of ExecuTorch Apple Backend
+### Apple Backend
 
 You will find demos of [ExecuTorch Core ML Backend](./apple/coreml/) in the [`apple/coreml/`](./apple/coreml) directory and [MPS Backend](./apple/mps/) in the [`apple/mps/`](./apple/mps) directory.
 
-## Demo of ExecuTorch on ARM Cortex-M55 + Ethos-U55
+### ARM Cortex-M55 + Ethos-U55 Backend
 
 The [`arm/`](./arm) directory contains scripts to help you run a PyTorch model on a ARM Corstone-300 platform via ExecuTorch.
 
-## Demo of ExecuTorch QNN Backend
+### QNN Backend
 
 You will find demos of [ExecuTorch QNN Backend](./qualcomm) in the [`qualcomm/`](./qualcomm) directory.
 
-## Demo of ExecuTorch on Cadence HiFi4 DSP
+### Cadence HiFi4 DSP
 
 The [`Cadence/`](./cadence) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it.
 
diff --git a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
index 16e9e590027..31e6eba6f1e 100644
--- a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
+++ b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
@@ -18,7 +18,7 @@
 		C97BFFA42BC0C17300F55BAC /* libportable_kernels.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */; };
 		C97BFFA62BC0C1F200F55BAC /* libportable_ops_lib.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */; };
 		C988D69D2B998CDE00979CF6 /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C988D69C2B998CD700979CF6 /* libprotobuf-lite.a */; };
-		F24817E72BC65B2000E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */; };
+		F24817E72BC65B2000E80D98 /* libexecutorch_core.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E62BC65B2000E80D98 /* libexecutorch_core.a */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -46,7 +46,7 @@
 		C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libportable_kernels.a; path = libraries/libportable_kernels.a; sourceTree = "<group>"; };
 		C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libportable_ops_lib.a; path = libraries/libportable_ops_lib.a; sourceTree = "<group>"; };
 		C988D69C2B998CD700979CF6 /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "libraries/libprotobuf-lite.a"; sourceTree = "<group>"; };
-		F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = libraries/libexecutorch_no_prim_ops.a; sourceTree = "<group>"; };
+		F24817E62BC65B2000E80D98 /* libexecutorch_core.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_core.a; path = libraries/libexecutorch_core.a; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -55,7 +55,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				38626BB52B225A890059413D /* libetdump.a in Frameworks */,
-				F24817E72BC65B2000E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */,
+				F24817E72BC65B2000E80D98 /* libexecutorch_core.a in Frameworks */,
 				38626BB42B225A560059413D /* libflatccrt.a in Frameworks */,
 				C94D51682ACFCC7100AF47FD /* libcoremldelegate.a in Frameworks */,
 				C94D51662ACFCBCB00AF47FD /* Accelerate.framework in Frameworks */,
@@ -99,7 +99,7 @@
 				C94D515C2ACFCBA000AF47FD /* libexecutorch.a */,
 				C94D51612ACFCBBA00AF47FD /* libsqlite3.tbd */,
 				C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */,
-				F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */,
+				F24817E62BC65B2000E80D98 /* libexecutorch_core.a */,
 				C97BFFA32BC0C17300F55BAC /* libportable_kernels.a */,
 				C97BFFA52BC0C1F200F55BAC /* libportable_ops_lib.a */,
 			);
diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm
index c83287fb44d..405bfb9c6c4 100644
--- a/examples/apple/coreml/executor_runner/main.mm
+++ b/examples/apple/coreml/executor_runner/main.mm
@@ -24,8 +24,25 @@ static inline id check_class(id obj, Class cls) {
 
 #define SAFE_CAST(Object, Type) ((Type *)check_class(Object, [Type class]))
 
-using namespace torch::executor;
-using torch::executor::util::FileDataLoader;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::EValue;
+using executorch::runtime::Error;
+using executorch::runtime::EventTracer;
+using executorch::runtime::EventTracerDebugLogLevel;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::TensorInfo;
+using torch::executor::CoreMLBackendDelegate;
 
 static constexpr size_t kRuntimeMemorySize = 16 * 1024U * 1024U; // 16 MB
 
@@ -294,7 +311,7 @@ bool is_model_analysis_enabled(const Args& args) {
 }
 
 void dump_etdump_gen(ETDumpGen *etdump_gen, const Buffer& debug_buffer, const Args& args) {
-    etdump_result result = (etdump_gen != nullptr) ? etdump_gen->get_etdump_data() : etdump_result{.buf = nullptr, .size = 0};
+    ETDumpResult result = (etdump_gen != nullptr) ? etdump_gen->get_etdump_data() : ETDumpResult{.buf = nullptr, .size = 0};
     if (result.size == 0) {
         return;
     }
@@ -316,7 +333,7 @@ void dump_etdump_gen(ETDumpGen *etdump_gen, const Buffer& debug_buffer, const Ar
 
 int main(int argc, char * argv[]) {
     @autoreleasepool {
-        runtime_init();
+        executorch::runtime::runtime_init();
 
         auto args = parse_command_line_args([[NSProcessInfo processInfo] arguments]);
         if (args.purge_models_cache) {
diff --git a/examples/apple/coreml/scripts/TARGETS b/examples/apple/coreml/scripts/TARGETS
new file mode 100644
index 00000000000..c47af5235f7
--- /dev/null
+++ b/examples/apple/coreml/scripts/TARGETS
@@ -0,0 +1,18 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
+
+python_binary(
+    name = "export",
+    srcs = [
+        "export.py",
+    ],
+    main_function = "executorch.examples.apple.coreml.scripts.export.main",
+    deps = [
+        "//executorch/backends/apple/coreml:backend",
+        "//executorch/backends/apple/coreml:partitioner",
+        "//executorch/backends/apple/coreml:quantizer",
+        "//executorch/devtools/etrecord:etrecord",
+        "//executorch/examples/models:models",
+    ],
+)
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index b57a8f12e7c..9d20f289bf6 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -36,7 +36,7 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \
 -DFLATC_EXECUTABLE="$(which flatc)" \
 -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
 -DEXECUTORCH_BUILD_XNNPACK=OFF \
--DEXECUTORCH_BUILD_SDK=ON \
+-DEXECUTORCH_BUILD_DEVTOOLS=ON \
 -DEXECUTORCH_BUILD_COREML=ON \
 -Dprotobuf_BUILD_TESTS=OFF \
 -Dprotobuf_BUILD_EXAMPLES=OFF \
@@ -63,7 +63,7 @@ cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH"
 echo "ExecuTorch: Copying libraries"
 mkdir "$LIBRARIES_DIR_PATH"
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libexecutorch.a"  \;
-find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch_no_prim_ops.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libexecutorch_no_prim_ops.a"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch_core.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libexecutorch_core.a"  \;
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lite.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libprotobuf-lite.a"  \;
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lited.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libprotobuf-lite.a"  \;
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libetdump.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH/libetdump.a"  \;
diff --git a/examples/apple/coreml/scripts/debugger_cli.py b/examples/apple/coreml/scripts/debugger_cli.py
index cb978de0746..88390f8d8cb 100644
--- a/examples/apple/coreml/scripts/debugger_cli.py
+++ b/examples/apple/coreml/scripts/debugger_cli.py
@@ -24,7 +24,7 @@ def get_root_dir_path() -> Path:
 sys.path.append(str((get_root_dir_path() / "examples").resolve()))
 
 from inspector_utils import (
-    build_sdk_runner_including_coreml,
+    build_devtools_runner_including_coreml,
     ComparisonResult,
     create_inspector_coreml,
     create_inspector_reference,
@@ -145,7 +145,7 @@ def main() -> None:
             f"Valid compute units are {valid_compute_units}."
         )
 
-    build_sdk_runner_including_coreml(
+    build_devtools_runner_including_coreml(
         root_dir_path=get_root_dir_path(), conda_env_name=args.conda_environment_name
     )
 
diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py
index e906c0704cb..1aa5806e371 100644
--- a/examples/apple/coreml/scripts/export.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -21,14 +21,15 @@
 from executorch.exir import to_edge
 
 from executorch.exir.backend.backend_api import to_backend
+
 from torch.export import export
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent
 EXAMPLES_DIR = REPO_ROOT / "examples"
 sys.path.append(str(EXAMPLES_DIR.absolute()))
 
-from models import MODEL_NAME_TO_MODEL
-from models.model_factory import EagerModelFactory
+from executorch.examples.models import MODEL_NAME_TO_MODEL
+from executorch.examples.models.model_factory import EagerModelFactory
 
 # Script to export a model with coreml delegation.
 
@@ -82,7 +83,7 @@ def partition_module_to_coreml(module):
     module = module.eval()
 
 
-def lower_module_to_coreml(module, compile_specs):
+def lower_module_to_coreml(module, compile_specs, example_inputs):
     module = module.eval()
     edge = to_edge(export(module, example_inputs), compile_config=_EDGE_COMPILE_CONFIG)
     # All of the subsequent calls on the edge_dialect_graph generated above (such as delegation or
@@ -141,7 +142,7 @@ def generate_compile_specs_from_args(args):
     )
 
 
-if __name__ == "__main__":
+def main():
     args = parse_args()
 
     if args.model_name not in MODEL_NAME_TO_MODEL:
@@ -167,6 +168,7 @@ def generate_compile_specs_from_args(args):
     if args.use_partitioner:
         model.eval()
         exir_program_aten = torch.export.export(model, example_inputs)
+
         edge_program_manager = exir.to_edge(exir_program_aten)
         edge_copy = copy.deepcopy(edge_program_manager)
         partitioner = CoreMLPartitioner(
@@ -179,6 +181,7 @@ def generate_compile_specs_from_args(args):
     else:
         lowered_module, edge_copy = lower_module_to_coreml(
             module=model,
+            example_inputs=example_inputs,
             compile_specs=compile_specs,
         )
         exec_program = export_lowered_module_to_executorch_program(
@@ -193,3 +196,7 @@ def generate_compile_specs_from_args(args):
         save_processed_bytes(
             lowered_module.processed_bytes, args.model_name, args.compute_unit
         )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/apple/coreml/scripts/inspector_utils.py b/examples/apple/coreml/scripts/inspector_utils.py
index 9d7420a920a..08af6fb3484 100644
--- a/examples/apple/coreml/scripts/inspector_utils.py
+++ b/examples/apple/coreml/scripts/inspector_utils.py
@@ -47,26 +47,26 @@
 ]
 
 
-def build_sdk_runner_including_coreml(
+def build_devtools_runner_including_coreml(
     root_dir_path: Path,
     conda_env_name: str,
     force: bool = False,
 ):
     if not force:
-        sdk_executable_path = (
-            root_dir_path / "cmake-out" / "examples" / "sdk" / "sdk_example_runner"
+        devtools_executable_path = (
+            root_dir_path / "cmake-out" / "examples" / "devtools" / "example_runner"
         )
-        print(sdk_executable_path)
-        if sdk_executable_path.is_file():
+        print(devtools_executable_path)
+        if devtools_executable_path.is_file():
             return
 
     cd_root_command: str = f"cd {root_dir_path.resolve()}"
     conda_activate_env_command: str = f"source conda activate {conda_env_name}"
-    build_sdk_runner_command: str = (
-        "./examples/sdk/build_sdk_example_runner.sh --coreml"
+    build_devtools_runner_command: str = (
+        "./examples/devtools/build_example_runner.sh --coreml"
     )
     build_command: str = (
-        f"{cd_root_command} && {conda_activate_env_command} && {build_sdk_runner_command}"
+        f"{cd_root_command} && {conda_activate_env_command} && {build_devtools_runner_command}"
     )
     subprocess.run(
         f'bash -c "{build_command}"', shell=True, check=True
@@ -173,22 +173,24 @@ def generate_etdump_with_intermediate_values(
     debug_buffer_path: Path,
     debug_buffer_size: int,
 ):
-    sdk_executable_path = (
-        root_dir_path / "cmake-out" / "examples" / "sdk" / "sdk_example_runner"
+    devtools_executable_path = (
+        root_dir_path / "cmake-out" / "examples" / "devtools" / "example_runner"
     )
-    if not sdk_executable_path.is_file():
+    if not devtools_executable_path.is_file():
         raise FileNotFoundError(
-            errno.ENOENT, os.strerror(errno.ENOENT), str(sdk_executable_path.resolve())
+            errno.ENOENT,
+            os.strerror(errno.ENOENT),
+            str(devtools_executable_path.resolve()),
         )
 
-    sdk_runner_command: str = f"""
-    {sdk_executable_path.resolve()} -dump_intermediate_outputs\
+    devtools_runner_command: str = f"""
+    {devtools_executable_path.resolve()} -dump_intermediate_outputs\
     -bundled_program_path {bundled_program_path.resolve()}\
     -etdump_path {et_dump_path.resolve()}\
     -debug_output_path {debug_buffer_path.resolve()}\
     -debug_buffer_size {debug_buffer_size}"""
     subprocess.run(
-        f'bash -c "{sdk_runner_command}"', shell=True, check=True
+        f'bash -c "{devtools_runner_command}"', shell=True, check=True
     ).check_returncode()
 
 
diff --git a/examples/apple/mps/README.md b/examples/apple/mps/README.md
index bebd1329be4..dc01d585f84 100644
--- a/examples/apple/mps/README.md
+++ b/examples/apple/mps/README.md
@@ -30,7 +30,7 @@ Once we have the model binary file, then let's run it with the ExecuTorch runtim
 # Build and install executorch
 cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DEXECUTORCH_BUILD_MPS=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm
index 040b2fcd996..e3d0e2978b6 100644
--- a/examples/apple/mps/executor_runner/mps_executor_runner.mm
+++ b/examples/apple/mps/executor_runner/mps_executor_runner.mm
@@ -97,8 +97,26 @@
     262144, // 256 KB
     "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
 
-using namespace torch::executor;
-using torch::executor::util::FileDataLoader;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::extension::BufferCleanup;
+using executorch::extension::BufferDataLoader;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::EValue;
+using executorch::runtime::Error;
+using executorch::runtime::EventTracerDebugLogLevel;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+namespace bundled_program = executorch::bundled_program;
 
 int main(int argc, char** argv) {
   {
@@ -113,7 +131,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -144,20 +162,20 @@ int main(int argc, char** argv) {
   // Find the offset to the embedded Program.
   const void* program_data;
   size_t program_data_len;
-  Error status = torch::executor::bundled_program::GetProgramData(
+  Error status = bundled_program::get_program_data(
       const_cast<void*>(file_data->data()),
       file_data->size(),
       &program_data,
       &program_data_len);
   ET_CHECK_MSG(
       status == Error::Ok,
-      "GetProgramData() failed on file '%s': 0x%x",
+      "get_program_data() failed on file '%s': 0x%x",
       model_path,
       (unsigned int)status);
 
   // Wrap the buffer in a DataLoader.
   auto buffer_data_loader =
-      util::BufferDataLoader(program_data, program_data_len);
+      BufferDataLoader(program_data, program_data_len);
 
   // Parse the program file. This is immutable, and can also be reused between
   // multiple execution invocations across multiple threads.
@@ -239,7 +257,7 @@ HierarchicalAllocator planned_memory(
   // be used by a single thread at at time, but it can be reused.
   //
 
-  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  ETDumpGen etdump_gen;
   Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
   ET_CHECK_MSG(
@@ -263,11 +281,11 @@ HierarchicalAllocator planned_memory(
   }
 
   // Prepare the inputs.
-  std::unique_ptr<util::BufferCleanup> inputs;
+  std::unique_ptr<BufferCleanup> inputs;
   if (FLAGS_bundled_program) {
     ET_LOG(Info, "Loading bundled program...");
     // Use the inputs embedded in the bundled program.
-    status = torch::executor::bundled_program::LoadBundledInput(
+    status = bundled_program::load_bundled_input(
         *method,
         file_data->data(),
         FLAGS_testset_idx);
@@ -278,11 +296,11 @@ HierarchicalAllocator planned_memory(
   } else {
     ET_LOG(Info, "Loading non-bundled program...\n");
     // Use ones-initialized inputs.
-    auto inputs_result = torch::executor::util::prepare_input_tensors(*method);
+    auto inputs_result = executorch::extension::prepare_input_tensors(*method);
     if (inputs_result.ok()) {
       // Will free the inputs when destroyed.
       inputs =
-          std::make_unique<util::BufferCleanup>(std::move(inputs_result.get()));
+          std::make_unique<BufferCleanup>(std::move(inputs_result.get()));
     }
   }
   ET_LOG(Info, "Inputs prepared.");
@@ -322,14 +340,14 @@ HierarchicalAllocator planned_memory(
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
   // Print the first and last 100 elements of long lists of scalars.
-  std::cout << torch::executor::util::evalue_edge_items(100);
+  std::cout << executorch::extension::evalue_edge_items(100);
   for (int i = 0; i < outputs.size(); ++i) {
     std::cout << "Output " << i << ": " << outputs[i] << std::endl;
   }
 
   // Dump the etdump data containing profiling/debugging data to the specified
   // file.
-  etdump_result result = etdump_gen.get_etdump_data();
+  ETDumpResult result = etdump_gen.get_etdump_data();
   if (result.buf != nullptr && result.size > 0) {
     FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
     fwrite((uint8_t*)result.buf, 1, result.size, f);
@@ -362,7 +380,7 @@ HierarchicalAllocator planned_memory(
       atol = 1e-01;
       rtol = 1e-01;
     }
-    status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
+    status = bundled_program::verify_method_outputs(
         *method,
         file_data->data(),
         FLAGS_testset_idx,
diff --git a/examples/apple/mps/scripts/build_mps_executor_runner.sh b/examples/apple/mps/scripts/build_mps_executor_runner.sh
index 16754588b67..31ab54fd4d3 100755
--- a/examples/apple/mps/scripts/build_mps_executor_runner.sh
+++ b/examples/apple/mps/scripts/build_mps_executor_runner.sh
@@ -41,7 +41,7 @@ rm -rf "$OUTPUT"
 cmake -DBUCK2="$BUCK" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE="$MODE" \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DEXECUTORCH_BUILD_MPS=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index d6416e0ffc8..dfb958dce53 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -166,7 +166,7 @@ def get_model_config(args):
 
     # pre-autograd export. eventually this will become torch.export
     with torch.no_grad():
-        model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+        model = torch.export.export_for_training(model, example_inputs).module()
         edge: EdgeProgramManager = export_to_edge(
             model,
             example_inputs,
diff --git a/examples/apple/mps/test_mps.sh b/examples/apple/mps/test_mps.sh
index 55712089e07..555161dd3f7 100755
--- a/examples/apple/mps/test_mps.sh
+++ b/examples/apple/mps/test_mps.sh
@@ -11,14 +11,14 @@ set -e
 
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../../../.ci/scripts/utils.sh"
-cmake_install_executorch_sdk_lib() {
+cmake_install_executorch_devtools_lib() {
   echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a"
   rm -rf cmake-out
 
   retry cmake -DBUCK2="$BUCK" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_BUILD_MPS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
@@ -60,5 +60,5 @@ then
 fi
 
 
-cmake_install_executorch_sdk_lib
+cmake_install_executorch_devtools_lib
 test_cmake_mps
diff --git a/examples/arm/README.md b/examples/arm/README.md
index 6e050768d1d..717a96c13e2 100644
--- a/examples/arm/README.md
+++ b/examples/arm/README.md
@@ -25,7 +25,7 @@ $ ./setup.sh --i-agree-to-the-contained-eula [optional-scratch-dir]
 
 # Step [2] - build + run ExecuTorch and executor_runner baremetal application
 # suited for Corstone300 to run a simple PyTorch model.
-$ ./run.sh [same-optional-scratch-dir-as-before]
+$ ./run.sh [--scratch-dir=same-optional-scratch-dir-as-before]
 ```
 ### Online Tutorial
 
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 4d77e819089..fed8b8b0b49 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -9,6 +9,7 @@
 
 import argparse
 import logging
+import os
 
 import torch
 
@@ -48,6 +49,19 @@ def get_model_and_inputs_from_name(model_name: str):
         model, example_inputs, _ = EagerModelFactory.create_model(
             *MODEL_NAME_TO_MODEL[model_name]
         )
+    # Case 3: Model is in an external python file loaded as a module.
+    #         ModelUnderTest should be a torch.nn.module instance
+    #         ModelInputs should be a tuple of inputs to the forward function
+    elif model_name.endswith(".py"):
+        import importlib.util
+
+        # load model's module and add it
+        spec = importlib.util.spec_from_file_location("tmp_model", model_name)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        model = module.ModelUnderTest
+        example_inputs = module.ModelInputs
+
     else:
         raise RuntimeError(
             f"Model '{model_name}' is not a valid name. Use --help for a list of available models."
@@ -133,7 +147,58 @@ def forward(self, x):
     "softmax": SoftmaxModule,
 }
 
-if __name__ == "__main__":
+targets = [
+    "ethos-u55-32",
+    "ethos-u55-64",
+    "ethos-u55-128",
+    "ethos-u55-256",
+    "ethos-u85-128",
+    "ethos-u85-256",
+    "ethos-u85-512",
+    "ethos-u85-1024",
+    "ethos-u85-2048",
+    "TOSA",
+]
+
+
+def get_compile_spec(target: str, intermediates: bool) -> ArmCompileSpecBuilder:
+    spec_builder = None
+    if target == "TOSA":
+        spec_builder = (
+            ArmCompileSpecBuilder().tosa_compile_spec().set_permute_memory_format(True)
+        )
+    elif "ethos-u55" in target:
+        spec_builder = (
+            ArmCompileSpecBuilder()
+            .ethosu_compile_spec(
+                target,
+                system_config="Ethos_U55_High_End_Embedded",
+                memory_mode="Shared_Sram",
+                extra_flags="--debug-force-regor --output-format=raw",
+            )
+            .set_permute_memory_format(args.model_name in MODEL_NAME_TO_MODEL.keys())
+            .set_quantize_io(True)
+        )
+    elif "ethos-u85" in target:
+        spec_builder = (
+            ArmCompileSpecBuilder()
+            .ethosu_compile_spec(
+                target,
+                system_config="Ethos_U85_SYS_DRAM_Mid",
+                memory_mode="Shared_Sram",
+                extra_flags="--output-format=raw",
+            )
+            .set_permute_memory_format(True)
+            .set_quantize_io(True)
+        )
+
+    if intermediates is not None:
+        spec_builder.dump_intermediate_artifacts_to(args.intermediates)
+
+    return spec_builder.build()
+
+
+def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-m",
@@ -149,6 +214,15 @@ def forward(self, x):
         default=False,
         help="Flag for producing ArmBackend delegated model",
     )
+    parser.add_argument(
+        "-t",
+        "--target",
+        action="store",
+        required=False,
+        default="ethos-u55-128",
+        choices=targets,
+        help=f"For ArmBackend delegated models, pick the target, and therefore the instruction set generated. valid targets are {targets}",
+    )
     parser.add_argument(
         "-q",
         "--quantize",
@@ -167,8 +241,26 @@ def forward(self, x):
     parser.add_argument(
         "--debug", action="store_true", help="Set the logging level to debug."
     )
-
+    parser.add_argument(
+        "-i",
+        "--intermediates",
+        action="store",
+        required=False,
+        help="Store intermediate output (like TOSA artefacts) somewhere.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        action="store",
+        required=False,
+        help="Location for outputs, if not the default of cwd.",
+    )
     args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
 
     if args.debug:
         logging.basicConfig(level=logging.DEBUG, format=FORMAT, force=True)
@@ -191,12 +283,12 @@ def forward(self, x):
     ):
         raise RuntimeError(f"Model {args.model_name} cannot be delegated.")
 
-    # 1. pick model from one of the supported lists
+    # Pick model from one of the supported lists
     model, example_inputs = get_model_and_inputs_from_name(args.model_name)
     model = model.eval()
 
     # pre-autograd export. eventually this will become torch.export
-    model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+    model = torch.export.export_for_training(model, example_inputs).module()
 
     # Quantize if required
     if args.quantize:
@@ -209,19 +301,18 @@ def forward(self, x):
             _check_ir_validity=False,
         ),
     )
+
+    # As we can target multiple output encodings from ArmBackend, one must
+    # be specified.
+    compile_spec = (
+        get_compile_spec(args.target, args.intermediates)
+        if args.delegate is True
+        else None
+    )
+
     logging.debug(f"Exported graph:\n{edge.exported_program().graph}")
     if args.delegate is True:
-        edge = edge.to_backend(
-            ArmPartitioner(
-                ArmCompileSpecBuilder()
-                .ethosu_compile_spec("ethos-u55-128")
-                .set_permute_memory_format(
-                    args.model_name in MODEL_NAME_TO_MODEL.keys()
-                )
-                .set_quantize_io(True)
-                .build()
-            )
-        )
+        edge = edge.to_backend(ArmPartitioner(compile_spec))
         logging.debug(f"Lowered graph:\n{edge.exported_program().graph}")
 
     try:
@@ -237,7 +328,14 @@ def forward(self, x):
         else:
             raise e
 
-    model_name = f"{args.model_name}" + (
-        "_arm_delegate" if args.delegate is True else ""
+    model_name = os.path.basename(os.path.splitext(args.model_name)[0])
+    output_name = f"{model_name}" + (
+        f"_arm_delegate_{args.target}"
+        if args.delegate is True
+        else f"_arm_{args.target}"
     )
-    save_pte_program(exec_prog, model_name)
+
+    if args.output is not None:
+        output_name = os.path.join(args.output, output_name)
+
+    save_pte_program(exec_prog, output_name)
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch
new file mode 100644
index 00000000000..f2df3350d04
--- /dev/null
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch
@@ -0,0 +1,24 @@
+From 162ea6b51bd94fabf623cc6b63cf271497eaff8d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Per=20=C3=85strand?= <per.astrand@arm.com>
+Date: Fri, 13 Sep 2024 11:47:03 +0200
+Subject: [PATCH] Add .data fixup from Corestone-300
+
+---
+ targets/corstone-320/platform.ld | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/targets/corstone-320/platform.ld b/targets/corstone-320/platform.ld
+index 2010d14..fb4e7b7 100644
+--- a/targets/corstone-320/platform.ld
++++ b/targets/corstone-320/platform.ld
+@@ -77,6 +77,7 @@ PHDRS
+     rom_boot PT_LOAD;
+     rom_exec PT_LOAD;
+     rom_dram PT_LOAD;
++    data     PT_LOAD; /* HACK: New prog header for .data (and friends) going in DTCM */
+     null     PT_NULL;
+ }
+ 
+-- 
+2.39.3 (Apple Git-146)
+
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 68c5435dffe..93fd8cca602 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -16,6 +16,8 @@ if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
   )
 endif()
 
+set(TARGET_BOARD "corstone-300" CACHE STRING "Target board")
+
 # Example ExecuTorch demo for bare metal Cortex-M based systems
 set(ET_DIR_PATH
     "../../.."
@@ -55,10 +57,13 @@ endif()
 # libraries. We link against ethosu_target_init which includes all of these
 # dependencies.
 
-# For Corstone-300 FVP builds we put models into the larger DRAM area
-set(MEMORY_MODEL "dram")
-set(MEMORY_ARENA "dram")
-add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-300 target)
+if(TARGET_BOARD STREQUAL "corstone-300")
+  add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-300 target)
+elseif(TARGET_BOARD STREQUAL "corstone-320")
+  add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-320 target)
+else()
+  message(FATAL_ERROR "Unsupported TARGET_BOARD: ${TARGET_BOARD}")
+endif()
 
 # Dependencies from the ExecuTorch build
 add_library(executorch STATIC IMPORTED)
@@ -67,12 +72,12 @@ set_property(
                              "${ET_BUILD_DIR_PATH}/libexecutorch.a"
 )
 
-add_library(executorch_no_prim_ops STATIC IMPORTED)
+add_library(executorch_core STATIC IMPORTED)
 set_property(
-  TARGET executorch_no_prim_ops
-  PROPERTY IMPORTED_LOCATION "${ET_BUILD_DIR_PATH}/libexecutorch_no_prim_ops.a"
+  TARGET executorch_core
+  PROPERTY IMPORTED_LOCATION "${ET_BUILD_DIR_PATH}/libexecutorch_core.a"
 )
-target_link_libraries(executorch INTERFACE executorch_no_prim_ops)
+target_link_libraries(executorch INTERFACE executorch_core)
 
 add_library(executorch_delegate_ethos_u STATIC IMPORTED)
 set_property(
@@ -171,7 +176,7 @@ endif()
 if(SEMIHOSTING)
   # Remove this when MLBEDSW-8910 is closed.
   set_source_files_properties(
-    ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c
+    ${ETHOS_SDK_PATH}/core_platform/targets/${TARGET_BOARD}/retarget.c
     PROPERTIES HEADER_FILE_ONLY TRUE
   )
 endif()
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index f8f9d34ecfc..3cfb96b99a6 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -22,35 +22,91 @@
 
 #include "arm_perf_monitor.h"
 
+#ifdef SEMIHOSTING
+
+/**
+ * The input_file_allocation_pool should be large enough to fit the various
+ * input file data used when loading the data files when running semihosting
+ * e.g. the input file data and the pte file data
+ * In our unit test flow, we have the capability to provide an enitre model to
+ * the Corstone-3xx FVP using semi hosting. Hence, the input file allocation
+ * pool needs to be large enough to take an entire model and input. On the FVP,
+ * input_data_sec is linked to the DDR, which is large (256MB on
+ * Corstone-300).
+ * If you use semihosting on your HW this can be lowered to fit your
+ * files/memory
+ */
+
+const size_t input_file_allocation_pool_size = 60 * 1024 * 1024;
+unsigned char __attribute__((
+    section("input_data_sec"),
+    aligned(16))) input_file_allocation_pool[input_file_allocation_pool_size];
+char* model_pte = nullptr;
+
+#else
+
 /**
  * This header file is generated by the build process based on the .pte file
  * specified in the ET_PTE_FILE_PATH variable to the cmake build.
  * Control of the action of the .pte, it's use of operators and delegates, and
  * which are included in the bare metal build are also orchestrated by the
  * CMakeLists file. For example use see examples/arm/run.sh
+ *
+ * e.g. This includes the pte as a big chunk of data struct into this file
  */
-#ifdef SEMIHOSTING
-// TODO: Verify the section attribute to match the linker script
-//       pending MLETORCH-39
-const size_t input_allocation_pool_size = 1 * 1024 * 1024;
-unsigned char __attribute__((
-    section("network_model_sec"),
-    aligned(16))) input_allocation_pool[input_allocation_pool_size];
-// memory for the model will be allocated from the input_allocation_pool
-char* model_pte = nullptr;
-#else
 #include "model_pte.h"
+
 #endif
 
-using namespace exec_aten;
-using namespace std;
-using torch::executor::Error;
-using torch::executor::Result;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::BufferCleanup;
+using executorch::extension::BufferDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
+using executorch::runtime::TensorInfo;
 
-#define METHOD_ALLOCATOR_POOL_SIZE (70 * 1024 * 1024)
+/**
+ * The method_allocation_pool should be large enough to fit the setup, input
+ * used and other data used like the planned memory pool (e.g. memory-planned
+ * buffers to use for mutable tensor data) In this example we run on a
+ * Corstone-3xx FVP so we can use a lot of memory to be able to run and test
+ * large models if you run on HW this should be lowered to fit into your
+ * availible memory.
+ */
+#ifndef ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
+#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (60 * 1024 * 1024)
+#endif
+const size_t method_allocation_pool_size =
+    ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE;
 unsigned char __attribute__((
-    section("network_model_sec"),
-    aligned(16))) method_allocation_pool[METHOD_ALLOCATOR_POOL_SIZE];
+    section("input_data_sec"),
+    aligned(16))) method_allocation_pool[method_allocation_pool_size];
+
+/**
+ * The temp_allocation_pool is used for allocating temporary data during kernel
+ * or delegate execution. This will be reset after each kernel or delegate call.
+ * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
+ * a better fit
+ */
+#ifndef ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE
+#define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
+#endif
+const size_t temp_allocation_pool_size =
+    ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE;
+unsigned char __attribute__((
+    section("input_data_sec"),
+    aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
 
 void et_pal_init(void) {}
 
@@ -78,20 +134,59 @@ void et_pal_emit_log_message(
 }
 
 namespace {
-using namespace torch::executor;
 
-Result<util::BufferCleanup> prepare_input_tensors(
+// Setup our own allocator that can show some extra stuff like used and free
+// memory info
+class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
+ public:
+  ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
+      : MemoryAllocator(size, base_address), used_(0) {}
+
+  void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
+    void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
+    if (ret != nullptr) {
+      // Align with the same code as in MemoryAllocator::allocate() to keep
+      // used_ "in sync" As alignment is expected to be power of 2 (checked by
+      // MemoryAllocator::allocate()) we can check it the lower bits
+      // (same as alignment - 1) is zero or not.
+      if ((size & (alignment - 1)) == 0) {
+        // Already aligned.
+        used_ += size;
+      } else {
+        used_ = (used_ | (alignment - 1)) + 1 + size;
+      }
+    }
+    return ret;
+  }
+
+  // Returns the used size of the allocator's memory buffer.
+  size_t used_size() const {
+    return used_;
+  }
+
+  // Returns the free size of the allocator's memory buffer.
+  size_t free_size() const {
+    return executorch::runtime::MemoryAllocator::size() - used_;
+  }
+
+ private:
+  size_t used_;
+};
+
+Result<BufferCleanup> prepare_input_tensors(
     Method& method,
-    torch::executor::MemoryAllocator& allocator,
+    MemoryAllocator& allocator,
     std::vector<std::pair<char*, size_t>>& input_buffers) {
   MethodMeta method_meta = method.method_meta();
   size_t num_inputs = method_meta.num_inputs();
   size_t num_allocated = 0;
 
+#ifdef SEMIHOSTING
   ET_CHECK_OR_RETURN_ERROR(
       input_buffers.size() > 0 && num_inputs == input_buffers.size(),
       InvalidArgument,
       "Wrong number of inputs allocated compared to method");
+#endif
 
   void** inputs =
       static_cast<void**>(allocator.allocate(num_inputs * sizeof(void*)));
@@ -165,18 +260,18 @@ Result<util::BufferCleanup> prepare_input_tensors(
       ET_LOG(
           Error, "Failed to prepare input %zu: 0x%" PRIx32, i, (uint32_t)err);
       // The BufferCleanup will free the inputs when it goes out of scope.
-      util::BufferCleanup cleanup({inputs, num_allocated});
+      BufferCleanup cleanup({inputs, num_allocated});
       return err;
     }
   }
-  return util::BufferCleanup({inputs, num_allocated});
+  return BufferCleanup({inputs, num_allocated});
 }
 
 #ifdef SEMIHOSTING
 
 std::pair<char*, size_t> read_binary_file(
     const char* filename,
-    torch::executor::MemoryAllocator& allocator) {
+    MemoryAllocator& allocator) {
   FILE* fp = fopen(filename, "rb");
   if (!fp) {
     ET_LOG(
@@ -228,14 +323,14 @@ int main(int argc, const char* argv[]) {
   (void)argv;
 #endif
 
-  torch::executor::runtime_init();
+  executorch::runtime::runtime_init();
   std::vector<std::pair<char*, size_t>> input_buffers;
   size_t pte_size = sizeof(model_pte);
 
 #ifdef SEMIHOSTING
   const char* output_basename = nullptr;
-  torch::executor::MemoryAllocator input_allocator(
-      input_allocation_pool_size, input_allocation_pool);
+  ArmMemoryAllocator input_file_allocator(
+      input_file_allocation_pool_size, input_file_allocation_pool);
 
   /* parse input parameters */
   for (int i = 0; i < argc; i++) {
@@ -249,13 +344,13 @@ int main(int argc, const char* argv[]) {
           ++nbr_inputs,
           input_tensor_filename);
       auto [buffer, buffer_size] =
-          read_binary_file(input_tensor_filename, input_allocator);
+          read_binary_file(input_tensor_filename, input_file_allocator);
       input_buffers.push_back(std::make_pair(buffer, buffer_size));
     } else if (std::strcmp(argv[i], "-m") == 0) {
       const char* pte_filename = argv[++i];
       ET_LOG(Info, "Reading pte model from file %s", pte_filename);
       auto [buffer, buffer_size] =
-          read_binary_file(pte_filename, input_allocator);
+          read_binary_file(pte_filename, input_file_allocator);
       // Store the model data with the same variable as if it was loaded
       // from compiled in location.
       model_pte = buffer;
@@ -267,10 +362,9 @@ int main(int argc, const char* argv[]) {
   }
 #endif
   ET_LOG(Info, "Model in %p %c", model_pte, model_pte[0]);
-  auto loader = torch::executor::util::BufferDataLoader(model_pte, pte_size);
+  auto loader = BufferDataLoader(model_pte, pte_size);
   ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", pte_size);
-  Result<torch::executor::Program> program =
-      torch::executor::Program::load(&loader);
+  Result<Program> program = Program::load(&loader);
   if (!program.ok()) {
     ET_LOG(
         Info,
@@ -289,8 +383,7 @@ int main(int argc, const char* argv[]) {
   }
   ET_LOG(Info, "Running method %s", method_name);
 
-  Result<torch::executor::MethodMeta> method_meta =
-      program->method_meta(method_name);
+  Result<MethodMeta> method_meta = program->method_meta(method_name);
   if (!method_meta.ok()) {
     ET_LOG(
         Info,
@@ -299,15 +392,15 @@ int main(int argc, const char* argv[]) {
         (unsigned int)method_meta.error());
   }
 
-  torch::executor::MemoryAllocator method_allocator{
-      torch::executor::MemoryAllocator(
-          METHOD_ALLOCATOR_POOL_SIZE, method_allocation_pool)};
+  ArmMemoryAllocator method_allocator(
+      method_allocation_pool_size, method_allocation_pool);
 
   std::vector<uint8_t*> planned_buffers; // Owns the memory
-  std::vector<torch::executor::Span<uint8_t>>
-      planned_spans; // Passed to the allocator
+  std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
 
+  size_t planned_buffer_membase = method_allocator.used_size();
+
   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
     size_t buffer_size =
         static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
@@ -320,14 +413,21 @@ int main(int argc, const char* argv[]) {
     planned_spans.push_back({planned_buffers.back(), buffer_size});
   }
 
-  torch::executor::HierarchicalAllocator planned_memory(
+  size_t planned_buffer_memsize =
+      method_allocator.used_size() - planned_buffer_membase;
+
+  HierarchicalAllocator planned_memory(
       {planned_spans.data(), planned_spans.size()});
 
-  torch::executor::MemoryManager memory_manager(
-      &method_allocator, &planned_memory);
+  ArmMemoryAllocator temp_allocator(
+      temp_allocation_pool_size, temp_allocation_pool);
+
+  MemoryManager memory_manager(
+      &method_allocator, &planned_memory, &temp_allocator);
+
+  size_t method_loaded_membase = method_allocator.used_size();
 
-  Result<torch::executor::Method> method =
-      program->load_method(method_name, &memory_manager);
+  Result<Method> method = program->load_method(method_name, &memory_manager);
   if (!method.ok()) {
     ET_LOG(
         Info,
@@ -335,9 +435,12 @@ int main(int argc, const char* argv[]) {
         method_name,
         method.error());
   }
+  size_t method_loaded_memsize =
+      method_allocator.used_size() - method_loaded_membase;
   ET_LOG(Info, "Method loaded.");
 
   ET_LOG(Info, "Preparing inputs...");
+  size_t input_membase = method_allocator.used_size();
 
   auto inputs =
       ::prepare_input_tensors(*method, method_allocator, input_buffers);
@@ -349,12 +452,52 @@ int main(int argc, const char* argv[]) {
         method_name,
         inputs.error());
   }
+  size_t input_memsize = method_allocator.used_size() - input_membase;
   ET_LOG(Info, "Input prepared.");
 
   ET_LOG(Info, "Starting the model execution...");
+  size_t executor_membase = method_allocator.used_size();
   StartMeasurements();
   Error status = method->execute();
   StopMeasurements();
+  size_t executor_memsize = method_allocator.used_size() - executor_membase;
+
+  ET_LOG(Info, "model_pte_loaded_size:     %lu bytes.", pte_size);
+#ifdef SEMIHOSTING
+  if (input_file_allocator.size() > 0) {
+    ET_LOG(
+        Info,
+        "input_file_allocator_used: %zu / %zu free: %zu ( used: %zu %% ) ",
+        input_file_allocator.used_size(),
+        input_file_allocator.size(),
+        input_file_allocator.free_size(),
+        100 * input_file_allocator.used_size() / input_file_allocator.size());
+  }
+#endif
+  if (method_allocator.size() != 0) {
+    size_t method_allocator_used = method_allocator.used_size();
+    ET_LOG(
+        Info,
+        "method_allocator_used:     %zu / %zu  free: %zu ( used: %zu %% ) ",
+        method_allocator_used,
+        method_allocator.size(),
+        method_allocator.free_size(),
+        100 * method_allocator_used / method_allocator.size());
+    ET_LOG(
+        Info, "method_allocator_planned:  %zu bytes", planned_buffer_memsize);
+    ET_LOG(Info, "method_allocator_loaded:   %zu bytes", method_loaded_memsize);
+    ET_LOG(Info, "method_allocator_input:    %zu bytes", input_memsize);
+    ET_LOG(Info, "method_allocator_executor: %zu bytes", executor_memsize);
+  }
+  if (temp_allocator.size() > 0) {
+    ET_LOG(
+        Info,
+        "temp_allocator_used:       %zu / %zu free: %zu ( used: %zu %% ) ",
+        temp_allocator.used_size(),
+        temp_allocator.size(),
+        temp_allocator.free_size(),
+        100 * temp_allocator.used_size() / temp_allocator.size());
+  }
 
   if (status != Error::Ok) {
     ET_LOG(
@@ -366,7 +509,7 @@ int main(int argc, const char* argv[]) {
     ET_LOG(Info, "Model executed successfully.");
   }
 
-  std::vector<torch::executor::EValue> outputs(method->outputs_size());
+  std::vector<EValue> outputs(method->outputs_size());
   ET_LOG(Info, "%zu outputs: ", outputs.size());
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp
index c53d28baab4..323010bfd71 100644
--- a/examples/arm/executor_runner/arm_perf_monitor.cpp
+++ b/examples/arm/executor_runner/arm_perf_monitor.cpp
@@ -39,10 +39,20 @@ void ethosu_inference_begin(struct ethosu_driver* drv, void*) {
   ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE);
 
   // Setup 4 counters
+#if defined(ETHOSU55) || defined(ETHOSU65)
   ETHOSU_PMU_Set_EVTYPER(drv, 0, ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED);
   ETHOSU_PMU_Set_EVTYPER(drv, 1, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED);
   ETHOSU_PMU_Set_EVTYPER(drv, 2, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN);
   ETHOSU_PMU_Set_EVTYPER(drv, 3, ETHOSU_PMU_NPU_IDLE);
+#elif defined(ETHOSU85)
+  ETHOSU_PMU_Set_EVTYPER(drv, 0, ETHOSU_PMU_EXT0_RD_DATA_BEAT_RECEIVED);
+  ETHOSU_PMU_Set_EVTYPER(drv, 1, ETHOSU_PMU_EXT1_RD_DATA_BEAT_RECEIVED);
+  ETHOSU_PMU_Set_EVTYPER(drv, 2, ETHOSU_PMU_EXT0_WR_DATA_BEAT_WRITTEN);
+  ETHOSU_PMU_Set_EVTYPER(drv, 3, ETHOSU_PMU_NPU_IDLE);
+#else
+#error No NPU target defined
+#endif
+
   // Enable 4 counters
   ETHOSU_PMU_CNTR_Enable(drv, 0xf);
 
@@ -160,9 +170,17 @@ void StopMeasurements() {
   for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
     ET_LOG(Info, "ethosu_pmu_cntr%zd : %" PRIu64, i, ethosu_pmuEventCounts[i]);
   }
+#if defined(ETHOSU55) || defined(ETHOSU65)
   ET_LOG(
       Info,
       "Ethos-U PMU Events:[ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]");
+#elif defined(ETHOSU85)
+  ET_LOG(
+      Info,
+      "Ethos-U PMU Events:[ETHOSU_PMU_EXT0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]");
+#else
+#error No NPU target defined
+#endif
 }
 
 #else
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index f41e0ef50c6..bdf80029049 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -9,20 +9,61 @@
 
 set -eu
 
-if [[ "${1:-'.'}" == "-h" || "${#}" -gt 2 ]]; then
-    echo "Usage: $(basename $0) [path-to-a-scratch-dir]"
-    echo "Supplied args: $*"
-    exit 1
-fi
+
 
 ########
 ### Hardcoded constants
 ########
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 
-# Ethos-u
-root_dir=${1:-"${script_dir}/ethos-u-scratch"}
+# Default Ethos-u tool folder override with --scratch-dir=<FOLDER>
+root_dir=${script_dir}/ethos-u-scratch
+
+model_name=""
+aot_arm_compiler_flags="--delegate --quantize"
+target="ethos-u55-128"
+output_folder_set=false
+output_folder="."
+build_only=false
+portable_kernels="aten::_softmax.out"
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --model_name=<MODEL>                   Model to run, can be a builtin, examples/models or a filename Default to all builtin models"
+    echo "  --aot_arm_compiler_flags=<FLAGS>       Only used if --model_name is used Default: ${aot_arm_compiler_flags}"
+    echo "  --portable_kernels=<OPS>               Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
+    echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
+    echo "  --output=<FOLDER>                      Output folder Default: ${output_folder}"
+    echo "  --build_only                           Only build, don't run FVP"
+    echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --model_name=*) model_name="${arg#*=}";;
+      --aot_arm_compiler_flags=*) aot_arm_compiler_flags="${arg#*=}";;
+      --portable_kernels=*) portable_kernels="${arg#*=}";;
+      --target=*) target="${arg#*=}";;
+      --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
+      --build_only) build_only=true ;;
+      --scratch-dir=*) root_dir="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
 root_dir=$(realpath ${root_dir})
+output_folder=$(realpath ${output_folder})
+mkdir -p ${output_folder}
+if [ "$output_folder_set" = true ] ; then
+    executor_runner_path=${output_folder}
+else
+    executor_runner_path=${script_dir}/executor_runner
+fi
+executor_runner_path=$(realpath ${executor_runner_path})
 
 ethos_u_root_dir="$(cd ${root_dir}/ethos-u && pwd)"
 ethos_u_build_dir=${ethos_u_root_dir}/core_platform/build
@@ -33,30 +74,45 @@ et_root_dir=$(cd ${script_dir}/../.. && pwd)
 et_build_dir=${et_root_dir}/cmake-out
 
 fvp_model=FVP_Corstone_SSE-300_Ethos-U55
+if [[ ${target} =~ "ethos-u85" ]]
+then
+    echo "target is ethos-u85 variant so switching to CS320 FVP"
+    fvp_model=FVP_Corstone_SSE-320
+fi
+
 toolchain_cmake=${script_dir}/ethos-u-setup/arm-none-eabi-gcc.cmake
 _setup_msg="please refer to ${script_dir}/ethos-u-setup/setup.sh to properly install necessary tools."
 
+if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.out)*$ ]]; then
+    echo " ERROR: specified argument --portable_kernels=${portable_kernels}"
+    echo "        is in the wrong format please use \"aten::<OP1>.out,aten::<OP2>.out,...\""
+    echo "        e.g. \"aten::_softmax.out,aten::add.out\""
+    exit 1
+fi
+
 # Generate a pte file
 function generate_pte_file() {
-    [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and delegate flag, got, $*"; exit 1; }
+    [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and model_compiler_flags flag, got, $*"; exit 1; }
     local model=${1}
-    local delegate=${2}
+    local model_compiler_flags=${2}
 
-    local model_filename=${model}.pte
-    if [[ "${delegate}" == *"--delegate"* ]]; then
-        model_filename=${model}_arm_delegate.pte
+    local model_filename=${model}_arm_${target}.pte
+    if [[ "${model_compiler_flags}" == *"--delegate"* ]]; then
+	# Name aligned with default aot_arm_compiler output
+        model_filename=${model}_arm_delegate_${target}.pte
     fi
     cd $et_root_dir
 
     local pte_file
-    pte_file=$(realpath ${model_filename})
+    pte_file=$(realpath ${output_folder}/${model_filename})
     rm -f "${pte_file}"
 
+    SO_EXT=$(python3 -c 'import platform; print({"Darwin": "dylib", "Linux": "so", "Windows": "dll"}.get(platform.system(), None))')
     # We are using the aot_lib from build_quantization_aot_lib below
-    SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.so)
+    SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT})
 
-    python3 -m examples.arm.aot_arm_compiler --model_name="${model}" ${delegate} --so_library="$SO_LIB" 1>&2
-    [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
+    python3 -m examples.arm.aot_arm_compiler --model_name="${model}" --target=${target} ${model_compiler_flags} --output ${output_folder} --so_library="$SO_LIB" 1>&2
+    [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "${pte_file}"
 }
 
@@ -78,8 +134,7 @@ function build_quantization_aot_lib()
         -Bcmake-out-aot-lib \
         "${et_root_dir}"
 
-    n=$(nproc)
-    cmake --build cmake-out-aot-lib -j"$((n - 5))" -- quantized_ops_aot_lib
+    cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
 }
 
 
@@ -98,7 +153,7 @@ function build_executorch() {
         -DCMAKE_BUILD_TYPE=Release                        \
         -DEXECUTORCH_ENABLE_LOGGING=ON                    \
         -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON                   \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
         -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
         -DFLATC_EXECUTABLE="$(which flatc)"               \
         -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
@@ -107,18 +162,17 @@ function build_executorch() {
 
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
-    n=$(nproc)
-    cmake --build ${et_build_dir} -j"$((n - 5))" --target install --config Release
+    cmake --build ${et_build_dir} --parallel --target install --config Release
 
     cmake                                                 \
         -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
         -DCMAKE_BUILD_TYPE=Release                        \
-        -DEXECUTORCH_SELECT_OPS_LIST="aten::_softmax.out" \
+        -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels}  \
         -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
         -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
         -B"${et_build_dir}"/examples/arm                  \
         "${et_root_dir}"/examples/arm
-    cmake --build ${et_build_dir}/examples/arm -- -j"$((n - 5))"
+    cmake --build ${et_build_dir}/examples/arm --parallel --
 
     set +x
 
@@ -132,39 +186,69 @@ function build_executorch_runner() {
     echo "[${FUNCNAME[0]}] Generating ExecuTorch libraries"
     [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expecting a single pte file as argument got, $*"; exit 1; }
     local pte=${1}
+    if [[ ${target} == *"ethos-u55"*  ]]; then
+        local target_cpu=cortex-m55
+	local target_board=corstone-300
+    else
+        local target_cpu=cortex-m85
+	local target_board=corstone-320
+    fi
     cd ${script_dir}/executor_runner
-    cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \
-	  -DTARGET_CPU=cortex-m55 \
-	  -B cmake-out \
-	  -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir} \
-	  -DET_DIR_PATH:PATH=${et_root_dir}         \
-	  -DET_BUILD_DIR_PATH:PATH=${et_build_dir}  \
-	  -DET_PTE_FILE_PATH:PATH="${pte}"          \
+    cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}     \
+	  -DTARGET_CPU=${target_cpu}                    \
+	  -DTARGET_BOARD=${target_board}                \
+	  -DETHOSU_TARGET_NPU_CONFIG=${target}          \
+	  -B ${executor_runner_path}/cmake-out          \
+	  -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}     \
+	  -DET_DIR_PATH:PATH=${et_root_dir}             \
+	  -DET_BUILD_DIR_PATH:PATH=${et_build_dir}      \
+	  -DET_PTE_FILE_PATH:PATH="${pte}"              \
 	  -DPYTHON_EXECUTABLE=$(which python3)
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
-    n=$(nproc)
-    cmake --build cmake-out -- -j"$((n - 5))" arm_executor_runner
+    cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner
     echo "[${FUNCNAME[0]}] Generated baremetal elf file:"
-    find cmake-out -name "arm_executor_runner"
+    find ${executor_runner_path}/cmake-out -name "arm_executor_runner"
+    echo "executable_text: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $1}') bytes"
+    echo "executable_data: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $2}') bytes"
+    echo "executable_bss:  $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $3}') bytes"
 }
 
 # Execute the executor_runner on FVP Simulator
 function run_fvp() {
     [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expexted elf binary name, got $*"; exit 1; }
     local elf_name=${1}
-    elf=$(find ${script_dir}/executor_runner -name "${elf_name}")
+    elf=$(find ${executor_runner_path} -name "${elf_name}")
     [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner elf: ${elf}"; exit 1; }
-    FVP_Corstone_SSE-300_Ethos-U55                          \
-        -C cpu0.CFGITCMSZ=11                                \
-        -C ethosu.num_macs=128                              \
-        -C mps3_board.visualisation.disable-visualisation=1 \
-        -C mps3_board.telnetterminal0.start_telnet=0        \
-        -C mps3_board.uart0.out_file='-'                    \
-        -C mps3_board.uart0.shutdown_on_eot=1               \
-        -a "${elf}"                                         \
-        --timelimit 120 || true # seconds
-    echo "[${FUNCNAME[0]} Simulation complete, $?"
+    num_macs=$(echo ${target} | cut -d - -f 3)
+
+    if [[ ${target} == *"ethos-u55"*  ]]; then
+        echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
+        ${fvp_model}                                            \
+            -C cpu0.CFGITCMSZ=11                                \
+            -C ethosu.num_macs=${num_macs}                      \
+            -C mps3_board.visualisation.disable-visualisation=1 \
+            -C mps3_board.telnetterminal0.start_telnet=0        \
+            -C mps3_board.uart0.out_file='-'                    \
+            -C mps3_board.uart0.shutdown_on_eot=1               \
+            -a "${elf}"                                         \
+            --timelimit 120 || true # seconds
+        echo "[${FUNCNAME[0]} Simulation complete, $?"
+    elif [[ ${target} == *"ethos-u85"*  ]]; then
+	${fvp_model}                                            \
+	    -C mps4_board.subsystem.cpu0.CFGITCMSZ=11           \
+	    -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
+	    -C mps4_board.visualisation.disable-visualisation=1 \
+	    -C vis_hdlcd.disable_visualisation=1                \
+	    -C mps4_board.telnetterminal0.start_telnet=0        \
+	    -C mps4_board.uart0.out_file='-'                    \
+	    -C mps4_board.uart0.shutdown_on_eot=1               \
+            -a "${elf}"                                         \
+            --timelimit 120 || true # seconds
+    else
+        echo "Running ${elf} for ${target} is not supported"
+        exit 1
+    fi
 }
 
 #######
@@ -193,17 +277,31 @@ hash arm-none-eabi-gcc \
 build_executorch
 build_quantization_aot_lib
 
-# the test models run, and whether to delegate
-test_model=( "softmax" "add" "add3" "mv2" )
-test_delegate=( "" "--delegate" "--delegate" "--delegate --quantize" )
+if [[ -z "$model_name" ]]; then
+    # the test models run, and whether to delegate
+    test_model=( "softmax" "add" "add3" "mv2" )
+    model_compiler_flags=( "" "--delegate" "--delegate" "--delegate --quantize" )
+else
+    test_model=( "$model_name" )
+    model_compiler_flags=( "$aot_arm_compiler_flags" )
+fi
 
 # loop over running the AoT flow and executing the model on device
 for i in "${!test_model[@]}"; do
-    printf "Running e2e flow for model '%s' with flags '%s'\n" "${test_model[i]}" "${test_delegate[i]}"
-    pte=$(generate_pte_file "${test_model[i]}" "${test_delegate[i]}")
-    # Rebuild the application as the pte is imported as a header/c array
-    build_executorch_runner "${pte}"
-    run_fvp arm_executor_runner
+    echo "--------------------------------------------------------------------------------"
+    printf "Running e2e flow for model '%s' with flags '%s'\n" "${test_model[i]}" "${model_compiler_flags[i]}"
+    echo "--------------------------------------------------------------------------------"
+    pte=$(generate_pte_file "${test_model[i]}" "${model_compiler_flags[i]}")
+    stat --printf="Generated pte_data_size: %s bytes\npte_file:%n\n" ${pte}
+    if [[ ${target} == *"TOSA"*  ]]; then
+        echo "Build for ${target} skip generating .elf and running"
+    else
+        # Rebuild the application as the pte is imported as a header/c array
+        build_executorch_runner "${pte}"
+        if [ "$build_only" = false ] ; then
+            run_fvp arm_executor_runner
+        fi
+    fi
 done
 
 exit 0
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 9cef98e6227..c445dfe8d2c 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -9,7 +9,7 @@
 
 set -eu
 
-if [[ "${1:-'.'}" == "-h" || "${#}" -eq 0 || "${#}" -gt 2 ]]; then
+if [[ "${1:-'.'}" == "-h" || "${#}" -gt 2 ]]; then
     echo "Usage: $(basename $0) <--i-agree-to-the-contained-eula> [path-to-a-scratch-dir]"
     echo "Supplied args: $*"
     exit 1
@@ -45,20 +45,28 @@ function verify_md5() {
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 
 if [[ "${ARCH}" == "x86_64" ]]; then
-    # FVP
-    fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9"
-    fvp_model_dir="Linux64_GCC-9.3"
-    fvp_md5_checksum="98e93b949d0fbac977292d8668d34523"
+    # FVPs
+    corstone300_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9"
+    corstone300_model_dir="Linux64_GCC-9.3"
+    corstone300_md5_checksum="98e93b949d0fbac977292d8668d34523"
+
+    corstone320_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-320/FVP_Corstone_SSE-320_11.27_25_Linux64.tgz?rev=a507bffc219a4d5792f1192ab7002d89&hash=D9A824AA8227D2E679C9B9787FF4E8B6FBE3D7C6"
+    corstone320_model_dir="Linux64_GCC-9.3"
+    corstone320_md5_checksum="3deb3c68f9b2d145833f15374203514d"
 
     # toochain
     toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz"
     toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi"
     toolchain_md5_checksum="00ebb1b70b1f88906c61206457eacb61"
 elif [[ "${ARCH}" == "aarch64" ]] || [[ "${ARCH}" == "arm64" ]]; then
-    # FVP
-    fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073"
-    fvp_model_dir="Linux64_armv8l_GCC-9.3"
-    fvp_md5_checksum="cbbabbe39b07939cff7a3738e1492ef1"
+    # FVPs
+    corstone300_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073"
+    corstone300_model_dir="Linux64_armv8l_GCC-9.3"
+    corstone300_md5_checksum="cbbabbe39b07939cff7a3738e1492ef1"
+
+    corstone320_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-320/FVP_Corstone_SSE-320_11.27_25_Linux64_armv8l.tgz?rev=b6ebe0923cb84f739e017385fd3c333c&hash=8965C4B98E2FF7F792A099B08831FE3CB6120493"
+    corstone320_model_dir="Linux64_armv8l_GCC-9.3"
+    corstone320_md5_checksum="3889f1d80a6d9861ea4aa6f1c88dd0ae"
 
     # toochain
     if [[ "${OS}" == "Darwin" ]]; then
@@ -76,15 +84,21 @@ fi
 
 # ethos-u
 ethos_u_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u"
-ethos_u_base_rev="24.05"
+ethos_u_base_rev="24.08"
 
 ########
 ### Mandatory user args
 ########
-eula_acceptance="${1:-'.'}"; shift
+eula_acceptance="${1:-'.'}"
 if [[ "${eula_acceptance}" != "--i-agree-to-the-contained-eula" ]]; then
-    echo "Must pass first positional argument '--i-agree-to-the-contained-eula' to agree to EULA associated with downloading the FVP. Exiting!"
-    exit 1
+    if [[ ${ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA} != "True" ]]; then
+	echo "Must pass first positional argument '--i-agree-to-the-contained-eula' to agree to EULA associated with downloading the FVP. Exiting!"
+	exit 1
+    else
+	echo "Arm EULA for FVP agreed to with ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA=True environment variable"
+    fi
+else
+    shift; # drop this arg
 fi
 
 ########
@@ -101,30 +115,56 @@ root_dir=$(realpath ${root_dir})
 function setup_fvp() {
     if [[ "${OS}" != "Linux" ]]; then
         echo "[${FUNCNAME[0]}] Warning: FVP only supported with Linux OS, skipping FVP setup..."
+        echo "[${FUNCNAME[0]}] Warning: For MacOS, using https://github.com/Arm-Examples/FVPs-on-Mac is recommended."
+        echo "[${FUNCNAME[0]}] Warning:   Follow the instructions and make sure the path is set correctly." 
         return 1
     fi
 
     # Download and install the Corstone 300 FVP simulator platform
-    cd "${root_dir}"
-    if [[ ! -e FVP_cs300.tgz ]]; then
-        echo "[${FUNCNAME[0]}] Downloading FVP ..."
-        curl --output FVP_cs300.tgz "${fvp_url}"
-        verify_md5 ${fvp_md5_checksum} FVP_cs300.tgz
-    fi
-
-    echo "[${FUNCNAME[0]}] Installing FVP ..."
-    rm -rf FVP
-    mkdir -p FVP
-    cd FVP
-    tar xf ../FVP_cs300.tgz
-    ./FVP_Corstone_SSE-300.sh --i-agree-to-the-contained-eula --force --destination ./ --quiet --no-interactive
-
-    fvp_bin_path="$(cd models/${fvp_model_dir} && pwd)"
-    export PATH=${PATH}:${fvp_bin_path}
-
-    hash FVP_Corstone_SSE-300_Ethos-U55
-    echo "export PATH=\${PATH}:${fvp_bin_path}" >> ${setup_path_script}
-
+    fvps=("corstone300" "corstone320")
+
+    for fvp in "${fvps[@]}"; do
+        cd "${root_dir}"
+        if [[ ! -e "FVP_${fvp}.tgz" ]]; then
+            echo "[${FUNCNAME[0]}] Downloading FVP ${fvp}..."
+            url_variable=${fvp}_url
+            fvp_url=${!url_variable}
+            curl --output "FVP_${fvp}.tgz" "${fvp_url}"
+            md5_variable=${fvp}_md5_checksum
+            fvp_md5_checksum=${!md5_variable}
+            verify_md5 ${fvp_md5_checksum} FVP_${fvp}.tgz
+        fi
+
+        echo "[${FUNCNAME[0]}] Installing FVP ${fvp}..."
+        rm -rf FVP-${fvp}
+        mkdir -p FVP-${fvp}
+        cd FVP-${fvp}
+        tar xf ../FVP_${fvp}.tgz
+
+        # Install the FVP
+        case ${fvp} in
+            corstone300)
+                ./FVP_Corstone_SSE-300.sh --i-agree-to-the-contained-eula --force --destination ./ --quiet --no-interactive
+                ;;
+            corstone320)
+                ./FVP_Corstone_SSE-320.sh --i-agree-to-the-contained-eula --force --destination ./ --quiet --no-interactive
+                ;;
+            *)
+                echo "[${FUNCNAME[0]}] Error: Unknown FVP model ${fvp}. Exiting."
+                exit 1
+                ;;
+        esac
+
+        model_dir_variable=${fvp}_model_dir
+        fvp_model_dir=${!model_dir_variable}
+        fvp_bin_path="$(cd models/${fvp_model_dir} && pwd)"
+        export PATH=${PATH}:${fvp_bin_path}
+
+        echo "export PATH=\${PATH}:${fvp_bin_path}" >> ${setup_path_script}
+    done
+
+    # Fixup for Corstone-320 python dependency
+    echo "export LD_LIBRARY_PATH=${root_dir}/FVP-corstone320/python/lib/" >> ${setup_path_script}
 }
 
 function setup_toolchain() {
@@ -163,7 +203,7 @@ function patch_repo() {
     name="$(basename $repo_dir)"
     echo -e "[${FUNCNAME[0]}] Preparing ${name}..."
     cd $repo_dir
-
+    git fetch
     git reset --hard ${base_rev}
 
     patch_dir=${script_dir}/ethos-u-setup/${name}/patches/
@@ -216,7 +256,7 @@ function setup_vela() {
     if [[ ! -e ethos-u-vela ]]; then
         git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela
         repo_dir="${root_dir}/ethos-u-vela"
-        base_rev=7706c1281166e7611f4300ed26338087152a33c9
+        base_rev=fe0eaa55c5ed319f78c01978f3b40eb11a9bcb38
         patch_repo
     fi
     cd "${root_dir}/ethos-u-vela"
@@ -261,7 +301,7 @@ setup_ethos_u
 
 # Patch the ethos-u dev environment to include executorch application
 repo_dir="${root_dir}/ethos-u/core_platform"
-base_rev=204210b1074071532627da9dc69950d058a809f4
+base_rev=b728c774158248ba2cad8e78a515809e1eb9b77f
 patch_repo
 
 # Setup the tosa_reference_model
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 9af1f5266eb..a60307dd90f 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -78,6 +78,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -Bcmake-android-out
 
@@ -120,6 +121,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -Bcmake-android-out
 
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index fc58d70a2f1..294a0ccecef 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -1,103 +1,147 @@
-# Building ExecuTorch LLaMA Android Demo App
-
-This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch.
-
-## Prerequisites
-* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment.
-* Install [Java 17 JDK](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html).
-* Install the [Android SDK API Level 34](https://developer.android.com/about/versions/14/setup-sdk) and
-  [Android NDK 25.0.8775105](https://developer.android.com/studio/projects/install-ndk).
- * If you have Android Studio set up, you can install them with
-   * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Platforms -> Check the row with API Level 34.
-   * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Tools -> Check NDK (Side by side) row.
- * Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI.
-* Supported Host OS: CentOS, macOS Sonoma on Apple Silicon.
-
-Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
-
-## Getting models
-Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
-
-After you export the model and generate tokenizer.bin, push them device:
-```bash
-adb shell mkdir -p /data/local/tmp/llama
-adb push llama2.pte /data/local/tmp/llama
-adb push tokenizer.bin /data/local/tmp/llama
-```
+# ExecuTorch Llama Android Demo App
 
-Note: The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer.
+We’re excited to share that the newly revamped Android demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an Android demo app and how to exercise the many features ExecuTorch and Llama models have to offer.
 
-## Build library
-For the demo app to build, we need to build the ExecuTorch AAR library first.
+This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case.
 
-The AAR library contains the required Java package and the corresponding JNI
-library for using ExecuTorch in your Android app.
+Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas.
 
-### Alternative 1: Use prebuilt AAR library (recommended)
 
-1. Open a terminal window and navigate to the root directory of the `executorch`.
-2. Run the following command to download the prebuilt library:
-```bash
-bash examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh
-```
+## Key Concepts
+From this demo app, you will learn many key concepts such as:
+* How to prepare Llama models, build the ExecuTorch library, and model inferencing across delegates
+* Expose the ExecuTorch library via JNI layer
+* Familiarity with current ExecuTorch app-facing capabilities
 
-The prebuilt AAR library contains the Java library and the JNI binding for
-NativePeer.java and ExecuTorch native library, including core ExecuTorch
-runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels,
-and Quantized kernels. It comes with two ABI variants, arm64-v8a and x86_64.
+The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases.
 
-If you want to use the prebuilt library for your own app, please refer to
-[Using Android prebuilt libraries (AAR)](./android-prebuilt-library.md) for
-tutorial.
+## Supporting Models
+As a whole, the models that this app supports are (varies by delegate):
+* Llama 3.2 1B/3B
+* Llama Guard 3 1B
+* Llama 3.1 8B
+* Llama 3 8B
+* Llama 2 7B
+* LLaVA-1.5 vision model (only XNNPACK)
 
-If you need to use other dependencies (like tokenizer), please refer to
-Alternative 2: Build from local machine option.
 
-### Alternative 2: Build from local machine
-1. Open a terminal window and navigate to the root directory of the `executorch`.
-2. Set the following environment variables:
-```bash
-export ANDROID_NDK=<path_to_android_ndk>
-export ANDROID_ABI=arm64-v8a
-```
-Note: `<path_to_android_ndk>` is the root for the NDK, which is usually under
-`~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md.
-We use `<path_to_android_ndk>/build/cmake/android.toolchain.cmake` for CMake to cross-compile.
-
-3. Build the Android Java extension code:
-```bash
-pushd extension/android
-./gradlew build
-popd
-```
+## Building the APK
+First it’s important to note that currently ExecuTorch provides support across 3 delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to exporting the models to build ExecuTorch libraries and apps to run on device:
 
-4. Run the following command set up the required JNI library:
-```bash
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:setup
-popd
-```
-This is running the shell script [setup.sh](./setup.sh) which configures the required core ExecuTorch, LLAMA2, and Android libraries, builds them, and copy to jniLibs.
+| Delegate      | Resource |
+| ------------- | ------------- |
+| XNNPACK (CPU-based library)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md) |
+| QNN (Qualcomm AI Accelerators)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md) |
+| MediaTek (MediaTek AI Accelerators)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md)  |
 
-## Build APK
-### Alternative 1: Android Studio (Recommended)
+**WARNING** NDK r27 will cause issues like:
+  ```
+  java.lang.UnsatisfiedLinkError: dlopen failed: cannot locate symbol "_ZTVNSt6__ndk114basic_ifstreamIcNS_11char_traitsIcEEEE" referenced by "/data/app/~~F5IwquaXUZPdLpSEYA-JGA==/com.example.executorchllamademo-FSyx80gEhsQCsxz7hvS2Ew==/lib/arm64/libexecutorch.so"...
+  ```
+  Please use NDK version 26.3.11579264.
+
+## How to Use the App
+
+This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API.
+
+For loading the app, development, and running on device we recommend Android Studio:
 1. Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo.
 2. Run the app (^R). This builds and launches the app on the phone.
 
-### Alternative 2: Command line
-Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
-```bash
-export ANDROID_HOME=<path_to_android_sdk_home>
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:installDebug
-popd
+### Opening the App
+
+Below are the UI features for the app.
+
+Select the settings widget to get started with picking a model, its parameters and any prompts.
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
+</p>
+
+
+
+### Select Models and Parameters
+
+Once you've selected the model, tokenizer, and model type you are ready to click on "Load Model" to have the app load the model and go back to the main Chat activity.
+<p align="center">
+      <img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/settings_menu.png" style="width:300px">
+</p>
+
+
+
+Optional Parameters:
+* Temperature: Defaulted to 0, you can adjust the temperature for the model as well. The model will reload upon any adjustments.
+* System Prompt: Without any formatting, you can enter in a system prompt. For example, "you are a travel assistant" or "give me a response in a few sentences".
+* User Prompt: More for the advanced user, if you would like to manually input a prompt then you can do so by modifying the `{{user prompt}}`. You can also modify the special tokens as well. Once changed then go back to the main Chat activity to send.
+
+#### ExecuTorch App API
+
+```java
+// Upon returning to the Main Chat Activity
+mModule = new LlamaModule(
+            ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()),
+            modelPath,
+            tokenizerPath,
+            temperature);
+int loadResult = mModule.load();
+```
+
+* `modelCategory`: Indicate whether it’s a text-only or vision model
+* `modePath`: path to the .pte file
+* `tokenizerPath`: path to the tokenizer .bin file
+* `temperature`: model parameter to adjust the randomness of the model’s output
+
+
+### User Prompt
+Once model is successfully loaded then enter any prompt and click the send (i.e. generate) button to send it to the model.
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/load_complete_and_start_prompt.png" style="width:300px">
+</p>
+
+You can provide it more follow-up questions as well.
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/chat.png" style="width:300px">
+</p>
+
+#### ExecuTorch App API
+
+```java
+mModule.generate(prompt,sequence_length, MainActivity.this);
 ```
+* `prompt`: User formatted prompt
+* `sequence_length`: Number of tokens to generate in response to a prompt
+* `MainActivity.this`: Indicate that the callback functions (OnResult(), OnStats()) are present in this class.
 
-On the phone or emulator, you can try running the model:
-<img src="../_static/img/android_llama_app.png" alt="Android LLaMA App" /><br>
+[*LLaVA-1.5: Only for XNNPACK delegate*]
 
-## Takeaways
-Through this tutorial we've learnt how to build the ExecuTorch LLAMA library, and expose it to JNI layer to build the Android app.
+For LLaVA-1.5 implementation, select the exported LLaVA .pte and tokenizer file in the Settings menu and load the model. After this you can send an image from your gallery or take a live picture along with a text prompt to the model.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/llava_example.png" style="width:300px">
+</p>
+
+
+### Output Generated
+To show completion of the follow-up question, here is the complete detailed response from the model.
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/chat_response.png" style="width:300px">
+</p>
+
+#### ExecuTorch App API
+
+Ensure you have the following functions in your callback class that you provided in the `mModule.generate()`. For this example, it is `MainActivity.this`.
+```java
+  @Override
+  public void onResult(String result) {
+    //...result contains token from response
+    //.. onResult will continue to be invoked until response is complete
+  }
+
+  @Override
+  public void onStats(float tps) {
+    //...tps (tokens per second) stats is provided by framework
+  }
+
+```
 
 ## Reporting Issues
 If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md b/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md
index 85c70057318..9ae79e96763 100644
--- a/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md
+++ b/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md
@@ -70,9 +70,9 @@ export ANDROID_HOME="$(realpath $DEV_HOME/sdk)"
 # Install SDK 34
 ./cmdline-tools/bin/sdkmanager --sdk_root="${ANDROID_HOME}" --install "platforms;android-34"
 # Install NDK
-./cmdline-tools/bin/sdkmanager --sdk_root="${ANDROID_HOME}" --install "ndk;25.0.8775105"
+./cmdline-tools/bin/sdkmanager --sdk_root="${ANDROID_HOME}" --install "ndk;26.3.11579264"
 # The NDK root is then under `ndk/<version>`.
-export ANDROID_NDK="$ANDROID_HOME/ndk/25.0.8775105"
+export ANDROID_NDK="$ANDROID_HOME/ndk/26.3.11579264"
 ```
 
 ### (Optional) Android Studio Setup
diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
index db4ea8f74c6..37c8cbf0ba2 100644
--- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
@@ -57,7 +57,7 @@ dependencies {
   implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
   implementation("com.google.code.gson:gson:2.8.6")
-  implementation(files("libs/executorch-llama-mtk31.aar"))
+  implementation(files("libs/executorch-llama.aar"))
   implementation("com.google.android.material:material:1.12.0")
   implementation("androidx.activity:activity:1.9.0")
   testImplementation("junit:junit:4.13.2")
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
index cee623507fd..7236fe317b0 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
@@ -9,7 +9,9 @@
 package com.example.executorchllamademo;
 
 import android.app.Activity;
+import android.app.ActivityManager;
 import android.content.Intent;
+import android.os.Build;
 import android.os.Bundle;
 import android.util.Log;
 import android.widget.TextView;
@@ -18,7 +20,11 @@
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback {
   ModelRunner mModelRunner;
@@ -50,19 +56,21 @@ protected void onCreate(Bundle savedInstanceState) {
     }
 
     mStatsDump = new StatsDump();
+    mStatsDump.modelName = model.getName().replace(".pte", "");
     mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
-    mStatsDump.loadStart = System.currentTimeMillis();
+    mStatsDump.loadStart = System.nanoTime();
   }
 
   @Override
   public void onModelLoaded(int status) {
-    mStatsDump.loadEnd = System.currentTimeMillis();
+    mStatsDump.loadEnd = System.nanoTime();
+    mStatsDump.loadStatus = status;
     if (status != 0) {
       Log.e("LlmBenchmarkRunner", "Loaded failed: " + status);
       onGenerationStopped();
       return;
     }
-    mStatsDump.generateStart = System.currentTimeMillis();
+    mStatsDump.generateStart = System.nanoTime();
     mModelRunner.generate(mPrompt);
   }
 
@@ -81,36 +89,122 @@ public void onStats(String stats) {
 
   @Override
   public void onGenerationStopped() {
-    mStatsDump.generateEnd = System.currentTimeMillis();
+    mStatsDump.generateEnd = System.nanoTime();
     runOnUiThread(
         () -> {
           mTextView.append(mStatsDump.toString());
         });
 
-    // TODO (huydhn): Remove txt files here once the JSON format is ready
-    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
-      writer.write(mStatsDump.toString());
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
+    final BenchmarkMetric.BenchmarkModel benchmarkModel =
+        BenchmarkMetric.extractBackendAndQuantization(mStatsDump.modelName);
+    final List<BenchmarkMetric> results = new ArrayList<>();
+    // The list of metrics we have atm includes:
+    // Load status
+    results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsDump.loadStatus, 0));
+    // Model load time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "model_load_time(ms)",
+            (mStatsDump.loadEnd - mStatsDump.loadStart) * 1e-6,
+            0.0f));
+    // LLM generate time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "generate_time(ms)",
+            (mStatsDump.generateEnd - mStatsDump.generateStart) * 1e-6,
+            0.0f));
+    // Token per second
+    results.add(
+        new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsDump.tokens), 0.0f));
 
-    // TODO (huydhn): Figure out on what the final JSON results looks like, we need something
-    // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042
     try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
       Gson gson = new Gson();
-      writer.write(gson.toJson(mStatsDump));
+      writer.write(gson.toJson(results));
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
+
+  private double extractTPS(final String tokens) {
+    final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens);
+    if (m.find()) {
+      return Double.parseDouble(m.group());
+    } else {
+      return 0.0f;
+    }
+  }
+}
+
+class BenchmarkMetric {
+  public static class BenchmarkModel {
+    // The model name, i.e. stories110M
+    String name;
+    String backend;
+    String quantization;
+
+    public BenchmarkModel(final String name, final String backend, final String quantization) {
+      this.name = name;
+      this.backend = backend;
+      this.quantization = quantization;
+    }
+  }
+
+  BenchmarkModel benchmarkModel;
+
+  // The metric name, i.e. TPS
+  String metric;
+
+  // The actual value and the option target value
+  double actualValue;
+  double targetValue;
+
+  public static class DeviceInfo {
+    // Let's see which information we want to include here
+    final String device = Build.BRAND;
+    // The phone model and Android release version
+    final String arch = Build.MODEL;
+    final String os = "Android " + Build.VERSION.RELEASE;
+    final long totalMem = new ActivityManager.MemoryInfo().totalMem;
+    final long availMem = new ActivityManager.MemoryInfo().availMem;
+  }
+
+  DeviceInfo deviceInfo = new DeviceInfo();
+
+  public BenchmarkMetric(
+      final BenchmarkModel benchmarkModel,
+      final String metric,
+      final double actualValue,
+      final double targetValue) {
+    this.benchmarkModel = benchmarkModel;
+    this.metric = metric;
+    this.actualValue = actualValue;
+    this.targetValue = targetValue;
+  }
+
+  // TODO (huydhn): Figure out a way to extract the backend and quantization information from
+  // the .pte model itself instead of parsing its name
+  public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) {
+    final Matcher m =
+        Pattern.compile("(?<name>\\w+)_(?<backend>\\w+)_(?<quantization>\\w+)").matcher(model);
+    if (m.matches()) {
+      return new BenchmarkMetric.BenchmarkModel(
+          m.group("name"), m.group("backend"), m.group("quantization"));
+    } else {
+      return new BenchmarkMetric.BenchmarkModel(model, "", "");
+    }
+  }
 }
 
 class StatsDump {
+  int loadStatus;
   long loadStart;
   long loadEnd;
   long generateStart;
   long generateEnd;
   String tokens;
+  String modelName;
 
   @NonNull
   @Override
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index fbd6948880f..b495a6b8457 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -107,10 +107,6 @@ public void onStats(float tps) {
   }
 
   private void setLocalModel(String modelPath, String tokenizerPath, float temperature) {
-    if (mModule != null) {
-      mModule.resetNative();
-      mModule = null;
-    }
     Message modelLoadingMessage = new Message("Loading model...", false, MessageType.SYSTEM, 0);
     ETLogging.getInstance().log("Loading model " + modelPath + " with tokenizer " + tokenizerPath);
     runOnUiThread(
@@ -119,6 +115,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
           mMessageAdapter.add(modelLoadingMessage);
           mMessageAdapter.notifyDataSetChanged();
         });
+    if (mModule != null) {
+      ETLogging.getInstance().log("Start deallocating existing module instance");
+      mModule.resetNative();
+      mModule = null;
+      ETLogging.getInstance().log("Completed deallocating existing module instance");
+    }
     long runStartTime = System.currentTimeMillis();
     mModule =
         new LlamaModule(
@@ -665,7 +667,14 @@ private void onModelRunStopped() {
     mSendButton.setOnClickListener(
         view -> {
           addSelectedImagesToChatThread(mSelectedImageUri);
+          String finalPrompt;
           String rawPrompt = mEditTextMessage.getText().toString();
+          if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
+              == ModelUtils.VISION_MODEL) {
+            finalPrompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
+          } else {
+            finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt);
+          }
           // We store raw prompt into message adapter, because we don't want to show the extra
           // tokens from system prompt
           mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID));
@@ -693,23 +702,31 @@ public void run() {
                         }
                       });
                   long generateStartTime = System.currentTimeMillis();
-                 /* if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
+                  if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
                       == ModelUtils.VISION_MODEL) {
                     mModule.generateFromPos(
-                        mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt),
+                        finalPrompt,
                         ModelUtils.VISION_MODEL_SEQ_LEN,
                         startPos,
                         MainActivity.this,
                         false);
-                  } else {*/
-                    String finalPrompt =
-                        getTotalFormattedPrompt(getConversationHistory(), rawPrompt);
+                  } else if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_GUARD_3) {
+                    String llamaGuardPromptForClassification =
+                        PromptFormat.getFormattedLlamaGuardPrompt(rawPrompt);
+                    ETLogging.getInstance()
+                        .log("Running inference.. prompt=" + llamaGuardPromptForClassification);
+                    mModule.generate(
+                        llamaGuardPromptForClassification,
+                        llamaGuardPromptForClassification.length() + 64,
+                        MainActivity.this,
+                        false);
+                  } else {
                     ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt);
                     mModule.generate(
                         finalPrompt,
                         (int) (finalPrompt.length() * 0.75) + 64,
                         MainActivity.this);
-                  //}
+                  }
 
                   long generateDuration = System.currentTimeMillis() - generateStartTime;
                   mResultMessage.setTotalGenerationTime(generateDuration);
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
index 2538c852e48..31aaa9a1d5f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
@@ -95,28 +95,32 @@ public ArrayList<Message> getSavedMessages() {
   public ArrayList<Message> getRecentSavedTextMessages(int numOfLatestPromptMessages) {
     ArrayList<Message> recentMessages = new ArrayList<Message>();
     int lastIndex = savedMessages.size() - 1;
-    Message messageToAdd = savedMessages.get(lastIndex);
-    int oldPromptID = messageToAdd.getPromptID();
-
-    for (int i = 0; i < savedMessages.size(); i++) {
-      messageToAdd = savedMessages.get(lastIndex - i);
-      if (messageToAdd.getMessageType() != MessageType.SYSTEM) {
-        if (messageToAdd.getPromptID() != oldPromptID) {
-          numOfLatestPromptMessages--;
-          oldPromptID = messageToAdd.getPromptID();
-        }
-        if (numOfLatestPromptMessages > 0) {
-          if (messageToAdd.getMessageType() == MessageType.TEXT) {
-            recentMessages.add(messageToAdd);
+    // In most cases lastIndex >=0 .
+    // A situation where the user clears chat history and enters prompt. Causes lastIndex=-1 .
+    if (lastIndex >= 0) {
+      Message messageToAdd = savedMessages.get(lastIndex);
+      int oldPromptID = messageToAdd.getPromptID();
+
+      for (int i = 0; i < savedMessages.size(); i++) {
+        messageToAdd = savedMessages.get(lastIndex - i);
+        if (messageToAdd.getMessageType() != MessageType.SYSTEM) {
+          if (messageToAdd.getPromptID() != oldPromptID) {
+            numOfLatestPromptMessages--;
+            oldPromptID = messageToAdd.getPromptID();
+          }
+          if (numOfLatestPromptMessages > 0) {
+            if (messageToAdd.getMessageType() == MessageType.TEXT) {
+              recentMessages.add(messageToAdd);
+            }
+          } else {
+            break;
           }
-        } else {
-          break;
         }
       }
+      // To place the order in [input1, output1, input2, output2...]
+      Collections.reverse(recentMessages);
     }
 
-    // To place the order in [input1, output1, input2, output2...]
-    Collections.reverse(recentMessages);
     return recentMessages;
   }
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
index 91e84be0590..b1074ee2cc6 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
@@ -11,5 +11,7 @@
 public enum ModelType {
   LLAMA_3,
   LLAMA_3_1,
+  LLAMA_3_2,
   LLAVA_1_5,
+  LLAMA_GUARD_3,
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
index ab1f1bc92fc..28e14cdac01 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
@@ -21,6 +21,7 @@ public static int getModelCategory(ModelType modelType) {
         return VISION_MODEL;
       case LLAMA_3:
       case LLAMA_3_1:
+      case LLAMA_3_2:
       default:
         return TEXT_MODEL;
     }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
index 36e738c3d0e..1d794733d27 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
@@ -19,6 +19,7 @@ public static String getSystemPromptTemplate(ModelType modelType) {
     switch (modelType) {
       case LLAMA_3:
       case LLAMA_3_1:
+      case LLAMA_3_2:
         return "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
             + SYSTEM_PLACEHOLDER
             + "<|eot_id|>";
@@ -33,6 +34,8 @@ public static String getUserPromptTemplate(ModelType modelType) {
     switch (modelType) {
       case LLAMA_3:
       case LLAMA_3_1:
+      case LLAMA_3_2:
+      case LLAMA_GUARD_3:
         return "<|start_header_id|>user<|end_header_id|>\n"
             + USER_PLACEHOLDER
             + "<|eot_id|>"
@@ -48,6 +51,7 @@ public static String getConversationFormat(ModelType modelType) {
     switch (modelType) {
       case LLAMA_3:
       case LLAMA_3_1:
+      case LLAMA_3_2:
         return getUserPromptTemplate(modelType) + "\n" + ASSISTANT_PLACEHOLDER + "<|eot_id|>";
       case LLAVA_1_5:
         return USER_PLACEHOLDER + " ASSISTANT:";
@@ -60,6 +64,8 @@ public static String getStopToken(ModelType modelType) {
     switch (modelType) {
       case LLAMA_3:
       case LLAMA_3_1:
+      case LLAMA_3_2:
+      case LLAMA_GUARD_3:
         return "<|eot_id|>";
       case LLAVA_1_5:
         return "</s>";
@@ -72,4 +78,44 @@ public static String getLlavaPresetPrompt() {
     return "A chat between a curious human and an artificial intelligence assistant. The assistant"
         + " gives helpful, detailed, and polite answers to the human's questions. USER: ";
   }
+
+  public static String getFormattedLlamaGuardPrompt(String userPrompt) {
+    return getUserPromptTemplate(ModelType.LLAMA_GUARD_3)
+        .replace(
+            USER_PLACEHOLDER, getLlamaGuardPresetPrompt().replace(USER_PLACEHOLDER, userPrompt));
+  }
+
+  public static String getLlamaGuardPresetPrompt() {
+    return "Task: Check if there is unsafe content in 'User' messages in conversations"
+        + " according our safety policy with the below categories.\n"
+        + "\n"
+        + "<BEGIN UNSAFE CONTENT CATEGORIES>\n"
+        + "S1: Violent Crimes. \n"
+        + "S2: Non-Violent Crimes. \n"
+        + "S3: Sex Crimes. \n"
+        + "S4: Child Exploitation. \n"
+        + "S5: Defamation. \n"
+        + "S6: Specialized Advice. \n"
+        + "S7: Privacy. \n"
+        + "S8: Intellectual Property. \n"
+        + "S9: Indiscriminate Weapons. \n"
+        + "S10: Hate. \n"
+        + "S11: Self-Harm. \n"
+        + "S12: Sexual Content. \n"
+        + "S13: Elections. \n"
+        + "S14: Code Interpreter Abuse.\n"
+        + "<END UNSAFE CONTENT CATEGORIES>\n"
+        + "\n"
+        + "<BEGIN CONVERSATION>\n"
+        + "\n"
+        + "User: "
+        + USER_PLACEHOLDER
+        + "\n"
+        + "<END CONVERSATION>\n"
+        + "Provide your safety assessment for ONLY THE LAST User message in the above"
+        + " conversation:\n"
+        + " - First line must read 'safe' or 'unsafe'.\n"
+        + " - If unsafe, a second line must include a comma-separated list of violated"
+        + " categories.";
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index 9d7d2f4ec2a..93a03b5bc7f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -317,7 +317,7 @@ private static String[] listLocalFile(String path, String suffix) {
       }
       return result;
     }
-    return null;
+    return new String[] {};
   }
 
   private void setupModelTypeSelectorDialog() {
@@ -344,9 +344,11 @@ private void setupModelTypeSelectorDialog() {
   }
 
   private void setupTokenizerSelectorDialog() {
-    String[] binFiles = listLocalFile(MODEL_PATH, ".bin");
-    String[] tokenizerFiles = new String[binFiles.length];
+    String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin");
+    String[] modelFiles = listLocalFile("/data/local/tmp/llama/", ".model");
+    String[] tokenizerFiles = new String[binFiles.length + modelFiles.length];
     System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length);
+    System.arraycopy(modelFiles, 0, tokenizerFiles, binFiles.length, modelFiles.length);
     AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this);
     tokenizerPathBuilder.setTitle("Select tokenizer path");
     tokenizerPathBuilder.setSingleChoiceItems(
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
new file mode 100644
index 00000000000..b0567d11fd1
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
@@ -0,0 +1,157 @@
+# Building ExecuTorch Android Demo for Llama running MediaTek
+This tutorial covers the end to end workflow for running Llama 3-8B-instruct inference on MediaTek AI accelerators on an Android device.
+More specifically, it covers:
+1. Export and quantization of Llama models against the MediaTek backend.
+2. Building and linking libraries that are required to inference on-device for Android platform using MediaTek AI accelerators.
+3. Loading the needed files on the device and running inference.
+
+Verified on MacOS, Linux CentOS (model export), Python 3.10, Android NDK 26.3.11579264
+Phone verified: MediaTek Dimensity 9300 (D9300) chip.
+
+## Prerequisites
+* Download and link the Buck2 build, Android NDK, and MediaTek ExecuTorch Libraries from the MediaTek Backend Readme ([link](https://github.com/pytorch/executorch/tree/main/backends/mediatek/scripts#prerequisites)).
+* MediaTek Dimensity 9300 (D9300) chip device
+* Desired Llama 3 model weights. You can download them on HuggingFace [Example](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)).
+* `libneuronusdk_adapter.mtk.so`,  `libneuron_buffer_allocator.so`, and `.whl` files  (will be available soon by MediaTek)
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+```
+conda create -yn et_mtk python=3.10.0
+conda activate et_mtk
+```
+
+Checkout ExecuTorch repo and sync submodules
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+Install dependencies
+```
+./install_requirements.sh
+```
+## Setup Environment Variables
+### Download Buck2 and make executable
+* Download Buck2 from the official [Release Page](https://github.com/facebook/buck2/releases/tag/2024-02-01)
+* Create buck2 executable
+```
+zstd -cdq "<downloaded_buck2_file>.zst" > "<path_to_store_buck2>/buck2" && chmod +x "<path_to_store_buck2>/buck2"
+```
+
+### MediaTek ExecuTorch Libraries
+The following libraries will be available soon by MediaTek:
+libneuronusdk_adapter.mtk.so: This universal SDK contains the implementation required for executing target-dependent code on the MediaTek chip.
+libneuron_buffer_allocator.so: This utility library is designed for allocating DMA buffers necessary for model inference.
+
+### Set Environment Variables
+```
+export BUCK2=path_to_buck/buck2 # Download BUCK2 and create BUCK2 executable
+export ANDROID_NDK=path_to_android_ndk
+export NEURON_BUFFER_ALLOCATOR_LIB=path_to_buffer_allocator/libneuron_buffer_allocator.so
+```
+
+## Build Backend and MTK Llama Runner
+Next we need to build and compile the MTK backend and MTK Llama runner.
+```
+cd examples/mediatek
+./mtk_build_examples.sh
+```
+
+This will generate a cmake-android-out folder that will contain a runner executable for inferring with Llama models and another library file:
+* `cmake-android-out/examples/mediatek/mtk_llama_executor_runner`
+* `cmake-android-out/backends/mediatek/libneuron_backend.so`
+
+## Export Llama Model
+MTK currently supports Llama 3 exporting.
+
+### Set up Environment
+1. Follow the ExecuTorch set-up environment instructions found on the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html) page
+2. Set-up MTK AoT environment
+```
+// Ensure that you are inside executorch/examples/mediatek directory
+pip3 install -r requirements.txt
+
+// The following .whl file will be available soon
+pip3 install mtk_neuron-8.2.2-py3-none-linux_x86_64.whl
+pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+```
+
+This was tested with transformers version 4.40 and numpy version 1.23. If you do not have these version then, use the following commands:
+```
+pip install transformers==4.40
+
+pip install numpy=1.23
+```
+
+### Running Export
+Prior to exporting, place the config.json, relevant tokenizer files and .bin or .safetensor weight files in `examples/mediatek/models/llm_models/weights`.
+
+Here is an export example ([details](https://github.com/pytorch/executorch/tree/main/examples/mediatek#aot-flow)):
+```
+cd examples/mediatek
+# num_chunks=4, num_tokens=128, cache_size=512
+source shell_scripts/export_llama.sh llama3 "" "" "" alpaca.txt
+```
+
+There will be 3 main set of files generated:
+* num_chunks*2 pte files: half are for prompt and the other half are for generation. Generation pte files are denoted by “1t” in the file name.
+* Token embedding bin file: located in the weights folder where `config.json` is placed (`examples/mediatek/modes/llm_models/weight/<model_name>/embedding_<model_name>_fp32.bin`)
+* Tokenizer file: `tokenizer.model` file
+
+Note: Exporting model flow can take 2.5 hours (114GB RAM for num_chunks=4) to complete. (Results may vary depending on hardware)
+
+Before continuing forward, make sure to modify the tokenizer, token embedding, and model paths in the  examples/mediatek/executor_runner/run_llama3_sample.sh.
+
+## Deploy Files on Device
+
+### Prepare to Deploy
+Prior to deploying the files on device, make sure to modify the tokenizer, token embedding, and model file names in  examples/mediatek/executor_runner/run_llama3_sample.sh reflect what was generated during the Export Llama Model step.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/mtk_changes_to_shell_file.png" style="width:600px">
+</p>
+
+In addition, create a sample_prompt.txt file with a prompt. This will be deployed to the device in the next step.
+* Example content of a sample_prompt.txt file:
+```
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+You are a helpful AI assistant for travel tips and recommendations<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What can you help me with?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+```
+
+### Deploy
+First, make sure your Android phone’s chipset version is compatible with this demo (MediaTek Dimensity 9300 (D9300)) chip. Once you have the model, tokenizer, and runner generated ready, you can push them and the .so files to the device before we start running using the runner via shell.
+
+```
+adb shell mkdir -p /data/local/tmp/llama
+adb push examples/mediatek/executor_runner/run_llama3_sample.sh /data/local/tmp/llama
+adb push sample_prompt.txt /data/local/tmp/llama
+adb push cmake-android-out/examples/mediatek/mtk_llama_executor_runner /data/local/tmp/llama
+adb push cmake-android-out/backends/mediatek/libneuron_backend.so /data/local/tmp/llama
+adb push libneuron_buffer_allocator.so /data/local/tmp/llama
+adb push libneuronusdk_adapter.mtk.so /data/local/tmp/llama
+adb push embedding_<model_name>_fp32.bin /data/local/tmp/llama
+adb push tokenizer.model /data/local/tmp/llama
+```
+
+## Run Demo
+At this point we have pushed all the required files on the device and we are ready to run the demo!
+```
+adb shell
+
+<android_device>:/ $ cd data/local/tmp/llama
+<android_device>:/data/local/tmp/llama $ sh run_llama3_sample.sh
+```
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/mtk_output.png" style="width:800px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
new file mode 100644
index 00000000000..5f850e31f30
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -0,0 +1,228 @@
+# Building ExecuTorch Android Demo App for Llama running Qualcomm
+
+This tutorial covers the end to end workflow for building an android demo app using Qualcomm AI accelerators on device.
+More specifically, it covers:
+1. Export and quantization of Llama models against the Qualcomm backend.
+2. Building and linking libraries that are required to inference on-device for Android platform using Qualcomm AI accelerators.
+3. Building the Android demo app itself.
+
+Verified on Linux CentOS, QNN SDK [v2.26](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip), python 3.10, Android SDK r26c.
+
+Phone verified: OnePlus 12, Samsung 24+, Samsung 23
+
+## Prerequisites
+* Download and unzip QNN SDK [v2.26](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip)
+* Download and unzip Android SDK [r26](https://developer.android.com/ndk/downloads)
+* Android phone with Snapdragon8 Gen3 (SM8650) or Gen2 (SM8550). Gen 1 and lower SoC might be supported but not fully validated.
+* Desired Llama model weights in .PTH format. You can download them on HuggingFace ([Example](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)).
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+```
+conda create -n et_qnn python=3.10.0
+conda activate et_qnn
+```
+
+Checkout ExecuTorch repo and sync submodules
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+Install dependencies
+```
+./install_requirements.sh
+```
+
+## Setup QNN
+```
+# Set these variables correctly for your environment
+export ANDROID_NDK_ROOT=$HOME/android-ndk-r26 # Download android SDK and unzip to home directory
+export QNN_SDK_ROOT=$HOME/Your-SDK-Root #Folder contains lib
+export EXECUTORCH_ROOT=$HOME/repos/executorch
+export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/:$LD_LIBRARY_PATH
+export PYTHONPATH=$EXECUTORCH_ROOT/..
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+```
+
+### Build QNN backend with ExecuTorch
+```
+./backends/qualcomm/scripts/build.sh --release
+
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_QNN=ON \
+    -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+```
+
+
+
+### Setup Llama Runner
+Next we need to build and compile the Llama runner. This is similar to the requirements for running Llama with XNNPACK.
+```
+sh examples/models/llama2/install_requirements.sh
+
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_QNN=ON \
+    -Bcmake-out/examples/models/llama2 \
+    examples/models/llama2
+cmake --build cmake-out/examples/models/llama2 -j16 --config Release
+```
+
+## Export Llama Model
+QNN backend currently supports exporting to these data types: fp32, int4/ int8 with PTQ, int4 with SpinQuant (Llama 3 only).
+
+We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add “--soc_model SM8550” in your export command. Without setting this flag, the export will default to SM8650.
+
+### Export with PTQ
+We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B). However, there is accuracy regression and we are working on improving it.
+8B models might need 16GB RAM on the device to run.
+
+Examples:
+```
+# 4 bits weight only quantize
+python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+```
+If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
+```
+# 8 bits quantization with 4 shards
+python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte”
+```
+Note: if you encountered issues below
+```
+[ERROR] [Qnn ExecuTorch]: Cannot Open QNN library libQnnHtp.so, with error: libc++.so.1: cannot open shared object file: No such file or directory
+```
+
+Resolve by:
+
+* Install older QNN such as 2.23 or below and copy it from ${QNN_SDK_ROOT}/lib/x86_64-linux-clang
+* Install it with apt-get by yourself
+* Install it with script in ${QNN_SDK_ROOT}/bin/check-linux-dependency.sh
+You could refer to [QNN SDK document](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/setup.html?product=1601111740009302#linux-platform-dependencies)
+* Install it with Conda:
+```
+conda install -c conda-forge libcxx=14.0.0
+```
+
+After installment, you will need to check libc++.so.1 in your LD_LIBRARY_PATH or system lib. Refer to this [PR](https://github.com/pytorch/executorch/issues/5120) for more detail.
+
+You may also wonder what the "--metadata" flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
+
+Convert tokenizer for Llama 2
+```
+python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+```
+Rename tokenizer for Llama 3 with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly.
+
+
+### Export with Spinquant (Llama 3 8B only)
+We also support Llama 3 8B for Spinquant where the accuracy regression is minimal.
+
+Deploying large language models like Llama 3 on-device presents the following challenges:
+* The model size is too large to fit in device memory for inference.
+* High model loading and inference time.
+* Difficulty in quantization.
+
+To address these challenges, we have implemented the following solutions:
+* Using --pt2e_quantize qnn_16a4w to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference.
+* Using --num_sharding 8 to shard the model into sub-parts.
+* Performing graph transformations to convert or decompose operations into more accelerator-friendly operations.
+* Using --optimized_rotation_path <path_to_optimized_matrix> to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy.
+* Using --calibration_data "<|start_header_id|>system<|end_header_id|..." to ensure that during the quantization of Llama 3 8B Instruct, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to the [model card](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/) of meta llama3 instruct.
+
+To get the optimized matrix, please refer to [SpinQuant](https://github.com/facebookresearch/SpinQuant) on GitHub. You can download the optimized rotation matrices in the Quantized Models section. Please choose "LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0".
+
+To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure the following:
+* The host machine has more than 100GB of memory (RAM + swap space).
+* The entire process takes a few hours.
+* 8B models might need 16GB RAM on the device to run.
+```
+# Please note that calibration_data must include the prompt template for special tokens.
+python -m examples.models.llama2.export_llama  -t <path_to_tokenizer.model> -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct>  --use_kv_cache  --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+```
+
+## Pushing Model and Tokenizer
+
+Once you have the model and tokenizer ready, you can push them to the device before we start building the android demo app.
+```
+adb shell mkdir -p /data/local/tmp/llama
+adb push llama-exported.pte /data/local/tmp/llama
+adb push tokenizer.bin /data/local/tmp/llama
+```
+
+
+
+## Build AAR Library
+Open a terminal window and navigate to the root directory of the executorch.
+Set the following environment variables:
+```
+export ANDROID_NDK=<path_to_android_ndk>
+export ANDROID_ABI=arm64-v8a
+```
+Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.
+Build the Android Java extension code:
+```
+pushd extension/android
+./gradlew build
+popd
+```
+Run the following command set up the required JNI library:
+```
+pushd examples/demo-apps/android/LlamaDemo
+./gradlew :app:setupQnn
+popd
+```
+Alternative you can also just run the shell script directly as in the root directory:
+```
+sh examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
+```
+This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them, and copies them to jniLibs.
+Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app.
+
+
+## Run the Android Demo App
+
+First, make sure your Android phone’s chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html).
+
+If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into “examples/demo-apps/android/LlamaDemo/app/libs”
+
+### Alternative 1: Android Studio (Recommended)
+Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo.
+Run the app (^R). This builds and launches the app on the phone.
+
+### Alternative 2: Command line
+Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
+```
+export ANDROID_HOME=<path_to_android_sdk_home>
+pushd examples/demo-apps/android/LlamaDemo
+./gradlew :app:installDebug
+popd
+```
+If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github.
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
new file mode 100644
index 00000000000..9a8b86b8a50
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -0,0 +1,195 @@
+# Building ExecuTorch Android Demo App for Llama/Llava running XNNPACK
+
+**[UPDATE - 09/25]** We have added support for running [Llama 3.2 models](#for-llama-32-1b-and-3b-models) on the XNNPACK backend. We currently support inference on their original data type (BFloat16). We have also added instructions to run [Llama Guard 1B models](#for-llama-guard-1b-models) on-device.
+
+This tutorial covers the end to end workflow for building an android demo app using CPU on device via XNNPACK framework.
+More specifically, it covers:
+1. Export and quantization of Llama and Llava models against the XNNPACK backend.
+2. Building and linking libraries that are required to inference on-device for Android platform.
+3. Building the Android demo app itself.
+
+Phone verified: OnePlus 12, OnePlus 9 Pro. Samsung S23 (Llama only), Samsung S24+ (Llama only), Pixel 8 Pro (Llama only)
+
+
+## Known Issues
+* With prompts like “What is the maxwell equation” the runner+jni is unable to handle odd unicodes.
+
+## Prerequisites
+* Install [Java 17 JDK](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html).
+* Install the [Android SDK API Level 34](https://developer.android.com/about/versions/15/setup-sdk) and [Android NDK 26.3.11579264](https://developer.android.com/studio/projects/install-ndk). **WARNING** NDK r27 will cause issues like:
+  ```
+  java.lang.UnsatisfiedLinkError: dlopen failed: cannot locate symbol "_ZTVNSt6__ndk114basic_ifstreamIcNS_11char_traitsIcEEEE" referenced by "/data/app/~~F5IwquaXUZPdLpSEYA-JGA==/com.example.executorchllamademo-FSyx80gEhsQCsxz7hvS2Ew==/lib/arm64/libexecutorch.so"...
+  ```
+  Please downgrade to version 26.3.11579264.
+* If you have Android Studio set up, you can install them with
+  * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Platforms -> Check the row with API Level 34.
+  * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Tools -> Check NDK (Side by side) row.
+* Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI.
+Supported Host OS: CentOS, macOS Sonoma on Apple Silicon.
+
+
+Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 26.3.11579264.
+
+
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+```
+conda create -yn executorch python=3.10.0
+conda activate executorch
+```
+
+Checkout ExecuTorch repo and sync submodules
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+Install dependencies
+```
+./install_requirements.sh
+```
+
+Optional: Use the --pybind flag to install with pybindings.
+```
+./install_requirements.sh --pybind xnnpack
+```
+
+
+## Prepare Models
+In this demo app, we support text-only inference with up-to-date Llama models and image reasoning inference with LLaVA 1.5.
+
+### For Llama 3.2 1B and 3B models
+We have supported BFloat16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models.
+* You can request and download model weights for Llama through Meta official [website](https://llama.meta.com/).
+* For chat use-cases, download the instruct models instead of pretrained.
+* Run `examples/models/llama2/install_requirements.sh` to install dependencies.
+* The 1B model in BFloat16 format can run on mobile devices with 8GB RAM. The 3B model will require 12GB+ RAM.
+* Export Llama model and generate .pte file as below:
+
+```
+python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte"
+```
+
+* Rename tokenizer for Llama 3.2 with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly.
+
+For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
+
+
+### For Llama Guard 1B models
+To safeguard your application, you can use our Llama Guard models for prompt classification or response classification as mentioned [here](https://www.llama.com/docs/model-cards-and-prompt-formats/llama-guard-3/).
+* Llama Guard 3-1B is a fine-tuned Llama-3.2-1B pretrained model for content safety classification. It is aligned to safeguard against the [MLCommons standardized hazards taxonomy](https://arxiv.org/abs/2404.12241).
+* You can download the latest Llama Guard 1B INT4 model, which is already exported for ExecuTorch, using instructions from [here](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard3). This model is pruned and quantized to 4-bit weights using 8da4w mode and reduced the size to <450MB to optimize deployment on edge devices.
+* You can use the same tokenizer from Llama 3.2.
+* To try this model, choose Model Type as LLAMA_GUARD_3 in the demo app below and try prompt classification for a given user prompt.
+* We prepared this model using the following command
+
+```
+python -m examples.models.llama2.export_llama --checkpoint <pruned llama guard 1b checkpoint.pth> --params <params.json> -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map <llama_guard pruned layers map.json> --output_name="llama_guard_3_1b_pruned_xnnpack.pte"
+```
+
+
+### For Llama 3.1 and Llama 2 models
+* You can download original model weights for Llama through Meta official [website](https://llama.meta.com/).
+* For Llama 2 models, Edit params.json file. Replace "vocab_size": -1 with "vocab_size": 32000. This is a short-term workaround
+* Run `examples/models/llama2/install_requirements.sh` to install dependencies.
+* The Llama 3.1 and Llama 2 models (8B and 7B) can run on devices with 12GB+ RAM.
+* Export Llama model and generate .pte file
+
+```
+python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte"
+```
+
+You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
+
+* Convert tokenizer for Llama 2 and Llava (skip this for Llama 3.x)
+```
+python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+```
+* Rename tokenizer for Llama 3.1 with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly.
+
+
+### For LLaVA model
+* For the Llava 1.5 model, you can get it from Huggingface [here](https://huggingface.co/llava-hf/llava-1.5-7b-hf).
+* Run `examples/models/llava/install_requirements.sh` to install dependencies.
+* Run the following command to generate llava.pte, tokenizer.bin and an image tensor (serialized in TorchScript) image.pt.
+
+```
+python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+```
+* You can find more information [here](https://github.com/pytorch/executorch/tree/main/examples/models/llava).
+
+
+## Pushing Model and Tokenizer
+Once you have the model and tokenizer ready, you can push them to the device before we start building the Android demo app.
+```
+adb shell mkdir -p /data/local/tmp/llama
+adb push llama.pte /data/local/tmp/llama
+adb push tokenizer.bin /data/local/tmp/llama
+```
+
+## Build AAR Library
+1. Open a terminal window and navigate to the root directory of the executorch
+2. Set the following environment variables:
+```
+export ANDROID_NDK=<path_to_android_ndk>
+export ANDROID_ABI=arm64-v8a
+```
+*Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.*
+
+3. Build the Android Java extension code:
+```
+pushd extension/android
+./gradlew build
+popd
+```
+4. Run the following command set up the required JNI library:
+```
+pushd examples/demo-apps/android/LlamaDemo
+./gradlew :app:setup
+popd
+```
+Alternative you can also just run the shell script directly as in the root directory:
+```
+sh examples/demo-apps/android/LlamaDemo/setup.sh
+```
+
+This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them, and copies them to jniLibs.
+
+**Output**: The executorch-llama.aar file will be generated in a newly created folder in the example/demo-apps/android/LlamaDemo/app/libs directory. This is the path that the Android app expects it to be in.
+
+**Note**: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting on Linux), make sure you copy the aar file generated from setup script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app.
+
+### Alternative: Use prebuilt AAR library
+1. Open a terminal window and navigate to the root directory of the executorch.
+2. Run the following command to download the prebuilt library
+```
+bash examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh
+```
+The prebuilt AAR library contains the Java library and the JNI binding for NativePeer.java and ExecuTorch native library, including core ExecuTorch runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels, and Quantized kernels. It comes with two ABI variants, arm64-v8a and x86_64.
+If you need to use other dependencies (like tokenizer), please build from the local machine option.
+
+## Run the Android Demo App
+### Alternative 1: Android Studio (Recommended)
+1. Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo.
+2. Run the app (^R). This builds and launches the app on the phone.
+
+### Alternative 2: Command line
+Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
+```
+export ANDROID_HOME=<path_to_android_sdk_home>
+pushd examples/demo-apps/android/LlamaDemo
+./gradlew :app:installDebug
+popd
+```
+If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
index c3b778d9b11..df70725942d 100644
--- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
@@ -50,7 +50,7 @@ mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}"
 BUILD_AAR_DIR="$(mktemp -d)"
 mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs"
 JNI_LIBS_PATH="${BUILD_AAR_DIR}/jni"
-cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch_jni.so"
+cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch.so"
 cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
 cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
 cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
index 857c5252845..714a43297c5 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
@@ -806,8 +806,8 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = latest;
-				kind = branch;
+				kind = revision;
+				revision = bdf3f5a1047c73ef61bb3e956d1d4528de743077;
 			};
 		};
 /* End XCRemoteSwiftPackageReference section */
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index e8cb47091a5..71444492da3 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -43,7 +43,6 @@
 		03729F132BB2042B00152F2E /* sampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F112BB2042B00152F2E /* sampler.cpp */; };
 		03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F142BB2043600152F2E /* bpe_tokenizer.cpp */; };
 		03729F172BB2043600152F2E /* tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F152BB2043600152F2E /* tokenizer.h */; };
-		0372C3112C893FE900CD942A /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0372C3102C893FE900CD942A /* CoreGraphics.framework */; };
 		0372C3142C89418E00CD942A /* llava_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 0372C3122C89418E00CD942A /* llava_runner.h */; };
 		0372C3152C89418E00CD942A /* llava_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0372C3132C89418E00CD942A /* llava_runner.cpp */; };
 		038D678C2C482C1E00B88CF2 /* llama_tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 038D678A2C482C1D00B88CF2 /* llama_tiktoken.cpp */; };
@@ -56,6 +55,7 @@
 		03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */; };
 		03D03DAC2C7823830088D6A7 /* text_decoder_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */; };
 		03DDA0FB2BD6368100D234B3 /* base64.h in Headers */ = {isa = PBXBuildFile; fileRef = 03DDA0FA2BD6368100D234B3 /* base64.h */; };
+		26A6A4282C8A3769005A761E /* ImagePicker.swift in Sources */ = {isa = PBXBuildFile; fileRef = 26A6A4272C8A3769005A761E /* ImagePicker.swift */; };
 		84DD94742C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84DD94732C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift */; };
 		84DD94812C81060E00C765A6 /* backend_coreml in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94802C81060E00C765A6 /* backend_coreml */; };
 		84DD94832C81060E00C765A6 /* backend_coreml_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94822C81060E00C765A6 /* backend_coreml_debug */; };
@@ -162,6 +162,7 @@
 		03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = text_decoder_runner.cpp; sourceTree = "<group>"; };
 		03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_decoder_runner.h; sourceTree = "<group>"; };
 		03DDA0FA2BD6368100D234B3 /* base64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = base64.h; path = ../../../../extension/llm/tokenizer/base64.h; sourceTree = "<group>"; };
+		26A6A4272C8A3769005A761E /* ImagePicker.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImagePicker.swift; sourceTree = "<group>"; };
 		84DD94712C8105EB00C765A6 /* LLaMAPerfBenchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LLaMAPerfBenchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		84DD94732C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LLaMAPerfBenchmarkApp.swift; sourceTree = "<group>"; };
 		84DD94A72C8107AB00C765A6 /* LLaMAPerfBenchmark.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = LLaMAPerfBenchmark.entitlements; sourceTree = "<group>"; };
@@ -182,13 +183,13 @@
 				03312C352BBFC940002106EF /* backend_xnnpack in Frameworks */,
 				03312C372BBFC940002106EF /* backend_xnnpack_debug in Frameworks */,
 				03312C1B2BBFC940002106EF /* backend_coreml_debug in Frameworks */,
-				03729EDB2BB1F8DE00152F2E /* LLaMARunner.framework in Frameworks */,
 				03312C2B2BBFC940002106EF /* kernels_optimized_debug in Frameworks */,
 				03312C2F2BBFC940002106EF /* kernels_portable_debug in Frameworks */,
 				03312C292BBFC940002106EF /* kernels_optimized in Frameworks */,
 				03312C192BBFC940002106EF /* backend_coreml in Frameworks */,
 				03312C332BBFC940002106EF /* kernels_quantized_debug in Frameworks */,
 				03312C1F2BBFC940002106EF /* kernels_custom_debug in Frameworks */,
+				03729EDB2BB1F8DE00152F2E /* LLaMARunner.framework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -196,7 +197,6 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				0372C3112C893FE900CD942A /* CoreGraphics.framework in Frameworks */,
 				03312C3E2BBFD076002106EF /* executorch_debug in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -253,6 +253,7 @@
 				0324D6862BAACB6900DEF36F /* MessageView.swift */,
 				0324D6872BAACB6900DEF36F /* ResourceManager.swift */,
 				0324D6882BAACB6900DEF36F /* ResourceMonitor.swift */,
+				26A6A4272C8A3769005A761E /* ImagePicker.swift */,
 			);
 			path = Application;
 			sourceTree = "<group>";
@@ -629,7 +630,7 @@
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 			shellPath = /bin/sh;
-			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"cmake not found, please install cmake.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    shift\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\"\n    cmake --install . --prefix \"$CMAKE_DIR\"\n}\n\ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/abseil-cpp\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/re2\" \\\n    -DCMAKE_PREFIX_PATH=\"$CMAKE_DIR/lib/cmake/absl\"\n    \ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/sentencepiece\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n";
+			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"cmake not found, please install cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    shift\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\"\n    cmake --install . --prefix \"$CMAKE_DIR\"\n}\n\ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/abseil-cpp\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/re2\" \\\n    -DCMAKE_PREFIX_PATH=\"$CMAKE_DIR/lib/cmake/absl\"\n    \ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/sentencepiece\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n";
 		};
 /* End PBXShellScriptBuildPhase section */
 
@@ -645,6 +646,7 @@
 				0324D6922BAACB6900DEF36F /* ResourceManager.swift in Sources */,
 				0324D68C2BAACB6900DEF36F /* ContentView.swift in Sources */,
 				0324D6902BAACB6900DEF36F /* MessageListView.swift in Sources */,
+				26A6A4282C8A3769005A761E /* ImagePicker.swift in Sources */,
 				0324D6912BAACB6900DEF36F /* MessageView.swift in Sources */,
 				0324D68B2BAACB6900DEF36F /* App.swift in Sources */,
 			);
@@ -860,7 +862,7 @@
 				OTHER_LDFLAGS = "";
 				"OTHER_LDFLAGS[sdk=iphoneos*]" = (
 					"-force_load",
-					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-debug.a",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-ios-debug.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-debug.a",
 					"-force_load",
@@ -874,7 +876,7 @@
 				);
 				"OTHER_LDFLAGS[sdk=iphonesimulator*]" = (
 					"-force_load",
-					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-debug.a",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-debug.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-debug.a",
 					"-force_load",
@@ -922,7 +924,7 @@
 				OTHER_LDFLAGS = "";
 				"OTHER_LDFLAGS[sdk=iphoneos*]" = (
 					"-force_load",
-					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-ios-release.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a",
 					"-force_load",
@@ -936,7 +938,7 @@
 				);
 				"OTHER_LDFLAGS[sdk=iphonesimulator*]" = (
 					"-force_load",
-					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a",
 					"-force_load",
@@ -1092,6 +1094,8 @@
 				);
 				MARKETING_VERSION = 1.0;
 				"OTHER_LDFLAGS[sdk=iphoneos*]" = (
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-ios-debug.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-debug.a",
 					"-force_load",
@@ -1106,6 +1110,8 @@
 					"$(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-debug.a",
 				);
 				"OTHER_LDFLAGS[sdk=iphonesimulator*]" = (
+					"-force_load",
+					"$(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-debug.a",
 					"-force_load",
 					"$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-debug.a",
 					"-force_load",
@@ -1280,8 +1286,8 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = latest;
-				kind = branch;
+				kind = revision;
+				revision = bdf3f5a1047c73ef61bb3e956d1d4528de743077;
 			};
 		};
 /* End XCRemoteSwiftPackageReference section */
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
index bac1b9ccf28..f47771b8f26 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
@@ -16,6 +16,72 @@ class RunnerHolder: ObservableObject {
   var llavaRunner: LLaVARunner?
 }
 
+extension UIImage {
+  func resized(to newSize: CGSize) -> UIImage {
+    let format = UIGraphicsImageRendererFormat.default()
+    let renderer = UIGraphicsImageRenderer(size: newSize, format: format)
+    let image = renderer.image { _ in
+      draw(in: CGRect(origin: .zero, size: newSize))
+    }
+    return image
+  }
+
+  func toRGBArray() -> [UInt8]? {
+    guard let cgImage = self.cgImage else {
+      NSLog("Failed to get CGImage from UIImage")
+      return nil
+    }
+
+    let width = cgImage.width
+    let height = cgImage.height
+    let colorSpace = CGColorSpaceCreateDeviceRGB()
+    let bytesPerPixel = 4
+    let bytesPerRow = bytesPerPixel * width
+    let bitsPerComponent = 8
+    let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue
+
+    guard let context = CGContext(
+      data: nil,
+      width: width,
+      height: height,
+      bitsPerComponent: bitsPerComponent,
+      bytesPerRow: bytesPerRow,
+      space: colorSpace,
+      bitmapInfo: bitmapInfo
+    ) else {
+      NSLog("Failed to create CGContext")
+      return nil
+    }
+
+    context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height))
+
+    guard let pixelBuffer = context.data else {
+      NSLog("Failed to get pixel data from CGContext")
+      return nil
+    }
+
+    let pixelData = pixelBuffer.bindMemory(to: UInt8.self, capacity: width * height * bytesPerPixel)
+
+    var rgbArray = [UInt8](repeating: 0, count: width * height * 3)
+
+    for y in 0..<height {
+      for x in 0..<width {
+        let pixelIndex = (y * width + x) * bytesPerPixel
+        let r = UInt8(pixelData[pixelIndex])
+        let g = UInt8(pixelData[pixelIndex + 1])
+        let b = UInt8(pixelData[pixelIndex + 2])
+
+        let rgbIndex = (y * width + x)
+        rgbArray[rgbIndex] = r
+        rgbArray[rgbIndex + height * width] = g
+        rgbArray[rgbIndex + 2 * height * width] = b
+      }
+    }
+
+    return rgbArray
+  }
+}
+
 struct ContentView: View {
   @State private var prompt = ""
   @State private var messages: [Message] = []
@@ -23,12 +89,19 @@ struct ContentView: View {
   @State private var pickerType: PickerType?
   @State private var isGenerating = false
   @State private var shouldStopGenerating = false
+  @State private var shouldStopShowingToken = false
   private let runnerQueue = DispatchQueue(label: "org.pytorch.executorch.llama")
   @StateObject private var runnerHolder = RunnerHolder()
   @StateObject private var resourceManager = ResourceManager()
   @StateObject private var resourceMonitor = ResourceMonitor()
   @StateObject private var logManager = LogManager()
 
+  @State private var isImagePickerPresented = false
+  @State private var selectedImage: UIImage?
+  @State private var imagePickerSourceType: UIImagePickerController.SourceType = .photoLibrary
+
+  @State private var showingSettings = false
+
   enum PickerType {
     case model
     case tokenizer
@@ -55,6 +128,23 @@ struct ContentView: View {
   var body: some View {
     NavigationView {
       VStack {
+        if showingSettings {
+          VStack(spacing: 20) {
+            Form {
+              Section(header: Text("Model and Tokenizer")
+                        .font(.headline)
+                        .foregroundColor(.primary)) {
+                Button(action: { pickerType = .model }) {
+                  Label(resourceManager.modelName == "" ? modelTitle : resourceManager.modelName, systemImage: "doc")
+                }
+                Button(action: { pickerType = .tokenizer }) {
+                  Label(resourceManager.tokenizerName == "" ? tokenizerTitle : resourceManager.tokenizerName, systemImage: "doc")
+                }
+              }
+            }
+          }
+        }
+
         MessageListView(messages: $messages)
           .gesture(
             DragGesture().onChanged { value in
@@ -64,24 +154,33 @@ struct ContentView: View {
             }
           )
         HStack {
-          Menu {
-            Section(header: Text("Model")) {
-              Button(action: { pickerType = .model }) {
-                Label(modelTitle, systemImage: "doc")
-              }
-            }
-            Section(header: Text("Tokenizer")) {
-              Button(action: { pickerType = .tokenizer }) {
-                Label(tokenizerTitle, systemImage: "doc")
-              }
+          Button(action: {
+            imagePickerSourceType = .photoLibrary
+            isImagePickerPresented = true
+          }) {
+            Image(systemName: "photo.on.rectangle")
+              .resizable()
+              .scaledToFit()
+              .frame(width: 24, height: 24)
+          }
+          .background(Color.clear)
+          .cornerRadius(8)
+
+          Button(action: {
+            if UIImagePickerController.isSourceTypeAvailable(.camera) {
+              imagePickerSourceType = .camera
+              isImagePickerPresented = true
+            } else {
+              print("Camera not available")
             }
-          } label: {
-            Image(systemName: "ellipsis.circle")
+          }) {
+            Image(systemName: "camera")
               .resizable()
-              .aspectRatio(contentMode: .fit)
-              .frame(height: 28)
+              .scaledToFit()
+              .frame(width: 24, height: 24)
           }
-          .disabled(isGenerating)
+          .background(Color.clear)
+          .cornerRadius(8)
 
           TextField(placeholder, text: $prompt, axis: .vertical)
             .padding(8)
@@ -103,8 +202,18 @@ struct ContentView: View {
           .disabled(isGenerating ? shouldStopGenerating : (!isInputEnabled || prompt.isEmpty))
         }
         .padding([.leading, .trailing, .bottom], 10)
+        .sheet(isPresented: $isImagePickerPresented, onDismiss: addSelectedImageMessage) {
+          ImagePicker(selectedImage: $selectedImage, sourceType: imagePickerSourceType)
+        }
       }
       .navigationBarTitle(title, displayMode: .inline)
+      .navigationBarItems(leading:
+                            Button(action: {
+                              showingSettings.toggle()
+                            }) {
+                              Image(systemName: "gearshape")
+                                .imageScale(.large)
+                            })
       .navigationBarItems(trailing:
                             HStack {
                               Menu {
@@ -154,18 +263,26 @@ struct ContentView: View {
     .navigationViewStyle(StackNavigationViewStyle())
   }
 
+  private func addSelectedImageMessage() {
+    if let selectedImage {
+      messages.append(Message(image: selectedImage))
+    }
+  }
+
   private func generate() {
     guard !prompt.isEmpty else { return }
     isGenerating = true
     shouldStopGenerating = false
-    let text = prompt.trimmingCharacters(in: .whitespacesAndNewlines)
-    let seq_len = 128
-    prompt = ""
+    shouldStopShowingToken = false
+    let text = prompt
+    let seq_len = 768 // text: 256, vision: 768
     let modelPath = resourceManager.modelPath
     let tokenizerPath = resourceManager.tokenizerPath
+    let useLlama = modelPath.range(of: "llama", options: .caseInsensitive) != nil
 
-    messages.append(Message(text: text))
-    messages.append(Message(type: .generated))
+    prompt = ""
+    hideKeyboard()
+    showingSettings = false
 
     runnerQueue.async {
       defer {
@@ -173,36 +290,82 @@ struct ContentView: View {
           isGenerating = false
         }
       }
-      runnerHolder.runner = runnerHolder.runner ?? Runner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+
+      if useLlama {
+        runnerHolder.runner = runnerHolder.runner ?? Runner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+      } else {
+        runnerHolder.llavaRunner = runnerHolder.llavaRunner ?? LLaVARunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+      }
+
       guard !shouldStopGenerating else { return }
-      if let runner = runnerHolder.runner, !runner.isloaded() {
-        var error: Error?
-        let startLoadTime = Date()
-        do {
-          try runner.load()
-        } catch let loadError {
-          error = loadError
-        }
-        let loadTime = Date().timeIntervalSince(startLoadTime)
-        DispatchQueue.main.async {
-          withAnimation {
-            var message = messages.removeLast()
-            message.type = .info
-            if let error {
-              message.text = "Model loading failed: error \((error as NSError).code)"
-            } else {
-              message.text = "Model loaded in \(String(format: "%.2f", loadTime)) s"
-            }
-            messages.append(message)
-            if error == nil {
-              messages.append(Message(type: .generated))
+      if useLlama {
+        messages.append(Message(text: text))
+        messages.append(Message(type: .llamagenerated))
+
+        if let runner = runnerHolder.runner, !runner.isloaded() {
+          var error: Error?
+          let startLoadTime = Date()
+          do {
+            try runner.load()
+          } catch let loadError {
+            error = loadError
+          }
+
+          let loadTime = Date().timeIntervalSince(startLoadTime)
+          DispatchQueue.main.async {
+            withAnimation {
+              var message = messages.removeLast()
+              message.type = .info
+              if let error {
+                message.text = "Model loading failed: error \((error as NSError).code)"
+              } else {
+                message.text = "Model loaded in \(String(format: "%.2f", loadTime)) s"
+              }
+              messages.append(message)
+              if error == nil {
+                messages.append(Message(type: .llamagenerated))
+              }
             }
           }
+          if error != nil {
+            return
+          }
         }
-        if error != nil {
-          return
+      } else {
+        messages.append(Message(text: text))
+        messages.append(Message(type: .llavagenerated))
+
+        if let runner = runnerHolder.llavaRunner, !runner.isloaded() {
+          var error: Error?
+          let startLoadTime = Date()
+          do {
+            try runner.load()
+          } catch let loadError {
+            error = loadError
+          }
+
+          let loadTime = Date().timeIntervalSince(startLoadTime)
+          DispatchQueue.main.async {
+            withAnimation {
+              var message = messages.removeLast()
+              message.type = .info
+              if let error {
+                message.text = "Model loading failed: error \((error as NSError).code)"
+              } else {
+                message.text = "Model loaded in \(String(format: "%.2f", loadTime)) s"
+              }
+              messages.append(message)
+              if error == nil {
+                messages.append(Message(type: .llavagenerated))
+              }
+            }
+          }
+          if error != nil {
+            return
+          }
         }
       }
+
       guard !shouldStopGenerating else {
         DispatchQueue.main.async {
           withAnimation {
@@ -213,22 +376,75 @@ struct ContentView: View {
       }
       do {
         var tokens: [String] = []
-        try runnerHolder.runner?.generate(text, sequenceLength: seq_len) { token in
-          tokens.append(token)
-          if tokens.count > 2 {
-            let text = tokens.joined()
-            let count = tokens.count
-            tokens = []
-            DispatchQueue.main.async {
-              var message = messages.removeLast()
-              message.text += text
-              message.tokenCount += count
-              message.dateUpdated = Date()
-              messages.append(message)
+        var rgbArray: [UInt8]?
+        let MAX_WIDTH = 336.0
+        var newHeight = 0.0
+        var imageBuffer: UnsafeMutableRawPointer?
+
+        if let img = selectedImage {
+          let llava_prompt = "\(text) ASSISTANT"
+
+          newHeight = MAX_WIDTH * img.size.height / img.size.width
+          let resizedImage = img.resized(to: CGSize(width: MAX_WIDTH, height: newHeight))
+          rgbArray = resizedImage.toRGBArray()
+          imageBuffer = UnsafeMutableRawPointer(mutating: rgbArray)
+
+          try runnerHolder.llavaRunner?.generate(imageBuffer!, width: MAX_WIDTH, height: newHeight, prompt: llava_prompt, sequenceLength: seq_len) { token in
+
+            if token != llava_prompt {
+              if token == "</s>" {
+                shouldStopGenerating = true
+                runnerHolder.runner?.stop()
+              } else {
+                tokens.append(token)
+                if tokens.count > 2 {
+                  let text = tokens.joined()
+                  let count = tokens.count
+                  tokens = []
+                  DispatchQueue.main.async {
+                    var message = messages.removeLast()
+                    message.text += text
+                    message.tokenCount += count
+                    message.dateUpdated = Date()
+                    messages.append(message)
+                  }
+                }
+                if shouldStopGenerating {
+                  runnerHolder.runner?.stop()
+                }
+              }
             }
           }
-          if shouldStopGenerating {
-            runnerHolder.runner?.stop()
+        } else {
+          let llama3_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\(text)<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+
+          try runnerHolder.runner?.generate(llama3_prompt, sequenceLength: seq_len) { token in
+
+            NSLog(">>> token={\(token)}")
+            if token != llama3_prompt && !shouldStopShowingToken {
+              // hack to fix the issue that extension/llm/runner/text_token_generator.h
+              // keeps generating after <|eot_id|>
+              if token == "<|eot_id|>" {
+                shouldStopShowingToken = true
+              } else {
+                tokens.append(token.trimmingCharacters(in: .newlines))
+                if tokens.count > 2 {
+                  let text = tokens.joined()
+                  let count = tokens.count
+                  tokens = []
+                  DispatchQueue.main.async {
+                    var message = messages.removeLast()
+                    message.text += text
+                    message.tokenCount += count
+                    message.dateUpdated = Date()
+                    messages.append(message)
+                  }
+                }
+                if shouldStopGenerating {
+                  runnerHolder.runner?.stop()
+                }
+              }
+            }
           }
         }
       } catch {
@@ -283,3 +499,9 @@ struct ContentView: View {
     }
   }
 }
+
+extension View {
+  func hideKeyboard() {
+    UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ImagePicker.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ImagePicker.swift
new file mode 100644
index 00000000000..57d71f6686d
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ImagePicker.swift
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import SwiftUI
+import UIKit
+
+struct ImagePicker: UIViewControllerRepresentable {
+  class Coordinator: NSObject, UINavigationControllerDelegate, UIImagePickerControllerDelegate {
+    let parent: ImagePicker
+
+    init(parent: ImagePicker) {
+      self.parent = parent
+    }
+
+    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [UIImagePickerController.InfoKey: Any]) {
+      if let image = info[.originalImage] as? UIImage {
+        parent.selectedImage = image
+      }
+
+      parent.presentationMode.wrappedValue.dismiss()
+    }
+
+    func imagePickerControllerDidCancel(_ picker: UIImagePickerController) {
+      parent.selectedImage = nil
+      parent.presentationMode.wrappedValue.dismiss()
+    }
+  }
+
+  @Environment(\.presentationMode) var presentationMode
+  @Binding var selectedImage: UIImage?
+  var sourceType: UIImagePickerController.SourceType = .photoLibrary
+
+  func makeCoordinator() -> Coordinator {
+    Coordinator(parent: self)
+  }
+
+  func makeUIViewController(context: Context) -> UIImagePickerController {
+    let picker = UIImagePickerController()
+    picker.delegate = context.coordinator
+    picker.sourceType = sourceType
+    return picker
+  }
+
+  func updateUIViewController(_ uiViewController: UIImagePickerController, context: Context) {}
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift
index b7e1b88c6aa..400941f496a 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift
@@ -6,11 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-import Foundation
+import UIKit
 
 enum MessageType {
   case prompted
-  case generated
+  case llamagenerated
+  case llavagenerated
   case info
 }
 
@@ -21,4 +22,5 @@ struct Message: Identifiable, Equatable {
   var type: MessageType = .prompted
   var text = ""
   var tokenCount = 0
+  var image: UIImage?
 }
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift
index e9ebbe953a2..542a88377b7 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift
@@ -19,21 +19,34 @@ struct MessageView: View {
           .foregroundColor(.secondary)
           .padding([.leading, .trailing], 10)
       } else {
-        VStack(alignment: message.type == .generated ? .leading : .trailing) {
-          Text(message.type == .generated ? "LLaMA" : "Prompt")
-            .font(.caption)
-            .foregroundColor(.secondary)
-            .padding(message.type == .generated ? .trailing : .leading, 20)
+        VStack(alignment: message.type == .llamagenerated || message.type == .llavagenerated ? .leading : .trailing) {
+          if message.type == .llamagenerated || message.type == .llavagenerated || message.type == .prompted {
+            Text(message.type == .llamagenerated ? "Llama" : (message.type == .llavagenerated ? "Llava" : "Prompt"))
+              .font(.caption)
+              .foregroundColor(.secondary)
+              .padding(message.type == .llamagenerated || message.type == .llavagenerated ? .trailing : .leading, 20)
+          }
           HStack {
-            if message.type != .generated { Spacer() }
+            if message.type != .llamagenerated && message.type != .llavagenerated { Spacer() }
             if message.text.isEmpty {
-              ProgressView()
-                .progressViewStyle(CircularProgressViewStyle())
+              if let img = message.image {
+                Image(uiImage: img)
+                  .resizable()
+                  .scaledToFit()
+                  .frame(maxWidth: 200, maxHeight: 200)
+                  .padding()
+                  .background(Color.gray.opacity(0.2))
+                  .cornerRadius(8)
+                  .padding(.vertical, 2)
+              } else {
+                ProgressView()
+                  .progressViewStyle(CircularProgressViewStyle())
+              }
             } else {
               Text(message.text)
                 .padding(10)
-                .foregroundColor(message.type == .generated ? .primary : .white)
-                .background(message.type == .generated ? Color(UIColor.secondarySystemBackground) : Color.blue)
+                .foregroundColor(message.type == .llamagenerated || message.type == .llavagenerated ? .primary : .white)
+                .background(message.type == .llamagenerated || message.type == .llavagenerated ? Color(UIColor.secondarySystemBackground) : Color.blue)
                 .cornerRadius(20)
                 .contextMenu {
                   Button(action: {
@@ -44,14 +57,14 @@ struct MessageView: View {
                   }
                 }
             }
-            if message.type == .generated { Spacer() }
+            if message.type == .llamagenerated || message.type == .llavagenerated { Spacer() }
           }
           let elapsedTime = message.dateUpdated.timeIntervalSince(message.dateCreated)
           if elapsedTime > 0 && message.type != .info {
             Text(String(format: "%.1f t/s", Double(message.tokenCount) / elapsedTime))
               .font(.caption)
               .foregroundColor(.secondary)
-              .padding(message.type == .generated ? .trailing : .leading, 20)
+              .padding(message.type == .llamagenerated || message.type == .llavagenerated ? .trailing : .leading, 20)
           }
         }.padding([.leading, .trailing], message.type == .info ? 0 : 10)
       }
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
index 5f8b3c8449a..c4edea2af93 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
@@ -38,7 +38,9 @@ NS_SWIFT_NAME(LLaVARunner)
                     tokenizerPath:(NSString*)tokenizerPath;
 - (BOOL)isloaded;
 - (BOOL)loadWithError:(NSError**)error;
-- (BOOL)generate:(NSArray<UIImage*>*)images
+- (BOOL)generate:(void*)imageBuffer
+                width:(CGFloat)width
+               height:(CGFloat)height
                prompt:(NSString*)prompt
        sequenceLength:(NSInteger)seq_len
     withTokenCallback:(nullable void (^)(NSString*))callback
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
index 9b169c33890..3136d2745fd 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
@@ -12,7 +12,8 @@
 #import <executorch/examples/models/llama2/runner/runner.h>
 #import <executorch/examples/models/llava/runner/llava_runner.h>
 
-using namespace ::torch::executor;
+using executorch::extension::llm::Image;
+using executorch::runtime::Error;
 
 NSErrorDomain const LLaMARunnerErrorDomain = @"LLaMARunnerErrorDomain";
 NSErrorDomain const LLaVARunnerErrorDomain = @"LLaVARunnerErrorDomain";
@@ -21,7 +22,7 @@ @interface LLaMARunner ()<ExecuTorchLogSink>
 @end
 
 @implementation LLaMARunner {
-  std::unique_ptr<Runner> _runner;
+  std::unique_ptr<example::Runner> _runner;
 }
 
 - (instancetype)initWithModelPath:(NSString*)modelPath
@@ -29,7 +30,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = std::make_unique<Runner>(
+    _runner = std::make_unique<example::Runner>(
         modelPath.UTF8String, tokenizerPath.UTF8String);
   }
   return self;
@@ -109,7 +110,7 @@ @interface LLaVARunner ()<ExecuTorchLogSink>
 @end
 
 @implementation LLaVARunner {
-  std::unique_ptr<LlavaRunner> _runner;
+  std::unique_ptr<example::LlavaRunner> _runner;
 }
 
 - (instancetype)initWithModelPath:(NSString*)modelPath
@@ -117,7 +118,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = std::make_unique<LlavaRunner>(
+    _runner = std::make_unique<example::LlavaRunner>(
         modelPath.UTF8String, tokenizerPath.UTF8String);
   }
   return self;
@@ -144,39 +145,27 @@ - (BOOL)loadWithError:(NSError**)error {
   return YES;
 }
 
-- (BOOL)generate:(NSArray<UIImage*>*)images
+- (BOOL)generate:(void*)imageBuffer
+                width:(CGFloat)width
+               height:(CGFloat)height
                prompt:(NSString*)prompt
        sequenceLength:(NSInteger)seq_len
     withTokenCallback:(nullable void (^)(NSString*))callback
                 error:(NSError**)error {
-  std::vector<Image> rawImages;
-  rawImages.reserve(images.count);
-
-  for (UIImage* image in images) {
-    CGImageRef cgImage = image.CGImage;
-    const int32_t width = CGImageGetWidth(cgImage);
-    const int32_t height = CGImageGetHeight(cgImage);
-    std::vector<uint8_t> buffer(height * width * 4);
-    CGContextRef context = CGBitmapContextCreate(
-        buffer.data(),
-        width,
-        height,
-        8,
-        width * 4,
-        CGColorSpaceCreateDeviceRGB(),
-        kCGImageAlphaPremultipliedLast);
-    CGContextDrawImage(context, CGRectMake(0, 0, width, height), cgImage);
-    CGContextRelease(context);
-    rawImages.push_back({std::move(buffer), width, height, 4});
-  }
+  const auto* data = static_cast<uint8_t*>(imageBuffer);
   const auto status = _runner->generate(
-      std::move(rawImages),
+      {Image{
+          std::vector<uint8_t>(
+              data, data + (int32_t)width * (int32_t)height * 3),
+          (int32_t)width,
+          (int32_t)height,
+          3}},
       prompt.UTF8String,
       seq_len,
       [callback](const std::string& token) { callback(@(token.c_str())); });
   if (status != Error::Ok) {
     if (error) {
-      *error = [NSError errorWithDomain:LLaVARunnerErrorDomain
+      *error = [NSError errorWithDomain:LLaMARunnerErrorDomain
                                    code:(NSInteger)status
                                userInfo:nil];
       return NO;
diff --git a/examples/demo-apps/apple_ios/LLaMA/README.md b/examples/demo-apps/apple_ios/LLaMA/README.md
index ddd542a0066..c149f0e5db5 100644
--- a/examples/demo-apps/apple_ios/LLaMA/README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/README.md
@@ -1,52 +1,97 @@
-# Building ExecuTorch LLaMA iOS Demo App
+# ExecuTorch Llama iOS Demo App
 
-This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch.
+We’re excited to share that the newly revamped iOS demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an iOS demo app and how to exercise the many features ExecuTorch and Llama models have to offer.
 
-## Prerequisites
-* [Xcode 15](https://developer.apple.com/xcode)
-* [iOS 17 SDK](https://developer.apple.com/ios)
-* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment:
+This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case.
 
-```bash
-git clone -b release/0.2 https://github.com/pytorch/executorch.git
-cd executorch
-git submodule update --init
+Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas.
 
-python3 -m venv .venv && source .venv/bin/activate
+## Key Concepts
+From this demo app, you will learn many key concepts such as:
+* How to prepare Llama models, build the ExecuTorch library, and perform model inference across delegates
+* Expose the ExecuTorch library via Swift Package Manager
+* Familiarity with current ExecuTorch app-facing capabilities
 
-./install_requirements.sh
-```
+The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases.
+
+## Supported Models
+
+As a whole, the models that this app supports are (varies by delegate):
+* Llama 3.2 1B/3B
+* Llama 3.1 8B
+* Llama 3 8B
+* Llama 2 7B
+* Llava 1.5 (only XNNPACK)
+
+## Building the application
+First it’s important to note that currently ExecuTorch provides support across several delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to export the models to build ExecuTorch libraries and apps to run on device:
 
-## Exporting models
-Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
+| Delegate                       | Resource                           |
+| ------------------------------ | ---------------------------------  |
+| XNNPACK (CPU-based library)    | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md)|
+| MPS (Metal Performance Shader) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md)    |
 
-## Run the App
+## How to Use the App
+This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API.
 
-1. Open the [project](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj) in Xcode.
-2. Run the app (cmd+R).
-3. In app UI pick a model and tokenizer to use, type a prompt and tap the arrow buton
+### Swift Package Manager
 
-```{note}
 ExecuTorch runtime is distributed as a Swift package providing some .xcframework as prebuilt binary targets.
-Xcode will dowload and cache the package on the first run, which will take some time.
+Xcode will download and cache the package on the first run, which will take some time.
+
+Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
+
 ```
+rm -rf \
+  ~/Library/org.swift.swiftpm \
+  ~/Library/Caches/org.swift.swiftpm \
+  ~/Library/Caches/com.apple.dt.Xcode \
+  ~/Library/Developer/Xcode/DerivedData
+```
+
+Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target.
+
+Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
+
+For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
+
+### XCode
+* Open XCode and select "Open an existing project" to open `examples/demo-apps/apple_ios/LLama`.
+* Ensure that the ExecuTorch package dependencies are installed correctly, then select which ExecuTorch framework should link against which target.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
+</p>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" style="width:600px">
+</p>
+
+* Run the app. This builds and launches the app on the phone.
+* In app UI pick a model and tokenizer to use, type a prompt and tap the arrow buton
 
 ## Copy the model to Simulator
 
-1. Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
-2. Pick the files in the app dialog, type a prompt and click the arrow-up button.
+* Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+* Pick the files in the app dialog, type a prompt and click the arrow-up button.
 
 ## Copy the model to Device
 
-1. Wire-connect the device and open the contents in Finder.
-2. Navigate to the Files tab and drag&drop the model and tokenizer files onto the iLLaMA folder.
-3. Wait until the files are copied.
+* Wire-connect the device and open the contents in Finder.
+* Navigate to the Files tab and drag&drop the model and tokenizer files onto the iLLaMA folder.
+* Wait until the files are copied.
+
+If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app.jpg" alt="iOS LLaMA App" style="width:300px">
+</p>
 
-Click the image below to see it in action!
+For Llava 1.5 models, you can select and image (via image/camera selector button) before typing prompt and send button.
 
-<a href="https://pytorch.org/executorch/main/_static/img/llama_ios_app.mp4">
-  <img src="https://pytorch.org/executorch/main/_static/img/llama_ios_app.png" width="600" alt="iOS app running a LlaMA model">
-</a>
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_llava.jpg" alt="iOS LLaMA App" style="width:300px">
+</p>
 
 ## Reporting Issues
 If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
new file mode 100644
index 00000000000..20ee73b821f
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -0,0 +1,114 @@
+# Building Llama iOS Demo for MPS Backend
+
+This tutorial covers the end to end workflow for building an iOS demo app using MPS backend on device.
+More specifically, it covers:
+1. Export and quantization of Llama models against the MPS backend.
+2. Building and linking libraries that are required to inference on-device for iOS platform using MPS.
+3. Building the iOS demo app itself.
+
+## Prerequisites
+* [Xcode 15](https://developer.apple.com/xcode)
+* [iOS 18 SDK](https://developer.apple.com/ios)
+* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment:
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+
+```
+conda create -n et_mps python=3.10.0
+conda activate et_mps
+```
+
+Checkout ExecuTorch repo and sync submodules
+
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+
+Install dependencies
+
+```
+./install_requirements.sh
+```
+
+## Prepare Models
+In this demo app, we support text-only inference with Llama 3.1, Llama 3, and Llama 2 models.
+
+Install the required packages to export the model
+
+```
+sh examples/models/llama2/install_requirements.sh
+```
+
+Export the model
+```
+python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32
+```
+
+## Pushing Model and Tokenizer
+
+### Copy the model to Simulator
+* Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+* Pick the files in the app dialog, type a prompt and click the arrow-up button.
+
+### Copy the model to Device
+* Wire-connect the device and open the contents in Finder.
+* Navigate to the Files tab and drag & drop the model and tokenizer files onto the iLLaMA folder.
+* Wait until the files are copied.
+
+## Configure the XCode Project
+
+### Install CMake
+Download and open the macOS .dmg installer at https://cmake.org/download and move the Cmake app to /Applications folder.
+Install Cmake command line tools:
+
+```
+sudo /Applications/CMake.app/Contents/bin/cmake-gui --install
+```
+
+
+### Swift Package Manager
+The prebuilt ExecuTorch runtime, backend, and kernels are available as a Swift PM package.
+
+### Xcode
+Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “0.3.0”, or just use the “latest” branch name for the latest stable build.
+
+Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
+
+```
+rm -rf \
+  ~/Library/org.swift.swiftpm \
+  ~/Library/Caches/org.swift.swiftpm \
+  ~/Library/Caches/com.apple.dt.Xcode \
+  ~/Library/Developer/Xcode/DerivedData
+```
+
+Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target.
+
+Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
+
+For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
+</p>
+
+Then select which ExecuTorch framework should link against which target.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" style="width:600px">
+</p>
+
+Click “Run” to build the app and run in on your iPhone. If the app successfully run on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_mps.jpg" alt="iOS LLaMA App mps" style="width:300px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
new file mode 100644
index 00000000000..d7a76da6434
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -0,0 +1,152 @@
+# Building Llama iOS Demo for XNNPACK Backend
+
+**[UPDATE - 09/25]** We have added support for running [Llama 3.2 models](#for-llama-32-1b-and-3b-models) on the XNNPACK backend. We currently support inference on their original data type (BFloat16).
+
+This tutorial covers the end to end workflow for building an iOS demo app using XNNPACK backend on device.
+More specifically, it covers:
+1. Export and quantization of Llama models against the XNNPACK backend.
+2. Building and linking libraries that are required to inference on-device for iOS platform using XNNPACK.
+3. Building the iOS demo app itself.
+
+## Prerequisites
+* [Xcode 15](https://developer.apple.com/xcode)
+* [iOS 17 SDK](https://developer.apple.com/ios)
+
+## Setup ExecuTorch
+In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)). The commands below are running on Linux (CentOS).
+
+Create a Conda environment
+
+```
+conda create -n et_xnnpack python=3.10.0
+conda activate et_xnnpack
+```
+
+Checkout ExecuTorch repo and sync submodules
+
+```
+git clone https://github.com/pytorch/executorch.git
+cd executorch
+git submodule sync
+git submodule update --init
+```
+
+Install dependencies
+
+```
+./install_requirements.sh
+```
+
+## Prepare Models
+In this demo app, we support text-only inference with up-to-date Llama models.
+
+Install the required packages to export the model
+
+```
+sh examples/models/llama2/install_requirements.sh
+```
+
+### For Llama 3.2 1B and 3B models
+We have supported BFloat16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models.
+* You can download original model weights for Llama through Meta official [website](https://llama.meta.com/).
+* For chat use-cases, download the instruct models instead of pretrained.
+* Run “examples/models/llama2/install_requirements.sh” to install dependencies.
+* The 1B model in BFloat16 format can run on mobile devices with 8GB RAM (iPhone 15 Pro and later). The 3B model will require 12GB+ RAM and hence will not fit on 8GB RAM phones.
+* Export Llama model and generate .pte file as below:
+
+```
+python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte"
+```
+
+For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
+
+### For Llama 3.1 and Llama 2 models
+
+Export the model
+```
+python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+```
+
+### For LLaVA model
+* For the Llava 1.5 model, you can get it from Huggingface [here](https://huggingface.co/llava-hf/llava-1.5-7b-hf).
+* Run `examples/models/llava/install_requirements.sh` to install dependencies.
+* Run the following command to generate llava.pte, tokenizer.bin and an image tensor (serialized in TorchScript) image.pt.
+
+```
+python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+```
+* You can find more information [here](https://github.com/pytorch/executorch/tree/main/examples/models/llava).
+
+
+## Configure the XCode Project
+
+### Install CMake
+Download and open the macOS .dmg installer at https://cmake.org/download and move the Cmake app to /Applications folder.
+Install Cmake command line tools:
+
+```
+sudo /Applications/CMake.app/Contents/bin/cmake-gui --install
+```
+
+
+### Swift Package Manager
+The prebuilt ExecuTorch runtime, backend, and kernels are available as a Swift PM package.
+
+### Xcode
+Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “0.3.0”, or just use the “latest” branch name for the latest stable build.
+
+Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
+
+```
+rm -rf \
+  ~/Library/org.swift.swiftpm \
+  ~/Library/Caches/org.swift.swiftpm \
+  ~/Library/Caches/com.apple.dt.Xcode \
+  ~/Library/Developer/Xcode/DerivedData
+```
+
+Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target.
+
+Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
+
+For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
+</p>
+
+Then select which ExecuTorch framework should link against which target.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" style="width:600px">
+</p>
+
+Click “Run” to build the app and run in on your iPhone.
+
+## Pushing Model and Tokenizer
+
+### Copy the model to Simulator
+* Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+* Pick the files in the app dialog, type a prompt and click the arrow-up button.
+
+### Copy the model to Device
+* Wire-connect the device and open the contents in Finder.
+* Navigate to the Files tab and drag & drop the model and tokenizer files onto the iLLaMA folder.
+* Wait until the files are copied.
+
+Open the iLLaMA app, click the settings button at the top left of the app to select the model and tokenizer files. When the app successfully runs on your device, you should see something like below:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app.jpg" alt="iOS LLaMA App" style="width:300px">
+</p>
+
+
+
+For Llava 1.5 models, you can select and image (via image/camera selector button) before typing prompt and send button.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_llava.jpg" alt="iOS LLaMA App" style="width:300px">
+</p>
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/models/flamingo/__init__.py b/examples/demo-apps/apple_ios/LLaMA/llama31.png
similarity index 100%
rename from examples/models/flamingo/__init__.py
rename to examples/demo-apps/apple_ios/LLaMA/llama31.png
diff --git a/examples/models/flamingo/preprocess/__init__.py b/examples/demo-apps/apple_ios/LLaMA/llava.png
similarity index 100%
rename from examples/models/flamingo/preprocess/__init__.py
rename to examples/demo-apps/apple_ios/LLaMA/llava.png
diff --git a/examples/sdk/CMakeLists.txt b/examples/devtools/CMakeLists.txt
similarity index 83%
rename from examples/sdk/CMakeLists.txt
rename to examples/devtools/CMakeLists.txt
index 29248d10738..7ed5232ba41 100644
--- a/examples/sdk/CMakeLists.txt
+++ b/examples/devtools/CMakeLists.txt
@@ -4,10 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Example CMakeLists.txt for building executor_runner with sdk support. In this
-# example we link sdk and bundled_program libraries into executor_runner binary
+# Example CMakeLists.txt for building executor_runner with Developer Tools
+# support. In this example we link devtools and bundled_program libraries into
+# executor_runner binary
 cmake_minimum_required(VERSION 3.19)
-project(sdk_example)
+project(devtools_example)
 
 option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
 
@@ -45,7 +46,7 @@ find_package(
   gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party
 )
 
-add_executable(sdk_example_runner sdk_example_runner/sdk_example_runner.cpp)
+add_executable(example_runner example_runner/example_runner.cpp)
 target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
 
 target_include_directories(
@@ -53,7 +54,7 @@ target_include_directories(
                    ${EXECUTORCH_ROOT}/third-party/flatcc/include
 )
 target_link_libraries(
-  sdk_example_runner
+  example_runner
   executorch
   gflags
   etdump
@@ -80,10 +81,12 @@ if(EXECUTORCH_BUILD_COREML)
     NO_DEFAULT_PATH
   )
 
-  target_link_libraries(sdk_example_runner "-Wl,-force_load" coremldelegate)
+  target_link_libraries(
+    example_runner "-Wl,-force_load" coremldelegate
+  )
 
   target_link_libraries(
-    sdk_example_runner ${PROTOBUF_LITE} ${ACCELERATE_FRAMEWORK}
+    example_runner ${PROTOBUF_LITE} ${ACCELERATE_FRAMEWORK}
     ${COREML_FRAMEWORK} ${FOUNDATION_FRAMEWORK} ${SQLITE_LIBRARY}
   )
 endif()
diff --git a/examples/sdk/README.md b/examples/devtools/README.md
similarity index 71%
rename from examples/sdk/README.md
rename to examples/devtools/README.md
index 096f90864e7..c06d3eac3fc 100644
--- a/examples/sdk/README.md
+++ b/examples/devtools/README.md
@@ -1,43 +1,41 @@
-# SDK Examples
+# Developer Tools Examples
 This directory contains examples of BundledProgram and ETDump generation.
 
 ## Directory structure
 ```bash
-examples/sdk
+examples/devtools
 ├── scripts                           # Python scripts to illustrate export workflow of bundled program.
-├── sdk_executor_runner               # Contains an example for both BundledProgram to verify ExecuTorch model, and generate ETDump for runtime results.
+├── executor_runner                   # Contains an example for both BundledProgram to verify ExecuTorch model, and generate ETDump for runtime results.
 └── README.md                         # Current file
 ```
 
 ## BundledProgram
 
-We will use an example model (in `torch.nn.Module`) and its representative inputs, both from [`models/`](../models) directory, to generate a [BundledProgram(`.bpte`)](../../docs/source/sdk-bundled-io.md) file using the [script](scripts/export_bundled_program.py). Then we will use [sdk_example_runner](sdk_example_runner/sdk_example_runner.cpp) to execute the `.bpte` model on the ExecuTorch runtime and verify the model on BundledProgram API.
+We will use an example model (in `torch.nn.Module`) and its representative inputs, both from [`models/`](../models) directory, to generate a [BundledProgram(`.bpte`)](../../docs/source/bundled-io.md) file using the [script](scripts/export_bundled_program.py). Then we will use [devtools/example_runner](example_runner/example_runner.cpp) to execute the `.bpte` model on the ExecuTorch runtime and verify the model on BundledProgram API.
 
 
 1. Sets up the basic development environment for ExecuTorch by [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/stable/getting-started-setup).
 
-2. Using the [script](scripts/export_bundled_program.py) to generate a BundledProgram binary file by retreiving a `torch.nn.Module` model and its representative inputs from the list of available models in the [`models/`](../models) dir。
+2. Using the [script](scripts/export_bundled_program.py) to generate a BundledProgram binary file by retreiving a `torch.nn.Module` model and its representative inputs from the list of available models in the [`models/`](../models) dir.
 
 ```bash
 cd executorch # To the top level dir
 
 # To get a list of example models
-python3 -m examples.sdk.scripts.export_bundled_program -h
+python3 -m examples.devtools.scripts.export_bundled_program -h
 
 # To generate a specific `.bpte` model
-python3 -m examples.sdk.scripts.export_bundled_program -m mv2 # for MobileNetv2
+python3 -m examples.devtools.scripts.export_bundled_program -m mv2 # for MobileNetv2
 
 # This should generate ./mv2_bundled.bpte file, if successful.
 ```
 
-3. Once we have the BundledProgram binary (`.bpte`) file, then let's run and verify it with ExecuTorch runtime and BundledProgram APIs using the [sdk_example_runner](sdk_example_runner/sdk_example_runner.cpp).
+3. Once we have the BundledProgram binary (`.bpte`) file, then let's run and verify it with ExecuTorch runtime and BundledProgram APIs using the [devtools/example_runner](example_runner/example_runner.cpp).
 
 ```bash
    cd executorch
-   rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
-   cd ..
-   cmake --build cmake-out -j8 -t sdk_example_runner
-   ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path mv2_bundled.bpte --output_verification
+   ./examples/devtools/build_example_runner.sh
+   ./cmake-out/examples/devtools/example_runner --bundled_program_path mv2_bundled.bpte --output_verification
    ```
 
 
@@ -51,7 +49,7 @@ We offer an example runner that accepts a `BundledProgram` (`.bpte`) and runs a
 Running the program will generate an `ETDump` file (`.etdp`) at the location specified by `--etdump_path`. Make sure to build the program as specified below to enable the event tracer.
 
 ```bash
-   ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path mv2_bundled.bpte --etdump_path mv2_etdump.etdp
+   ./cmake-out/examples/devtools/example_runner --bundled_program_path mv2_bundled.bpte --etdump_path mv2_etdump.etdp
    ```
 
 ### Parsing ETDump
@@ -66,7 +64,7 @@ Once an `ETDump` has been generated, it can be viewed using the CLI inspector. T
 ETDump profiling can also be used in a custom C++ program. `ETDumpGen` is an implementation of the abstract `EventTracer` class.  Include the header file located at `devtools/etdump/etdump_flatcc.h`. To initialize the ETDump generator, construct it before loading the method from the program.
 
 ```cpp
-   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+   executorch::etdump::ETDumpGen etdump_gen;
    Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
    ```
diff --git a/examples/sdk/build_sdk_example_runner.sh b/examples/devtools/build_example_runner.sh
similarity index 82%
rename from examples/sdk/build_sdk_example_runner.sh
rename to examples/devtools/build_example_runner.sh
index be0e61cef79..9f35abb1a35 100755
--- a/examples/sdk/build_sdk_example_runner.sh
+++ b/examples/devtools/build_example_runner.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Builds sdk_example_runner and prints its path.
+# Builds example_runner and prints its path.
 
 set -euo pipefail
 
@@ -20,9 +20,9 @@ export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-9}"
 BUILD_COREML=OFF
 
 usage() {
-  echo "Builds sdk example runner."
+  echo "Builds example runner."
   echo "Options:"
-  echo "  --coreml             Include this flag to enable Core ML backend when building the SDK."
+  echo "  --coreml             Include this flag to enable Core ML backend when building the Developer Tools."
   exit 0
 }
 
@@ -42,7 +42,7 @@ main() {
   if [[ "${BUILD_COREML}" == "ON" ]]; then
     cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_SDK=ON \
+        -DEXECUTORCH_BUILD_DEVTOOLS=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DEXECUTORCH_BUILD_COREML=ON \
         -Dprotobuf_BUILD_TESTS=OFF \
@@ -52,14 +52,14 @@ main() {
   else
    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
       -DCMAKE_BUILD_TYPE=Release \
-      -DEXECUTORCH_BUILD_SDK=ON \
+      -DEXECUTORCH_BUILD_DEVTOOLS=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -Bcmake-out .
   fi
 
   cmake --build cmake-out --target install --config Release
 
-  local example_dir=examples/sdk
+  local example_dir=examples/devtools
   local build_dir="cmake-out/${example_dir}"
   local cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
   rm -rf ${build_dir}
@@ -70,12 +70,12 @@ main() {
       "${example_dir}"
   cmake --build "${build_dir}" --config Release
 
-  local runner="${PWD}/${build_dir}/sdk_example_runner"
+  local runner="${PWD}/${build_dir}/example_runner"
   if [[ ! -f "${runner}" ]]; then
-    echo "ERROR: Failed to build ${build_dir}/sdk_example_runner" >&2
+    echo "ERROR: Failed to build ${build_dir}/example_runner" >&2
     exit 1
   else
-    echo "Built ${build_dir}/sdk_example_runner"
+    echo "Built ${build_dir}/example_runner"
   fi
 }
 
diff --git a/examples/devtools/example_runner/TARGETS b/examples/devtools/example_runner/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/examples/devtools/example_runner/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp b/examples/devtools/example_runner/example_runner.cpp
similarity index 88%
rename from examples/sdk/sdk_example_runner/sdk_example_runner.cpp
rename to examples/devtools/example_runner/example_runner.cpp
index fc47d17f42b..1aae0f2a98f 100644
--- a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp
+++ b/examples/devtools/example_runner/example_runner.cpp
@@ -30,9 +30,7 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
 
-static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
-static constexpr size_t kBundledAllocatorPoolSize = 16 * 1024U;
-static uint8_t bundled_allocator_pool[kBundledAllocatorPoolSize];
+static std::array<uint8_t, 4 * 1024U * 1024U> method_allocator_pool; // 4MB
 
 DEFINE_string(
     bundled_program_path,
@@ -77,7 +75,20 @@ DEFINE_int32(
     262144, // 256 KB
     "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
 
-using namespace torch::executor;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::extension::BufferDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::EventTracerDebugLogLevel;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
 
 std::vector<uint8_t> load_file_or_die(const char* path) {
   std::ifstream file(path, std::ios::binary | std::ios::ate);
@@ -92,7 +103,7 @@ std::vector<uint8_t> load_file_or_die(const char* path) {
 }
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -111,19 +122,18 @@ int main(int argc, char** argv) {
   // Find the offset to the embedded Program.
   const void* program_data;
   size_t program_data_len;
-  Error status = torch::executor::bundled_program::GetProgramData(
+  Error status = executorch::bundled_program::get_program_data(
       reinterpret_cast<void*>(file_data.data()),
       file_data.size(),
       &program_data,
       &program_data_len);
   ET_CHECK_MSG(
       status == Error::Ok,
-      "GetProgramData() failed on file '%s': 0x%x",
+      "get_program_data() failed on file '%s': 0x%x",
       bundled_program_path,
       (unsigned int)status);
 
-  auto buffer_data_loader =
-      util::BufferDataLoader(program_data, program_data_len);
+  auto buffer_data_loader = BufferDataLoader(program_data, program_data_len);
 
   // Parse the program file. This is immutable, and can also be reused
   // between multiple execution invocations across multiple threads.
@@ -170,8 +180,8 @@ int main(int argc, char** argv) {
   // MallocMemoryAllocator).
   //
   // In this example we use a statically allocated memory pool.
-  MemoryAllocator method_allocator{
-      MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
+  MemoryAllocator method_allocator{MemoryAllocator(
+      sizeof(method_allocator_pool), method_allocator_pool.data())};
 
   // The memory-planned buffers will back the mutable tensors used by the
   // method. The sizes of these buffers were determined ahead of time during the
@@ -204,7 +214,7 @@ int main(int argc, char** argv) {
   // the method can mutate the memory-planned buffers, so the method should only
   // be used by a single thread at at time, but it can be reused.
   //
-  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  ETDumpGen etdump_gen;
   Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
   ET_CHECK_MSG(
@@ -227,7 +237,7 @@ int main(int argc, char** argv) {
         EventTracerDebugLogLevel::kProgramOutputs);
   }
   // Use the inputs embedded in the bundled program.
-  status = torch::executor::bundled_program::LoadBundledInput(
+  status = executorch::bundled_program::load_bundled_input(
       *method, file_data.data(), FLAGS_testset_idx);
   ET_CHECK_MSG(
       status == Error::Ok,
@@ -264,7 +274,7 @@ int main(int argc, char** argv) {
 
   // Dump the etdump data containing profiling/debugging data to the specified
   // file.
-  etdump_result result = etdump_gen.get_etdump_data();
+  ETDumpResult result = etdump_gen.get_etdump_data();
   if (result.buf != nullptr && result.size > 0) {
     FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
     fwrite((uint8_t*)result.buf, 1, result.size, f);
@@ -274,14 +284,13 @@ int main(int argc, char** argv) {
 
   if (FLAGS_output_verification) {
     // Verify the outputs.
-    status =
-        torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
-            *method,
-            file_data.data(),
-            FLAGS_testset_idx,
-            1e-3, // rtol
-            1e-5 // atol
-        );
+    status = executorch::bundled_program::verify_method_outputs(
+        *method,
+        file_data.data(),
+        FLAGS_testset_idx,
+        1e-3, // rtol
+        1e-5 // atol
+    );
     ET_CHECK_MSG(
         status == Error::Ok,
         "Bundle verification failed with status 0x%" PRIx32,
diff --git a/examples/sdk/sdk_example_runner/targets.bzl b/examples/devtools/example_runner/targets.bzl
similarity index 93%
rename from examples/sdk/sdk_example_runner/targets.bzl
rename to examples/devtools/example_runner/targets.bzl
index 680bdacc40c..6faf53173d5 100644
--- a/examples/sdk/sdk_example_runner/targets.bzl
+++ b/examples/devtools/example_runner/targets.bzl
@@ -9,9 +9,9 @@ def define_common_targets():
 
     # Test driver for models with bundled inputs.
     runtime.cxx_binary(
-        name = "sdk_example_runner",
+        name = "example_runner",
         srcs = [
-            "sdk_example_runner.cpp",
+            "example_runner.cpp",
         ],
         deps = [
             "//executorch/runtime/executor/test:test_backend_compiler_lib",
diff --git a/examples/sdk/scripts/etrecord.bin b/examples/devtools/scripts/etrecord.bin
similarity index 100%
rename from examples/sdk/scripts/etrecord.bin
rename to examples/devtools/scripts/etrecord.bin
diff --git a/examples/sdk/scripts/export_bundled_program.py b/examples/devtools/scripts/export_bundled_program.py
similarity index 99%
rename from examples/sdk/scripts/export_bundled_program.py
rename to examples/devtools/scripts/export_bundled_program.py
index 052f5e99629..143a7b0e666 100644
--- a/examples/sdk/scripts/export_bundled_program.py
+++ b/examples/devtools/scripts/export_bundled_program.py
@@ -6,6 +6,8 @@
 
 # Example script for exporting simple models to flatbuffer
 
+# pyre-unsafe
+
 import argparse
 
 from typing import List
diff --git a/examples/sdk/scripts/gen_sample_etrecord.py b/examples/devtools/scripts/gen_sample_etrecord.py
similarity index 100%
rename from examples/sdk/scripts/gen_sample_etrecord.py
rename to examples/devtools/scripts/gen_sample_etrecord.py
diff --git a/examples/sdk/test_sdk_example_runner.sh b/examples/devtools/test_example_runner.sh
similarity index 72%
rename from examples/sdk/test_sdk_example_runner.sh
rename to examples/devtools/test_example_runner.sh
index 5185def6552..9c9ed782cbe 100644
--- a/examples/sdk/test_sdk_example_runner.sh
+++ b/examples/devtools/test_example_runner.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Test the end-to-end flow of building sdk_example_runner and use it to run
+# Test the end-to-end flow of building devtools/example_runner and use it to run
 # an actual model.
 
 
@@ -14,23 +14,23 @@ set -e
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../../.ci/scripts/utils.sh"
 
-cmake_install_executorch_sdk_lib() {
+cmake_install_executorch_devtools_lib() {
   echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a"
   rm -rf cmake-out
 
   retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
   cmake --build cmake-out -j9 --target install --config Release
 }
 
-test_cmake_sdk_example_runner() {
+test_cmake_devtools_example_runner() {
   echo "Exporting MobilenetV2"
-  ${PYTHON_EXECUTABLE} -m examples.sdk.scripts.export_bundled_program --model_name="mv2"
-  local example_dir=examples/sdk
+  ${PYTHON_EXECUTABLE} -m examples.devtools.scripts.export_bundled_program --model_name="mv2"
+  local example_dir=examples/devtools
   local build_dir=cmake-out/${example_dir}
   CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
   rm -rf ${build_dir}
@@ -44,8 +44,8 @@ test_cmake_sdk_example_runner() {
   echo "Building ${example_dir}"
   cmake --build ${build_dir} -j9 --config Release
 
-  echo 'Running sdk_example_runner'
-  ${build_dir}/sdk_example_runner --bundled_program_path="./mv2_bundled.bpte"
+  echo 'Running example_runner'
+  ${build_dir}/example_runner --bundled_program_path="./mv2_bundled.bpte"
 }
 
 if [[ -z $PYTHON_EXECUTABLE ]];
@@ -58,5 +58,5 @@ then
   BUCK=buck2
 fi
 
-cmake_install_executorch_sdk_lib
-test_cmake_sdk_example_runner
+cmake_install_executorch_devtools_lib
+test_cmake_devtools_example_runner
diff --git a/examples/mediatek/README.md b/examples/mediatek/README.md
index 9727f2587fd..17b73ce9372 100644
--- a/examples/mediatek/README.md
+++ b/examples/mediatek/README.md
@@ -23,30 +23,18 @@ examples/mediatek
 ├── mtk_build_examples.sh             # Script for building MediaTek backend and the examples
 └── README.md                         # Documentation for the examples (this file)
 ```
-# Examples
-## Build MediaTek examples
-1. Set up the environment by folllowing the instructions in `backends/mediatek/scripts`
-2. Build the backend and the examples by exedcuting the script:
-```bash
-./mtk_build_examples.sh
-```
+# Examples Build Instructions
 
-# AoT
 ## Environment Setup
-1. Setup ET Environment
-- Follow the instructions found in: https://pytorch.org/executorch/stable/getting-started-setup.html
-2. Setup MTK AoT Environment
-```bash
-// Ensure that you are inside executorch/examples/mediatek directory
-pip3 install -r requirements.txt
+- Follow the instructions of **Prerequisites** and **Setup** in `backends/mediatek/scripts/README.md`.
 
-// Download the two whl files from NeuroPilot Portal
-pip3 install mtk_neuron-8.2.2-py3-none-linux_x86_64.whl
-pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+## Build MediaTek Examples
+1. Build the backend and the examples by exedcuting the script:
+```bash
+./mtk_build_examples.sh
 ```
 
-## AoT Flow
-### llama
+## LLaMa Example Instructions
 ##### Note: Verify that localhost connection is available before running AoT Flow
 1. Exporting Models to `.pte`
 - In the `examples/mediatek directory`, run:
diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py
index f447b2ac68f..8b4de4aac3a 100755
--- a/examples/mediatek/aot_utils/oss_utils/utils.py
+++ b/examples/mediatek/aot_utils/oss_utils/utils.py
@@ -58,9 +58,8 @@ def build_executorch_binary(
         partitioner=[neuro_partitioner],
     )
 
-    exec_prog = edge_prog.to_executorch(
-        config=exir.ExecutorchBackendConfig(extract_constant_segment=False)
-    )
+    exec_prog = edge_prog.to_executorch(config=exir.ExecutorchBackendConfig())
+
     with open(f"{file_name}.pte", "wb") as file:
         file.write(exec_prog.buffer)
 
diff --git a/examples/mediatek/executor_runner/llama_runner/FileMemMapper.h b/examples/mediatek/executor_runner/llama_runner/FileMemMapper.h
index 5fc09428db8..1382e14d3e1 100644
--- a/examples/mediatek/executor_runner/llama_runner/FileMemMapper.h
+++ b/examples/mediatek/executor_runner/llama_runner/FileMemMapper.h
@@ -15,7 +15,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
-namespace torch::executor {
+namespace example {
 
 class FileMemMapper { // Read-only mmap
  public:
@@ -97,4 +97,4 @@ class FileMemMapper { // Read-only mmap
   size_t mSize = 0;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaConfig.h b/examples/mediatek/executor_runner/llama_runner/LlamaConfig.h
index 5465299b32d..f512d59b5c5 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaConfig.h
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaConfig.h
@@ -13,7 +13,7 @@
 
 #include "llm_helper/include/llm_types.h"
 
-namespace torch::executor {
+namespace example {
 
 using llm_helper::LLMType;
 
@@ -42,4 +42,4 @@ struct LlamaModelPaths {
   std::vector<std::string> gen_model_paths;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
index 1757c63fe21..288ee7105a3 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
@@ -26,7 +26,7 @@
 #include "llm_helper/include/mask_builder.h"
 #include "llm_helper/include/rotary_embedding.h"
 
-namespace torch::executor {
+namespace example {
 
 inline std::vector<size_t> getIndexRange(
     const size_t startIndex,
@@ -353,4 +353,4 @@ void LlamaModelChunk::InitCache() {
   }
 }
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h
index c8955378cbf..0a5002199db 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h
@@ -27,12 +27,12 @@
 #include "llm_helper/include/mask_builder.h"
 #include "llm_helper/include/rotary_embedding.h"
 
-namespace torch::executor {
+namespace example {
 
 using llm_helper::MaskBuilder;
 using llm_helper::RotaryEmbeddingMasterLut;
 
-using TensorShape = Span<const int32_t>;
+using TensorShape = executorch::runtime::Span<const int32_t>;
 using ModelIndexMap = std::unordered_map<size_t, size_t>;
 
 // Llama decoder chunk
@@ -135,4 +135,4 @@ class LlamaModelChunk : public ModelChunk {
   size_t mCurrentTokenIndex = 0;
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
index 8a12ce90ecb..fd033b031bb 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
@@ -18,7 +18,7 @@
 #include "llm_helper/include/rotary_embedding.h"
 #include "llm_helper/include/token_embedding.h"
 
-namespace torch::executor {
+namespace example {
 
 void LlamaRuntime::Initialize(
     const LlamaModelOptions& modelOptions,
@@ -239,4 +239,4 @@ const LlamaModelOptions& LlamaRuntime::GetModelOptions() const {
   return mModelOptions;
 }
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.h b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.h
index d788e73dcaa..fc2fca2e105 100644
--- a/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.h
+++ b/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.h
@@ -20,7 +20,7 @@
 #include "llm_helper/include/rotary_embedding.h"
 #include "llm_helper/include/token_embedding.h"
 
-namespace torch::executor {
+namespace example {
 
 class LlamaRuntime {
  public:
@@ -56,4 +56,4 @@ class LlamaRuntime {
   size_t mTokenIndex = 0;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
index 2c7e236968d..7447c21c309 100644
--- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
@@ -23,9 +23,21 @@
 #define ENSURE_INIT \
   ET_CHECK_MSG(Initialized(), "Error: Model chunk not initialized.");
 
-namespace torch::executor {
-
-using util::FileDataLoader;
+namespace example {
+
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
 
 static constexpr size_t kMethodAllocatorPoolSize = 4 * 1024U * 1024U; // 4MB
 
@@ -595,4 +607,4 @@ void ModelChunk::ReleaseModelInstance(void* modelInstance) {
   }
 }
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.h b/examples/mediatek/executor_runner/llama_runner/ModelChunk.h
index 988747e47e3..67d9e30b5f1 100644
--- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.h
+++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.h
@@ -16,7 +16,7 @@
 
 #include "MultiModelLoader.h"
 
-namespace torch::executor {
+namespace example {
 
 struct BufferInfo {
   void* data = nullptr;
@@ -91,7 +91,7 @@ class ModelChunk : protected MultiTokenSizeModelLoader {
   // Release allocated buffers for model IOs
   void ReleaseIoBuffers();
 
-  Method& GetModelMethod();
+  executorch::runtime::Method& GetModelMethod();
 
  private:
   // Override the virtual functions
@@ -119,4 +119,4 @@ class ModelChunk : protected MultiTokenSizeModelLoader {
   std::unordered_map<size_t, size_t> mModelOutToInIndexLinks;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
index e20eac3b248..7c7d7267638 100644
--- a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
@@ -16,7 +16,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 
 template <typename IdType>
 void MultiModelLoader<IdType>::LoadModels() {
@@ -188,4 +188,4 @@ std::string MultiModelLoader<IdType>::GetIdString(const IdType& id) {
 template class MultiModelLoader<int>;
 template class MultiModelLoader<size_t>;
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.h b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.h
index 49a400f4477..7c364b60c03 100644
--- a/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.h
+++ b/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.h
@@ -12,7 +12,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 
 template <typename IdType = size_t>
 class MultiModelLoader {
@@ -92,4 +92,4 @@ class MultiModelLoader {
   IdType mCurrentModelId = 0;
 };
 
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/Utils.h b/examples/mediatek/executor_runner/llama_runner/Utils.h
index 9aed7ec08d4..24e8a4d6e50 100644
--- a/examples/mediatek/executor_runner/llama_runner/Utils.h
+++ b/examples/mediatek/executor_runner/llama_runner/Utils.h
@@ -18,7 +18,7 @@
 #include <string_view>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 namespace utils {
 
 class Timer {
@@ -113,4 +113,4 @@ static std::string to_string(const std::vector<T> vec) {
 }
 
 } // namespace utils
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llm_types.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llm_types.h
index e4cb14a2c98..59290f820fc 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llm_types.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llm_types.h
@@ -11,7 +11,7 @@
 #include <stddef.h>
 #include <strings.h>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 typedef enum { INT4, INT8, INT16, FP16, INT32, FP32, INVALID } LLMType;
@@ -72,4 +72,4 @@ inline const char* getLLMTypeName(const LLMType llm_type) {
 }
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/mask_builder.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/mask_builder.h
index 14b40619ad3..5ab6741c11c 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/mask_builder.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/mask_builder.h
@@ -12,7 +12,7 @@
 
 #include <string>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 class MaskBuilder {
@@ -76,4 +76,4 @@ class MaskBuilder {
 };
 
 } // namespace llm_helper
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/rotary_embedding.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/rotary_embedding.h
index cef7ec09e2a..d4c017cf82b 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/rotary_embedding.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/rotary_embedding.h
@@ -13,7 +13,7 @@
 #include <string>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 class RotaryEmbeddingMasterLut {
@@ -77,4 +77,4 @@ class RotaryEmbeddingMasterLut {
 };
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/token_embedding.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/token_embedding.h
index d3ed623f5f0..43d6413be66 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/token_embedding.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/token_embedding.h
@@ -13,7 +13,7 @@
 #include <string>
 #include <vector>
 
-namespace torch::executor {
+namespace example {
 
 class FileMemMapper;
 
@@ -49,4 +49,4 @@ class TokenEmbeddingLut {
 };
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp b/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
index 9a7dafb2b7f..e83e8b37082 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
@@ -11,7 +11,7 @@
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 // Define mask values for different types
@@ -260,4 +260,4 @@ bool MaskBuilder::adjustMaskForPadding(const size_t tokenBatchSize) {
 }
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp b/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
index 9015d875495..6f1a64bedbc 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
@@ -16,7 +16,7 @@
 #include <fstream>
 #include <type_traits>
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 RotaryEmbeddingMasterLut::RotaryEmbeddingMasterLut(
@@ -394,4 +394,4 @@ size_t RotaryEmbeddingMasterLut::getRotEmbedLength() const {
 }
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp b/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
index 1e20cc22594..b69bb713083 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
@@ -19,7 +19,7 @@
 
 namespace fs = std::filesystem;
 
-namespace torch::executor {
+namespace example {
 namespace llm_helper {
 
 TokenEmbeddingLut::TokenEmbeddingLut(
@@ -90,4 +90,4 @@ void TokenEmbeddingLut::lookupEmbedding(const std::vector<uint64_t>& tokens) {
 }
 
 } // namespace llm_helper
-} // namespace torch::executor
\ No newline at end of file
+} // namespace example
diff --git a/examples/mediatek/executor_runner/mtk_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_executor_runner.cpp
index a6ab4eedab2..1d9d5522161 100644
--- a/examples/mediatek/executor_runner/mtk_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_executor_runner.cpp
@@ -41,11 +41,21 @@ DEFINE_string(
     "Model serialized in flatbuffer format.");
 DEFINE_int32(iteration, 1, "Iterations of inference.");
 
-using namespace torch::executor;
-using torch::executor::util::FileDataLoader;
+using executorch::extension::FileDataLoader;
+using executorch::extension::prepare_input_tensors;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -158,7 +168,7 @@ int main(int argc, char** argv) {
   // Allocate input tensors and set all of their elements to 1. The `inputs`
   // variable owns the allocated memory and must live past the last call to
   // `execute()`.
-  auto inputs = util::prepare_input_tensors(*method);
+  auto inputs = prepare_input_tensors(*method);
   ET_CHECK_MSG(
       inputs.ok(),
       "Could not prepare inputs: 0x%" PRIx32,
@@ -196,7 +206,7 @@ int main(int argc, char** argv) {
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
   // Print the first and last 100 elements of long lists of scalars.
-  std::cout << torch::executor::util::evalue_edge_items(100);
+  std::cout << executorch::extension::evalue_edge_items(100);
   for (int i = 0; i < outputs.size(); ++i) {
     std::cout << "Output " << i << ": " << outputs[i] << std::endl;
   }
diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
index 1193e2b1830..794034584c1 100644
--- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
@@ -131,9 +131,19 @@ DEFINE_string(prompt_file, "", "File containing the prompt text.");
 static constexpr int8_t kAddBos = 1;
 static constexpr int8_t kAddEos = 0;
 
-using namespace torch::executor;
-using namespace torch::executor::llm_helper;
-using torch::executor::utils::Timer;
+using namespace example::llm_helper;
+using example::LlamaModelOptions;
+using example::LlamaModelPaths;
+using example::LlamaRuntime;
+using example::utils::argmax;
+using example::utils::read_file;
+using example::utils::split;
+using example::utils::Timer;
+using example::utils::to_string;
+using executorch::extension::llm::BPETokenizer;
+using executorch::extension::llm::Tokenizer;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
 
 LlamaModelOptions get_model_options() {
   LlamaModelOptions options = {
@@ -159,8 +169,8 @@ LlamaModelPaths get_model_paths() {
   LlamaModelPaths model_paths = {
       .tokenizer_path = FLAGS_tokenizer_path,
       .token_embedding_path = FLAGS_token_embedding_path,
-      .prompt_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,", ','),
-      .gen_model_paths = utils::split("/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,", ',')};
+      .prompt_model_paths = split(FLAGS_prompt_model_paths, ','),
+      .gen_model_paths = split(FLAGS_gen_model_paths, ',')};
   return model_paths;
 }
 
@@ -211,8 +221,7 @@ Result<uint64_t> digest_prompt(
 
   const auto vocab_size = tokenizer->vocab_size();
   const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
-  const auto first_output_token =
-      utils::argmax(logits_type, logits, vocab_size);
+  const auto first_output_token = argmax(logits_type, logits, vocab_size);
   return first_output_token;
 }
 
@@ -259,7 +268,7 @@ Error gen_response(
     timer_gen_token.End();
 
     prev_token = output_token;
-    output_token = utils::argmax(logits_type, logits, vocab_size);
+    output_token = argmax(logits_type, logits, vocab_size);
     full_response_tokens.push_back(output_token);
 
     // Stop when output is EOS
@@ -279,7 +288,7 @@ Error gen_response(
   }
 
   std::cout << "\n\n[Generated Tokens]\n"
-            << utils::to_string(full_response_tokens) << std::endl;
+            << to_string(full_response_tokens) << std::endl;
 
   ET_LOG(
       Info,
@@ -314,9 +323,9 @@ Error inference(
 std::unique_ptr<Tokenizer> load_tokenizer() {
   std::unique_ptr<Tokenizer> tokenizer;
   if (FLAGS_tokenizer_type == "bpe") {
-    tokenizer = std::make_unique<torch::executor::BPETokenizer>();
+    tokenizer = std::make_unique<BPETokenizer>();
   } else if (FLAGS_tokenizer_type == "tiktoken") {
-    tokenizer = torch::executor::get_tiktoken_for_llama();
+    tokenizer = example::get_tiktoken_for_llama();
   }
   ET_CHECK_MSG(
       tokenizer, "Invalid tokenizer type: %s", FLAGS_tokenizer_type.c_str());
@@ -325,7 +334,7 @@ std::unique_ptr<Tokenizer> load_tokenizer() {
 }
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
diff --git a/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp
index 3a1ad1d863b..bfa8aef38f0 100755
--- a/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp
@@ -51,14 +51,27 @@ DEFINE_string(
     "outputs",
     "Model output folder. Default to 'outputs'");
 
-using namespace torch::executor;
-using torch::executor::MemoryAllocator;
-using torch::executor::util::BufferCleanup;
-using torch::executor::util::FileDataLoader;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::BufferCleanup;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
+using executorch::runtime::TensorInfo;
+
 using namespace std::filesystem;
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
diff --git a/examples/mediatek/model_export_scripts/llama.py b/examples/mediatek/model_export_scripts/llama.py
index 980a502c5ae..b2fef26a4cf 100644
--- a/examples/mediatek/model_export_scripts/llama.py
+++ b/examples/mediatek/model_export_scripts/llama.py
@@ -365,7 +365,6 @@ def export_to_et_ir(
         executorch_program = delegated_program.to_executorch(
             config=exir.ExecutorchBackendConfig(
                 memory_planning_pass=exir.passes.MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=False,
                     alloc_graph_output=False,
                 ),
diff --git a/examples/models/flamingo/preprocess/export_preprocess.py b/examples/models/flamingo/preprocess/export_preprocess.py
deleted file mode 100644
index c5a930c88c8..00000000000
--- a/examples/models/flamingo/preprocess/export_preprocess.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from export_preprocess_lib import export_preprocess, lower_to_executorch_preprocess
-
-
-def main():
-    ep = export_preprocess()
-    et = lower_to_executorch_preprocess(ep)
-
-    with open("preprocess.pte", "wb") as file:
-        et.write_to_file(file)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/models/llama2/Android3_2_1B_bf16.gif b/examples/models/llama2/Android3_2_1B_bf16.gif
new file mode 100644
index 00000000000..d40a8c2db97
Binary files /dev/null and b/examples/models/llama2/Android3_2_1B_bf16.gif differ
diff --git a/examples/models/llama2/Android3_2_3B_SpinQuant.gif b/examples/models/llama2/Android3_2_3B_SpinQuant.gif
new file mode 100644
index 00000000000..fbd072c39ff
Binary files /dev/null and b/examples/models/llama2/Android3_2_3B_SpinQuant.gif differ
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
index 7a9b69d65b1..b1401a0bca6 100644
--- a/examples/models/llama2/CMakeLists.txt
+++ b/examples/models/llama2/CMakeLists.txt
@@ -141,7 +141,10 @@ endif()
 
 # XNNPACK
 if(TARGET xnnpack_backend)
-  set(xnnpack_backend_libs xnnpack_backend XNNPACK)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
   list(APPEND link_libraries ${xnnpack_backend_libs})
   target_link_options_shared_lib(xnnpack_backend)
 endif()
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 09ada515a10..f5686eccd95 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -1,7 +1,12 @@
 # Summary
-This example demonstrates how to run a [Llama 2](https://llama.meta.com/llama2/) 7B or [Llama 3](https://ai.meta.com/llama/) 8B model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on a phone.
+This example demonstrates how to run a [llama models](https://www.llama.com/) on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on a phone.
 
-For more details, see [Llama 2 repo](https://github.com/facebookresearch/llama) or [Llama 3 repo](https://github.com/facebookresearch/llama3).
+Here are supported models:
+
+- Llama 3.2 1B and 3B
+- Llama 3.1 8B
+- Llama 3 8B
+- Llama 2 7B
 
 Pretrained models are not included in this repo. Users are suggested to download them [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
 
@@ -19,6 +24,26 @@ Please note that the models are subject to the [Llama 2 Acceptable Use Policy](h
 
 Since Llama 2 7B or Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model.
 
+For Llama 3.2 1B/3B, we validated the models by running them in their original bf16 datatype and unquantized on both Android and iOS phones. The 3B version required high-end phones with larger RAMs to fit the model.
+
+Additionally, 1B/3B models are sensitive to accuracy loss when regular PTQ quantization is applied, so we employed 4bit quantization using [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main) to achieve a good balance between accuracy, performance and memory.
+
+<table>
+  <tr>
+    <td>
+      <img src="./llama_via_xnnpack.gif" width="300">
+      <br>
+      <em>
+      Llama3.1 8B, 4bit quantized on Android phone
+      </em>
+    </td>
+    <td><img src="./Android3_2_1B_bf16.gif" width="300">
+    <br>
+    <em> Llama3.2 1B, unquantized, bf16 on Android phone. </em>
+    </td>
+  </tr>
+</table>
+
 ## Quantization:
 We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch/ao).
 
@@ -31,24 +56,43 @@ We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/l
 
 Note that groupsize less than 128 was not enabled, since such models were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32.
 
+### SpinQuant for Llama 3.2 1B/3B models (Optional)
+
+To improve accuracy, we can use [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main), a post-training quantization (PTQ) technique that generates new quantized weights. In the standard PTQ process, quantization may lead to a decrease in accuracy when there are outliers. The SpinQuant method takes the original weights and produces optimized quantized weights with minimal outliers, resulting in higher accuracy. This can be achieved without any finetuning of the weights and only requires 100 iterations on a single A100 node.
+
+SpinQuant can generate quantized weights that are [compatible with ExecuTorch](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch), specifically, it can be integrated with the existing optimized XNNPACK kernels (e.g., group-wise 4bit weight and 8bit dynamic activation). This allows developers to benefit from the higher accuracy of SpinQuant while also taking advantage of the strong performance of ExecuTorch acceleration. We enabled SpinQuant for Llama3.2 1B/3B models on ExecuTorch.
+
+<p align="center">
+      <img src="./Android3_2_3B_SpinQuant.gif" width=300>
+      <br>
+      <em>
+      Running Llama3.2 3B on Android phone.
+      </em>
+      <br>
+      <em>
+      4bit quantization using SpinQuant
+      </em>
+</p>
+
 ## Enablement
 
-We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12.
+For Llama 3 8B and Llama3.1 8B, we have verified so far on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S24+ and OnePlus 12 (with 16GB RAM).
 
-For Llama 3 8B, we have verified so far on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S24+ and OnePlus 12 (with 16GB RAM).
+We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12.
 
 ## Performance
 
-### Llama2 7B
-Llama 2 7B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on).
+### Llama 3.2 1B and 3B
+Llama 3.2 1B and 3B performance was measured on the OnePlus 12 device. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on) for generating 128 tokens.
 
-|Device  | Groupwise 4-bit (128) | Groupwise 4-bit (256)
+|Model  | bf16 | 4bit(*) via SpinQuant
 |--------| ---------------------- | ---------------
-|Galaxy S22  | 8.15 tokens/second | 8.3 tokens/second |
-|Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second |
-|OnePlus 12 | 11.55 tokens/second | 11.6 tokens/second |
+|1B  | 19.4 tokens/second | 53.41 tokens/second |
+|3B | 7.76 tokens/second | 22.98 tokens/second |
+
+(*) With SpinQuant, we currently quantize 4-bit groupwise (with groupsize 32) weight, 8bit dynamic activation of all the linear layers of the model, except embedding and output layers. The embedding and output layers are quantized as 8-bit per-channel weight and 8-bit dynamic activation.
 
-### Llama3 8B
+### Llama3 8B and Llama3.1 8B
 Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on).
 
 Note that since Llama3's vocabulary size is 4x that of Llama2, we had to quantize embedding lookup table as well. For these results embedding lookup table was groupwise quantized with 4-bits and group size of 32.
@@ -59,8 +103,14 @@ Note that since Llama3's vocabulary size is 4x that of Llama2, we had to quantiz
 |Galaxy S24 | 10.91 tokens/second | 11.21 tokens/second |
 |OnePlus 12 | 10.85 tokens/second | 11.02 tokens/second |
 
-### Llama3.1
-> :warning: **use the main branch**: Llama3.1 is supported on the ExecuTorch main branch (not release 0.3).
+### Llama2 7B
+Llama 2 7B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on).
+
+|Device  | Groupwise 4-bit (128) | Groupwise 4-bit (256)
+|--------| ---------------------- | ---------------
+|Galaxy S22  | 8.15 tokens/second | 8.3 tokens/second |
+|Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second |
+|OnePlus 12 | 11.55 tokens/second | 11.6 tokens/second |
 
 # Instructions
 
@@ -78,25 +128,67 @@ Note that since Llama3's vocabulary size is 4x that of Llama2, we had to quantiz
 
 ## Step 2: Prepare model
 
-### Option A: Download and export Llama 2 7B model
+### Option A: Download and export Llama3.2 1B/3B model.
 
-You can export and run the original Llama 2 7B model.
+1. Download `consolidated.00.pth`, `params.json` and `tokenizer.model` from [Llama website](https://www.llama.com/llama-downloads/) or [Hugging Face](https://huggingface.co/meta-llama/Llama-3.2-1B). For chat use-cases, download the instruct models.
 
-1. Llama 2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b).
+2. Export model and generate `.pte` file. Use original bfloat16 version, without any quantization.
 
-2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround.
+```
+# Set these paths to point to the downloaded files
+LLAMA_CHECKPOINT=path/to/checkpoint.pth
+LLAMA_PARAMS=path/to/params.json
+
+python -m examples.models.llama2.export_llama \
+  --checkpoint "${LLAMA_CHECKPOINT:?}" \
+  --params "${LLAMA_PARAMS:?}" \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -X \
+  -d bf16 \
+  --metadata '{"append_eos_to_prompt": 0, "get_bos_id":128000, "get_eos_ids":[128009, 128001], "get_n_bos": 0, "get_n_eos": 0}' \
+  --output_name="llama3_2.pte"
+```
 
-3. Export model and generate `.pte` file:
-    ```
-    python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
-    ```
-4. Create tokenizer.bin.
+Optionally, we can apply SpinQuant to quantize the model without sacrifacing too much accuracy loss.
 
+To use SpinQuant, follow its [instruction](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch) for exporting checkpoint to ExecuTorch and then export the SpinQuant checkpoint.
+
+```
+# Set these paths to point to the exported files
+LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth
+LLAMA_PARAMS=path/to/params.json
+
+python -m examples.models.llama2.export_llama \
+   --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
+   --params "${LLAMA_PARAMS:?}" \
+   --use_sdpa_with_kv_cache \
+   -X \
+   --preq_mode 8da4w_output_8da8w \
+   --preq_group_size 32 \
+   --max_seq_length 2048 \
+   --output_name "llama3_2.pte" \
+   -kv \
+   -d fp32 \
+   --preq_embedding_quantize 8,0 \
+   --use_spin_quant native \
+   --metadata '{"append_eos_to_prompt": 0, "get_bos_id":128000, "get_eos_ids":[128009, 128001], "get_n_bos": 0, "get_n_eos": 0}'
+```
+
+### Option B: Download and export Llama 3 8B instruct model
+
+You can export and run the original Llama 3 8B instruct model.
+
+1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/).
+
+2. Export model and generate `.pte` file
     ```
-    python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
+    python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
     ```
 
-### Option B: Download and export stories110M model
+    Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
+
+### Option C: Download and export stories110M model
 
 If you want to deploy and run a smaller model for educational purposes. From `executorch` root:
 
@@ -111,7 +203,7 @@ If you want to deploy and run a smaller model for educational purposes. From `ex
     ```
 3. Export model and generate `.pte` file.
     ```
-    python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X
+    python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X -kv
     ```
 4. Create tokenizer.bin.
 
@@ -119,23 +211,30 @@ If you want to deploy and run a smaller model for educational purposes. From `ex
     python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
     ```
 
-### Option C: Download and export Llama 3 8B instruct model
+### Option D: Download and export Llama 2 7B model
 
-You can export and run the original Llama 3 8B instruct model.
+You can export and run the original Llama 2 7B model.
 
-1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/).
+1. Llama 2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b).
 
-2. Export model and generate `.pte` file
+2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround.
+
+3. Export model and generate `.pte` file:
     ```
-    python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+    python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
+    ```
+4. Create tokenizer.bin.
+
+    ```
+    python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
     ```
 
-    Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
+### Option E: Download models from Hugging Face and convert from safetensor format to state dict
 
-### Option D: Download models from Hugging Face and convert from safetensor format to state dict
 
 You can also download above models from [Hugging Face](https://huggingface.co/). Since ExecuTorch starts from a PyTorch model, a script like below can be used to convert the Hugging Face safetensors format to PyTorch's state dict. It leverages the utils provided by [TorchTune](https://github.com/pytorch/torchtune).
 
+
 ```Python
 from torchtune.utils import FullModelHFCheckpointer
 from torchtune.models import convert_weights
@@ -230,10 +329,12 @@ Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the
 
 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40).
     ```
-    cmake-out/examples/models/llama2/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.bin> --prompt=<prompt>
+    cmake-out/examples/models/llama2/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
     ```
 
-For Llama3, you can pass the original `tokenizer.model` (without converting to `.bin` file).
+For Llama2 and stories models, pass the converted `tokenizer.bin` file instead of `tokenizer.model`.
+
+To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON`
 
 ## Step 5: Run benchmark on Android phone
 
@@ -294,19 +395,19 @@ cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release
 ```
 adb shell mkdir -p /data/local/tmp/llama
 adb push <model.pte> /data/local/tmp/llama/
-adb push <tokenizer.bin> /data/local/tmp/llama/
+adb push <tokenizer.model> /data/local/tmp/llama/
 adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/llama/
 ```
 
 **2.3 Run model**
 ```
-adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt \"Once upon a time\" --seq_len 120"
+adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.model> --prompt \"Once upon a time\" --seq_len 120"
 ```
 ## Step 6: Build Mobile apps
 
 ### iOS
 
-Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App. Note that to use Llama 3 8B instruct in the iOS demo app, you don't need to convert the downloaded `tokenizer.model` to `tokenizer.bin`, required for Llama 2 (shown in Step 2 - Option A - 4 above), but you need to rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension.
+Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension.
 
 ### Android
 Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.
@@ -321,6 +422,10 @@ for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.
 
 The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
 
+For CoreML, there are 2 additional optional arguments:
+* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though)
+* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML
+
 # What is coming next?
 ## Quantization
 - Enabling FP16 model to leverage smaller groupsize for 4-bit quantization.
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index f1c56a5bda3..a80c62514df 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -54,6 +54,7 @@ runtime.python_binary(
     main_function = "executorch.examples.models.llama2.export_llama.main",
     # visibility = ["//executorch/examples/..."],
     preload_deps = [
+        "//executorch/extension/llm/custom_ops:model_sharding_py",
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
         "//executorch/kernels/quantized:aot_lib",
     ],
@@ -64,6 +65,14 @@ runtime.python_binary(
     ],
 )
 
+runtime.command_alias(
+    name = "export_llama_qnn",
+    env = {
+        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-2.25:qnn_offline_compile_libs)",
+    },
+    exe = ":export_llama",
+)
+
 runtime.python_library(
     name = "export_library",
     srcs = [
@@ -71,7 +80,11 @@ runtime.python_library(
         "export_llama_lib.py",
         "model.py",
         "source_transformation/apply_spin_quant_r1_r2.py",
+        "source_transformation/lora.py",
+        "source_transformation/pre_quantization.py",
+        "source_transformation/prune_output.py",
         "source_transformation/quantize.py",
+        "source_transformation/quantized_kv_cache.py",
         "source_transformation/rms_norm.py",
         "source_transformation/rope.py",
         "source_transformation/sdpa.py",
@@ -102,5 +115,100 @@ runtime.python_library(
         "//executorch/util:python_profiler",
         "fbsource//third-party/pypi/coremltools:coremltools",
         "fbsource//third-party/pypi/sentencepiece:sentencepiece",
+        "//pytorch/ao:torchao",
+    ],
+)
+
+runtime.python_binary(
+    name = "eval_llama",
+    main_function = "executorch.examples.models.llama2.eval_llama.main",
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/kernels/quantized:aot_lib",
+    ],
+    deps = [
+        ":eval_library",
+        "//caffe2:torch",
+    ],
+)
+
+runtime.python_library(
+    name = "eval_library",
+    srcs = [
+        "eval_llama.py",
+        "eval_llama_lib.py",
+        "evaluate/eager_eval.py",
+    ],
+    _is_external_target = True,
+    base_module = "executorch.examples.models.llama2",
+    visibility = [
+        "//bento/...",
+        "//bento_kernels/...",
+        "//executorch/examples/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/lm-eval:lm-eval",
+        "fbsource//third-party/pypi/tiktoken:tiktoken",
+        ":export_library",
+        "//executorch/examples/models/llama2/tokenizer:tiktoken_py",
+        "//executorch/extension/llm/export:export_lib",
+        "//executorch/extension/llm/tokenizer:tokenizer_py_lib",
+        "//executorch/extension/pybindings:portable_lib",
+    ],
+)
+
+runtime.python_library(
+    name = "quantized_kv_cache",
+    srcs = [
+        "source_transformation/quantized_kv_cache.py",
+    ],
+    _is_external_target = True,
+    visibility = ["//executorch/..."],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
+runtime.python_library(
+    name = "sdpa",
+    srcs = [
+        "source_transformation/sdpa.py",
+    ],
+    _is_external_target = True,
+    visibility = ["//executorch/..."],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
+runtime.python_test(
+    name = "quantized_kv_cache_test",
+    srcs = [
+        "source_transformation/test_quantized_kv_cache.py",
+    ],
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+    ],
+    deps = [
+        ":quantized_kv_cache",
+        "//caffe2:torch",
+        "//executorch/examples/models/llama2:llama_transformer",
+    ],
+)
+
+runtime.python_test(
+    name = "quantized_sdpa_with_kv_cache_test",
+    srcs = [
+        "source_transformation/test_sdpa_with_quantized_kv_cache.py",
+    ],
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+    ],
+    deps = [
+        ":quantized_kv_cache",
+        ":sdpa",
+        "//caffe2:torch",
+        "//executorch/examples/models/llama2:llama_transformer",
     ],
 )
diff --git a/examples/models/llama2/eval_llama.py b/examples/models/llama2/eval_llama.py
index 4daeaf7afa5..09157789bde 100644
--- a/examples/models/llama2/eval_llama.py
+++ b/examples/models/llama2/eval_llama.py
@@ -24,7 +24,7 @@ def main() -> None:
     args = parser.parse_args()
     # Overrides this arg, because evaluation requires full logits.
     args.generate_full_logits = True
-    eval_llama(modelname, args)
+    eval_llama(modelname, args)  # pyre-ignore
 
 
 if __name__ == "__main__":
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
index b8987ac5d49..3061d290bdc 100644
--- a/examples/models/llama2/eval_llama_lib.py
+++ b/examples/models/llama2/eval_llama_lib.py
@@ -10,19 +10,20 @@
 from typing import Optional, Union
 
 import torch
-from executorch.examples.models.llama2.evaluate import EagerEvalWrapper, evaluate_model
 from executorch.examples.models.llama2.export_llama_lib import (
     get_quantizer_and_quant_params,
 )
 from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
 
-from executorch.extension.llm.export import LLMEdgeManager
+from executorch.extension.llm.export.builder import LLMEdgeManager
 from executorch.extension.llm.tokenizer.tokenizer import (
     Tokenizer as SentencePieceTokenizer,
 )
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 from lm_eval.api.model import LM
 
+from .evaluate.eager_eval import EagerEvalWrapper, evaluate_model
+
 from .export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
@@ -91,7 +92,7 @@ def __init__(
         tokenizer: Union[SentencePieceTokenizer, Tiktoken],
         max_seq_length: Optional[int] = None,
     ):
-        super().__init__(None, tokenizer, max_seq_length)
+        super().__init__(None, tokenizer, max_seq_length)  # pyre-ignore
         self._model = model  # Expects model to be path to a .pte file
 
         from executorch.extension.pybindings.portable_lib import _load_for_executorch
@@ -106,7 +107,7 @@ def __init__(
         from executorch.kernels import quantized  # noqa
 
         self._et_model = _load_for_executorch(self._model)
-        self._use_kv_cache = self._et_model.run_method("use_kv_cache")[0]
+        self._use_kv_cache = self._et_model.run_method("use_kv_cache")[0]  # pyre-ignore
 
     def _model_call(self, inps):
         # Given inps (tokens), return the logits from a single forward call
@@ -140,7 +141,7 @@ def __init__(
         tokenizer_bin: str,
         max_seq_length: Optional[int] = None,
     ):
-        super().__init__(None, tokenizer, max_seq_length)
+        super().__init__(None, tokenizer, max_seq_length)  # pyre-ignore
         self._model = model
         self._tokenizer_bin = tokenizer_bin
 
@@ -165,17 +166,17 @@ def gen_eval_wrapper(
     Returns:
         eval_wrapper (LM): A wrapper interface for the lm-evaluation-harness library.
     """
-    tokenizer = get_tokenizer(args.tokenizer_path)
+    tokenizer = get_tokenizer(args.tokenizer_path)  # pyre-ignore
 
     # ExecuTorch Binary Evaluation
-    if (model := args.pte) is not None:
-        if (tokenizer_bin := args.tokenizer_bin) is not None:
+    if (model := args.pte) is not None:  # pyre-ignore
+        if (tokenizer_bin := args.tokenizer_bin) is not None:  # pyre-ignore
             # ETRunnerEvalWrapper: Create a wrapper around an ExecuTorch model, evaluated at runtime
             return ETRunnerEvalWrapper(
                 model=model,
                 tokenizer=tokenizer,
                 tokenizer_bin=tokenizer_bin,
-                max_seq_length=args.max_seq_length,
+                max_seq_length=args.max_seq_length,  # pyre-ignore
             )
 
         # ETPybindEvalWrapper: Create a wrapper around an ExecuTorch model, evaluated with pybindings
@@ -194,7 +195,7 @@ def gen_eval_wrapper(
     if len(quantizers) != 0:
         manager = manager.capture_pre_autograd_graph().pt2e_quantize(quantizers)
         model = (
-            manager.pre_autograd_graph_module.to(device="cuda")
+            manager.pre_autograd_graph_module.to(device="cuda")  # pyre-ignore
             if torch.cuda.is_available()
             else manager.pre_autograd_graph_module.to(device="cpu")
         )
@@ -202,8 +203,8 @@ def gen_eval_wrapper(
             model=model,
             tokenizer=tokenizer,
             max_seq_length=args.max_seq_length,
-            use_kv_cache=args.use_kv_cache,
-            enable_dynamic_shape=args.enable_dynamic_shape,
+            use_kv_cache=args.use_kv_cache,  # pyre-ignore
+            enable_dynamic_shape=args.enable_dynamic_shape,  # pyre-ignore
         )
     else:
         # TODO: use manager.pre_autograd_graph_module for the eval to remove the if-else branch
@@ -221,7 +222,7 @@ def gen_eval_wrapper(
         # that is not available in this eval_llama. We save the checkpoint
         # here for consistency with eval_llama. The accuracy results we
         # get from eval_llama can be used as a reference to other evaluations.
-        if args.output_eager_checkpoint_file is not None:
+        if args.output_eager_checkpoint_file is not None:  # pyre-ignore
             torch.save(model, args.output_eager_checkpoint_file)
 
         return EagerEvalWrapper(
@@ -282,8 +283,8 @@ def eval_llama(
     # Evaluate the model
     eval_results = evaluate_model(
         eval_wrapper,
-        args.tasks,
-        args.limit,
+        args.tasks,  # pyre-ignore
+        args.limit,  # pyre-ignore
     )
 
     for task, res in eval_results["results"].items():
diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py
index e8a540f95e2..8f2659ab308 100644
--- a/examples/models/llama2/evaluate/eager_eval.py
+++ b/examples/models/llama2/evaluate/eager_eval.py
@@ -62,7 +62,7 @@ def batch_size(self):
     def device(self):
         return self._device
 
-    def tok_encode(self, string: str, **kwargs):
+    def tok_encode(self, string: str, **kwargs):  # pyre-ignore
         tokens = self._tokenizer.encode(string, bos=True, eos=False)
         encoded = torch.tensor(tokens, dtype=torch.int, device=self.device)
         # encoded is a pytorch tensor, but some internal logic in the
@@ -111,7 +111,9 @@ def evaluate_model(
 
     if "hendrycks_test" in tasks:
         tasks.remove("hendrycks_test")
-        tasks += list(lm_eval.tasks.hendrycks_test.create_all_tasks().keys())
+        tasks += list(
+            lm_eval.tasks.hendrycks_test.create_all_tasks().keys()  # pyre-ignore
+        )
     task_dict = get_task_dict(tasks)
 
     eval_results = evaluate(
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 2a03c0cebda..cf8d221c8e5 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -53,11 +53,17 @@
     get_quant_embedding_transform,
     get_quant_weight_transform,
 )
+from .source_transformation.quantized_kv_cache import (
+    replace_kv_cache_with_quantized_kv_cache,
+)
 from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
+
 from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis
 from .source_transformation.sdpa import (
     replace_causal_mask,
+    replace_kv_cache_with_coreml_kv_cache,
     replace_kv_cache_with_simple_kv_cache,
+    replace_sdpa_with_coreml_sdpa,
     replace_sdpa_with_custom_op,
     replace_sdpa_with_flex_sdpa,
     replace_sdpa_with_simple_sdpa,
@@ -204,6 +210,12 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Whether or not to export a model using kv cache",
     )
+    parser.add_argument(
+        "--quantize_kv_cache",
+        default=False,
+        action="store_true",
+        help="Whether or not to export a model using int8 per token quantized kv cache",
+    )
     parser.add_argument(
         "--num_sharding",
         type=int,
@@ -295,7 +307,17 @@ def build_args_parser() -> argparse.ArgumentParser:
 
     parser.add_argument("-2", "--fairseq2", action="store_true")
     parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-X", "--xnnpack", action="store_true")
+    parser.add_argument(
+        "-X",
+        "--xnnpack",
+        action="store_true",
+        help="Delegate to DQLinear ops to the xnnpack backend",
+    )
+    parser.add_argument(
+        "--xnnpack-extended-ops",
+        action="store_true",
+        help="Delegate more operators beyond DQLinear to the xnnpack backend. Requires -X or --xnnpack to be set.",
+    )
     parser.add_argument("-V", "--vulkan", action="store_true")
     parser.add_argument("--mps", action="store_true")
     parser.add_argument("--coreml", action="store_true")
@@ -304,12 +326,24 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="This option is only for coreml, and is only supported for MacOS15+/iOS18+",
     )
+    parser.add_argument(
+        "--coreml-preserve-sdpa",
+        action="store_true",
+        help="This option is only for coreml: Preserve sdpa in torch edge program to use coreml iOS18.sdpa op",
+    )
     parser.add_argument(
         "--coreml-quantize",
         default=None,
         choices=["b4w"],
         help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)",
     )
+    parser.add_argument(
+        "--coreml-ios",
+        type=int,
+        default=15,
+        choices=(15, 16, 17, 18),
+        help="This option is only for coreml: The minimum iOS version to deploy",
+    )
     parser.add_argument(
         "--qnn",
         action="store_true",
@@ -355,6 +389,51 @@ def build_args_parser() -> argparse.ArgumentParser:
         choices=["cuda", "native"],
         help="Use SpinQuant for better quantization performance. Only support cuda and native.",
     )
+
+    parser.add_argument(
+        "-qat",
+        "--use_qat",
+        default=False,
+        action="store_true",
+        help="Whether the checkpoin is pre-quantized with QAT or not.",
+    )
+
+    parser.add_argument(
+        "-lora",
+        "--use_lora",
+        type=int,
+        default=0,
+        help="Whether the checkpoint contains LoRA adaptors or not. 0: no LoRA adaptors; "
+        "otherwise, it means the rank of LoRA adaptors. Currently it only works if QAT is enabled.",
+    )
+
+    parser.add_argument(
+        "--preq_mode",
+        type=str,
+        default=None,
+        choices=["8da4w", "8da4w_output_8da8w"],
+        help="Quantization mode used for pre-quantized checkpoint. Only support 8da4w and 8da4w_output_8da8w right now.",
+    )
+
+    parser.add_argument(
+        "--preq_group_size",
+        type=int,
+        default=32,
+        help="group_size for pre-quantized checkpoint weight quantization",
+    )
+
+    parser.add_argument(
+        "--preq_embedding_quantize",
+        default="8,0",
+        type=str,
+        help="type of embedding quantization for pre-quantized checkpoint, '<bitwidth>,<groupsize>', e.g., '8,1024'.",
+    )
+
+    parser.add_argument(
+        "--output_prune_map",
+        default=None,
+        help="path to the output pruning token mapping file (token_map.json)",
+    )
     return parser
 
 
@@ -408,7 +487,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
 
     Returns a LLMEdgeManager prior to calling export_to_edge with quantizers
     """
-
     # load model from checkpoint and params.json
     checkpoint_path = canonical_path(args.checkpoint) if args.checkpoint else None
     checkpoint_dir = (
@@ -444,11 +522,12 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             tokenizer_path=args.tokenizer_path,
             verbose=args.verbose,
             max_seq_len=args.max_seq_length,
+            output_prune_map_path=args.output_prune_map,
             metadata_str=args.metadata,
+            dtype_override=dtype_override,
             args=args,
         )
         .set_output_dir(output_dir_path)
-        .to_dtype(dtype_override)
         .source_transform(_get_source_transforms(modelname, dtype_override, args))
     )
 
@@ -503,12 +582,24 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
 
     # to_backend
     partitioners = []
-    if pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None:
-        partitioners.append(get_xnnpack_partitioner())
+
+    # Order matters here, dynamic quantization should be applied first when both xnnpack and xnnpack_extended_ops are enabled
+    if (
+        pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None
+    ) or (args.xnnpack):
+        partitioners.append(
+            get_xnnpack_partitioner(dynamic_quant_only_partitioner=True)
+        )
+
+        # force xnnpack to be true if pt2e_quant_params is not None and args.xnnpack is False
+        args.xnnpack = True
         modelname = f"xnnpack_dq_{modelname}"
 
-    if args.xnnpack:
-        partitioners.append(get_xnnpack_partitioner())
+    if args.xnnpack_extended_ops:
+        assert args.xnnpack, "xnnpack_extended_ops requires xnnpack to be enabled"
+        partitioners.append(
+            get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
+        )
         modelname = f"xnnpack_{modelname}"
 
     if args.vulkan:
@@ -526,7 +617,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
 
     if args.coreml:
         coreml_partitioner = get_coreml_partitioner(
-            args.use_kv_cache and args.coreml_enable_state,
+            args.coreml_ios,
             args.embedding_quantize,
             args.pt2e_quantize,
             args.coreml_quantize,
@@ -555,6 +646,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
                 shares=args.num_sharding,
             )
 
+    logging.info("Lowering model using following partitioner(s): ")
+    for partitioner in partitioners:
+        logging.info(f"--> {partitioner.__class__.__name__}")
+
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
             raise ValueError("Unable to generate etrecord due to missing edge manager.")
@@ -668,7 +763,9 @@ def _load_llama_model(
     tokenizer_path: Optional[str] = None,
     verbose: bool = False,
     max_seq_len: int = 128,
+    output_prune_map_path: Optional[str] = None,
     metadata_str: Optional[str] = None,
+    dtype_override: Optional[DType] = None,
     args,
 ) -> "LLMEdgeManager":
     """
@@ -695,25 +792,35 @@ def _load_llama_model(
         fairseq2=weight_type == WeightType.FAIRSEQ2,
         max_seq_len=max_seq_len,
         enable_dynamic_shape=enable_dynamic_shape,
+        output_prune_map_path=output_prune_map_path,
         args=args,
     )
-    state_dict = model.state_dict()
-    dtype = state_dict[next(iter(state_dict))].dtype
-    assert dtype in [
-        torch.bfloat16,
-        torch.float16,
-        torch.float32,
-    ], f"Only support bfloat16, fp16 or fp32 got {dtype}"
-    logging.info(f"Loaded model with dtype={dtype}")
-
-    if dtype == torch.bfloat16:
-        dtype = DType.bf16
-    elif dtype == torch.float16:
-        dtype = DType.fp16
-    elif dtype == torch.float32:
-        dtype = DType.fp32
+    if dtype_override:
+        assert isinstance(
+            dtype_override, DType
+        ), "Override dtype needs to be of type <DType>"
+        torch_dtype = dtype_override.to_torch_dtype()
+        logging.info(f"model.to {torch_dtype}")
+        model = model.to(dtype=torch_dtype)
+        dtype = dtype_override
     else:
-        raise ValueError(f"Unsupported dtype {dtype}")
+        state_dict = model.state_dict()
+        dtype = state_dict[next(iter(state_dict))].dtype
+        assert dtype in [
+            torch.bfloat16,
+            torch.float16,
+            torch.float32,
+        ], f"Only support bfloat16, fp16 or fp32 got {dtype}"
+        logging.info(f"Loaded model with dtype={dtype}")
+
+        if dtype == torch.bfloat16:
+            dtype = DType.bf16
+        elif dtype == torch.float16:
+            dtype = DType.fp16
+        elif dtype == torch.float32:
+            dtype = DType.fp32
+        else:
+            raise ValueError(f"Unsupported dtype {dtype}")
 
     return LLMEdgeManager(
         model=model,
@@ -742,34 +849,54 @@ def _load_llama_model(
     )
 
 
-def _get_source_transforms(
+def _get_source_transforms(  # noqa
     modelname: str, dtype_override: Optional[DType], args
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     transforms = []
-    if args.quantization_mode:
-        modelname = f"{modelname}_q"
-        if args.use_spin_quant is None:
-            transforms.append(
-                get_quant_weight_transform(args, dtype_override, verbose_export())
-            )
-        # For SpinQuant, the checkpoints are already quantized
-        # aka the weights have corresponding scales value,
-        # So that means, we don't need to apply quantization
-        # transform. However, we will still need to apply
-        # transformations that change the model structure to
-        # match the checkpoint format.
-        # transform_for_spinquant() will apply these transformations
-        # later in model.py file.
-        elif args.use_spin_quant == "cuda":
+
+    if args.use_spin_quant:
+        if args.use_spin_quant == "cuda":
             from .source_transformation.spin_quant import (
                 inject_fast_hadamard_transform_cuda_for_spin_quant,
             )
 
             transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant)
         elif args.use_spin_quant == "native":
-            raise NotImplementedError("native SpinQuant is not implemented yet.")
+            from .source_transformation.spin_quant import (
+                inject_fast_hadamard_transform_native_for_spin_quant,
+            )
+
+            transforms.append(inject_fast_hadamard_transform_native_for_spin_quant)
+
+    if args.quantization_mode:
+        """
+        When this option is selected, it finds all linear layers and transforms
+        into quantized linear equivalent module.
+
+        There are cases where the checkpoint is already quantized, for example
+        on use_spin_quant is enabled. In that case, it will do the appropriate
+        transformations based on the given checkpoint first. In those cases,
+        if quantization_mode is enabled, it will quantize any remaining linear
+        ops that is not quantized.
+
+        There are cases where this may be a no-op, namely, if all linears are
+        quantized in the checkpoint.
+        """
+        modelname = f"{modelname}_q"
+        transforms.append(
+            get_quant_weight_transform(args, dtype_override, verbose_export())
+        )
 
     if args.embedding_quantize:
+        """
+        When this option is selected, it finds all embedding layers and transforms
+        into quantized embedding equivalent module.
+
+        There are cases where the checkpoint is already quantized, for example
+        on use_spin_quant is enabled. In that case, it will do the appropriate
+        transformations based on the given checkpoint first. In those cases,
+        this wil be a no-op.
+        """
         modelname = f"{modelname}_e"
         transforms.append(get_quant_embedding_transform(args))
 
@@ -779,6 +906,10 @@ def _get_source_transforms(
     if args.use_sdpa_with_kv_cache:
         transforms.append(replace_sdpa_with_custom_op)
 
+    if args.quantize_kv_cache:
+        assert args.use_kv_cache, "quantize_kv_cache requires use_kv_cache=True"
+        transforms.append(replace_kv_cache_with_quantized_kv_cache)
+
     if args.use_kv_cache:
         if args.qnn:
             # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
@@ -795,10 +926,18 @@ def _get_source_transforms(
                 transforms.append(get_model_with_r1_r2(args.optimized_rotation_path))
             transforms.append(convert_linear_to_conv2d)
 
-        elif args.coreml or args.mps:
-            # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition
+        elif args.mps:
+            # Currently mps doesn't support sdpa op, use the simpler decomposition
             # to get free perf gain.
             transforms.append(replace_sdpa_with_simple_sdpa)
             transforms.append(replace_causal_mask)
 
+        elif args.coreml:
+            # iOS 18 introduced fused sdpa op
+            if args.coreml_ios >= 18:
+                transforms.append(replace_sdpa_with_coreml_sdpa)
+            else:
+                transforms.append(replace_sdpa_with_simple_sdpa)
+            transforms.append(replace_kv_cache_with_coreml_kv_cache)
+
     return transforms
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index 534d90c6ed9..8e17013ae3d 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -9,7 +9,7 @@
 
 from dataclasses import dataclass
 from functools import partial
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -102,6 +102,8 @@ class ModelArgs:
     # logits for all input tokens.)
     generate_full_logits: bool = False
     enable_dynamic_shape: bool = False  # export model with dynamic shape support
+    # A dictionary mapping from pruned token-id to original token-id
+    output_prune_map: Optional[Dict[int, int]] = None
     use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
     rope_theta: Optional[float] = (
         None  # The official name to override self.rope_freq_base.
@@ -149,6 +151,7 @@ def __init__(
     ):
         super().__init__()
         self.max_seq_length = max_seq_length
+        self.is_tranposed = transpose_cache
         if transpose_cache:
             cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
         else:
@@ -171,19 +174,21 @@ def update(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # input_pos: [S], k_val: [B, H, S, D] or [B, S, H, D] depending on transpose_cache
         if self.enable_dynamic_shape:
-            start_pos = input_pos[-1].item()
+            start_pos = input_pos[0].item()
             torch._check_is_size(start_pos)
             torch._check(start_pos < self.max_seq_length)
-            seq_length = k_val.size(2)
+            dim_to_slice = 2 if self.transpose_cache else 1
+            seq_length = k_val.size(dim_to_slice)
             # Replace the entry in the cache for this token
             # The following lines are equivalent to:
             # cache_k[:bsz, start_pos : start_pos + seqlen] = xk
             # cache_v[:bsz, start_pos : start_pos + seqlen] = xv
+            # when dim_to_slice is 1
             # We use .narrow() here to make the compiler happy
             # pyre-ignore: Incompatible parameter type [6]
-            narrowed_k = self.k_cache.narrow(2, start_pos, seq_length)
+            narrowed_k = self.k_cache.narrow(dim_to_slice, start_pos, seq_length)
             # pyre-ignore: Incompatible parameter type [6]
-            narrowed_v = self.v_cache.narrow(2, start_pos, seq_length)
+            narrowed_v = self.v_cache.narrow(dim_to_slice, start_pos, seq_length)
 
             narrowed_k.copy_(k_val)
             narrowed_v.copy_(v_val)
@@ -191,8 +196,12 @@ def update(
         else:
             k_out = self.k_cache
             v_out = self.v_cache
-            k_out[:, :, input_pos] = k_val
-            v_out[:, :, input_pos] = v_val
+            if self.transpose_cache:
+                k_out[:, :, input_pos] = k_val
+                v_out[:, :, input_pos] = v_val
+            else:
+                k_out[:, input_pos] = k_val
+                v_out[:, input_pos] = v_val
 
             return k_out, v_out
 
@@ -218,9 +227,9 @@ def __init__(
     def forward(
         self,
         input_pos: torch.Tensor,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
+        q: torch.Tensor,  # Already have rotary embeddings. (bs, seqlen, n_local_heads, head_dim)
+        k: torch.Tensor,  # Already have rotary embeddings. (bs, seqlen, n_local_kv_heads, head_dim)
+        v: torch.Tensor,  # (bs, seqlen, n_local_kv_heads, head_dim)
         bsz,
         seqlen,
         mask: torch.Tensor,
@@ -449,6 +458,7 @@ def __init__(self, params: ModelArgs):
         self.use_kv_cache = params.use_kv_cache
         self.generate_full_logits = params.generate_full_logits
         self.max_seq_len = params.max_seq_len
+        self.output_prune_map = params.output_prune_map
         if params.use_hf_rope:
             self.precompute_freqs_cis = hf_precompute_freqs_cis
         else:
@@ -525,4 +535,27 @@ def forward(
         h = self.norm(h)
 
         logits = self.output(h)
+
+        if self.output_prune_map is not None:
+            # expand to original size so that downstream applications can use the logits as-is.
+            if self.generate_full_logits:
+                # (1, seq_len, pruned_size) -> (1, seq_len, original_size)
+                expanded_logits = torch.full(
+                    [logits.shape[0], logits.shape[1], self.vocab_size],
+                    float("-inf"),
+                    device=logits.device,
+                    dtype=logits.dtype,
+                )
+                expanded_logits[:, :, list(self.output_prune_map.values())] = logits
+            else:
+                # (1, pruned_size) -> (1, original_size)
+                expanded_logits = torch.full(
+                    [logits.shape[0], self.vocab_size],
+                    float("-inf"),
+                    device=logits.device,
+                    dtype=logits.dtype,
+                )
+                expanded_logits[:, list(self.output_prune_map.values())] = logits
+            logits = expanded_logits
+
         return logits
diff --git a/examples/models/llama2/llama_via_xnnpack.gif b/examples/models/llama2/llama_via_xnnpack.gif
new file mode 100644
index 00000000000..046011f5f87
Binary files /dev/null and b/examples/models/llama2/llama_via_xnnpack.gif differ
diff --git a/examples/models/llama2/main.cpp b/examples/models/llama2/main.cpp
index 1e1c3a10b3b..339b2abfdb4 100644
--- a/examples/models/llama2/main.cpp
+++ b/examples/models/llama2/main.cpp
@@ -39,6 +39,8 @@ DEFINE_int32(
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
+DEFINE_bool(warmup, false, "Whether to run a warmup run.");
+
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -57,20 +59,25 @@ int32_t main(int32_t argc, char** argv) {
 
   int32_t cpu_threads = FLAGS_cpu_threads;
 
+  bool warmup = FLAGS_warmup;
+
 #if defined(ET_USE_THREADPOOL)
   uint32_t num_performant_cores = cpu_threads == -1
-      ? torch::executorch::cpuinfo::get_num_performant_cores()
+      ? ::executorch::extension::cpuinfo::get_num_performant_cores()
       : static_cast<uint32_t>(cpu_threads);
   ET_LOG(
       Info, "Resetting threadpool with num threads = %d", num_performant_cores);
   if (num_performant_cores > 0) {
-    torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool(
-        num_performant_cores);
+    ::executorch::extension::threadpool::get_threadpool()
+        ->_unsafe_reset_threadpool(num_performant_cores);
   }
 #endif
   // create llama runner
-  ::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
+  example::Runner runner(model_path, tokenizer_path, temperature);
 
+  if (warmup) {
+    runner.warmup(prompt, seq_len);
+  }
   // generate
   runner.generate(prompt, seq_len);
 
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index 174f562f93a..d8d0ff00ffa 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -63,6 +63,7 @@ def __init__(self, **kwargs):
         self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
         self.generate_full_logits = kwargs.get("generate_full_logits", False)
         self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
+        self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
 
         self.max_seq_len = kwargs.get("max_seq_len", 128)
         self.args = kwargs.get("args", None)
@@ -141,6 +142,12 @@ def __init__(self, **kwargs):
                 )
         with open(params_path, "r") as f:
             params = json.loads(f.read())
+        output_prune_map = None
+        if self.output_prune_map_path is not None:
+            with open(self.output_prune_map_path, "r") as f:
+                output_prune_map = json.load(f)
+            # change keys from string to int (json only supports string keys)
+            output_prune_map = {int(k): v for (k, v) in output_prune_map.items()}
         max_seq_len = self.max_seq_len
         max_batch_size = 1
         model_args: ModelArgs = ModelArgs(
@@ -149,6 +156,7 @@ def __init__(self, **kwargs):
             use_kv_cache=self.use_kv_cache,
             use_sdpa_with_kv_cache_op=self.use_sdpa_with_kv_cache_op,
             generate_full_logits=self.generate_full_logits,
+            output_prune_map=output_prune_map,
             enable_dynamic_shape=self.enable_dynamic_shape,
             **params,
         )
@@ -183,37 +191,33 @@ def __init__(self, **kwargs):
             )
         elif hasattr(self.args, "use_spin_quant") and self.args.use_spin_quant:
             print("Using SPIN quantization.")
-            assert hasattr(self.args, "group_size"), "group_size must be specified"
-            assert hasattr(
-                self.args, "quantization_mode"
-            ), "quantization_mode must be specified"
-            assert hasattr(
-                self.args, "dtype_override"
-            ), "dtype_override must be specified"
-            from .source_transformation.spin_quant import (
-                sanitize_checkpoint_from_spinquant,
-                transform_for_spinquant,
+            self._transform_for_pre_quantization(checkpoint)
+
+            from .source_transformation.pre_quantization import (
+                sanitize_checkpoint_from_pre_quantization,
             )
 
-            mapping = {
-                "fp32": torch.float32,
-                "fp16": torch.float16,
-                "bf16": torch.bfloat16,
-            }
+            sanitize_checkpoint_from_pre_quantization(checkpoint)
+        elif hasattr(self.args, "use_qat") and self.args.use_qat:
+            print("Using QAT quantization.")
+            self._transform_for_pre_quantization(checkpoint)
+            if hasattr(self.args, "use_lora") and self.args.use_lora:
+                from .source_transformation.lora import (
+                    transform_linear_for_lora_after_quantization,
+                )
 
-            self.model_ = transform_for_spinquant(
-                self.model_,
-                checkpoint,
-                self.args.group_size,
-                self.args.quantization_mode,
-                mapping[self.args.dtype_override],
-            )
+                self.model_ = transform_linear_for_lora_after_quantization(
+                    self.model_,
+                    checkpoint,
+                    self.args.use_lora,
+                )
 
-            sanitize_checkpoint_from_spinquant(
-                checkpoint,
-                self.args.group_size,
+            from .source_transformation.pre_quantization import (
+                sanitize_checkpoint_from_pre_quantization,
             )
 
+            sanitize_checkpoint_from_pre_quantization(checkpoint)
+
         # assign=True: load params/buffers by assignment instead of performing an in-place copy.
         # Because we are using device="meta", tensors do not have memory associated with them
         # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario.
@@ -230,6 +234,12 @@ def __init__(self, **kwargs):
             print(unexpected)
             print("============= /unexpected ================")
 
+        # prune the output layer if output_prune_map is provided
+        if output_prune_map is not None:
+            from .source_transformation.prune_output import prune_output_vocab
+
+            self.model_ = prune_output_vocab(self.model_, output_prune_map)
+
     def get_eager_model(self):
         if self.dtype:
             # convert to the type of the provided checkpoint
@@ -266,3 +276,68 @@ def get_example_inputs_kvcache_sdpa(self):
                     [0], dtype=torch.long
                 ),  # start_pos, what token of output are we on.
             )
+
+    def _transform_for_pre_quantization(self, checkpoint):
+        assert hasattr(self.args, "preq_mode"), "preq_mode must be specified"
+        assert self.args.preq_mode in [
+            "8da4w",
+            "8da4w_output_8da8w",
+        ], f"Quantization mode {self.args.preq_mode} is not compatible with SpinQuant."
+        assert hasattr(
+            self.args, "preq_group_size"
+        ), "preq_group_size must be specified"
+        assert hasattr(self.args, "dtype_override"), "dtype_override must be specified"
+        from .source_transformation.pre_quantization import (
+            transform_linear_for_pre_quantization,
+        )
+
+        mapping = {
+            "fp32": torch.float32,
+            "fp16": torch.float16,
+            "bf16": torch.bfloat16,
+        }
+
+        # Transform the output layer first if needed.
+        if self.args.preq_mode == "8da4w_output_8da8w":
+            from .source_transformation.pre_quantization import (
+                transform_output_linear_for_pre_quantization,
+            )
+
+            self.model_ = transform_output_linear_for_pre_quantization(
+                module=self.model_,
+                checkpoint=checkpoint,
+                dtype=mapping[self.args.dtype_override],
+            )
+
+        self.model_ = transform_linear_for_pre_quantization(
+            self.model_,
+            checkpoint,
+            self.args.preq_group_size,
+            mapping[self.args.dtype_override],
+        )
+
+        embedding_bit_width, embedding_group_size = None, None
+        if hasattr(self.args, "preq_embedding_quantize"):
+            embedding_bit_width, embedding_group_size = (
+                self.args.preq_embedding_quantize.split(",")
+            )
+            from .source_transformation.pre_quantization import (
+                transform_embedding_for_pre_quantization,
+            )
+
+            if (
+                embedding_group_size == "none"
+                or embedding_group_size == "None"
+                or embedding_group_size == "0"
+            ):
+                embedding_group_size = None
+            else:
+                embedding_group_size = int(embedding_group_size)
+
+            self.model_ = transform_embedding_for_pre_quantization(
+                self.model_,
+                checkpoint,
+                mapping[self.args.dtype_override],
+                int(embedding_bit_width),
+                embedding_group_size,
+            )
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index 1e17c754007..499bfbedf15 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -18,7 +18,14 @@
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 
-namespace torch::executor {
+namespace example {
+
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace llm = ::executorch::extension::llm;
+
 namespace {
 static constexpr auto kAppendEosToPrompt = "append_eos_to_prompt";
 static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
@@ -80,7 +87,7 @@ Error Runner::load() {
         "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
         tokenizer_path_.c_str());
     tokenizer_.reset();
-    tokenizer_ = std::make_unique<BPETokenizer>();
+    tokenizer_ = std::make_unique<llm::BPETokenizer>();
     tokenizer_->load(tokenizer_path_);
   }
 
@@ -119,17 +126,17 @@ Error Runner::load() {
       ET_LOG(Info, "eos_id = %" PRId64, value);
     }
   }
-  text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
+  text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
       module_.get(),
       metadata_.at(kUseKVCache),
       metadata_.at(kVocabSize),
       temperature_);
-  text_prefiller_ = std::make_unique<TextPrefiller>(
+  text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
       metadata_.at(kEnableDynamicShape));
 
-  text_token_generator_ = std::make_unique<TextTokenGenerator>(
+  text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(),
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
@@ -139,31 +146,46 @@ Error Runner::load() {
   return Error::Ok;
 }
 
+// Don't print with the same priority during warmup
+#define RUNNER_ET_LOG(warmup, format, ...) \
+  if (warmup) {                            \
+    ET_LOG(Debug, format, __VA_ARGS__);    \
+  } else {                                 \
+    ET_LOG(Info, format, __VA_ARGS__);     \
+  }
+
 Error Runner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback,
-    bool echo) {
+    std::function<void(const llm::Stats&)> stats_callback,
+    bool echo,
+    bool warmup) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
   if (!is_loaded()) {
-    stats_.model_load_start_ms = util::time_in_ms();
+    stats_.model_load_start_ms = llm::time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_.model_load_end_ms = util::time_in_ms();
+    stats_.model_load_end_ms = llm::time_in_ms();
   }
 
-  ET_LOG(
-      Info,
+  if (warmup) {
+    ET_LOG(Info, "Doing a warmup run...");
+  }
+
+  RUNNER_ET_LOG(
+      warmup,
       "RSS after loading model: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
-      [token_callback](const std::string& piece) {
-        util::safe_printf(piece.c_str());
-        fflush(stdout);
+      [token_callback, warmup](const std::string& piece) {
+        if (!warmup) {
+          llm::safe_printf(piece.c_str());
+          fflush(stdout);
+        }
         if (token_callback) {
           token_callback(piece);
         }
@@ -171,7 +193,7 @@ Error Runner::generate(
   // First token time only measures the time it takes to encode the prompt and
   // return a response token.
 
-  stats_.inference_start_ms = util::time_in_ms();
+  stats_.inference_start_ms = llm::time_in_ms();
   shouldStop_ = false;
 
   // Set the sequence length to the max seq length if not provided
@@ -214,37 +236,45 @@ Error Runner::generate(
   }
   int64_t pos = 0;
   auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
-  stats_.first_token_ms = util::time_in_ms();
-  stats_.prompt_eval_end_ms = util::time_in_ms();
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
   uint64_t cur_token = prefill_res.get();
 
   // print the first token from prefill. No prev_token so use cur_token for it.
   wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
-  ET_LOG(
-      Info,
+  RUNNER_ET_LOG(
+      warmup,
       "RSS after prompt prefill: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
       prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));
 
-  stats_.inference_end_ms = util::time_in_ms();
-  printf("\n");
-  ET_LOG(
-      Info,
+  stats_.inference_end_ms = llm::time_in_ms();
+  if (!warmup) {
+    printf("\n");
+  }
+  RUNNER_ET_LOG(
+      warmup,
       "RSS after finishing text generation: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   if (num_prompt_tokens + num_generated_tokens == seq_len) {
-    ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
+    RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len);
   }
 
   stats_.num_prompt_tokens = num_prompt_tokens;
   stats_.num_generated_tokens = num_generated_tokens;
-  ::executorch::llm::print_report(stats_);
+
+  if (warmup) {
+    ET_LOG(Info, "Warmup run finished!");
+  } else {
+    // Do not print report during warmup
+    ::executorch::llm::print_report(stats_);
+  }
   if (stats_callback) {
     stats_callback(stats_);
   }
@@ -252,6 +282,18 @@ Error Runner::generate(
   return Error::Ok;
 }
 
+Error Runner::warmup(const std::string& prompt, int32_t seq_len) {
+  Error err = generate(
+      prompt,
+      seq_len,
+      /*token_callback=*/nullptr,
+      /*stats_callbak=*/nullptr,
+      /*echo=*/false,
+      /*warmup=*/true);
+  stats_.reset();
+  return err;
+}
+
 void Runner::stop() {
   if (is_loaded()) {
     text_token_generator_->stop();
@@ -259,4 +301,4 @@ void Runner::stop() {
     ET_LOG(Error, "Token generator is not loaded, cannot stop");
   }
 }
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index cec8c61157f..ca843427a7b 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -24,8 +24,7 @@
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 
-namespace torch::executor {
-using Stats = ::executorch::llm::Stats;
+namespace example {
 
 class Runner {
  public:
@@ -35,13 +34,18 @@ class Runner {
       const float temperature = 0.8f);
 
   bool is_loaded() const;
-  Error load();
-  Error generate(
+  ::executorch::runtime::Error load();
+  ::executorch::runtime::Error generate(
       const std::string& prompt,
       int32_t seq_len = 128,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {},
-      bool echo = true);
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {},
+      bool echo = true,
+      bool warming = false);
+  ::executorch::runtime::Error warmup(
+      const std::string& prompt,
+      int32_t seq_len = 128);
   void stop();
 
  private:
@@ -49,16 +53,18 @@ class Runner {
   bool shouldStop_{false};
 
   // model
-  std::unique_ptr<Module> module_;
+  std::unique_ptr<::executorch::extension::Module> module_;
   std::string tokenizer_path_;
-  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<TextDecoderRunner> text_decoder_runner_;
-  std::unique_ptr<TextPrefiller> text_prefiller_;
-  std::unique_ptr<TextTokenGenerator> text_token_generator_;
+  std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
+      text_decoder_runner_;
+  std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
+  std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
+      text_token_generator_;
 
   // stats
-  Stats stats_;
+  ::executorch::extension::llm::Stats stats_;
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llama2/source_transformation/lora.py b/examples/models/llama2/source_transformation/lora.py
new file mode 100644
index 00000000000..11fcba76c77
--- /dev/null
+++ b/examples/models/llama2/source_transformation/lora.py
@@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+# Helper functions for tranforming the model to be able to load checkpoints with
+# LoRA adaptors. See https://arxiv.org/abs/2106.09685 for more details about LoRA.
+
+from typing import Any
+
+import torch
+from torch import nn
+from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
+from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
+
+
+class LoRAAdaptorLinear(nn.Module):
+    """
+    LoRA adaptor for linear layers.
+
+    This class implements Low-Rank Adaptation(LoRA) for linear layers.
+    See more details about LoRA here https://arxiv.org/abs/2106.09685.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int,
+        scale: float = 2.0,
+        dtype=torch.float32,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.scale = scale
+        self.A = nn.Linear(in_features, rank, bias=False, dtype=dtype, device=device)
+        self.B = nn.Linear(rank, out_features, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.scale * self.B(self.A(x))  # pyre-ignore[7]
+
+
+class Int8DynActInt4WeightLinearLoRA(Int8DynActInt4WeightLinear):
+    """
+    Int8DynActInt4WeightLinear with LoRA adaptor.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        lora_rank: int,
+        bias=True,
+        device=None,
+        groupsize: int = 256,
+        precision: torch.dtype = torch.float32,
+        scales_precision: torch.dtype = torch.float32,
+        lora_adaptor_precision: torch.dtype = torch.bfloat16,
+        lora_scale: float = 2.0,
+    ) -> None:
+        super().__init__(
+            in_features,
+            out_features,
+            bias=bias,
+            device=device,
+            groupsize=groupsize,
+            precision=precision,
+            scales_precision=scales_precision,
+        )
+        self.adaptor = LoRAAdaptorLinear(
+            in_features,
+            out_features,
+            lora_rank,
+            scale=lora_scale,
+            dtype=lora_adaptor_precision,
+            device=device,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input) + self.adaptor(input).to(dtype=self.precision)
+
+
+def _replace_linear_8da4w_for_lora(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    lora_rank: int,
+):
+    def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
+        # Only replace linear layers where the checkpoint contains explicit adaptors
+        adaptor_A_key = f"{cur_fqn}.adaptor.A.weight"
+        adaptor_B_key = f"{cur_fqn}.adaptor.B.weight"
+        if (
+            isinstance(child, Int8DynActInt4WeightLinear)
+            and adaptor_A_key in checkpoint
+            and adaptor_B_key in checkpoint
+        ):
+            assert checkpoint[adaptor_A_key].dtype == torch.bfloat16
+            assert checkpoint[adaptor_A_key].shape[0] == lora_rank
+            assert checkpoint[adaptor_A_key].shape[1] == child.in_features
+            assert checkpoint[adaptor_B_key].dtype == torch.bfloat16
+            assert checkpoint[adaptor_B_key].shape[0] == child.out_features
+            assert checkpoint[adaptor_B_key].shape[1] == lora_rank
+            return True
+        return False
+
+    def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
+        new_linear = Int8DynActInt4WeightLinearLoRA(
+            child.in_features,
+            child.out_features,
+            lora_rank=lora_rank,
+            bias=False,
+            device=child.weight.device,
+            groupsize=child.groupsize,
+            precision=child.precision,
+            scales_precision=child.scales.dtype,
+        )
+        return new_linear
+
+    _replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn)
+
+
+def transform_linear_for_lora_after_quantization(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    lora_rank: int,
+) -> torch.nn.Module:
+    """
+    Transform the model to be able to load checkpoints with LoRA adaptors.
+    The model should be already transformed to be able to load pre-quantized
+    checkpoints. The checkpoint should have been pre-quantized and added with
+    LoRA adaptors.
+    """
+    _replace_linear_8da4w_for_lora(
+        module,
+        checkpoint,
+        lora_rank,
+    )
+    return module
diff --git a/examples/models/llama2/source_transformation/pre_quantization.py b/examples/models/llama2/source_transformation/pre_quantization.py
new file mode 100644
index 00000000000..38937c5ab4e
--- /dev/null
+++ b/examples/models/llama2/source_transformation/pre_quantization.py
@@ -0,0 +1,191 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+# Helper functions for tranforming the model to be able to load pre-quantized checkpoints.
+
+from typing import Any, Optional
+
+import torch
+from torch import nn
+
+from torchao.quantization.GPTQ import _check_linear_int4_k, Int8DynActInt4WeightLinear
+from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
+
+from .quantize import Int8DynActInt8WeightLinear, QuantizedGroupEmbedding
+
+
+def _replace_linear_with_linear_8da4w_for_pre_quantization(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    group_size: int,
+    precision: torch.dtype,
+    scales_precision: torch.dtype,
+):
+    def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
+        # Only replace linear layers where the checkpoint contains explicit scales
+        scales_key = f"{cur_fqn}.scales"
+        if isinstance(child, nn.Linear) and scales_key in checkpoint:
+            assert _check_linear_int4_k(child.in_features, group_size)
+            assert checkpoint[f"{cur_fqn}.weight"].dtype == torch.int8
+            assert checkpoint[scales_key].dtype == scales_precision
+            return True
+        return False
+
+    def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
+        new_linear = Int8DynActInt4WeightLinear(
+            child.in_features,
+            child.out_features,
+            bias=False,
+            device=child.weight.device,
+            groupsize=group_size,
+            precision=precision,
+            scales_precision=scales_precision,
+        )
+        return new_linear
+
+    _replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn)
+
+
+def transform_linear_for_pre_quantization(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    group_size: int,
+    dtype: torch.dtype,
+) -> torch.nn.Module:
+    """
+    Transform the model to be able to load pre-quantized checkpoints that
+    are quantized with the given group size and quantization mode for
+    linear layers.
+    """
+
+    if group_size not in [32, 64, 128, 256]:
+        raise ValueError(
+            f"Group size {group_size} is not supported for pre-quantized checkpoint."
+        )
+    _replace_linear_with_linear_8da4w_for_pre_quantization(
+        module,
+        checkpoint,
+        group_size,
+        dtype,
+        dtype,
+    )
+    return module
+
+
+def _replace_output_linear_with_linear_int8_for_pre_quantization(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    dtype: torch.dtype,
+):
+    def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
+        scales_key = f"{cur_fqn}.scales"
+        if (
+            isinstance(child, nn.Linear)
+            and scales_key in checkpoint
+            and "output" in cur_fqn
+        ):
+            assert checkpoint[f"{cur_fqn}.weight"].dtype == torch.int8
+            assert checkpoint[scales_key].dtype == dtype
+            return True
+        return False
+
+    def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
+        new_linear = Int8DynActInt8WeightLinear(
+            device=child.weight.device,
+            in_features=child.in_features,
+            out_features=child.out_features,
+            precision=dtype,
+            bias=False,
+        )
+        return new_linear
+
+    _replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn)
+
+
+def transform_output_linear_for_pre_quantization(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    dtype: torch.dtype,
+) -> torch.nn.Module:
+    """
+    Transform the model to be able to load pre-quantized checkpoints that
+    has the output layer quantized per-channel.
+    """
+    _replace_output_linear_with_linear_int8_for_pre_quantization(
+        module,
+        checkpoint,
+        dtype,
+    )
+    return module
+
+
+def _replace_embedding_with_quantized_group_embedding_for_pre_quantization(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    dtype: torch.dtype,
+    bit_width: int,
+    group_size: Optional[int] = None,
+):
+    def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
+        # Only replace embedding layers where the checkpoint contains explicit scales
+        scales_key = f"{cur_fqn}.scales"
+        if isinstance(child, nn.Embedding) and scales_key in checkpoint:
+            assert checkpoint[f"{cur_fqn}.weight"].dtype == torch.int8
+            assert checkpoint[scales_key].dtype == torch.float32
+            return True
+        return False
+
+    def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
+        new_embedding = QuantizedGroupEmbedding(
+            device=child.weight.device,
+            vocab_size=child.weight.shape[0],
+            embedding_dim=child.weight.shape[1],
+            group_size=group_size,
+            dtype=dtype,
+            packed=False,  # TODO(lunwenh): support packed embedding for pre-quantized
+        )
+        return new_embedding
+
+    _replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn)
+
+
+def transform_embedding_for_pre_quantization(
+    module: torch.nn.Module,
+    checkpoint: Any,
+    dtype: torch.dtype,
+    bit_width: int,
+    group_size: Optional[int] = None,
+) -> torch.nn.Module:
+    """
+    Transform the model to be able to load pre-quantized checkpoints that
+    are quantized with the given bit_width and group size for embedding.
+    """
+    if group_size is not None and group_size not in [0, 32, 64, 128, 256]:
+        raise ValueError(
+            f"Group size {group_size} is not supported for pre-quantized checkpoint."
+        )
+    _replace_embedding_with_quantized_group_embedding_for_pre_quantization(
+        module,
+        checkpoint,
+        dtype,
+        bit_width,
+        group_size,
+    )
+    return module
+
+
+def sanitize_checkpoint_from_pre_quantization(
+    checkpoint: Any,
+):
+    """
+    Sanitize the pre-quantized checkpoint.
+        - Converts all tensors to contiguous format
+        - Squeeze all tensors
+    """
+    for k, v in checkpoint.items():
+        checkpoint[k] = torch.squeeze(v.contiguous())
diff --git a/examples/models/llama2/source_transformation/prune_output.py b/examples/models/llama2/source_transformation/prune_output.py
new file mode 100644
index 00000000000..6d02d52fa5c
--- /dev/null
+++ b/examples/models/llama2/source_transformation/prune_output.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import numpy as np
+
+import torch
+
+
+def prune_output_vocab(
+    model: torch.nn.Module,
+    token_map: Dict[int, int],
+    output_layer_name: str = "output",
+) -> torch.nn.Module:
+    """Prune the model output linear layer while keeping the tokens in the token map.
+
+    Note: Pruning is performed in-place.
+
+    Args:
+        model: The model to prune.
+        token_map: A dictionary mapping from new token ids to the old token ids to preserve.
+            e.g. {0: 221, 1: 1325, 2: 1542, 3: 1728, 4: 18243}
+        output_layer_name: name of the output layer to prune
+
+    Returns:
+        The pruned model.
+    """
+    assert hasattr(
+        model, output_layer_name
+    ), f"Model does not have {output_layer_name} layer"
+    output_layer = getattr(model, output_layer_name)
+    assert isinstance(
+        output_layer, torch.nn.Linear
+    ), "Output layer is not a linear layer"
+    original_shape = output_layer.weight.shape
+    input_features = original_shape[1]
+    num_pruned_tokens = len(token_map)
+    has_bias = output_layer.bias is not None
+    weight_dtype = output_layer.weight.dtype
+    pruned_layer = torch.nn.Linear(input_features, num_pruned_tokens, bias=has_bias)
+    pruned_layer.to(dtype=weight_dtype)
+    pruned_layer_weights = np.zeros(pruned_layer.weight.shape, dtype=np.float32)
+    pruned_layer_bias = None
+    if has_bias:
+        pruned_layer_bias = np.zeros(pruned_layer.bias.shape, dtype=np.float32)
+    for i, token_id in token_map.items():
+        # Copy the weights and biases from the original layer to the pruned layer
+        pruned_wt = output_layer.weight[token_id].detach()
+        if weight_dtype == torch.bfloat16:
+            pruned_wt = pruned_wt.float()
+        pruned_layer_weights[i] = pruned_wt.numpy()
+        if has_bias:
+            pruned_bias = output_layer.bias[token_id].detach()
+            if weight_dtype == torch.bfloat16:
+                pruned_bias = pruned_bias.float()
+            pruned_layer_bias[i] = pruned_bias.numpy()
+    with torch.no_grad():
+        pruned_layer.weight.copy_(
+            torch.tensor(pruned_layer_weights, dtype=weight_dtype)
+        )
+        if has_bias:
+            pruned_layer.bias.copy_(torch.tensor(pruned_layer_bias, dtype=weight_dtype))
+
+    # Replace the original layer with the pruned layer
+    setattr(model, output_layer_name, pruned_layer)
+
+    return model
diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py
index da832f8285a..7ef51ac93c0 100644
--- a/examples/models/llama2/source_transformation/quantize.py
+++ b/examples/models/llama2/source_transformation/quantize.py
@@ -379,6 +379,99 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         # return F.linear(input, self.weight.to(dtype=input.dtype)) * se...
 
 
+def linear_forward_8da8w(
+    x,
+    weight_int8,
+    scales,
+    zeros,
+    out_features,
+    precision,
+):
+    from torchao.quantization.utils import per_token_dynamic_quant
+
+    x = per_token_dynamic_quant(x)
+    n_bit = 8
+    quant_min = -(2 ** (n_bit - 1))
+    quant_max = 2 ** (n_bit - 1) - 1
+    w_dq = torch.ops.quantized_decomposed.dequantize_per_channel(
+        weight_int8,
+        scales,
+        zeros,
+        0,
+        quant_min,
+        quant_max,
+        torch.int8,
+        out_dtype=precision,
+    )
+    c = torch.nn.functional.linear(x, w_dq)
+
+    return c
+
+
+class Int8DynActInt8WeightLinear(torch.nn.Module):
+    __constants__ = ["in_features", "out_features"]
+
+    in_features: int
+    out_features: int
+    weight: torch.Tensor
+
+    """
+    This module implements a dynamic quantized linear layer with int8 weight.
+    Weights are per channel quantized. Parameters of importance
+    precision: precision of input and output. e.g. torch.float32 means input
+    activation is float32 and output is float32.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias=True,
+        device=None,
+        dtype=None,
+        precision: torch.dtype = torch.float32,
+    ) -> None:
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        assert not bias, "require bias=False"
+        self.precision = precision
+
+        if dtype is not None:
+            raise ValueError("Please specify 'precision' instead of 'dtype'")
+
+        # currently storing unpacked int8 weights
+        self.register_buffer(
+            "weight",
+            torch.empty((out_features, in_features), dtype=torch.int8),
+        )
+        self.register_buffer(
+            "scales",
+            torch.empty(
+                (out_features),
+                dtype=torch.float32,
+            ),
+        )
+        self.register_buffer(
+            "zeros",
+            torch.empty(
+                (out_features),
+                dtype=torch.float32,
+            ),
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        input = input.to(self.precision)
+        return linear_forward_8da8w(
+            input,
+            self.weight,
+            self.scales,
+            self.zeros,
+            self.out_features,
+            self.precision,
+        )
+
+
 #########################################################################
 #####                   embedding table quantization               ######
 
diff --git a/examples/models/llama2/source_transformation/quantized_kv_cache.py b/examples/models/llama2/source_transformation/quantized_kv_cache.py
new file mode 100644
index 00000000000..8eec7846d3c
--- /dev/null
+++ b/examples/models/llama2/source_transformation/quantized_kv_cache.py
@@ -0,0 +1,224 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from enum import Enum
+
+import torch
+import torch.nn as nn
+from executorch.examples.models.llama2.llama_transformer import KVCache
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
+
+
+"""
+ Heavily "inspired" by AO's implementation of the same in torchao/_models/llama/model.py
+"""
+
+
+# Doesnt have to abide by affine quantizaiton laws
+# However, if we do implement quantized sdpa, then this might be handy
+class QuantizedCacheType(Enum):
+    AffineSymmetric = 0
+    AffineAsymmetric = 1
+    AffineSymmetricGroupWise = 2
+    AffineAsymmetricGroupWise = 3
+
+
+class QuantizedKVCache(nn.Module):
+    def __init__(
+        self,
+        max_batch_size,
+        max_seq_length,
+        n_heads,
+        head_dim,
+        cache_type: QuantizedCacheType = QuantizedCacheType.AffineSymmetric,
+        tranposed=False,
+        enable_dynamic_shape=False,
+    ):
+        super().__init__()
+        if cache_type not in (
+            QuantizedCacheType.AffineSymmetric,
+            QuantizedCacheType.AffineAsymmetric,
+        ):
+
+            raise ValueError(
+                f"Only affine symmetric and asymmetric cache types are supported: got {cache_type}"
+            )
+
+        # For now supporting int8 only
+        self.quantized_cache_dtype = torch.int8
+        self.cache_fp_type = torch.float32
+        self.is_transposed = tranposed
+        self.enable_dynamic_shape = enable_dynamic_shape
+        if self.is_transposed:
+            cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+            scale_shape = (max_batch_size, n_heads, max_seq_length, 1)
+        else:
+            cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
+            scale_shape = (max_batch_size, max_seq_length, n_heads, 1)
+        self.register_buffer(
+            "k_cache", torch.zeros(cache_shape, dtype=self.quantized_cache_dtype)
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(cache_shape, dtype=self.quantized_cache_dtype)
+        )
+        self.register_buffer(
+            "k_cache_scales", torch.ones(scale_shape, dtype=torch.float64)
+        )
+        self.register_buffer(
+            "v_cache_scales", torch.ones(scale_shape, dtype=torch.float64)
+        )
+        if cache_type == QuantizedCacheType.AffineAsymmetric:
+            self.register_buffer(
+                "k_cache_zero_points", torch.ones(scale_shape, dtype=torch.int64)
+            )
+            self.register_buffer(
+                "v_cache_zero_points", torch.ones(scale_shape, dtype=torch.int64)
+            )
+
+    def _quantize(self, value):
+        scales, zero_points = (
+            torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
+                value, self.quantized_cache_dtype
+            )
+        )
+        quantized_value = torch.ops.quantized_decomposed.quantize_per_token(
+            value,
+            scales,
+            zero_points,
+            torch.iinfo(self.quantized_cache_dtype).min,
+            torch.iinfo(self.quantized_cache_dtype).max,
+            self.quantized_cache_dtype,
+        )
+        return quantized_value, scales, zero_points
+
+    def update(self, input_pos, k_val, v_val):
+        # quantize current k_val and store it in the cache
+        quantized_k_val, k_scales, k_zero_points = self._quantize(k_val)
+
+        quantized_v_val, v_scales, v_zero_points = self._quantize(v_val)
+
+        if self.is_transposed:
+            # We cannot use update_cache op at the moment
+            # if the cache is transposed
+            # Also note that we shold not need separate paths
+            # for dynamic shape vs !
+            # Only reason it is done this way is to accommodate
+            # for lowering pains of backends that work better
+            # with index_put op.
+            if self.enable_dynamic_shape:
+                start_pos = input_pos[0].item()
+                torch._check_is_size(start_pos)
+                dim_to_slice = 2 if self.is_transposed else 1
+                torch._check(start_pos < self.k_cache.size(dim_to_slice))
+                seq_length = k_val.size(dim_to_slice)
+                narrowed_k = self.k_cache.narrow(dim_to_slice, start_pos, seq_length)
+                narrowed_k_scales = self.k_cache_scales.narrow(
+                    dim_to_slice, start_pos, seq_length
+                )
+                narrowed_k_zp = self.k_cache_zero_points.narrow(
+                    dim_to_slice, start_pos, seq_length
+                )
+                narrowed_k.copy_(quantized_k_val)
+                narrowed_k_scales.copy_(k_scales)
+                narrowed_k_zp.copy_(k_zero_points)
+                narrowed_v = self.v_cache.narrow(dim_to_slice, start_pos, seq_length)
+                narrowed_v_scales = self.v_cache_scales.narrow(
+                    dim_to_slice, start_pos, seq_length
+                )
+                narrowed_v_zp = self.v_cache_zero_points.narrow(
+                    dim_to_slice, start_pos, seq_length
+                )
+                narrowed_v.copy_(quantized_v_val)
+                narrowed_v_scales.copy_(v_scales)
+                narrowed_v_zp.copy_(v_zero_points)
+            else:
+                self.k_cache[:, :, input_pos] = quantized_k_val
+                self.k_cache_scales[:, :, input_pos] = k_scales
+                self.k_cache_zero_points[:, :, input_pos] = k_zero_points
+                self.v_cache[:, :, input_pos] = quantized_v_val
+                self.v_cache_scales[:, :, input_pos] = v_scales
+                self.v_cache_zero_points[:, :, input_pos] = v_zero_points
+        else:
+            # Right now using custom ops on this path.
+            # In future we can update custom op to handle transposed cache
+            # as well.
+            # Note that we may have to revert this change if other ET
+            # backends such as QNN want to use quantized cache, with dynamic shape,
+            # instead of quantizing on their own.
+            # But until this opting for code simplicity
+            start_pos = input_pos[0].item()
+            _ = torch.ops.llama.update_quantized_cache(
+                quantized_k_val, self.k_cache, start_pos
+            )
+            _ = torch.ops.llama.update_quantized_cache(
+                k_scales, self.k_cache_scales, start_pos
+            )
+            _ = torch.ops.llama.update_quantized_cache(
+                k_zero_points, self.k_cache_zero_points, start_pos
+            )
+            _ = torch.ops.llama.update_quantized_cache(
+                quantized_v_val, self.v_cache, start_pos
+            )
+            _ = torch.ops.llama.update_quantized_cache(
+                v_scales, self.v_cache_scales, start_pos
+            )
+            _ = torch.ops.llama.update_quantized_cache(
+                v_zero_points, self.v_cache_zero_points, start_pos
+            )
+
+        k_out = torch.ops.quantized_decomposed.dequantize_per_token(
+            self.k_cache,
+            self.k_cache_scales,
+            self.k_cache_zero_points,
+            torch.iinfo(self.quantized_cache_dtype).min,
+            torch.iinfo(self.quantized_cache_dtype).max,
+            self.quantized_cache_dtype,
+            self.cache_fp_type,
+        )
+        v_out = torch.ops.quantized_decomposed.dequantize_per_token(
+            self.v_cache,
+            self.v_cache_scales,
+            self.v_cache_zero_points,
+            torch.iinfo(self.quantized_cache_dtype).min,
+            torch.iinfo(self.quantized_cache_dtype).max,
+            self.quantized_cache_dtype,
+            self.cache_fp_type,
+        )
+        return k_out, v_out
+
+    @classmethod
+    def from_float(cls, kv_cache, cache_type: QuantizedCacheType):
+        cache_shape = kv_cache.k_cache.shape
+        if kv_cache.is_tranposed:
+            max_batch_size, n_heads, max_seq_length, head_dim = cache_shape
+        else:
+            max_batch_size, max_seq_length, n_heads, head_dim = cache_shape
+        return cls(
+            max_batch_size,
+            max_seq_length,
+            n_heads,
+            head_dim,
+            cache_type,
+            kv_cache.is_tranposed,
+            kv_cache.enable_dynamic_shape,
+        )
+
+
+def replace_kv_cache_with_quantized_kv_cache(module):
+    logging.warning(
+        "Replacing KVCache with QuantizedKVCache. This modifies the model in place."
+    )
+    for name, child in module.named_children():
+        if isinstance(child, KVCache):
+            setattr(
+                module,
+                name,
+                QuantizedKVCache.from_float(child, QuantizedCacheType.AffineAsymmetric),
+            )
+        else:
+            replace_kv_cache_with_quantized_kv_cache(child)
+    return module
diff --git a/examples/models/llama2/source_transformation/rope.py b/examples/models/llama2/source_transformation/rope.py
index 7061636f0c6..a2a2264b247 100644
--- a/examples/models/llama2/source_transformation/rope.py
+++ b/examples/models/llama2/source_transformation/rope.py
@@ -16,11 +16,11 @@ def materialze_broadcast_of_rope_freq_cis(
     assert module.freqs_cos.dim() == 2
     dim0 = module.freqs_cos.size(0)
     dim1 = module.freqs_cos.size(1)
+    module_attention = module.layers[0].attention
     assert (
-        module.layers[0].attention.n_local_kv_heads
-        == module.layers[0].attention.n_local_heads
-    ), f"For rope freqs to be materialzed for broadcast q, k, v num heads must match. For q got {module.attention.n_kv_heads} for k got {module.attention.n_local_heads} and v got {module.attention.n_local_kv_heads}"
-    num_heads = module.layers[0].attention.n_local_heads
+        module_attention.n_local_kv_heads == module_attention.n_local_heads
+    ), f"For rope freqs to be materialized for broadcast, q, k, v num heads must match. For q got {module_attention.n_kv_heads} for k got {module_attention.n_local_heads} and v got {module_attention.n_local_kv_heads}"
+    num_heads = module_attention.n_local_heads
     module.freqs_cos = module.freqs_cos.view(dim0, 1, dim1)
     module.freqs_cos = module.freqs_cos.expand(dim0, num_heads, dim1).contiguous()
     assert module.freqs_sin.dim() == 2
diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py
index c48fdf0ae58..bda6966fa16 100644
--- a/examples/models/llama2/source_transformation/sdpa.py
+++ b/examples/models/llama2/source_transformation/sdpa.py
@@ -9,21 +9,32 @@
 # Example script for exporting Llama2 to flatbuffer
 
 import math
-from typing import Tuple
+from typing import Tuple, Union
 
 import torch
 
 from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA
+from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import (
+    QuantizedKVCache,
+)
 
 
 class SDPACustom(torch.nn.Module):
     def __init__(
         self,
-        kv_cache: KVCache,
+        kv_cache: Union[KVCache, QuantizedKVCache],
         dim: int,
     ):
         super().__init__()
+        # Custom op only supports float32 currently. Converting to/from float32 is
+        # faster than not having the op.
         self.kv_cache = kv_cache
+        if not isinstance(kv_cache, QuantizedKVCache):
+            self.kv_cache = kv_cache.to(torch.float)
+        else:
+            assert (
+                kv_cache.cache_fp_type == torch.float32
+            ), "Only float32 is supported for custom SDPA"
         self.dim = dim
 
     def forward(
@@ -36,19 +47,43 @@ def forward(
         seqlen,
         mask,
     ):
-        output = torch.ops.llama.sdpa_with_kv_cache(
-            q,
-            k,
-            v,
-            self.kv_cache.k_cache,
-            self.kv_cache.v_cache,
-            input_pos[-1].item(),
-            seqlen,
-            None,  # Attention mask
-            0,  # dropout probability. Ignored by the code
-            True,  # is_causal
-        )
-        return output.view(bsz, seqlen, self.dim)
+        # Custom op only supports float32 currently. Converting to/from float32 is
+        # faster than not having the op.
+        input_dtype = q.dtype
+        q = q.to(dtype=torch.float)
+        k = k.to(dtype=torch.float)
+        v = v.to(dtype=torch.float)
+
+        k_cache = self.kv_cache.k_cache
+        v_cache = self.kv_cache.v_cache
+        if isinstance(self.kv_cache, QuantizedKVCache):
+            # updated quantize cache, scale and zero points
+            # returns dequantized kv cache
+            # Not most optimal. Optimizations to follow next
+            k_cache, v_cache = self.kv_cache.update(input_pos, k, v)
+            output = torch.ops.llama.custom_sdpa(
+                q,
+                k_cache,
+                v_cache,
+                input_pos[0].item(),
+                None,  # Attention mask
+                0,  # dropout probability. Ignored by the code
+                True,  # is_causal
+            )
+        else:
+            output = torch.ops.llama.sdpa_with_kv_cache(
+                q,
+                k,
+                v,
+                k_cache,
+                v_cache,
+                input_pos[0].item(),
+                seqlen,
+                None,  # Attention mask
+                0,  # dropout probability. Ignored by the code
+                True,  # is_causal
+            )
+        return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
 def _replace_sdpa_with_custom_op(module: torch.nn.Module):
@@ -195,6 +230,136 @@ def replace_sdpa_with_flex_sdpa(module: torch.nn.Module):
     return module
 
 
+@torch.library.custom_op("coreml::sdpa", mutates_args=())
+def sdpa(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Same as F.scaled_dot_product_attention, but with custom op to avoid lowering during dialect conversion."""
+    return torch.ops.aten.scaled_dot_product_attention.default(
+        q, k, v, attn_mask=attn_mask
+    )
+
+
+@torch.library.register_fake("coreml::sdpa")
+def _(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Fake implementation with the right output shape, which is required for torch.compile/export/fx tracing."""
+    expected_shape = list(q.shape)
+    expected_shape[-1] = v.shape[-1]
+    return q.new_empty(expected_shape)
+
+
+class SDPACoreML(torch.nn.Module):
+    """Similar to SDPASimple, but with coreml custom op to do SDPA calculation."""
+
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        if self.n_rep > 1:
+            k = k.repeat_interleave(self.n_rep, dim=1)
+            v = v.repeat_interleave(self.n_rep, dim=1)
+
+        y = torch.ops.coreml.sdpa(q, k, v, attn_mask)
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
+def replace_sdpa_with_coreml_sdpa(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPACoreML(child.kv_cache, child.dim, child.head_dim, child.n_rep),
+            )
+        else:
+            replace_sdpa_with_coreml_sdpa(child)
+    return module
+
+
+class KVCacheCoreML(torch.nn.Module):
+    """
+    Rather than k_out[:, :, input_pos] = k_val, use torch.ops.aten.index_put_,
+    which can directly translate to CoreML iOS18.silce_update
+    """
+
+    def __init__(
+        self,
+        max_batch_size: int,
+        max_seq_length: int,
+        n_heads: int,
+        head_dim: int,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.max_seq_length = max_seq_length
+        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+
+        self.max_batch_size = max_batch_size
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.register_buffer(
+            "k_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+
+    def update(
+        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        k_out = torch.ops.aten.index_put_(self.k_cache, [None, None, input_pos], k_val)
+        v_out = torch.ops.aten.index_put_(self.v_cache, [None, None, input_pos], v_val)
+        return k_out, v_out
+
+
+def replace_kv_cache_with_coreml_kv_cache(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, KVCache):
+            setattr(
+                module,
+                name,
+                KVCacheCoreML(
+                    child.max_batch_size,
+                    child.max_seq_length,
+                    child.n_heads,
+                    child.head_dim,
+                    child.k_cache.dtype,
+                ),
+            )
+        else:
+            replace_kv_cache_with_coreml_kv_cache(child)
+    return module
+
+
 class KVCacheSimple(torch.nn.Module):
     def __init__(
         self,
diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py
index a45db190f48..f544e9e1f6e 100644
--- a/examples/models/llama2/source_transformation/spin_quant.py
+++ b/examples/models/llama2/source_transformation/spin_quant.py
@@ -9,7 +9,6 @@
 # Helper functions for tranforming the model to be able to run SpinQuant.
 # See https://github.com/facebookresearch/SpinQuant for more details about SpinQuant.
 
-from typing import Any
 
 import torch
 
@@ -17,8 +16,6 @@
 
 from executorch.examples.models.llama2.llama_transformer import FeedForward
 from torch import nn
-from torchao.quantization.GPTQ import _check_linear_int4_k, Int8DynActInt4WeightLinear
-from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
 
 
 def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module):
@@ -33,7 +30,7 @@ def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module)
             "Please install fast-hadamard-transform: pip install fast-hadamard-transform"
         )
 
-    class FeedForwardCustom(nn.Module):
+    class FeedForwardCudaCustom(nn.Module):
         def __init__(self, w1, w2, w3):
             super().__init__()
             self.w1 = w1
@@ -47,7 +44,7 @@ def forward(self, x):
 
     for name, child in module.named_children():
         if isinstance(child, FeedForward):
-            setattr(module, name, FeedForwardCustom(child.w1, child.w2, child.w3))
+            setattr(module, name, FeedForwardCudaCustom(child.w1, child.w2, child.w3))
         else:
             _inject_fast_hadamard_transform_cuda_for_spin_quant(child)
 
@@ -59,90 +56,33 @@ def inject_fast_hadamard_transform_cuda_for_spin_quant(
     return module
 
 
-def _replace_linear_with_linear_8da4w_for_spin_quant(
-    module: torch.nn.Module,
-    checkpoint: Any,
-    group_size: int,
-    precision: torch.dtype,
-    scales_precision: torch.dtype,
-):
-    def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool:
-        # Only replace linear layers where the checkpoint contains explicit scales
-        scales_key = f"{cur_fqn}.scale"
-        if isinstance(child, nn.Linear) and scales_key in checkpoint:
-            assert _check_linear_int4_k(child.in_features, group_size)
-            assert checkpoint[f"{cur_fqn}.weight"].dtype == torch.int8
-            assert checkpoint[scales_key].dtype == scales_precision
-            return True
-        return False
-
-    def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
-        new_linear = Int8DynActInt4WeightLinear(
-            child.in_features,
-            child.out_features,
-            bias=False,
-            device=child.weight.device,
-            groupsize=group_size,
-            precision=precision,
-            scales_precision=scales_precision,
-        )
-        return new_linear
+def _inject_fast_hadamard_transform_native_for_spin_quant(module: torch.nn.Module):
+    """
+    SpinQuant needs two Hadmard matrixes: R3 and R4. Here we are only injecting R4 in the feed forward layer.
+    R3 needs to be injected as well when KV cache quantization is enabled.
+    """
 
-    _replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn)
+    class FeedForwardNativeCustom(nn.Module):
+        def __init__(self, w1, w2, w3):
+            super().__init__()
+            self.w1 = w1
+            self.w2 = w2
+            self.w3 = w3
 
+        def forward(self, x):
+            return self.w2(
+                torch.ops.llama.fast_hadamard_transform(F.silu(self.w1(x)) * self.w3(x))
+            )
 
-def transform_for_spinquant(
+    for name, child in module.named_children():
+        if isinstance(child, FeedForward):
+            setattr(module, name, FeedForwardNativeCustom(child.w1, child.w2, child.w3))
+        else:
+            _inject_fast_hadamard_transform_native_for_spin_quant(child)
+
+
+def inject_fast_hadamard_transform_native_for_spin_quant(
     module: torch.nn.Module,
-    checkpoint: Any,
-    group_size: int,
-    quantization_mode: str,
-    dtype: torch.dtype,
 ) -> torch.nn.Module:
-    """
-    Transform the model to be able to load SpinQuant checkpoints that
-    are quantized with the given group size and quantization mode.
-    """
-
-    if group_size not in [32, 64, 128, 256]:
-        raise ValueError(f"Group size {group_size} is not supported for SpinQuant.")
-    if quantization_mode not in ["8da4w"]:
-        raise ValueError(
-            f"Quantization mode {quantization_mode} is not compatible with SpinQuant."
-        )
-    _replace_linear_with_linear_8da4w_for_spin_quant(
-        module,
-        checkpoint,
-        group_size,
-        dtype,
-        dtype,
-    )
+    _inject_fast_hadamard_transform_native_for_spin_quant(module)
     return module
-
-
-def sanitize_checkpoint_from_spinquant(
-    checkpoint: Any,
-    group_size: int,
-):
-    """
-    Sanitize the SpinQuant checkpoint.
-        - Renames 'scale' to 'scales'
-        - Groups scales
-        - Removes 'o_weight'
-        - Converts all tensors to contiguous format
-    """
-    keys_to_rename = []
-    keys_to_remove = []
-    for k, _ in checkpoint.items():
-        if k.endswith(".scale"):
-            new_key = k + "s"
-            keys_to_rename.append((k, new_key))
-        if k.endswith(".o_weight"):
-            keys_to_remove.append(k)
-
-    for old_key, new_key in keys_to_rename:
-        old_val = checkpoint.pop(old_key)
-        checkpoint[new_key] = old_val if group_size == -1 else old_val[:, ::group_size]
-    for k in keys_to_remove:
-        checkpoint.pop(k)
-    for k, v in checkpoint.items():
-        checkpoint[k] = v.contiguous()
diff --git a/examples/models/llama2/source_transformation/test_quantized_kv_cache.py b/examples/models/llama2/source_transformation/test_quantized_kv_cache.py
new file mode 100644
index 00000000000..5fa5d1958de
--- /dev/null
+++ b/examples/models/llama2/source_transformation/test_quantized_kv_cache.py
@@ -0,0 +1,128 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.examples.models.llama2.llama_transformer import KVCache
+
+from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import (
+    QuantizedCacheType,
+    QuantizedKVCache,
+)
+
+
+class QuantizedKVCacheTest(unittest.TestCase):
+
+    def _init_cache(self):
+        self.kv_cache = KVCache(
+            self.max_batch_size,
+            self.max_seq_len,
+            self.n_kv_heads,
+            self.head_dim,
+            self.transpose_kv_cache,
+            self.enable_dynamic_shape,
+            dtype=self.dtype,
+        )
+
+    def _init_kv(self):
+        if self.transpose_kv_cache:
+            shape = (1, self.n_kv_heads, self.seq_len, self.head_dim)
+        else:
+            shape = (1, self.seq_len, self.n_kv_heads, self.head_dim)
+        k = torch.rand(shape, dtype=self.dtype)
+        v = torch.rand(shape, dtype=self.dtype)
+        return k, v
+
+    def setUp(self):
+        torch.manual_seed(42)
+        self.max_batch_size = 1
+        self.max_seq_len = 5
+        self.n_kv_heads = 8
+        self.head_dim = 17
+        self.enable_dynamic_shape = False
+        self.transpose_kv_cache = False
+        self.dtype = torch.float32
+
+    def _test_simple_update_fetch(self, is_tranposed=False, is_dynamic_shape=False):
+        self.transpose_kv_cache = is_tranposed
+        self.enable_dynamic_shape = is_dynamic_shape
+        input_pos = torch.tensor([0, 1, 2])
+        self.seq_len = input_pos.size(0)
+        self._init_cache()
+        k, v = self._init_kv()
+        quantized_kv_cache = QuantizedKVCache.from_float(
+            self.kv_cache, QuantizedCacheType.AffineAsymmetric
+        )
+        updated_k_cache, updated_v_cache = self.kv_cache.update(input_pos, k, v)
+        updated_dequantized_k_cache, updated_dequantized_v_cache = (
+            quantized_kv_cache.update(input_pos, k, v)
+        )
+
+        def index(t, input_pos):
+            if self.transpose_kv_cache:
+                return t[:, :, input_pos, :]
+            else:
+                return t[:, input_pos, :, :]
+
+        sliced_k_cache = index(updated_k_cache, input_pos)
+        sliced_v_cache = index(updated_v_cache, input_pos)
+
+        sliced_dequantized_k_cache = index(updated_dequantized_k_cache, input_pos)
+        sliced_dequantized_v_cache = index(updated_dequantized_v_cache, input_pos)
+
+        torch.testing.assert_close(
+            sliced_k_cache,
+            sliced_dequantized_k_cache,
+            rtol=1e-02,
+            atol=1e-02,
+        )
+        torch.testing.assert_close(
+            sliced_v_cache,
+            sliced_dequantized_v_cache,
+            rtol=1e-02,
+            atol=1e-02,
+        )
+
+        input_pos = torch.tensor([3])
+        self.seq_len = input_pos.size(0)
+        k, v = self._init_kv()
+        pos_to_check = torch.tensor([0, 1, 2, 3])
+        updated_k_cache, updated_v_cache = self.kv_cache.update(input_pos, k, v)
+        updated_dequantized_k_cache, updated_dequantized_v_cache = (
+            quantized_kv_cache.update(input_pos, k, v)
+        )
+        sliced_k_cache = index(updated_k_cache, pos_to_check)
+        sliced_v_cache = index(updated_v_cache, pos_to_check)
+
+        sliced_dequantized_k_cache = index(updated_dequantized_k_cache, pos_to_check)
+        sliced_dequantized_v_cache = index(updated_dequantized_v_cache, pos_to_check)
+
+        torch.testing.assert_close(
+            sliced_k_cache,
+            sliced_dequantized_k_cache,
+            rtol=1e-02,
+            atol=1e-02,
+        )
+        torch.testing.assert_close(
+            sliced_v_cache,
+            sliced_dequantized_v_cache,
+            rtol=1e-02,
+            atol=1e-02,
+        )
+
+    def test_simple_update_fetch_not_transposed(self):
+        self._test_simple_update_fetch()
+
+    def test_simple_update_fetch_not_transposed_dynamic_shape(self):
+        self._test_simple_update_fetch(is_dynamic_shape=True)
+
+    def test_simple_update_fetch_transposed(self):
+        self._test_simple_update_fetch(is_tranposed=True)
+
+    def test_simple_update_fetch_transposed_dynamic_shape(self):
+        self._test_simple_update_fetch(is_tranposed=True, is_dynamic_shape=True)
diff --git a/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py
new file mode 100644
index 00000000000..4755d45499d
--- /dev/null
+++ b/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.examples.models.llama2.llama_transformer import KVCache
+
+from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import (
+    QuantizedCacheType,
+    QuantizedKVCache,
+)
+
+from executorch.examples.models.llama2.source_transformation.sdpa import SDPACustom
+
+
+class SDPAWithQuantizedKVCacheTest(unittest.TestCase):
+
+    def _init_cache(self):
+        self.kv_cache = KVCache(
+            self.max_batch_size,
+            self.max_seq_len,
+            self.n_kv_heads,
+            self.head_dim,
+            False,
+            self.enable_dynamic_shape,
+            dtype=self.dtype,
+        )
+        self.quantized_kv_cache = QuantizedKVCache.from_float(
+            self.kv_cache, QuantizedCacheType.AffineAsymmetric
+        )
+
+    def _init_kv(self):
+        kv_shape = (1, self.seq_len, self.n_kv_heads, self.head_dim)
+        q_shape = (1, self.seq_len, self.n_heads, self.head_dim)
+        q = torch.rand(q_shape, dtype=self.dtype)
+        k = torch.rand(kv_shape, dtype=self.dtype)
+        v = torch.rand(kv_shape, dtype=self.dtype)
+        return q, k, v
+
+    def setUp(self):
+        torch.manual_seed(42)
+        self.max_batch_size = 1
+        self.max_seq_len = 5
+        self.n_kv_heads = 4
+        self.n_heads = 8
+        self.head_dim = 17
+        self.dim = self.n_heads * self.head_dim
+        self.enable_dynamic_shape = False
+        self.dtype = torch.float32
+
+    def test_simple(self, is_dynamic_shape=False):
+        self.enable_dynamic_shape = is_dynamic_shape
+        input_pos = torch.tensor([0], dtype=torch.int64)
+        self.seq_len = 3
+        self._init_cache()
+        q, k, v = self._init_kv()
+        self.float_sdpa = SDPACustom(self.kv_cache, self.dim)
+        self.quantized_sdpa = SDPACustom(self.quantized_kv_cache, self.dim)
+        float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
+        quantized_out = self.quantized_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
+        torch.testing.assert_close(
+            float_out,
+            quantized_out,
+            # had to adjust rtol because switching to using custom_sdpa means we
+            # will use dequantized k and v instead of original k and v
+            # this leads to larger differences in the output.
+            # subsequent diff in the stack will address this issue.
+            rtol=1e-01,
+            atol=1e-03,
+        )
+
+        input_pos = torch.tensor([3], dtype=torch.int64)
+        self.seq_len = 1
+        q, k, v = self._init_kv()
+        float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
+        quantized_out = self.quantized_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
+        torch.testing.assert_close(
+            float_out,
+            quantized_out,
+            rtol=1e-03,
+            atol=1e-03,
+        )
diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama2/tests/TARGETS
index 76981d8f317..2e4dcf7d1f6 100644
--- a/examples/models/llama2/tests/TARGETS
+++ b/examples/models/llama2/tests/TARGETS
@@ -15,9 +15,9 @@ python_unittest(
 )
 
 python_unittest(
-    name = "test_spinquant_transforms",
+    name = "test_pre_quantization_transforms",
     srcs = [
-        "test_spinquant_transforms.py",
+        "test_pre_quantization_transforms.py",
     ],
     deps = [
         "//caffe2:torch",
diff --git a/examples/models/llama2/tests/test_pre_quantization_transforms.py b/examples/models/llama2/tests/test_pre_quantization_transforms.py
new file mode 100644
index 00000000000..59cec2e72ab
--- /dev/null
+++ b/examples/models/llama2/tests/test_pre_quantization_transforms.py
@@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer
+from executorch.examples.models.llama2.source_transformation.pre_quantization import (
+    sanitize_checkpoint_from_pre_quantization,
+    transform_embedding_for_pre_quantization,
+    transform_linear_for_pre_quantization,
+    transform_output_linear_for_pre_quantization,
+)
+from executorch.examples.models.llama2.source_transformation.quantize import (
+    dynamically_quantize_per_channel,
+)
+from torchao.quantization.utils import group_quantize_tensor_symmetric
+
+
+class PreQuantizationTests(unittest.TestCase):
+
+    def _prepare_dummy_model(self) -> Transformer:
+        model_args = ModelArgs(
+            max_seq_len=2048,
+            max_batch_size=1,
+            use_kv_cache=False,
+            use_sdpa_with_kv_cache_op=False,
+            generate_full_logits=False,
+            enable_dynamic_shape=True,
+            dim=768,
+            multiple_of=32,
+            n_heads=12,
+            n_layers=12,
+            norm_eps=1e-05,
+            vocab_size=32000,
+        )
+
+        model = Transformer(model_args)
+
+        return model
+
+    def test_transform_linear_for_pre_quantization(self):
+
+        # Step 1: Create llama class with dummy weights
+        model = self._prepare_dummy_model()
+        checkpoint = model.state_dict()
+
+        # Step 2:
+        # Do group-wise quantization and amend the checkpoints with
+        # int8 weight and fp32 scales
+        group_size = 32
+        n_bit = 4
+        scales_precision = torch.float32
+        for fqn, mod in model.named_modules():
+            if isinstance(mod, torch.nn.Linear):
+                weight = mod.weight.data
+                (
+                    weight_int8,
+                    scales,
+                    zeros,
+                ) = group_quantize_tensor_symmetric(
+                    weight.to(torch.float32), n_bit, group_size, scales_precision
+                )
+                checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu")
+                checkpoint[f"{fqn}.scales"] = scales.to("cpu")
+
+        # Step 3:
+        # Transform the model so that it is compatible with the new checkpoint
+        transform_linear_for_pre_quantization(
+            model,
+            checkpoint,
+            32,
+            torch.float32,
+        )
+        sanitize_checkpoint_from_pre_quantization(checkpoint)
+
+        model.load_state_dict(
+            checkpoint,
+            strict=False,
+            assign=True,
+        )
+
+        new_checkpoint = model.state_dict()
+
+        for k, v in checkpoint.items():
+            # The new_checkpoint contains zeros so
+            # have to iterate over the keys.
+            self.assertTrue(torch.allclose(new_checkpoint[k], v))
+
+    def test_transform_output_linear_for_pre_quantization(self):
+        # Step 1: Create llama class with dummy weights
+        model = self._prepare_dummy_model()
+        checkpoint = model.state_dict()
+
+        # Step 2:
+        # Do per-channel quantization and amend the checkpoints with
+        # int8 weight and fp32 scales
+        for fqn, mod in model.named_modules():
+            if isinstance(mod, torch.nn.Linear) and fqn == "output":
+                weight = mod.weight.data
+                weight_int8, scales, _ = dynamically_quantize_per_channel(
+                    weight,
+                    quant_min=-128,
+                    quant_max=127,
+                    target_dtype=torch.int8,
+                    scales_dtype=torch.float32,
+                )
+                checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu")
+                checkpoint[f"{fqn}.scales"] = scales.to("cpu")
+
+        # Step 3:
+        # Transform the model so that it is compatible with the new checkpoint
+        transform_output_linear_for_pre_quantization(
+            model,
+            checkpoint,
+            torch.float32,
+        )
+        sanitize_checkpoint_from_pre_quantization(checkpoint)
+
+        model.load_state_dict(
+            checkpoint,
+            strict=False,
+            assign=True,
+        )
+
+        new_checkpoint = model.state_dict()
+
+        for k, v in checkpoint.items():
+            # The new_checkpoint contains zeros so
+            # have to iterate over the keys.
+            self.assertTrue(torch.allclose(new_checkpoint[k], v))
+
+    def test_transform_embedding_for_pre_quantization(self):
+
+        # Step 1: Create llama class with dummy weights
+        model = self._prepare_dummy_model()
+        checkpoint = model.state_dict()
+
+        # Step 2:
+        # Do group-wise quantization and amend the checkpoints with
+        # int8 weight and fp32 scales
+        group_size = 32
+        n_bit = 4
+        scales_precision = torch.float32
+        for fqn, mod in model.named_modules():
+            # Quantize everything except the last layer
+            if isinstance(mod, torch.nn.Embedding):
+                weight = mod.weight.data
+                (
+                    weight_int8,
+                    scales,
+                    zeros,
+                ) = group_quantize_tensor_symmetric(
+                    weight.to(torch.float32), n_bit, group_size, scales_precision
+                )
+                checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu")
+                checkpoint[f"{fqn}.scales"] = scales.to("cpu")
+
+        # Step 3:
+        # Transform the model so that it is compatible with the new checkpoint
+        transform_embedding_for_pre_quantization(
+            model,
+            checkpoint,
+            torch.float32,
+            n_bit,
+            group_size,
+        )
+        sanitize_checkpoint_from_pre_quantization(checkpoint)
+
+        model.load_state_dict(
+            checkpoint,
+            strict=False,
+            assign=True,
+        )
+
+        new_checkpoint = model.state_dict()
+
+        for k, v in checkpoint.items():
+            # The new_checkpoint contains zeros so
+            # have to iterate over the keys.
+            self.assertTrue(torch.allclose(new_checkpoint[k], v))
diff --git a/examples/models/llama2/tests/test_spinquant_transforms.py b/examples/models/llama2/tests/test_spinquant_transforms.py
deleted file mode 100644
index bd56632c5f5..00000000000
--- a/examples/models/llama2/tests/test_spinquant_transforms.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer
-from executorch.examples.models.llama2.source_transformation.spin_quant import (
-    sanitize_checkpoint_from_spinquant,
-    transform_for_spinquant,
-)
-from torchao.quantization.utils import group_quantize_tensor_symmetric
-
-
-class SpinQuantTests(unittest.TestCase):
-    def test_transforms_for_spinquant(self):
-
-        # Step 1: Create llama class with dummy weights
-        params = {
-            "dim": 768,
-            "multiple_of": 32,
-            "n_heads": 12,
-            "n_layers": 12,
-            "norm_eps": 1e-05,
-            "vocab_size": 32000,
-        }
-
-        model_args = ModelArgs(
-            max_seq_len=2048,
-            max_batch_size=1,
-            use_kv_cache=False,
-            use_sdpa_with_kv_cache_op=False,
-            generate_full_logits=False,
-            enable_dynamic_shape=True,
-            **params,
-        )
-
-        model = Transformer(model_args)
-        checkpoint = model.state_dict()
-
-        # Step 2:
-        # Do group-wise quantization and amend the checkpoints with
-        # int8 weight and fp32 scales
-        group_size = 32
-        n_bit = 4
-        scales_precision = torch.float32
-        for fqn, mod in model.named_modules():
-            # Quantize everything except the last layer
-            if isinstance(mod, torch.nn.Linear) and ("output" not in fqn):
-                weight = mod.weight.data
-                (
-                    weight_int8,
-                    scales,
-                    zeros,
-                ) = group_quantize_tensor_symmetric(
-                    weight.to(torch.float32), n_bit, group_size, scales_precision
-                )
-                checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu")
-                checkpoint[f"{fqn}.scale"] = scales.to("cpu")
-
-        # Step 3:
-        # Transform the model so that it is compatible with the new checkpoint
-        transform_for_spinquant(
-            model,
-            checkpoint,
-            32,
-            "8da4w",
-            torch.float32,
-        )
-        sanitize_checkpoint_from_spinquant(
-            checkpoint,
-            -1,
-        )
-
-        model.load_state_dict(
-            checkpoint,
-            strict=False,
-            assign=True,
-        )
-
-        new_checkpoint = model.state_dict()
-
-        for k, v in checkpoint.items():
-            # The new_checkpoint contains zeros so
-            # have to iterate over the keys.
-            self.assertTrue(torch.allclose(new_checkpoint[k], v))
diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.cpp b/examples/models/llama2/tokenizer/llama_tiktoken.cpp
index 0a1dddcc22e..5ce9d7f14cc 100644
--- a/examples/models/llama2/tokenizer/llama_tiktoken.cpp
+++ b/examples/models/llama2/tokenizer/llama_tiktoken.cpp
@@ -8,8 +8,10 @@
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 
-namespace torch {
-namespace executor {
+namespace example {
+
+using ::executorch::extension::llm::Tiktoken;
+
 namespace {
 static constexpr int32_t kSpecialTokensSize = 256;
 static constexpr size_t kBOSTokenIndex = 0;
@@ -72,7 +74,7 @@ _get_multimodal_special_tokens() {
 
 std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
   switch (version) {
-    case MULTIMODAL:
+    case Version::Multimodal:
       return _get_multimodal_special_tokens();
     default:
       return _get_default_special_tokens();
@@ -86,5 +88,4 @@ std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
       _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.h b/examples/models/llama2/tokenizer/llama_tiktoken.h
index 5e05b946d16..6baa3f49cc6 100644
--- a/examples/models/llama2/tokenizer/llama_tiktoken.h
+++ b/examples/models/llama2/tokenizer/llama_tiktoken.h
@@ -10,15 +10,14 @@
 
 #include <executorch/extension/llm/tokenizer/tiktoken.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
-enum Version {
-  DEFAULT,
-  MULTIMODAL,
+enum class Version {
+  Default,
+  Multimodal,
 };
 
-std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version = DEFAULT);
+std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
+    Version version = Version::Default);
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
index 0bc1e7d9dc1..5bd6515b676 100644
--- a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
+++ b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
@@ -7,20 +7,25 @@
  */
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+
+#include <vector>
+
 #include <executorch/runtime/platform/runtime.h>
+
 #include <gtest/gtest.h>
-#include <vector>
 
 using namespace ::testing;
 
-namespace torch {
-namespace executor {
+using ::example::Version;
+using ::executorch::extension::llm::Tokenizer;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
 
 class MultimodalTiktokenV5ExtensionTest : public Test {
  public:
   void SetUp() override {
-    torch::executor::runtime_init();
-    tokenizer_ = get_tiktoken_for_llama(MULTIMODAL);
+    executorch::runtime::runtime_init();
+    tokenizer_ = get_tiktoken_for_llama(Version::Multimodal);
     modelPath_ = std::getenv("RESOURCES_PATH") +
         std::string("/test_tiktoken_tokenizer.model");
   }
@@ -79,5 +84,3 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
     EXPECT_EQ(out.get(), expected[i]);
   }
 }
-} // namespace executor
-} // namespace torch
diff --git a/examples/models/llama3_2_mm/__init__.py b/examples/models/llama3_2_mm/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp b/examples/models/llama3_2_mm/cross_attention/cross_attention_mask.cpp
similarity index 96%
rename from examples/models/flamingo/cross_attention/cross_attention_mask.cpp
rename to examples/models/llama3_2_mm/cross_attention/cross_attention_mask.cpp
index 06887ec4735..9951165f46c 100644
--- a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp
+++ b/examples/models/llama3_2_mm/cross_attention/cross_attention_mask.cpp
@@ -6,12 +6,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
+#include <executorch/examples/models/llama3_2_mm/cross_attention/cross_attention_mask.h>
 
 #include <algorithm>
 #include <string>
 
-namespace torch::executor {
+namespace example {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::aten::TensorImpl;
 
 // Fowrward declaration needed for ARM compilers.
 int32_t safe_size_t_to_sizes_type(size_t value);
@@ -166,4 +170,4 @@ std::vector<executorch::extension::TensorPtr> cross_attention_mask(
   return cross_attention_masks;
 }
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.h b/examples/models/llama3_2_mm/cross_attention/cross_attention_mask.h
similarity index 96%
rename from examples/models/flamingo/cross_attention/cross_attention_mask.h
rename to examples/models/llama3_2_mm/cross_attention/cross_attention_mask.h
index ccbc9eb1710..ae6df0a6be4 100644
--- a/examples/models/flamingo/cross_attention/cross_attention_mask.h
+++ b/examples/models/llama3_2_mm/cross_attention/cross_attention_mask.h
@@ -13,8 +13,7 @@
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
 /**
  * Computes the cross-attention mask for text + image inputs. Text tokens that
@@ -61,11 +60,10 @@ namespace executor {
  */
 std::vector<::executorch::extension::TensorPtr> cross_attention_mask(
     const std::vector<int>& tokens,
-    const std::vector<Tensor>& images,
+    const std::vector<::executorch::aten::Tensor>& images,
     size_t tile_size,
     size_t patch_size,
     int image_token_id,
     std::vector<std::vector<int>>& out);
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp b/examples/models/llama3_2_mm/cross_attention/cross_attention_mask_test.cpp
similarity index 88%
rename from examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
rename to examples/models/llama3_2_mm/cross_attention/cross_attention_mask_test.cpp
index b232212fa31..fffbff5e402 100644
--- a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
+++ b/examples/models/llama3_2_mm/cross_attention/cross_attention_mask_test.cpp
@@ -6,14 +6,14 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
+#include <executorch/examples/models/llama3_2_mm/cross_attention/cross_attention_mask.h>
 
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using torch::executor::ScalarType;
-using torch::executor::Tensor;
-using torch::executor::TensorImpl;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using exec_aten::TensorImpl;
 
 TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) {
   std::vector<int> tokens = {
@@ -41,7 +41,7 @@ TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) {
 
   std::vector<Tensor> images = {a, b, c};
   std::vector<std::vector<int>> mask_data;
-  auto output_masks = torch::executor::cross_attention_mask(
+  auto output_masks = example::cross_attention_mask(
       tokens,
       images,
       /*tile_size=*/1,
diff --git a/examples/models/flamingo/cross_attention/targets.bzl b/examples/models/llama3_2_mm/cross_attention/targets.bzl
similarity index 100%
rename from examples/models/flamingo/cross_attention/targets.bzl
rename to examples/models/llama3_2_mm/cross_attention/targets.bzl
diff --git a/examples/models/flamingo/install_requirements.sh b/examples/models/llama3_2_mm/install_requirements.sh
similarity index 100%
rename from examples/models/flamingo/install_requirements.sh
rename to examples/models/llama3_2_mm/install_requirements.sh
diff --git a/examples/models/llama3_2_mm/preprocess/__init__.py b/examples/models/llama3_2_mm/preprocess/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/models/llama3_2_mm/preprocess/export_preprocess.py b/examples/models/llama3_2_mm/preprocess/export_preprocess.py
new file mode 100644
index 00000000000..7946d89f8a4
--- /dev/null
+++ b/examples/models/llama3_2_mm/preprocess/export_preprocess.py
@@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.examples.models.llama3_2_mm.preprocess.export_preprocess_lib import (
+    export_preprocess,
+    get_example_inputs,
+    lower_to_executorch_preprocess,
+)
+
+
+def main():
+    # Export
+    ep = export_preprocess()
+
+    # ExecuTorch
+    et = lower_to_executorch_preprocess(ep)
+    with open("preprocess_et.pte", "wb") as file:
+        et.write_to_file(file)
+
+    # AOTInductor
+    torch._inductor.aot_compile(
+        ep.module(),
+        get_example_inputs(),
+        options={"aot_inductor.output_path": "preprocess_aoti.so"},
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/flamingo/preprocess/export_preprocess_lib.py b/examples/models/llama3_2_mm/preprocess/export_preprocess_lib.py
similarity index 94%
rename from examples/models/flamingo/preprocess/export_preprocess_lib.py
rename to examples/models/llama3_2_mm/preprocess/export_preprocess_lib.py
index 366f5989222..53bb2e400d0 100644
--- a/examples/models/flamingo/preprocess/export_preprocess_lib.py
+++ b/examples/models/llama3_2_mm/preprocess/export_preprocess_lib.py
@@ -11,7 +11,7 @@
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.exir.program._program import ExecutorchProgramManager
 
-from executorch.extension.llm.custom_ops import preprocess_custom_ops  # noqa
+from executorch.extension.llm.custom_ops import op_tile_crop_aot  # noqa
 
 from torch.export import Dim, ExportedProgram
 from torchtune.models.clip.inference._transform import _CLIPImageTransform
@@ -43,6 +43,7 @@ def export_preprocess(
     max_num_tiles: int = 4,
     tile_size: int = 224,
     antialias: bool = False,
+    pad_max_tiles: bool = True,
 ) -> ExportedProgram:
 
     # Instantiate eager model.
@@ -53,6 +54,7 @@ def export_preprocess(
         max_num_tiles=max_num_tiles,
         tile_size=tile_size,
         antialias=antialias,
+        pad_max_tiles=pad_max_tiles,
     )
 
     # Replace non-exportable ops with custom ops.
diff --git a/examples/models/flamingo/preprocess/preprocess.cpp b/examples/models/llama3_2_mm/preprocess/preprocess.cpp
similarity index 97%
rename from examples/models/flamingo/preprocess/preprocess.cpp
rename to examples/models/llama3_2_mm/preprocess/preprocess.cpp
index ff46070f669..fb8ebb3aa4c 100644
--- a/examples/models/flamingo/preprocess/preprocess.cpp
+++ b/examples/models/llama3_2_mm/preprocess/preprocess.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "executorch/examples/models/flamingo/preprocess/preprocess.h"
+#include "executorch/examples/models/llama3_2_mm/preprocess/preprocess.h"
 
 #include <algorithm>
 #include <cassert>
diff --git a/examples/models/flamingo/preprocess/preprocess.h b/examples/models/llama3_2_mm/preprocess/preprocess.h
similarity index 100%
rename from examples/models/flamingo/preprocess/preprocess.h
rename to examples/models/llama3_2_mm/preprocess/preprocess.h
diff --git a/examples/models/flamingo/preprocess/preprocess_test.cpp b/examples/models/llama3_2_mm/preprocess/preprocess_test.cpp
similarity index 97%
rename from examples/models/flamingo/preprocess/preprocess_test.cpp
rename to examples/models/llama3_2_mm/preprocess/preprocess_test.cpp
index deede877223..7b822d10079 100644
--- a/examples/models/flamingo/preprocess/preprocess_test.cpp
+++ b/examples/models/llama3_2_mm/preprocess/preprocess_test.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/flamingo/preprocess/preprocess.h>
+#include <executorch/examples/models/llama3_2_mm/preprocess/preprocess.h>
 #include <gtest/gtest.h>
 
 using namespace ::testing;
diff --git a/examples/models/flamingo/preprocess/targets.bzl b/examples/models/llama3_2_mm/preprocess/targets.bzl
similarity index 100%
rename from examples/models/flamingo/preprocess/targets.bzl
rename to examples/models/llama3_2_mm/preprocess/targets.bzl
diff --git a/examples/models/flamingo/preprocess/test_preprocess.py b/examples/models/llama3_2_mm/preprocess/test_preprocess.py
similarity index 81%
rename from examples/models/flamingo/preprocess/test_preprocess.py
rename to examples/models/llama3_2_mm/preprocess/test_preprocess.py
index b990f44ca1b..313097020a1 100644
--- a/examples/models/flamingo/preprocess/test_preprocess.py
+++ b/examples/models/llama3_2_mm/preprocess/test_preprocess.py
@@ -37,7 +37,11 @@
 )
 from torchvision.transforms.v2 import functional as F
 
-from .export_preprocess_lib import export_preprocess, lower_to_executorch_preprocess
+from .export_preprocess_lib import (
+    export_preprocess,
+    get_example_inputs,
+    lower_to_executorch_preprocess,
+)
 
 
 @dataclass
@@ -50,6 +54,7 @@ class PreprocessConfig:
     tile_size: int = 224
     max_num_tiles: int = 4
     possible_resolutions = None
+    pad_max_tiles: bool = True
 
 
 class TestImageTransform(unittest.TestCase):
@@ -132,6 +137,17 @@ def prepare_inputs(
                 [1.0, 1.0],  # expected_tile_max
                 [0.0, 0.0],  # expected_tile_min
                 [1, 2],  # expected_aspect_ratio
+                False,  # pad_max_tiles
+            ),
+            (
+                (100, 400, 3),  # image_size
+                torch.Size([4, 3, 224, 224]),  # expected shape
+                False,  # resize_to_max_canvas
+                [0.2230, 0.1763, 0.0, 0.0],  # expected_tile_means
+                [1.0, 1.0, 0.0, 0.0],  # expected_tile_max
+                [0.0, 0.0, 0.0, 0.0],  # expected_tile_min
+                [1, 2],  # expected_aspect_ratio
+                True,  # pad_max_tiles
             ),
             (
                 (1000, 300, 3),  # image_size
@@ -141,6 +157,7 @@ def prepare_inputs(
                 [0.9976, 0.9940, 0.9936, 0.9906],  # expected_tile_max
                 [0.0037, 0.0047, 0.0039, 0.0],  # expected_tile_min
                 [4, 1],  # expected_aspect_ratio
+                False,  # pad_max_tiles
             ),
             (
                 (200, 200, 3),  # image_size
@@ -150,6 +167,7 @@ def prepare_inputs(
                 [0.9921, 0.9925, 0.9969, 0.9908],  # expected_tile_max
                 [0.0056, 0.0069, 0.0059, 0.0032],  # expected_tile_min
                 [2, 2],  # expected_aspect_ratio
+                False,  # pad_max_tiles
             ),
             (
                 (600, 200, 3),  # image_size
@@ -159,6 +177,17 @@ def prepare_inputs(
                 [1.0, 1.0, 1.0],  # expected_tile_max
                 [0.0, 0.0, 0.0],  # expected_tile_min
                 [3, 1],  # expected_aspect_ratio
+                False,  # pad_max_tiles
+            ),
+            (
+                (600, 200, 3),  # image_size
+                torch.Size([4, 3, 224, 224]),  # expected shape
+                False,  # resize_to_max_canvas
+                [0.4472, 0.4468, 0.3031, 0.0],  # expected_tile_means
+                [1.0, 1.0, 1.0, 0.0],  # expected_tile_max
+                [0.0, 0.0, 0.0, 0.0],  # expected_tile_min
+                [3, 1],  # expected_aspect_ratio
+                True,  # pad_max_tiles
             ),
         ]
     )
@@ -171,8 +200,11 @@ def test_preprocess(
         expected_tile_max: List[float],
         expected_tile_min: List[float],
         expected_ar: List[int],
+        pad_max_tiles: bool,
     ) -> None:
-        config = PreprocessConfig(resize_to_max_canvas=resize_to_max_canvas)
+        config = PreprocessConfig(
+            resize_to_max_canvas=resize_to_max_canvas, pad_max_tiles=pad_max_tiles
+        )
 
         reference_model = CLIPImageTransform(
             image_mean=config.image_mean,
@@ -183,6 +215,7 @@ def test_preprocess(
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
             possible_resolutions=None,
+            pad_max_tiles=config.pad_max_tiles,
         )
 
         eager_model = _CLIPImageTransform(
@@ -192,6 +225,7 @@ def test_preprocess(
             antialias=config.antialias,
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
+            pad_max_tiles=config.pad_max_tiles,
         )
 
         exported_model = export_preprocess(
@@ -201,11 +235,17 @@ def test_preprocess(
             antialias=config.antialias,
             tile_size=config.tile_size,
             max_num_tiles=config.max_num_tiles,
+            pad_max_tiles=config.pad_max_tiles,
         )
 
         executorch_model = lower_to_executorch_preprocess(exported_model)
         executorch_module = _load_for_executorch_from_buffer(executorch_model.buffer)
 
+        aoti_path = torch._inductor.aot_compile(
+            exported_model.module(),
+            get_example_inputs(),
+        )
+
         # Prepare image input.
         image = (
             np.random.randint(0, 256, np.prod(image_size))
@@ -235,8 +275,11 @@ def test_preprocess(
             self.assertAlmostEqual(tile.min().item(), expected_tile_min[i], delta=1e-4)
 
         # Check num tiles matches the product of the aspect ratio.
-        expected_num_tiles = reference_ar[0] * reference_ar[1]
-        self.assertEqual(expected_num_tiles, reference_image.shape[0])
+        if pad_max_tiles:
+            self.assertEqual(config.max_num_tiles, reference_image.shape[0])
+        else:
+            expected_num_tiles = reference_ar[0] * reference_ar[1]
+            self.assertEqual(expected_num_tiles, reference_image.shape[0])
 
         # Pre-work for eager and exported models. The reference model performs these
         # calculations and passes the result to _CLIPImageTransform, the exportable model.
@@ -266,3 +309,9 @@ def test_preprocess(
         )
         self.assertTrue(torch.allclose(reference_image, et_image))
         self.assertEqual(reference_ar, et_ar.tolist())
+
+        # Run aoti model and check it matches reference model.
+        aoti_model = torch._export.aot_load(aoti_path, "cpu")
+        aoti_image, aoti_ar = aoti_model(image_tensor, inscribed_size, best_resolution)
+        self.assertTrue(torch.allclose(reference_image, aoti_image))
+        self.assertEqual(reference_ar, aoti_ar.tolist())
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index c36e39a04cb..ed4cbc46344 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -157,7 +157,10 @@ endif()
 
 # XNNPACK
 if(TARGET xnnpack_backend)
-  set(xnnpack_backend_libs xnnpack_backend XNNPACK)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
   list(APPEND link_libraries ${xnnpack_backend_libs})
   target_link_options_shared_lib(xnnpack_backend)
 endif()
diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md
index 8cb605d75fc..ad2f3f3dc99 100644
--- a/examples/models/llava/README.md
+++ b/examples/models/llava/README.md
@@ -1,89 +1,196 @@
 ## Summary
 LLaVA is the first multi-modal LLM ExecuTorch supports. In this directory, we
-- Host a model definition for LLaVA.
-- Demonstrate how to export [LLavA](https://github.com/haotian-liu/LLaVA) multimodal model to a .pte file.
-- Provide a C++ runner that loads the .pte file, the tokenizer and an image, then generate responses based on user prompt.
+- Host a model definition for [LLavA](https://github.com/haotian-liu/LLaVA).
+- Demonstrate how to export LLavA multimodal model to generate ExecuTorch .PTE file.
+- Provide a C++ runner, Android/iOS Apps that loads the .pte file, the tokenizer and an image, then generate responses based on user prompt.
+- Discuss optimizations went into enabling LlaVA on a phone, and early performance numbers
+
+Tokenizer, image encoder, and the pretrained text model, which is based on Meta
+[Llama2-7b](https://llama.meta.com/llama2/), is loaded from Llava
+huggingface page [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) .
+
+
+<p align="center">
+      <img src="./llava_via_xnnpack.gif" width=300>
+      <br>
+      <em>
+      Running Llava1.5 7B on Android phone
+      </em>
+</p>
+
+## What is LLaVA?
+
+[LLaVA](https://llava-vl.github.io/) is a novel end-to-end trained large
+multimodal model that combines a vision encoder and Vicuna (a LLama2 based text
+model) for general-purpose visual and language understanding, achieving
+impressive chat capabilities mimicking spirits of the cutting edge multimodal
+models and setting a high bar for accuracy on Science QA.
 
 ## Instructions
-### Export .pte & other artifacts
 
-Run the following command to generate `llava.pte`, `tokenizer.bin` and an image tensor (serialized in TorchScript) `image.pt`.
+First you need to generate a .PTE file for the model, along with input image,
+and other artifacts. Then you need either a C++ runner, or Android or iOS
+application to test things out on device.
+
+### Generate ExecuTorch .PTE and other artifacts
 
-Prerequisite: run `install_requirements.sh` to install ExecuTorch and run `examples/models/llava/install_requirements.sh` to install dependencies.
+Run the following command to generate `llava.pte`, `tokenizer.bin` and an image
+tensor (serialized in TorchScript) `image.pt`.
+
+Prerequisite: run `install_requirements.sh` to install ExecuTorch and run
+`examples/models/llava/install_requirements.sh` to install dependencies.
 
 ```bash
 python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
 ```
 
-Currently the whole export process takes about 6 minutes. We also provide a small test util to verify the correctness of the exported .pte file. Just run:
+Currently the whole export process takes about 6 minutes. We also provide a
+small test utility to verify the correctness of the exported .pte file. Just run:
 
 ```bash
 python -m executorch.examples.models.llava.test.test_pte llava.pte
 ```
 
-If everything works correctly it should give you some meaningful response such as:
-
+### Build C++ Runner
 
+See or run `.ci/scripts/test_llava.sh` shell script to build a C++ runner. This
+script also has preliminary support to build the C++ runner for Android.
 
-### Build C++ runner
+This also has an image utility Python script to generate image in PyTorch
+loadable format. Alternatively, we are working on generating image format which
+doesn't need PyTorch to load an image. Motivation for this is to build the C++
+runner on Android.
 
-Run the following cmake commands from `executorch/`:
+Then you should be able to find `llava_main` binary:
 
 ```bash
-# build libraries
-cmake                                               \
-    -DCMAKE_INSTALL_PREFIX=cmake-out                \
-    -DCMAKE_BUILD_TYPE=Debug                        \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON     \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON          \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON            \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON         \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON         \
-    -DEXECUTORCH_BUILD_XNNPACK=ON                   \
-    -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON            \
-    -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON        \
-    -Bcmake-out .
-
-
-cmake --build cmake-out -j9 --target install --config Debug
-
-# build llava runner
-
-dir=examples/models/llava
-python_lib=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
-
-cmake                                       \
-    -DCMAKE_INSTALL_PREFIX=cmake-out        \
-    -DCMAKE_BUILD_TYPE=Debug                \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON    \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON           \
-    -DCMAKE_PREFIX_PATH="$python_lib"       \
-    -Bcmake-out/${dir}                      \
-    ${dir}
-
-
-cmake --build cmake-out/${dir} -j9 --config Debug
+cmake-out/examples/models/llava/llava_main
 ```
 
-Or simply run `.ci/scripts/test_llava.sh`.
+### Build Mobile Apps
 
-Then you should be able to find `llava_main` binary:
+#### Android
 
-```bash
-cmake-out/examples/models/llava/llava_main
-```
+We can run LLAVA using the LLAMA Demo Apps. Please refer to [this
+tutorial](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo)
+to for full instructions on building the Android LLAMA Demo App.
+
+#### iOS
 
-### Run LLaVA
+We can run LLAVA using the LLAMA Demo Apps. Please refer to [this
+tutorial](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/apple_ios/LLaMA)
+to for full instructions on building the iOS LLAMA Demo App.
+
+### Running LLaVA
 
 Run:
 ```bash
-cmake-out/examples/models/llava/llava_main --model_path=llava.pte --tokenizer_path=tokenizer.bin --image_path=image.pt --prompt="What are the things I should be cautious about when I visit here? ASSISTANT:" --seq_len=768 --temperature=0
+cmake-out/examples/models/llava/llava_main \
+    --model_path=llava.pte                 \
+    --tokenizer_path=tokenizer.bin         \
+    --image_path=image.pt                  \
+    --prompt="ASSISTANT:" \
+    --seq_len=768                          \
+    --temperature=0
 ```
+(see --help for other options).
+
+For this example input used in this example,
+
+![image](https://upload.wikimedia.org/wikipedia/commons/3/3e/Chicago_Bulls_-_New_Jersey_Nets_match_on_March_28%2C_1991.jpg)
 
-You should get a response like:
+You should get a response like (tested on Arm CPUs with ET XNNPACK delegate):
 
 ```
-When visiting a place like this, ...
+ASSISTANT: image captures a basketball game in progress, with several players on the court. ...
 ```
+
+## Optimizations and Results
+
+Since LLaVA model needs at least 4-bit quantization to fit even within some of
+the high-end phones, results presented here correspond to 4-bit groupwise
+post-training quantized model.
+
+In addition to that, work is mainly focused on using Arm CPUs and ET XNNPACK delegate.
+
+### Memory Footprint Reduction Techniques
+
+With Llava, we needed to find a way to reduce the memory footprint in order to
+make it feasible to run on edge devices. Out of the box, even with 4-bit
+quantized weights, the memory footprint is around ~11 GiB, which is
+prohibitively large even for high-end Android or iOS devices.
+
+We did several optimizations, which should be already enabled if you follow this
+tutorial, to get the memory footprint down to ~5 GiB, which unblocks us to run
+on high-end devices.
+
+#### Sharing intermediate memory across delegates
+
+Sharing working memory across ET XNNPACK delegates helps reduce the peak memory
+usage for LLMs with many DQLinears. We reduced it by 36.1% (from 10.44GiB to
+6.67GiB) for Llava towards unblocking it to run on Phones.
+
+#### Reducing maximum sequence length
+
+To free up more memory, we examined non-constant memory usage, specifically
+focusing on intermediate tensors used throughout the model during inference.
+The majority of these were found in the KV-cache allocations. Based on “minimum
+can get away with” heuristic, we reduced max sequence length number to 768 from
+previous default 2048. This adjustment led to a further memory reduction of
+approximately 1.23 GiB (from 6.67 GiB to 5.44 GiB).
+
+#### Quantizing embedding weights to 8b
+
+By quantizing the embedding layer to 8 bit, we were able to achieve an
+additional memory footprint reduction of approximately 300 MiB, bringing the
+total down to ~5 GiB.
+
+### Performance Optimizations
+
+#### Decode performance
+
+This was already heavily optimized through KV-cache and GEMV kernel
+optimization efforts for LLama2/3.
+
+#### Encode performance
+
+With image based large prompts, this was the focus of performance
+optimizations for LLaVA. We implemented two main optimizations to bring the decode or
+prefill performance for the image down by more than 100% from the baseline.
+
+* **Two XNNPACK Partitioners**
+
+For text-only LLMs, our approach involved lowering only DQLinear ops
+to XNNPACK and relying on ExecuTorch-optimized operators or custom ops
+(utilizing Neon SIMD) to support multiplication, addition, and other
+operations. Lowering these operations to XNNPACK significantly improves Time to
+First Token (TTFT).
+
+
+* **New Arm Neon I8mm GEMM kernels**
+
+We introduced new kernels in XNNPACK for the quantization scheme used
+here, which upgrades our existing dot-prod based GEMM kernels to i8mm based
+GEMM kernels. The new kernel offers significantly improved performance by
+leveraging the more efficient SMMLA instruction from Arm Neon. However, it's
+worth noting that this instruction is only available on newer Arm CPUs.
+
+
+### Results
+
+Note this is an active area of development in the ExecuTorch repository. You
+will need this PR [5380](https://github.com/pytorch/executorch/pull/5380) to
+supply an image to the C++ runner on Android without Torch dependency. This
+should be merged soon.
+
+With those caveats out of the way, here are some preliminary numbers (as average of
+three runs) for LLaVA using a C++ runner on Android OnePlus12 device with 12GiB
+memory.
+
+| Experiment Setup  | Prefill time in seconds | Decode tokens/second |
+| :------------- | -------------: | -------------: |
+| Baseline  | 29.95  | 8.75 |
+| + Two XNNPACK Partitioners  | 17.82  | 8.93 |
+| + New Arm Neon i8mm GEMM Kernels  | 14.60 | 8.92 |
+
+We appreciate your feedback. Please let us know if you run into any issues.
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index bdeaef15fe6..47a5407cf18 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -233,7 +233,7 @@ def export_all(llava_model: LlavaModel):
             passes=[
                 QuantFusionPass(),
             ],
-            memory_planning_pass=MemoryPlanningPass("greedy", alloc_graph_input=False),
+            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
                 "image_encoder": ConstraintBasedSymShapeEvalPass(),
                 "text_model": ConstraintBasedSymShapeEvalPass(),
@@ -273,6 +273,7 @@ def main():
     parser.add_argument(
         "--max-seq-len",
         default=768,
+        type=int,
         help="Maximum sequence length for the text model.",
     )
     parser.add_argument(
diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh
index 931d63b3919..917e740150a 100644
--- a/examples/models/llava/install_requirements.sh
+++ b/examples/models/llava/install_requirements.sh
@@ -7,6 +7,6 @@
 
 set -x
 
-pip install transformers accelerate sentencepiece
+pip install transformers accelerate sentencepiece tiktoken
 
 pip list
diff --git a/examples/models/llava/llava_via_xnnpack.gif b/examples/models/llava/llava_via_xnnpack.gif
new file mode 100644
index 00000000000..83c32cf0465
Binary files /dev/null and b/examples/models/llava/llava_via_xnnpack.gif differ
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
index 53f6329b4d8..b01b33f5dd8 100644
--- a/examples/models/llava/main.cpp
+++ b/examples/models/llava/main.cpp
@@ -48,6 +48,8 @@ DEFINE_int32(
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
+using executorch::extension::llm::Image;
+
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -70,17 +72,17 @@ int32_t main(int32_t argc, char** argv) {
 
 #if defined(ET_USE_THREADPOOL)
   uint32_t num_performant_cores = cpu_threads == -1
-      ? torch::executorch::cpuinfo::get_num_performant_cores()
+      ? ::executorch::extension::cpuinfo::get_num_performant_cores()
       : static_cast<uint32_t>(cpu_threads);
   ET_LOG(
       Info, "Resetting threadpool with num threads = %d", num_performant_cores);
   if (num_performant_cores > 0) {
-    torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool(
-        num_performant_cores);
+    ::executorch::extension::threadpool::get_threadpool()
+        ->_unsafe_reset_threadpool(num_performant_cores);
   }
 #endif
   // create llama runner
-  torch::executor::LlavaRunner runner(model_path, tokenizer_path, temperature);
+  example::LlavaRunner runner(model_path, tokenizer_path, temperature);
 
   // read image and resize the longest edge to 336
   std::vector<uint8_t> image_data;
@@ -90,7 +92,7 @@ int32_t main(int32_t argc, char** argv) {
   image_data.resize(3 * 240 * 336);
   std::fill(image_data.begin(), image_data.end(), 0); // black
   std::array<int32_t, 3> image_shape = {3, 240, 336};
-  std::vector<torch::executor::Image> images = {
+  std::vector<Image> images = {
       {.data = image_data, .width = image_shape[2], .height = image_shape[1]}};
 #else //  LLAVA_NO_TORCH_DUMMY_IMAGE
   //   cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
@@ -111,7 +113,7 @@ int32_t main(int32_t argc, char** argv) {
   image_data.assign(
       image_tensor.data_ptr<uint8_t>(),
       image_tensor.data_ptr<uint8_t>() + image_tensor.numel());
-  std::vector<torch::executor::Image> images = {
+  std::vector<Image> images = {
       {.data = image_data,
        .width = static_cast<int32_t>(image_tensor.size(2)),
        .height = static_cast<int32_t>(image_tensor.size(1))}};
diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h
index 3597ff82efe..ace28ac2c1f 100644
--- a/examples/models/llava/runner/llava_image_prefiller.h
+++ b/examples/models/llava/runner/llava_image_prefiller.h
@@ -13,28 +13,33 @@
 #include <executorch/extension/llm/runner/image_prefiller.h>
 #include <executorch/extension/tensor/tensor.h>
 
-namespace torch::executor {
+namespace example {
 
-class LlavaImagePrefiller : public ImagePrefiller {
+class LlavaImagePrefiller
+    : public ::executorch::extension::llm::ImagePrefiller {
  public:
-  LlavaImagePrefiller(Module* module) : ImagePrefiller(module){};
+  LlavaImagePrefiller(::executorch::extension::Module* module)
+      : ImagePrefiller(module){};
   /**
    * Prefill an LLM Module with the given image input.
    * @param image The image input to LLaVa.
    * @param start_pos The starting position in KV cache of the input in the LLM
    * @return logits of the image prefill.
    */
-  inline Result<exec_aten::Tensor> prefill(Image& image, int64_t& start_pos)
-      override {
+  inline ::executorch::runtime::Result<exec_aten::Tensor> prefill(
+      ::executorch::extension::llm::Image& image,
+      int64_t& start_pos) override {
     auto image_tensor = executorch::extension::from_blob(
-        image.data.data(), {3, image.height, image.width}, ScalarType::Byte);
+        image.data.data(),
+        {3, image.height, image.width},
+        ::executorch::aten::ScalarType::Byte);
     // Run image encoder
     auto image_encoder_outputs =
         ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
 
     // inputs:[start_pos, embeds]
-    auto start_pos_tensor =
-        executorch::extension::from_blob(&start_pos, {1}, ScalarType::Long);
+    auto start_pos_tensor = executorch::extension::from_blob(
+        &start_pos, {1}, ::executorch::aten::ScalarType::Long);
 
     // Run text model
     auto outputs_res = ET_UNWRAP(module_->execute(
@@ -54,13 +59,13 @@ class LlavaImagePrefiller : public ImagePrefiller {
    * Load the Module for image prefill purpose.
    * @return The error code.
    */
-  inline Error load() override {
+  inline ::executorch::runtime::Error load() override {
     if (is_method_loaded()) {
-      return Error::Ok;
+      return ::executorch::runtime::Error::Ok;
     }
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
-    return Error::Ok;
+    return ::executorch::runtime::Error::Ok;
   }
 
   /**
@@ -68,9 +73,9 @@ class LlavaImagePrefiller : public ImagePrefiller {
    * @return True if the Module is loaded, false otherwise.
    */
   inline bool is_method_loaded() override {
-    Result<std::unordered_set<std::string>> methods_res =
+    ::executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
         module_->method_names();
-    if (methods_res.error() != Error::Ok) {
+    if (methods_res.error() != ::executorch::runtime::Error::Ok) {
       ET_CHECK_MSG(false, "Failed to get method names");
     }
     std::unordered_set<std::string> methods = methods_res.get();
@@ -95,4 +100,4 @@ class LlavaImagePrefiller : public ImagePrefiller {
   inline static const std::string kTextModelMethod = "text_model";
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
index 1924b057ec4..b3c0cce5c33 100644
--- a/examples/models/llava/runner/llava_runner.cpp
+++ b/examples/models/llava/runner/llava_runner.cpp
@@ -20,9 +20,11 @@
 #include <sstream>
 #include <vector>
 
-using ::executorch::extension::llm::Stats;
+namespace llm = ::executorch::extension::llm;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
 
-namespace torch::executor {
+namespace example {
 
 bool LlavaRunner::is_loaded() {
   bool instantiated = tokenizer_ && text_decoder_runner_ && text_prefiller_ &&
@@ -38,10 +40,10 @@ Error LlavaRunner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
-  stats_.model_load_start_ms = util::time_in_ms();
+  stats_.model_load_start_ms = llm::time_in_ms();
 
   // Load the tokenizer
-  tokenizer_ = std::make_unique<BPETokenizer>();
+  tokenizer_ = std::make_unique<llm::BPETokenizer>();
   tokenizer_->load(tokenizer_path_);
 
   // Load the text decoder runner
@@ -50,7 +52,7 @@ Error LlavaRunner::load() {
   text_decoder_runner_->load();
 
   // Load the text prefiller
-  text_prefiller_ = std::make_unique<TextPrefiller>(
+  text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(),
       /*use_kv_cache=*/true,
       /*enable_parallel_prefill=*/true);
@@ -60,7 +62,7 @@ Error LlavaRunner::load() {
   image_prefiller_->load();
 
   // Load the text token generator
-  text_token_generator_ = std::make_unique<TextTokenGenerator>(
+  text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(),
       text_decoder_runner_.get(),
       /*use_kv_cache=*/true,
@@ -68,12 +70,12 @@ Error LlavaRunner::load() {
           std::unordered_set<uint64_t>{tokenizer_->eos_tok()}),
       &stats_);
 
-  stats_.model_load_end_ms = util::time_in_ms();
+  stats_.model_load_end_ms = llm::time_in_ms();
   return Error::Ok;
 }
 
 Error LlavaRunner::prefill_images(
-    std::vector<Image>& images,
+    std::vector<llm::Image>& images,
     int64_t& start_pos) {
   for (auto& image : images) {
     // pos is updated inside image prefill.
@@ -108,8 +110,8 @@ Error LlavaRunner::generate_from_pos(
 
   uint64_t prefill_next_token =
       ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0));
-  stats_.first_token_ms = util::time_in_ms();
-  stats_.prompt_eval_end_ms = util::time_in_ms();
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
   stats_.num_prompt_tokens = start_pos;
 
   // Generate tokens
@@ -125,11 +127,11 @@ Error LlavaRunner::generate_from_pos(
 }
 
 Error LlavaRunner::generate(
-    std::vector<Image> images,
+    std::vector<llm::Image> images,
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback,
+    std::function<void(const llm::Stats&)> stats_callback,
     bool echo) {
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
   if (!is_loaded()) {
@@ -139,12 +141,12 @@ Error LlavaRunner::generate(
   ET_LOG(
       Info,
       "RSS after loading model: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
-        util::safe_printf(piece.c_str());
+        llm::safe_printf(piece.c_str());
         fflush(stdout);
         if (token_callback) {
           token_callback(piece);
@@ -152,7 +154,7 @@ Error LlavaRunner::generate(
       };
 
   int64_t pos = 0;
-  stats_.inference_start_ms = util::time_in_ms();
+  stats_.inference_start_ms = llm::time_in_ms();
 
   // prefill preset prompt
   prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0);
@@ -163,21 +165,21 @@ Error LlavaRunner::generate(
   ET_LOG(
       Info,
       "RSS after prompt and image prefill: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Generate tokens
   Error err = generate_from_pos(
       prompt, seq_len, pos, wrapped_callback, stats_callback, echo);
 
-  stats_.inference_end_ms = util::time_in_ms();
+  stats_.inference_end_ms = llm::time_in_ms();
   ::executorch::llm::print_report(stats_);
 
   ET_LOG(
       Info,
       "RSS after finishing text generation: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   return err;
 }
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h
index e671718ae5e..79cc22fb240 100644
--- a/examples/models/llava/runner/llava_runner.h
+++ b/examples/models/llava/runner/llava_runner.h
@@ -19,9 +19,9 @@
 
 #include <executorch/extension/llm/runner/multimodal_runner.h>
 
-namespace torch::executor {
+namespace example {
 
-class LlavaRunner : public MultimodalRunner {
+class LlavaRunner : public ::executorch::extension::llm::MultimodalRunner {
  public:
   explicit LlavaRunner(
       const std::string& model_path,
@@ -29,9 +29,9 @@ class LlavaRunner : public MultimodalRunner {
       const float temperature = 0.8f)
       : MultimodalRunner(model_path, tokenizer_path, temperature){};
   bool is_loaded();
-  Error load();
-  Error generate(
-      std::vector<Image> images,
+  ::executorch::runtime::Error load();
+  ::executorch::runtime::Error generate(
+      std::vector<::executorch::extension::llm::Image> images,
       const std::string& prompt,
       int32_t seq_len = 1024,
       std::function<void(const std::string&)> token_callback = {},
@@ -46,7 +46,9 @@ class LlavaRunner : public MultimodalRunner {
    * It's passed as reference and will be updated inside this function.
    * @return The error status of prefilling images.
    */
-  Error prefill_images(std::vector<Image>& images, int64_t& start_pos);
+  ::executorch::runtime::Error prefill_images(
+      std::vector<::executorch::extension::llm::Image>& images,
+      int64_t& start_pos);
 
   /**
    * Prefill an LLaVA Module with the given text input.
@@ -57,7 +59,7 @@ class LlavaRunner : public MultimodalRunner {
    * @param eos The number of EOS (end of sequence) token.
    * @return The generated token of the LLaVA Module after prefill prompt.
    */
-  Result<uint64_t> prefill_prompt(
+  ::executorch::runtime::Result<uint64_t> prefill_prompt(
       const std::string& prompt,
       int64_t& start_pos,
       int8_t bos = 0,
@@ -74,7 +76,7 @@ class LlavaRunner : public MultimodalRunner {
    * @param echo Whether to echo the input prompt or not.
    * @return The error code.
    */
-  Error generate_from_pos(
+  ::executorch::runtime::Error generate_from_pos(
       const std::string& prompt,
       int32_t seq_len = 1024,
       int64_t start_pos = 0,
@@ -88,4 +90,4 @@ class LlavaRunner : public MultimodalRunner {
       "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
index a58bcc47e0a..b2ee56f321a 100644
--- a/examples/models/llava/runner/llava_text_decoder_runner.h
+++ b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -12,14 +12,18 @@
 
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 
-namespace torch::executor {
+namespace example {
 
-class LlavaTextDecoderRunner : public TextDecoderRunner {
+class LlavaTextDecoderRunner
+    : public executorch::extension::llm::TextDecoderRunner {
  public:
-  LlavaTextDecoderRunner(Module* module, int32_t vocab_size, float temperature)
+  LlavaTextDecoderRunner(
+      executorch::extension::Module* module,
+      int32_t vocab_size,
+      float temperature)
       : TextDecoderRunner(module, true, vocab_size, temperature){};
 
-  inline Result<exec_aten::Tensor> step(
+  inline executorch::runtime::Result<exec_aten::Tensor> step(
       executorch::extension::TensorPtr& tokens,
       executorch::extension::TensorPtr& start_pos) override {
     // run token embedding
@@ -45,13 +49,13 @@ class LlavaTextDecoderRunner : public TextDecoderRunner {
    * Load the Module for text decode purpose.
    * @return The error code.
    */
-  inline Error load() override {
+  inline executorch::runtime::Error load() override {
     if (is_method_loaded()) {
-      return Error::Ok;
+      return executorch::runtime::Error::Ok;
     }
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
-    return Error::Ok;
+    return executorch::runtime::Error::Ok;
   }
 
   /**
@@ -59,9 +63,9 @@ class LlavaTextDecoderRunner : public TextDecoderRunner {
    * @return True if the Module is loaded, false otherwise.
    */
   inline bool is_method_loaded() override {
-    Result<std::unordered_set<std::string>> methods_res =
+    executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
         module_->method_names();
-    if (methods_res.error() != Error::Ok) {
+    if (methods_res.error() != executorch::runtime::Error::Ok) {
       ET_CHECK_MSG(false, "Failed to get method names");
     }
     std::unordered_set<std::string> methods = methods_res.get();
@@ -86,4 +90,4 @@ class LlavaTextDecoderRunner : public TextDecoderRunner {
   inline static const std::string kTextModelMethod = "text_model";
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/phi-3-mini-lora/README.md b/examples/models/phi-3-mini-lora/README.md
index 69564581af3..987052dbf24 100644
--- a/examples/models/phi-3-mini-lora/README.md
+++ b/examples/models/phi-3-mini-lora/README.md
@@ -1,5 +1,7 @@
 ## Summary
-In this example, we export to ExecuTorch a model ([phi-3-mini](https://github.com/pytorch/executorch/tree/main/examples/models/phi-3-mini)) appended with attention and mlp LoRA layers. The model is exported to ExecuTorch for both inference and training. Note: the exported training model can only train at the moment.
+In this example, we showcase how to export a model ([phi-3-mini](https://github.com/pytorch/executorch/tree/main/examples/models/phi-3-mini)) appended with LoRA layers to ExecuTorch. The model is exported to ExecuTorch for both inference and training.
+
+To see how you can use the model exported for training in a fully involved finetuning loop, please see our example on [LLM PTE Fintetuning](https://github.com/pytorch/executorch/tree/main/examples/llm_pte_finetuning).
 
 ## Instructions
 ### Step 1: [Optional] Install ExecuTorch dependencies
@@ -9,7 +11,7 @@ In this example, we export to ExecuTorch a model ([phi-3-mini](https://github.co
 - `./examples/models/phi-3-mini-lora/install_requirements.sh`
 
 ### Step 3: Export and run the model
-1. Export the inferenace and training models to ExecuTorch.
+1. Export the inference and training models to ExecuTorch.
 ```
 python export_model.py
 ```
diff --git a/examples/models/phi-3-mini-lora/export_model.py b/examples/models/phi-3-mini-lora/export_model.py
index eb8fbc07fe8..e6f291bd581 100644
--- a/examples/models/phi-3-mini-lora/export_model.py
+++ b/examples/models/phi-3-mini-lora/export_model.py
@@ -28,11 +28,13 @@ def __init__(self, model, loss):
         self.model = model
         self.loss = loss
 
-    def forward(self, input):
+    def forward(self, input: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
         # Output is of the shape (seq_len, vocab_size).
-        output = self.model(input)
-        target = zeros((1, vocab_size), dtype=long)
-        return self.loss(output, target)
+        logits = self.model(input)
+        logits = logits[..., :-1, :].contiguous()
+        labels = labels[..., 1:].contiguous()
+        logits = logits.transpose(1, 2)
+        return self.loss(logits, labels)
 
 
 @no_grad()
@@ -47,7 +49,11 @@ def export_phi3_mini_lora(model) -> None:
     model.eval()
     # 1. torch.export: Defines the program with the ATen operator set.
     print("Exporting to aten dialect")
-    example_args = (randint(0, 100, (1, 100), dtype=long),)
+    batch_size = 1
+    vocab_size = 100
+    seq_len = 10
+    tokens = randint(0, vocab_size, (batch_size, seq_len), dtype=long)
+    example_args = (tokens,)
     with sdpa_kernel([SDPBackend.MATH]):
         aten_dialect: ExportedProgram = export(model, example_args)
 
@@ -80,7 +86,12 @@ def export_phi3_mini_lora_training(model) -> None:
     print("Exporting phi3-mini with LoRA for training")
     # 1. torch.export: Defines the program with the ATen operator set.
     print("Exporting to aten dialect")
-    example_args = (randint(0, 100, (1, 100), dtype=long),)
+    batch_size = 1
+    vocab_size = 100
+    seq_len = 10
+    tokens = randint(0, vocab_size, (batch_size, seq_len), dtype=long)
+    labels = tokens
+    example_args = (tokens, labels)
     with sdpa_kernel([SDPBackend.MATH]):
         exported_graph: ExportedProgram = export(model, example_args)
         print("Creating a joint forward-backwards graph for training")
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
index 19269716211..e5a86c41777 100644
--- a/examples/models/phi-3-mini/README.md
+++ b/examples/models/phi-3-mini/README.md
@@ -4,9 +4,9 @@ This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/micro
 # Instructions
 ## Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack`
-2. To export Phi-3-mini, we need this [PR](https://github.com/huggingface/transformers/pull/32339). Install transformers from master with the following command:
+2. Currently, we support transformers v4.44.2. Install transformers with the following command:
 ```
-pip uninstall -y transformers ; pip install git+https://github.com/huggingface/transformers
+pip uninstall -y transformers ; pip install transformers==4.44.2
 ```
 ## Step 2: Prepare and run the model
 1. Download the `tokenizer.model` from HuggingFace and create `tokenizer.bin`.
@@ -53,5 +53,14 @@ cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release
 ```
 - Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L13-L30)
 ```
-cmake-out/examples/models/phi-3-mini/phi_3_mini_runner --model_path=<model pte file> --tokenizer_path=<tokenizer.bin> --seq_len=128 --prompt=<prompt>
+cmake-out/examples/models/phi-3-mini/phi_3_mini_runner \
+    --model_path=phi-3-mini.pte \
+    --tokenizer_path=tokenizer.bin \
+    --seq_len=128 \
+    --temperature=0 \
+    --prompt="<|system|>
+You are a helpful assistant.<|end|>
+<|user|>
+What is the capital of France?<|end|>
+<|assistant|>"
 ```
diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
index c2e97a21b1e..305b83457dc 100644
--- a/examples/models/phi-3-mini/export_phi-3-mini.py
+++ b/examples/models/phi-3-mini/export_phi-3-mini.py
@@ -15,13 +15,13 @@
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
 from executorch.exir import to_edge
-from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
+from torch.export import export_for_training
 
 from transformers import Phi3ForCausalLM
 
@@ -64,9 +64,9 @@ def export(args) -> None:
         xnnpack_quantizer = XNNPACKQuantizer()
         xnnpack_quantizer.set_global(xnnpack_quant_config)
 
-        model = capture_pre_autograd_graph(
+        model = export_for_training(
             model, example_inputs, dynamic_shapes=dynamic_shapes
-        )
+        ).module()
         model = prepare_pt2e(model, xnnpack_quantizer)  # pyre-fixme[6]
         model(*example_inputs)
         model = convert_pt2e(model)
diff --git a/examples/models/phi-3-mini/install_requirements.sh b/examples/models/phi-3-mini/install_requirements.sh
new file mode 100644
index 00000000000..b8ad5233100
--- /dev/null
+++ b/examples/models/phi-3-mini/install_requirements.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -x
+
+pip install transformers==4.44.2
+
+pip install sentencepiece
+
+pip list
diff --git a/examples/models/phi-3-mini/main.cpp b/examples/models/phi-3-mini/main.cpp
index 7aedcb75b28..86446a8bde3 100644
--- a/examples/models/phi-3-mini/main.cpp
+++ b/examples/models/phi-3-mini/main.cpp
@@ -42,7 +42,7 @@ int main(int32_t argc, char** argv) {
 
   int32_t seq_len = FLAGS_seq_len;
 
-  ::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
+  example::Runner runner(model_path, tokenizer_path, temperature);
 
   runner.generate(prompt, seq_len);
 
diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp
index 9da323278f5..ca299d3b11e 100644
--- a/examples/models/phi-3-mini/runner.cpp
+++ b/examples/models/phi-3-mini/runner.cpp
@@ -15,7 +15,13 @@
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace torch::executor {
+using executorch::aten::ScalarType;
+using executorch::extension::Module;
+using executorch::extension::llm::BPETokenizer;
+using executorch::extension::llm::Sampler;
+using executorch::runtime::Error;
+
+namespace example {
 
 #define SAMPLER_TOP 0.9f
 #define ENDOFTEXT_TOKEN 32000
@@ -48,17 +54,8 @@ void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
   ET_CHECK_MSG(
       encode_res.error() == Error::Ok, "Failed to encode %s", prompt.c_str());
   auto input_tokens = encode_res.get();
-
-  std::cout << "Prefilling tokens ..." << std::endl;
-  for (auto token : input_tokens) {
-    std::cout << token << " ";
-  }
-  std::cout << std::endl;
-  std::cout.flush();
   auto prev_token = input_tokens.back();
   auto current_token = prefill(input_tokens);
-
-  std::cout << "Generating tokens ..." << std::endl;
   std::cout << tokenizer_->decode(prev_token, current_token).get();
   std::cout.flush();
 
@@ -81,7 +78,7 @@ uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) {
 }
 
 uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
-  auto result = module_->forward(from_blob(
+  auto result = module_->forward(executorch::extension::from_blob(
       tokens.data(),
       {1, static_cast<exec_aten::SizesType>(tokens.size())},
       ScalarType::Long));
@@ -91,7 +88,8 @@ uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
 }
 
 uint64_t Runner::run_model_step(uint64_t token) {
-  auto result = module_->forward(from_blob(&token, {1, 1}, ScalarType::Long));
+  auto result = module_->forward(
+      executorch::extension::from_blob(&token, {1, 1}, ScalarType::Long));
   ET_CHECK_MSG(
       result.error() == Error::Ok,
       "Failed to run forward() for token %" PRIu64,
@@ -100,4 +98,4 @@ uint64_t Runner::run_model_step(uint64_t token) {
   return logits_to_token(result.get()[0].toTensor());
 }
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/models/phi-3-mini/runner.h b/examples/models/phi-3-mini/runner.h
index 15022751a80..9b24f971708 100644
--- a/examples/models/phi-3-mini/runner.h
+++ b/examples/models/phi-3-mini/runner.h
@@ -19,7 +19,7 @@
 #include <executorch/extension/module/module.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 
-namespace torch::executor {
+namespace example {
 
 class Runner {
  public:
@@ -42,9 +42,9 @@ class Runner {
   uint64_t prefill(std::vector<uint64_t>& tokens);
   uint64_t run_model_step(uint64_t token);
 
-  std::unique_ptr<Module> module_;
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::unique_ptr<Sampler> sampler_;
+  std::unique_ptr<executorch::extension::Module> module_;
+  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
 };
 
-} // namespace torch::executor
+} // namespace example
diff --git a/examples/portable/scripts/export_and_delegate.py b/examples/portable/scripts/export_and_delegate.py
index 8df476f3dfc..50f2ce6d901 100644
--- a/examples/portable/scripts/export_and_delegate.py
+++ b/examples/portable/scripts/export_and_delegate.py
@@ -61,7 +61,7 @@ def export_composite_module_with_lower_graph():
     m_compile_spec = m.get_compile_spec()
 
     # pre-autograd export. eventually this will become torch.export
-    m = torch._export.capture_pre_autograd_graph(m, m_inputs)
+    m = torch.export.export_for_training(m, m_inputs).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
@@ -84,7 +84,7 @@ def forward(self, *args):
     m = CompositeModule()
     m = m.eval()
     # pre-autograd export. eventually this will become torch.export
-    m = torch._export.capture_pre_autograd_graph(m, m_inputs)
+    m = torch.export.export_for_training(m, m_inputs).module()
     composited_edge = export_to_edge(m, m_inputs)
 
     # The graph module is still runnerable
@@ -134,7 +134,7 @@ def get_example_inputs(self):
     m = Model()
     m_inputs = m.get_example_inputs()
     # pre-autograd export. eventually this will become torch.export
-    m = torch._export.capture_pre_autograd_graph(m, m_inputs)
+    m = torch.export.export_for_training(m, m_inputs).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
@@ -171,7 +171,7 @@ def export_and_lower_the_whole_graph():
 
     m_inputs = m.get_example_inputs()
     # pre-autograd export. eventually this will become torch.export
-    m = torch._export.capture_pre_autograd_graph(m, m_inputs)
+    m = torch.export.export_for_training(m, m_inputs).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index fa73c92ee11..ae953be7739 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -4,7 +4,7 @@ This directory contains examples for some AI models.
 
 We have seperated the example scripts into the following subfolders, please refer to [README.md](../../backends/qualcomm/README.md) for the example scripts' directory structure:
 
-1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script. 
+1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script.
 
 2. oss_scripts: OSS stands for Open Source Software. This folder contains python scripts for open source models. Some models under this folder might also have their own customized runner.
    For example, [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model.
@@ -83,7 +83,7 @@ pip install scikit-learn pandas
 
 ## Limitation
 
-1. QNN 2.13 is used for all examples. Newer or older QNN might work,
+1. QNN 2.24 is used for all examples. Newer or older QNN might work,
 but the performance and accuracy number can differ.
 
 2. The mobilebert example is on QNN HTP fp16, which is only supported by a limited
diff --git a/examples/qualcomm/TARGETS b/examples/qualcomm/TARGETS
new file mode 100644
index 00000000000..d6232977478
--- /dev/null
+++ b/examples/qualcomm/TARGETS
@@ -0,0 +1,22 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.python_binary(
+    name = "export_example",
+    srcs = ["scripts/export_example.py"],
+    main_function = ".scripts.export_example.main",
+    visibility = ["//executorch/examples/..."],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/extension/pybindings:aten_lib",
+        "//executorch/backends/qualcomm/partition:partition",
+        "//executorch/backends/qualcomm/quantizer:quantizer",
+        "//executorch/devtools:lib",
+        "//executorch/examples/models:models",
+        "//executorch/extension/export_util:export_util",
+    ],
+)
diff --git a/examples/qualcomm/executor_runner/CMakeLists.txt b/examples/qualcomm/executor_runner/CMakeLists.txt
index b950a4f82fd..214e0a58547 100644
--- a/examples/qualcomm/executor_runner/CMakeLists.txt
+++ b/examples/qualcomm/executor_runner/CMakeLists.txt
@@ -22,3 +22,6 @@ target_link_libraries(
   qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
   ${FLATCCRT_LIB} gflags
 )
+set_target_properties(
+  qnn_executor_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index f1c84bc6650..7235e36681e 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -25,7 +25,6 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
-#include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
 
 #include <gflags/gflags.h>
@@ -40,10 +39,6 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
-DEFINE_string(
-    prof_result_path,
-    "prof_result.bin",
-    "Executorch profiler output path.");
 DEFINE_string(
     output_folder_path,
     "outputs",
@@ -60,9 +55,40 @@ DEFINE_string(
     etdump_path,
     "etdump.etdp",
     "If etdump generation is enabled an etdump will be written out to this path");
-using namespace torch::executor;
-using torch::executor::MemoryAllocator;
-using torch::executor::util::FileDataLoader;
+
+DEFINE_bool(
+    dump_intermediate_outputs,
+    false,
+    "Dump intermediate outputs to etdump file.");
+
+DEFINE_string(
+    debug_output_path,
+    "debug_output.bin",
+    "Path to dump debug outputs to.");
+
+DEFINE_int32(
+    debug_buffer_size,
+    20000000, // 20MB
+    "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
+
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+using executorch::extension::FileDataLoader;
+using executorch::extension::prepare_input_tensors;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::EventTracerDebugLogLevel;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::TensorInfo;
 
 class CustomMemory {
  public:
@@ -101,7 +127,7 @@ class CustomMemory {
 };
 
 int main(int argc, char** argv) {
-  runtime_init();
+  executorch::runtime::runtime_init();
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
@@ -168,7 +194,6 @@ int main(int argc, char** argv) {
   // In this example we use a statically allocated memory pool.
   MemoryAllocator method_allocator{
       MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
-  method_allocator.enable_profiling("method allocator");
 
   // The memory-planned buffers will back the mutable tensors used by the
   // method. The sizes of these buffers were determined ahead of time during the
@@ -201,7 +226,7 @@ int main(int argc, char** argv) {
   // the method can mutate the memory-planned buffers, so the method should only
   // be used by a single thread at at time, but it can be reused.
   //
-  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  ETDumpGen etdump_gen;
   Result<Method> method =
       program->load_method(method_name, &memory_manager, &etdump_gen);
   ET_CHECK_MSG(
@@ -211,6 +236,15 @@ int main(int argc, char** argv) {
       method.error());
   ET_LOG(Info, "Method loaded.");
 
+  void* debug_buffer;
+  if (FLAGS_dump_intermediate_outputs) {
+    debug_buffer = malloc(FLAGS_debug_buffer_size);
+    Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
+    etdump_gen.set_debug_buffer(buffer);
+    etdump_gen.set_event_tracer_debug_level(
+        EventTracerDebugLogLevel::kIntermediateOutputs);
+  }
+
   // Prepare the inputs.
   // Allocate data memory for inputs and outputs
   std::vector<std::unique_ptr<CustomMemory>> in_custom_mem;
@@ -242,7 +276,7 @@ int main(int argc, char** argv) {
   }
   for (int output_index = 0; output_index < method->outputs_size();
        ++output_index) {
-    const exec_aten::Tensor& t = method->get_output(output_index).toTensor();
+    const Tensor& t = method->get_output(output_index).toTensor();
     out_custom_mem.push_back(
         std::make_unique<CustomMemory>(FLAGS_shared_buffer));
     std::unique_ptr<CustomMemory>& custom_mem_ptr = out_custom_mem.back();
@@ -386,14 +420,6 @@ int main(int argc, char** argv) {
         fout.close();
       }
 
-      // Dump the profiling data to the specified file.
-      torch::executor::prof_result_t prof_result;
-      EXECUTORCH_DUMP_PROFILE_RESULTS(&prof_result);
-      if (prof_result.num_bytes != 0) {
-        FILE* ptr = fopen(FLAGS_prof_result_path.c_str(), "w+");
-        fwrite(prof_result.prof_data, 1, prof_result.num_bytes, ptr);
-        fclose(ptr);
-      }
       ++inference_index;
     }
     ET_LOG(
@@ -404,7 +430,7 @@ int main(int argc, char** argv) {
         elapsed_time / inference_index);
   } else {
     // if no input is provided, fill the inputs with default values
-    auto inputs = util::prepare_input_tensors(*method);
+    auto inputs = prepare_input_tensors(*method);
     ET_CHECK_MSG(
         inputs.ok(),
         "Could not prepare inputs: 0x%" PRIx32,
@@ -423,7 +449,7 @@ int main(int argc, char** argv) {
 
   // Dump the etdump data containing profiling/debugging data to the specified
   // file.
-  etdump_result result = etdump_gen.get_etdump_data();
+  ETDumpResult result = etdump_gen.get_etdump_data();
   if (result.buf != nullptr && result.size > 0) {
     ET_LOG(
         Info,
@@ -436,5 +462,17 @@ int main(int argc, char** argv) {
     free(result.buf);
   }
 
+  if (FLAGS_dump_intermediate_outputs) {
+    ET_LOG(
+        Info,
+        "Write debug output binary to %s, Size = %zu",
+        FLAGS_debug_output_path.c_str(),
+        (size_t)FLAGS_debug_buffer_size);
+    FILE* f = fopen(FLAGS_debug_output_path.c_str(), "w+");
+    fwrite((uint8_t*)debug_buffer, 1, FLAGS_debug_buffer_size, f);
+    fclose(f);
+    free(debug_buffer);
+  }
+
   return 0;
 }
diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py
index a8241e34a73..2eb26e6cece 100644
--- a/examples/qualcomm/oss_scripts/dino_v2.py
+++ b/examples/qualcomm/oss_scripts/dino_v2.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -15,6 +14,7 @@
 
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     parse_skip_delegation_node,
     setup_common_args_and_variables,
@@ -25,40 +25,6 @@
 from transformers import Dinov2ForImageClassification
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append((feature,))
-        targets.append(target)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def get_instance():
     model = Dinov2ForImageClassification.from_pretrained(
         "facebook/dinov2-small-imagenet1k-1-layer"
@@ -79,18 +45,17 @@ def main(args):
             "Please specify a device serial by -s/--device argument."
         )
 
-    data_num = 100
-    inputs, targets, input_list = get_dataset(
+    img_size, data_num = 224, 100
+    inputs, targets, input_list = get_imagenet_dataset(
         dataset_path=f"{args.dataset}",
         data_size=data_num,
+        image_shape=(256, 256),
+        crop_size=img_size,
     )
-
-    img_size = 224
     sample_input = (torch.randn((1, 3, img_size, img_size)),)
 
     pte_filename = "dino_v2"
     instance = get_instance()
-
     build_executorch_binary(
         instance,
         sample_input,
@@ -100,10 +65,11 @@ def main(args):
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/oss_scripts/esrgan.py b/examples/qualcomm/oss_scripts/esrgan.py
index df02374e4bb..a5f027f79a6 100644
--- a/examples/qualcomm/oss_scripts/esrgan.py
+++ b/examples/qualcomm/oss_scripts/esrgan.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -69,10 +68,11 @@ def main(args):
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py
new file mode 100644
index 00000000000..30fe74f35b5
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/fastvit.py
@@ -0,0 +1,215 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+from multiprocessing.connection import Client
+
+import numpy as np
+import torch
+
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.quantizer.utils import (
+    _derived_bias_quant_spec,
+    MovingAverageMinMaxObserver,
+    ParamObserver,
+    QuantizationConfig,
+    QuantizationSpec,
+)
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_PASS_EXPAND_BROADCAST_SHAPE,
+)
+from executorch.backends.qualcomm.utils.utils import convert_linear_to_conv2d
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_imagenet_dataset,
+    make_output_dir,
+    make_quantizer,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+    topk_accuracy,
+)
+
+
+def get_instance(repo_path: str, checkpoint_path: str):
+    import sys
+
+    sys.path.insert(0, repo_path)
+
+    from models.modules.mobileone import reparameterize_model
+    from timm.models import create_model
+
+    checkpoint = torch.load(checkpoint_path, weights_only=True)
+    model = create_model("fastvit_s12")
+    model = reparameterize_model(model).eval()
+    model.load_state_dict(checkpoint["state_dict"])
+    return model
+
+
+def main(args):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    if not args.compile_only and args.device is None:
+        raise RuntimeError(
+            "device serial is required if not compile only. "
+            "Please specify a device serial by -s/--device argument."
+        )
+
+    data_num = 100
+    inputs, targets, input_list = get_imagenet_dataset(
+        dataset_path=f"{args.dataset}",
+        data_size=data_num,
+        image_shape=(256, 256),
+    )
+
+    pte_filename = "fastvit_qnn"
+    quantizer = make_quantizer(quant_dtype=QuantDtype.use_8a8w)
+
+    # there are lots of outliers appearing in fastvit parameters
+    # we need to apply special configuration to saturate their impact
+    act_qspec = QuantizationSpec(
+        dtype=torch.uint8,
+        qscheme=torch.per_tensor_affine,
+        observer_or_fake_quant_ctr=MovingAverageMinMaxObserver.with_args(
+            **{"averaging_constant": 0.02}
+        ),
+    )
+    weight_qspec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=torch.iinfo(torch.int8).min + 1,
+        quant_max=torch.iinfo(torch.int8).max,
+        qscheme=torch.per_channel_symmetric,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=ParamObserver.with_args(
+            **{"steps": 200, "use_mse": True}
+        ),
+    )
+    # rewrite default per-channel ptq config
+    quantizer.per_channel_quant_config = QuantizationConfig(
+        input_activation=act_qspec,
+        output_activation=act_qspec,
+        weight=weight_qspec,
+        bias=_derived_bias_quant_spec,
+    )
+    # rewrite default ptq config
+    q_config = quantizer.bit8_quant_config
+    quantizer.bit8_quant_config = QuantizationConfig(
+        input_activation=act_qspec,
+        output_activation=act_qspec,
+        weight=q_config.weight,
+        bias=q_config.bias,
+    )
+    # lower to QNN
+    build_executorch_binary(
+        convert_linear_to_conv2d(get_instance(args.oss_repo, args.pretrained_weight)),
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        dataset=inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        quant_dtype=QuantDtype.use_8a8w,
+        custom_quantizer=quantizer,
+        custom_pass_config={QCOM_PASS_EXPAND_BROADCAST_SHAPE},
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    # top-k analysis
+    predictions = []
+    for i in range(data_num):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)}))
+    else:
+        for i, k in enumerate(k_val):
+            print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./fastvit",
+        default="./fastvit",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--oss_repo",
+        help="Path to cloned https://github.com/apple/ml-fastvit",
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--pretrained_weight",
+        help=(
+            "Location of model pretrained weight."
+            "e.g., -p ./fastvit_s12_reparam.pth.tar"
+            "Pretrained model can be found in "
+            "https://docs-assets.developer.apple.com/ml-research/models/fastvit/image_classification_distilled_models/fastvit_s12_reparam.pth.tar"
+        ),
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/fbnet.py b/examples/qualcomm/oss_scripts/fbnet.py
index 495b08b413b..67fe2fba380 100755
--- a/examples/qualcomm/oss_scripts/fbnet.py
+++ b/examples/qualcomm/oss_scripts/fbnet.py
@@ -7,15 +7,14 @@
 import json
 import os
 import re
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
 import timm
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.examples.qualcomm.scripts.inception_v4 import get_dataset
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     setup_common_args_and_variables,
     SimpleADB,
@@ -36,9 +35,10 @@ def main(args):
     instance = timm.create_model("fbnetc_100", pretrained=True).eval()
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
+    inputs, targets, input_list = get_imagenet_dataset(
         dataset_path=f"{args.dataset}",
         data_size=data_num,
+        image_shape=(299, 299),
     )
 
     pte_filename = "fbnet"
@@ -50,10 +50,11 @@ def main(args):
         f"{args.artifact}/{pte_filename}",
         inputs,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/oss_scripts/gMLP_image_classification.py b/examples/qualcomm/oss_scripts/gMLP_image_classification.py
index cbcd6d88cbf..1dffa6831b4 100644
--- a/examples/qualcomm/oss_scripts/gMLP_image_classification.py
+++ b/examples/qualcomm/oss_scripts/gMLP_image_classification.py
@@ -7,7 +7,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -17,6 +16,7 @@
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     parse_skip_delegation_node,
     setup_common_args_and_variables,
@@ -25,40 +25,6 @@
 )
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize((224, 224)),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append((feature,))
-        for element in target:
-            targets.append(element)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
@@ -72,9 +38,10 @@ def main(args):
         )
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
+    inputs, targets, input_list = get_imagenet_dataset(
         dataset_path=f"{args.dataset}",
         data_size=data_num,
+        image_shape=(224, 224),
     )
 
     pte_filename = "gMLP_image_classification_qnn"
@@ -94,7 +61,7 @@ def main(args):
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/oss_scripts/install_requirements.sh b/examples/qualcomm/oss_scripts/install_requirements.sh
deleted file mode 100755
index 9987f7cda98..00000000000
--- a/examples/qualcomm/oss_scripts/install_requirements.sh
+++ /dev/null
@@ -1 +0,0 @@
-pip install timm
diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
index 97995086335..61a2ecda56b 100644
--- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
+++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
@@ -33,3 +33,6 @@ target_link_libraries(
   re2::re2
 )
 target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
+set_target_properties(
+  qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
index df8c876abf2..222f2717ed2 100644
--- a/examples/qualcomm/oss_scripts/llama2/llama.py
+++ b/examples/qualcomm/oss_scripts/llama2/llama.py
@@ -48,6 +48,7 @@
 
 
 soc_to_chipset_map = {
+    "SSG2115P": QcomChipset.SSG2115P,
     "SM8650": QcomChipset.SM8650,
     "SM8550": QcomChipset.SM8550,
     "SM8475": QcomChipset.SM8475,
@@ -150,7 +151,7 @@ def annotate_matmul_input1(node: Node):
 
 def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None:
     from executorch.backends.qualcomm.quantizer.quantizer import (
-        get_ptq_per_channel_weight_config,
+        get_ptq_per_channel_quant_config,
         QuantizationConfig,
     )
     from executorch.backends.qualcomm.quantizer.utils import QUANT_ANNOTATION_KEY
@@ -172,7 +173,7 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
             _annotated=True,
         )
 
-    quantization_config_16a8w_per_channel = get_ptq_per_channel_weight_config(
+    quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config(
         torch.uint16, weight_dtype=torch.int8
     )
     for node in gm.graph.nodes:
@@ -311,7 +312,6 @@ def lowering_modules(
             # Therefore, won't want to pre-allocate
             # by memory manager in runtime.
             memory_planning_pass=MemoryPlanningPass(
-                memory_planning_algo="greedy",
                 alloc_graph_input=False,
                 alloc_graph_output=False,
             ),
@@ -424,8 +424,6 @@ def inference(args, pre_gen_pte=""):
     runner_cmd = " ".join(
         [
             f"cd {workspace} &&",
-            "export ADSP_LIBRARY_PATH=. &&",
-            "export LD_LIBRARY_PATH=. &&",
             f"./qnn_llama_runner {runner_args}",
         ]
     )
@@ -573,11 +571,11 @@ def post_process():
         inference(args, args.pre_gen_pte)
         exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
 
-    compile(args)
     if args.compile_only:
         exit(f"Finish compile_only and save to {args.artifact}")
 
     try:
+        compile(args)
         inference(args)
     except Exception as e:
         if args.ip and args.port != -1:
diff --git a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
index 599accfd1ed..1e46f919dca 100644
--- a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
@@ -23,8 +23,6 @@
 #include <fstream>
 #include <vector>
 
-using torch::executor::MemoryAllocator;
-
 DEFINE_string(
     model_path,
     "qnn_llama2.pte",
@@ -49,9 +47,12 @@ DEFINE_int32(
     128,
     "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");
 
-int main(int argc, char** argv) {
-  using namespace torch::executor;
+using executorch::runtime::Error;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
 
+int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
@@ -60,7 +61,7 @@ int main(int argc, char** argv) {
   int32_t seq_len = FLAGS_seq_len;
 
   // create llama runner
-  Runner runner(FLAGS_model_path, tokenizer_path, temperature);
+  example::Runner runner(FLAGS_model_path, tokenizer_path, temperature);
   ET_CHECK_MSG(runner.load() == Error::Ok, "Runner failed to load method");
 
   // MethodMeta describes the memory requirements of the method.
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
index d8da43c74ce..4d8dd0b91f5 100644
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
@@ -22,11 +22,27 @@
 #include <memory>
 #include <sstream>
 
-namespace torch {
-namespace executor {
+using executorch::aten::ScalarType;
+using executorch::aten::SizesType;
+using executorch::aten::Tensor;
+using executorch::extension::from_blob;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::extension::llm::BPETokenizer;
+using executorch::extension::llm::Sampler;
+using executorch::extension::llm::time_in_ms;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+// TODO: Remove this usage of an internal-only function.
+using executorch::runtime::internal::set_tensor_data;
+
+namespace example {
 
 namespace {
-using namespace executorch::extension;
 static constexpr auto kTopp = 0.9f;
 void printReport(const Runner::Stats& stats);
 std::string statsToJsonString(const Runner::Stats& stats);
@@ -57,7 +73,7 @@ Error Runner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
-  stats_.model_load_start_ms = util::time_in_ms();
+  stats_.model_load_start_ms = time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
 
   // Read out metadata from the model
@@ -97,7 +113,7 @@ Error Runner::load() {
       temperature_,
       kTopp,
       static_cast<unsigned long long>(std::time(nullptr)));
-  stats_.model_load_end_ms = util::time_in_ms();
+  stats_.model_load_end_ms = time_in_ms();
 
   return Error::Ok;
 }
@@ -125,7 +141,7 @@ T Runner::getMetadataHelper(std::string method_name, T default_val) {
 }
 
 template <typename T>
-int32_t Runner::logitsToToken(const exec_aten::Tensor& logits_tensor) {
+int32_t Runner::logitsToToken(const Tensor& logits_tensor) {
   T* logits = logits_tensor.mutable_data_ptr<T>();
 
   // Since the logits are for all tokens, get the last token probabilities
@@ -135,7 +151,7 @@ int32_t Runner::logitsToToken(const exec_aten::Tensor& logits_tensor) {
 
 // Given an input token. Set up the inputs for the model and execute a single
 // step. Returning the logits tensor.
-Result<torch::executor::Tensor> Runner::run_model_step(
+Result<Tensor> Runner::run_model_step(
     int64_t input_token,
     TensorPtr& token,
     TensorPtr& start_pos,
@@ -145,7 +161,10 @@ Result<torch::executor::Tensor> Runner::run_model_step(
   token->mutable_data_ptr<int32_t>()[0] = input_token;
 
   // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache]
-  auto outputs_res = module_->forward({token, start_pos, atten_mask});
+  std::vector<executorch::runtime::EValue> inputs = {
+      token, start_pos, atten_mask};
+  inputs.insert(inputs.end(), kv_tensors.begin(), kv_tensors.end());
+  auto outputs_res = module_->forward(inputs);
   ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
 
   // TODO: need to handle batch size != 1
@@ -167,7 +186,7 @@ Result<torch::executor::Tensor> Runner::run_model_step(
     char* new_inp_addr = io_mem_mgr_.update_k_caches_read(j, el_size);
     // inputs
     ET_CHECK_MSG(
-        internal::set_tensor_data(
+        set_tensor_data(
             *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok,
         "Failed to set input tensor when updating k_cache");
   }
@@ -177,17 +196,17 @@ Result<torch::executor::Tensor> Runner::run_model_step(
     char* new_inp_addr = io_mem_mgr_.update_v_caches_read(v_idx, v_offset);
 
     ET_CHECK_MSG(
-        internal::set_tensor_data(
+        set_tensor_data(
             *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok,
         "Failed to set input tensor when updating v_cache");
     // outputs
     char* new_out_addr = io_mem_mgr_.update_v_caches_write(v_idx, v_offset);
     ET_CHECK_MSG(
-        internal::set_tensor_data(
+        set_tensor_data(
             *kv_outputs[j], new_out_addr, kv_outputs[j]->nbytes()) == Error::Ok,
         "Failed to set output tensor when updating v_cache");
     ET_CHECK_MSG(
-        module_->set_output_data_ptr(*kv_outputs[j], j + 1) == Error::Ok,
+        module_->set_output(*kv_outputs[j], j + 1) == Error::Ok,
         "Failed to set llama output data pointer");
   }
 
@@ -210,7 +229,7 @@ Error Runner::generate(
 
   // First token time only measures the time it takes to encode the prompt and
   // return a response token.
-  stats_.inference_start_ms = util::time_in_ms();
+  stats_.inference_start_ms = time_in_ms();
   shouldStop_ = false;
 
   // Set the sequence length to the max seq length if not provided
@@ -235,21 +254,21 @@ Error Runner::generate(
       "Sequence length exceeded - please increase the seq_len value passed to generate()");
 
   int32_t pos = 0, prev_token, cur_token = prompt_tokens[0];
-  std::vector<exec_aten::SizesType> token_shape = {1, 1};
+  std::vector<SizesType> token_shape = {1, 1};
 
   io_mem_mgr_.get_input_token_ptr()[0] = 0;
-  std::vector<exec_aten::SizesType> start_pos_shape = {1, 1};
+  std::vector<SizesType> start_pos_shape = {1, 1};
 
   float* atten_mask_ptr =
       reinterpret_cast<float*>(io_mem_mgr_.get_atten_mask_ptr());
   std::fill(atten_mask_ptr, atten_mask_ptr + max_seq_len_, -255);
   atten_mask_ptr[max_seq_len_ - 1] = 0;
 
-  std::vector<exec_aten::SizesType> atten_mask_shape = {1, max_seq_len_};
+  std::vector<SizesType> atten_mask_shape = {1, max_seq_len_};
 
-  std::vector<exec_aten::SizesType> logits_data_shape = {1, vocab_size_};
+  std::vector<SizesType> logits_data_shape = {1, vocab_size_};
 
-  std::vector<exec_aten::SizesType> hidden_states_data_shape = {1, 1, dim_};
+  std::vector<SizesType> hidden_states_data_shape = {1, 1, dim_};
 
   // initialize tensor wrappers
   auto token = from_blob(
@@ -274,7 +293,7 @@ Error Runner::generate(
         method_meta->input_tensor_meta(input_index);
 
     auto tensor_shape = tensor_meta->sizes();
-    std::vector<exec_aten::SizesType> sizes(
+    std::vector<SizesType> sizes(
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size());
     kv_tensors.emplace_back(from_blob(
         io_mem_mgr_.get_k_caches_read_ptr(i),
@@ -284,14 +303,14 @@ Error Runner::generate(
     // outpus
     Result<TensorInfo> out_tensor_meta = method_meta->output_tensor_meta(i + 1);
     tensor_shape = out_tensor_meta->sizes();
-    sizes = std::vector<exec_aten::SizesType>{
+    sizes = std::vector<SizesType>{
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size()};
     kv_outputs.emplace_back(from_blob(
         io_mem_mgr_.get_k_caches_write_ptr(i),
         sizes,
         kv_tensors.back()->scalar_type()));
     ET_CHECK_MSG(
-        module_->set_output_data_ptr(kv_outputs.back(), i + 1) == Error::Ok,
+        module_->set_output(kv_outputs.back(), i + 1) == Error::Ok,
         "Failed to set output tensor for kv cache");
   }
 
@@ -303,7 +322,7 @@ Error Runner::generate(
     Result<TensorInfo> tensor_meta =
         method_meta->input_tensor_meta(input_index);
     auto tensor_shape = tensor_meta->sizes();
-    std::vector<exec_aten::SizesType> sizes(
+    std::vector<SizesType> sizes(
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size());
 
     kv_tensors.emplace_back(from_blob(
@@ -315,7 +334,7 @@ Error Runner::generate(
     Result<TensorInfo> out_tensor_meta =
         method_meta->output_tensor_meta(output_index);
     tensor_shape = out_tensor_meta->sizes();
-    sizes = std::vector<exec_aten::SizesType>{
+    sizes = std::vector<SizesType>{
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size()};
 
     kv_outputs.push_back(from_blob(
@@ -323,8 +342,7 @@ Error Runner::generate(
         sizes,
         kv_tensors.back()->scalar_type()));
     ET_CHECK_MSG(
-        module_->set_output_data_ptr(kv_outputs.back(), output_index) ==
-            Error::Ok,
+        module_->set_output(kv_outputs.back(), output_index) == Error::Ok,
         "Failed to set output tensor for llama block");
   }
 
@@ -333,7 +351,7 @@ Error Runner::generate(
       logits_data_shape,
       ScalarType::Float);
   ET_CHECK_MSG(
-      module_->set_output_data_ptr(affine_logits, 0) == Error::Ok,
+      module_->set_output(affine_logits) == Error::Ok,
       "Failed to set output tensor for affine module - logits");
 
   // Start consuming user's prompts and generating new tokens
@@ -343,19 +361,18 @@ Error Runner::generate(
     auto logits_res = run_model_step(
         cur_token, token, start_pos, atten_mask, kv_tensors, kv_outputs);
     if (pos == num_prompt_tokens) {
-      stats_.first_token_ms = util::time_in_ms();
+      stats_.first_token_ms = time_in_ms();
     } else if (pos == num_prompt_tokens - 1) {
-      stats_.prompt_eval_end_ms = util::time_in_ms();
+      stats_.prompt_eval_end_ms = time_in_ms();
     }
 
     ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
-    exec_aten::Tensor& logits_tensor = logits_res.get();
+    Tensor& logits_tensor = logits_res.get();
     prev_token = cur_token;
-    long sample_start_time_ms = util::time_in_ms();
+    long sample_start_time_ms = time_in_ms();
 
     cur_token = logitsToToken<float>(logits_tensor);
-    stats_.aggregate_sampling_time_ms +=
-        util::time_in_ms() - sample_start_time_ms;
+    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
 
     // advance the state machine
     if (pos < num_prompt_tokens - 1) {
@@ -382,7 +399,7 @@ Error Runner::generate(
       break;
     }
   }
-  stats_.inference_end_ms = util::time_in_ms();
+  stats_.inference_end_ms = time_in_ms();
 
   if (pos == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
@@ -651,5 +668,4 @@ template bool Runner::getMetadataHelper<bool>(
     std::string method_name,
     bool default_val);
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h
index 1c35c821ceb..700cb94f52c 100644
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.h
@@ -106,21 +106,20 @@ class RpcMemAllocator {
     return reinterpret_cast<char*>(ptr_) + name##_pos_[idx]; \
   }
 
-namespace torch {
-namespace executor {
+namespace example {
 class IoMemMgr {
  public:
   // Allocate a big memory which is capable to contain all IO of all modules
   IoMemMgr(){};
-  IoMemMgr(MethodMeta method_meta);
+  IoMemMgr(executorch::runtime::MethodMeta method_meta);
 
   struct InfoAttrs {
-    std::unique_ptr<TensorInfo> tensor_meta;
+    std::unique_ptr<executorch::runtime::TensorInfo> tensor_meta;
     size_t size = 0;
     std::vector<uint32_t> shape;
     uint32_t rank;
     size_t element_size;
-    torch::executor::ScalarType dtype;
+    executorch::aten::ScalarType dtype;
   };
 
   struct IoInfo {
@@ -186,15 +185,16 @@ class IoMemMgr {
   std::vector<size_t> v_caches_write_pos_;
 
   IoInfo io_info_;
-  std::unique_ptr<MethodMeta> method_meta_;
+  std::unique_ptr<executorch::runtime::MethodMeta> method_meta_;
   RpcMemAllocator rpc_mem_allocator{QnnMemDescriptor::kCustom};
-  std::unordered_map<ScalarType, size_t> scalar_type_to_size = {
-      {ScalarType::Int, sizeof(int32_t)},
-      {ScalarType::Float, sizeof(float)},
-      {ScalarType::Char, sizeof(int8_t)},
-      {ScalarType::Short, sizeof(int16_t)},
-      {ScalarType::Byte, sizeof(uint8_t)},
-      {ScalarType::Bits16, sizeof(uint16_t)},
+  std::unordered_map<executorch::aten::ScalarType, size_t> scalar_type_to_size =
+      {
+          {executorch::aten::ScalarType::Int, sizeof(int32_t)},
+          {executorch::aten::ScalarType::Float, sizeof(float)},
+          {executorch::aten::ScalarType::Char, sizeof(int8_t)},
+          {executorch::aten::ScalarType::Short, sizeof(int16_t)},
+          {executorch::aten::ScalarType::Byte, sizeof(uint8_t)},
+          {executorch::aten::ScalarType::Bits16, sizeof(uint16_t)},
   };
 };
 
@@ -232,23 +232,24 @@ class Runner {
   };
 
   bool is_loaded() const;
-  Error load();
-  Error mem_alloc(size_t alignment, size_t seq_len);
-  Error generate(
+  executorch::runtime::Error load();
+  executorch::runtime::Error mem_alloc(size_t alignment, size_t seq_len);
+  executorch::runtime::Error generate(
       const std::string& prompt,
       int32_t seq_len,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {});
   void stop();
-  Result<MethodMeta> get_method_meta();
+  executorch::runtime::Result<executorch::runtime::MethodMeta>
+  get_method_meta();
 
  private:
   // metadata
   template <typename T>
   T getMetadataHelper(std::string method_name, T default_val);
   template <typename T>
-  int32_t logitsToToken(const exec_aten::Tensor& logits_tensor);
-  Result<Tensor> run_model_step(
+  int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor);
+  executorch::runtime::Result<executorch::aten::Tensor> run_model_step(
       int64_t input_token,
       ::executorch::extension::TensorPtr& token,
       ::executorch::extension::TensorPtr& start_pos,
@@ -265,16 +266,15 @@ class Runner {
   int32_t head_dim_;
   int32_t dim_;
   std::unordered_set<std::string> model_methods_;
-  std::unique_ptr<Module> module_;
+  std::unique_ptr<executorch::extension::Module> module_;
   std::string tokenizer_path_;
   std::string model_path_;
   float temperature_;
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::unique_ptr<Sampler> sampler_;
+  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
   bool shouldStop_{false};
   Stats stats_;
   IoMemMgr io_mem_mgr_;
 };
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/regnet.py b/examples/qualcomm/oss_scripts/regnet.py
index 0dc70608daf..01b6bb9937e 100644
--- a/examples/qualcomm/oss_scripts/regnet.py
+++ b/examples/qualcomm/oss_scripts/regnet.py
@@ -6,14 +6,13 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
-import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     parse_skip_delegation_node,
     setup_common_args_and_variables,
@@ -29,41 +28,6 @@
 )
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append((feature,))
-        for element in target:
-            targets.append(element)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
@@ -77,9 +41,11 @@ def main(args):
         )
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
+    inputs, targets, input_list = get_imagenet_dataset(
         dataset_path=f"{args.dataset}",
         data_size=data_num,
+        image_shape=(256, 256),
+        crop_size=224,
     )
 
     if args.weights == "regnet_y_400mf":
@@ -97,11 +63,14 @@ def main(args):
         args.model,
         f"{args.artifact}/{pte_filename}",
         inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/oss_scripts/squeezenet.py b/examples/qualcomm/oss_scripts/squeezenet.py
index 64b317068ce..9e486e94c07 100644
--- a/examples/qualcomm/oss_scripts/squeezenet.py
+++ b/examples/qualcomm/oss_scripts/squeezenet.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -15,6 +14,7 @@
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     parse_skip_delegation_node,
     setup_common_args_and_variables,
@@ -23,40 +23,6 @@
 )
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append((feature,))
-        targets.append(target)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
@@ -70,13 +36,17 @@ def main(args):
         )
 
     data_num = 100
-    inputs, targets, input_list = get_dataset(
+    inputs, targets, input_list = get_imagenet_dataset(
         dataset_path=f"{args.dataset}",
         data_size=data_num,
+        image_shape=(256, 256),
+        crop_size=224,
     )
     pte_filename = "squeezenet_qnn"
     instance = torch.hub.load(
-        "pytorch/vision:v0.10.0", "squeezenet1_1", pretrained=True
+        "pytorch/vision:v0.13.0",
+        "squeezenet1_1",
+        weights="SqueezeNet1_1_Weights.DEFAULT",
     )
     build_executorch_binary(
         instance.eval(),
@@ -86,11 +56,12 @@ def main(args):
         inputs,
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
-        quant_dtype=QuantDtype.use_16a16w,
+        quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
index a5db138233e..2db51cd5c48 100644
--- a/examples/qualcomm/oss_scripts/ssd300_vgg16.py
+++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
@@ -150,10 +150,11 @@ def main(args):
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
index 1a9406ca955..947b3ef975c 100644
--- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
@@ -32,7 +32,7 @@ target_include_directories(
 target_link_libraries(
   qaihub_llama2_7b_runner
   qnn_executorch_backend
-  executorch_no_prim_ops
+  executorch_core
   extension_data_loader
   extension_module
   extension_tensor
@@ -42,6 +42,9 @@ target_link_libraries(
 target_compile_options(
   qaihub_llama2_7b_runner PUBLIC ${_common_compile_options}
 )
+set_target_properties(
+  qaihub_llama2_7b_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
 
 # preprocess qaihub llama3 8b runner src files
 set(_qaihub_llama3_8b_runner__srcs ${_qaihub_llama_runner__srcs})
@@ -87,7 +90,7 @@ target_include_directories(
 target_link_libraries(
   qaihub_llama3_8b_runner
   qnn_executorch_backend
-  executorch_no_prim_ops
+  executorch_core
   extension_data_loader
   extension_module
   extension_tensor
@@ -97,3 +100,6 @@ target_link_libraries(
 target_compile_options(
   qaihub_llama3_8b_runner PUBLIC ${_common_compile_options}
 )
+set_target_properties(
+  qaihub_llama3_8b_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
index 9966d665aec..c54b75a6b6a 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
+++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py
@@ -134,8 +134,6 @@ def get_logit_encoding(path_to_last_shard: str):
     runner_cmds = " ".join(
         [
             f"cd {adb.workspace} &&",
-            "export ADSP_LIBRARY_PATH=. &&",
-            "export LD_LIBRARY_PATH=. &&",
             f"./qaihub_llama2_7b_runner {' '.join(runner_args)}",
         ]
     )
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
index d69aa0aa7a8..3de97cde7e8 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
@@ -49,8 +49,6 @@ DEFINE_double(logits_scale, 0.0, "Path to logits scale file");
 DEFINE_int32(logits_offset, 0, "Path to logits offset file");
 
 int main(int argc, char** argv) {
-  using namespace torch::executor;
-
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   std::vector<std::string> models_path = {
@@ -62,7 +60,7 @@ int main(int argc, char** argv) {
       FLAGS_freq_cos_path, FLAGS_freq_sin_path};
 
   // create llama runner
-  Runner runner(
+  example::Runner runner(
       models_path,
       pos_embs_path,
       {8, 8, 8, 8},
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
index bdcd7ad6a2e..9acbeebef2d 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
+++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py
@@ -124,8 +124,6 @@ def main(args):
     runner_cmds = " ".join(
         [
             f"cd {adb.workspace} &&",
-            "export ADSP_LIBRARY_PATH=. &&",
-            "export LD_LIBRARY_PATH=. &&",
             f"./qaihub_llama3_8b_runner {' '.join(runner_args)}",
         ]
     )
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
index 9d06e8118da..7591b7ae1e9 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
@@ -54,8 +54,6 @@ DEFINE_double(logits_scale, 0.0, "Path to logits scale file");
 DEFINE_int32(logits_offset, 0, "Path to logits offset file");
 
 int main(int argc, char** argv) {
-  using namespace torch::executor;
-
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   std::vector<std::string> models_path = {
@@ -68,7 +66,7 @@ int main(int argc, char** argv) {
       FLAGS_freq_cos_path, FLAGS_freq_sin_path};
 
   // create llama runner
-  Runner runner(
+  example::Runner runner(
       models_path,
       pos_embs_path,
       {4, 8, 8, 8, 4},
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
index 3283d81a9f3..9dc1ee7e254 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.cpp
@@ -6,13 +6,21 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <algorithm>
 #include <fstream>
 
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
-namespace torch {
-namespace executor {
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::Module;
+using executorch::runtime::Error;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+namespace example {
 
 Memory::Memory(
     const std::vector<std::string>& pos_embs_path,
@@ -427,7 +435,7 @@ void KVCachedMemory::update_io(
             // k, v are placed interleaved
             int index = (cache_stride << 1) + (cache_group << 5) + head;
             ET_CHECK_MSG(
-                modules_[shard]->set_output_data_ptr(
+                modules_[shard]->set_output(
                     output_tensors[shard][index], index) == Error::Ok,
                 "failed to set output tensor for module %d's %d'th output "
                 "while updating kv_cache output tensors",
@@ -450,8 +458,8 @@ void KVCachedMemory::update_io(
     for (int shard = 0; shard < output_tensors.size(); shard++) {
       for (int index = 0; index < output_tensors[shard].size(); index++) {
         ET_CHECK_MSG(
-            modules_[shard]->set_output_data_ptr(
-                output_tensors[shard][index], index) == Error::Ok,
+            modules_[shard]->set_output(output_tensors[shard][index], index) ==
+                Error::Ok,
             "failed to set output tensor for module %d's %d'th output "
             "while updating kv_cache output tensors",
             shard,
@@ -476,7 +484,7 @@ void KVCachedMemory::update_io(
 ThreadPool::ThreadPool() : stop_(false) {
   size_t hc = (std::thread::hardware_concurrency() + 3) / 4;
   // maximum number should be divisible by head dimension which equals to 32
-  num_workers_ = min(32, hc * 4);
+  num_workers_ = std::min<size_t>(32, hc * 4);
   for (size_t i = 0; i < num_workers_; ++i) {
     threads_.emplace_back([this]() {
       while (1) {
@@ -520,5 +528,4 @@ size_t ThreadPool::num_workers() {
   return num_workers_;
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
index df64bf8263d..4ad7264cc91 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h
@@ -26,44 +26,47 @@
 #define QAIHUB_LLAMA_LOGITS 32000
 #endif
 
-namespace torch {
-namespace executor {
+namespace example {
 
 class Memory {
  public:
   Memory(
       const std::vector<std::string>& pos_embs_path,
-      std::vector<std::shared_ptr<Module>>& modules);
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules);
   virtual ~Memory();
   virtual void prepare_io(
-      const std::vector<Result<MethodMeta>>& methods_meta) = 0;
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          methods_meta) = 0;
   virtual void update_io(
       int64_t cur_token,
       int64_t pos,
-      std::vector<std::vector<Tensor>>& output_tensors) = 0;
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
   void* get_mutable_ptr();
-  std::vector<Tensor> get_input_tensors(int shard_index);
-  std::vector<Tensor> get_output_tensors(int shard_index);
+  std::vector<executorch::aten::Tensor> get_input_tensors(int shard_index);
+  std::vector<executorch::aten::Tensor> get_output_tensors(int shard_index);
 
  protected:
   std::unique_ptr<void, void (*)(void*)> data_ptr_;
-  std::vector<std::vector<TensorImpl*>> input_tensors_;
-  std::vector<std::vector<TensorImpl*>> output_tensors_;
+  std::vector<std::vector<executorch::aten::TensorImpl*>> input_tensors_;
+  std::vector<std::vector<executorch::aten::TensorImpl*>> output_tensors_;
   std::vector<std::string> pos_embs_path_;
-  std::vector<std::shared_ptr<Module>> modules_;
+  std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
 };
 
 class BertMemory : public Memory {
  public:
   BertMemory(
       const std::vector<std::string>& pos_embs_path,
-      std::vector<std::shared_ptr<Module>>& modules,
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
       std::vector<int> shard_layers);
-  void prepare_io(const std::vector<Result<MethodMeta>>& methods_meta) override;
+  void prepare_io(const std::vector<executorch::runtime::Result<
+                      executorch::runtime::MethodMeta>>& methods_meta) override;
   void update_io(
       int64_t cur_token,
       int64_t pos,
-      std::vector<std::vector<Tensor>>& output_tensors) override;
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
   struct IO {
     int32_t input_ids[1024 * 2];
     uint16_t hidden_state[1024 * 4096];
@@ -76,14 +79,14 @@ class BertMemory : public Memory {
   };
 
  private:
-  std::unique_ptr<TensorImpl> input_ids_;
-  std::unique_ptr<TensorImpl> hidden_state_;
-  std::unique_ptr<TensorImpl> attention_mask_;
-  std::unique_ptr<TensorImpl> position_ids_cos_;
-  std::unique_ptr<TensorImpl> position_ids_sin_;
-  std::vector<std::unique_ptr<TensorImpl>> k_cache_;
-  std::vector<std::unique_ptr<TensorImpl>> v_cache_;
-  std::unique_ptr<TensorImpl> logits_;
+  std::unique_ptr<executorch::aten::TensorImpl> input_ids_;
+  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
+  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> position_ids_cos_;
+  std::unique_ptr<executorch::aten::TensorImpl> position_ids_sin_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> k_cache_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> v_cache_;
+  std::unique_ptr<executorch::aten::TensorImpl> logits_;
   std::vector<int> shard_layers_;
   int num_heads_;
 };
@@ -117,13 +120,15 @@ class KVCachedMemory : public Memory {
  public:
   KVCachedMemory(
       const std::vector<std::string>& pos_embs_path,
-      std::vector<std::shared_ptr<Module>>& modules,
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
       std::vector<int> shard_layers);
-  void prepare_io(const std::vector<Result<MethodMeta>>& methods_meta) override;
+  void prepare_io(const std::vector<executorch::runtime::Result<
+                      executorch::runtime::MethodMeta>>& methods_meta) override;
   void update_io(
       int64_t cur_token,
       int64_t pos,
-      std::vector<std::vector<Tensor>>& output_tensors) override;
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
   struct IO {
     int32_t input_ids;
     uint16_t hidden_state[4096];
@@ -142,16 +147,16 @@ class KVCachedMemory : public Memory {
   };
 
  private:
-  std::unique_ptr<TensorImpl> input_ids_;
-  std::unique_ptr<TensorImpl> hidden_state_;
-  std::unique_ptr<TensorImpl> attention_mask_;
-  std::unique_ptr<TensorImpl> position_ids_cos_;
-  std::unique_ptr<TensorImpl> position_ids_sin_;
-  std::vector<std::unique_ptr<TensorImpl>> k_cache_in_;
-  std::vector<std::unique_ptr<TensorImpl>> v_cache_in_;
-  std::vector<std::unique_ptr<TensorImpl>> k_cache_out_;
-  std::vector<std::unique_ptr<TensorImpl>> v_cache_out_;
-  std::unique_ptr<TensorImpl> logits_;
+  std::unique_ptr<executorch::aten::TensorImpl> input_ids_;
+  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
+  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> position_ids_cos_;
+  std::unique_ptr<executorch::aten::TensorImpl> position_ids_sin_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> k_cache_in_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> v_cache_in_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> k_cache_out_;
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> v_cache_out_;
+  std::unique_ptr<executorch::aten::TensorImpl> logits_;
   std::vector<LoopRange> lr_update_kv_;
   std::vector<std::future<void>> futures_;
   ThreadPool thread_pool_;
@@ -159,5 +164,4 @@ class KVCachedMemory : public Memory {
   int num_heads_;
 };
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
index d6d99112932..721c16209c2 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
@@ -29,8 +29,16 @@
 #include "arm_neon.h"
 #endif
 
-namespace torch {
-namespace executor {
+using executorch::aten::Tensor;
+using executorch::extension::Module;
+using executorch::extension::llm::Sampler;
+using executorch::extension::llm::time_in_ms;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+
+namespace example {
 
 namespace {
 static constexpr auto kTopp = 0.9f;
@@ -66,12 +74,12 @@ Runner::Runner(
 
 // load tokenizer
 #if defined(QAIHUB_LLAMA3_RUNNER)
-  tokenizer_ = get_tiktoken_for_llama();
+  tokenizer_ = example::get_tiktoken_for_llama();
   tokenizer_->load(tokenizer_path_);
   eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
   version_ = LlamaVersion::kLlama3;
 #else
-  tokenizer_ = std::make_unique<BPETokenizer>();
+  tokenizer_ = std::make_unique<executorch::extension::llm::BPETokenizer>();
   tokenizer_->load(tokenizer_path_);
   version_ = LlamaVersion::kLlama2;
 #endif
@@ -170,15 +178,14 @@ Error Runner::generate(
   std::vector<std::vector<Tensor>> input_tensors, output_tensors;
   std::vector<std::vector<EValue>> inputs;
   if (!is_loaded()) {
-    stats_.model_load_start_ms = util::time_in_ms();
+    stats_.model_load_start_ms = time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
     for (int i = 0; i < modules_.size(); ++i) {
       input_tensors.emplace_back(io_mem_->get_input_tensors(i));
       output_tensors.emplace_back(io_mem_->get_output_tensors(i));
       for (size_t j = 0; j < output_tensors[i].size(); ++j) {
         ET_CHECK_MSG(
-            modules_[i]->set_output_data_ptr(output_tensors[i][j], j) ==
-                Error::Ok,
+            modules_[i]->set_output(output_tensors[i][j], j) == Error::Ok,
             "failed to set output tensor for module %d's %zu'th output",
             i,
             j);
@@ -186,10 +193,10 @@ Error Runner::generate(
       inputs.emplace_back(
           std::vector<EValue>(begin(input_tensors[i]), end(input_tensors[i])));
     }
-    stats_.model_load_end_ms = util::time_in_ms();
+    stats_.model_load_end_ms = time_in_ms();
   }
 
-  stats_.inference_start_ms = util::time_in_ms();
+  stats_.inference_start_ms = time_in_ms();
   seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
 
   std::string post_process_prompt;
@@ -276,16 +283,15 @@ Error Runner::generate(
     Tensor& logits_tensor = output_tensors.back().back();
 
     if (pos == num_prompt_tokens) {
-      stats_.first_token_ms = util::time_in_ms();
+      stats_.first_token_ms = time_in_ms();
     } else if (pos == num_prompt_tokens - 1) {
-      stats_.prompt_eval_end_ms = util::time_in_ms();
+      stats_.prompt_eval_end_ms = time_in_ms();
     }
 
-    long sample_start_time_ms = util::time_in_ms();
+    long sample_start_time_ms = time_in_ms();
     prev_token = cur_token;
     cur_token = logitsToToken(logits_tensor);
-    stats_.aggregate_sampling_time_ms +=
-        util::time_in_ms() - sample_start_time_ms;
+    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
 
     if (pos < num_prompt_tokens - 1) {
       cur_token = prompt_tokens[pos + 1];
@@ -304,7 +310,7 @@ Error Runner::generate(
       break;
     }
   }
-  stats_.inference_end_ms = util::time_in_ms();
+  stats_.inference_end_ms = time_in_ms();
 
   if (pos == seq_len) {
     ET_LOG(Info, "\nSequence length (%i tokens) reached!", seq_len);
@@ -406,5 +412,4 @@ std::vector<Result<MethodMeta>> Runner::get_methods_meta() {
   }
   return methods_meta;
 }
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
index bd24ea6beb4..0d15114bc64 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
@@ -22,8 +22,7 @@
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
 class Runner {
  public:
@@ -64,15 +63,16 @@ class Runner {
   };
 
   bool is_loaded() const;
-  Error load();
-  Error generate(
+  executorch::runtime::Error load();
+  executorch::runtime::Error generate(
       const std::string& prompt,
       const std::string& system_prompt,
       int32_t seq_len,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {});
   void stop();
-  std::vector<Result<MethodMeta>> get_methods_meta();
+  std::vector<executorch::runtime::Result<executorch::runtime::MethodMeta>>
+  get_methods_meta();
 
  private:
   enum EvalMode {
@@ -86,8 +86,9 @@ class Runner {
     kLlama3,
   };
 
-  int32_t logitsToToken(const exec_aten::Tensor& logits_tensor);
-  void run_model_step(std::vector<std::vector<EValue>>& inputs);
+  int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor);
+  void run_model_step(
+      std::vector<std::vector<executorch::runtime::EValue>>& inputs);
   // metadata
   int32_t bos_id_;
   std::unordered_set<uint64_t> eos_id_;
@@ -96,11 +97,11 @@ class Runner {
   const int32_t vocab_size_;
   const int32_t max_seq_len_;
   int32_t eval_mode_;
-  std::vector<std::shared_ptr<Module>> modules_;
+  std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
   std::string tokenizer_path_;
   float temperature_;
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::unique_ptr<Sampler> sampler_;
+  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
   Stats stats_;
   std::unique_ptr<Memory> io_mem_;
   const float logits_scale_;
@@ -108,5 +109,4 @@ class Runner {
   LlamaVersion version_;
 };
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
index c59cea32b9f..ff22f08cd09 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
@@ -21,7 +21,7 @@ target_include_directories(
 target_link_libraries(
   qaihub_stable_diffusion_runner
   qnn_executorch_backend
-  executorch_no_prim_ops
+  executorch_core
   extension_data_loader
   extension_module
   extension_tensor
@@ -31,3 +31,6 @@ target_link_libraries(
 target_compile_options(
   qaihub_stable_diffusion_runner PUBLIC ${_common_compile_options}
 )
+set_target_properties(
+  qaihub_stable_diffusion_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py
index 64393fddfee..defce876ba0 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py
@@ -311,8 +311,6 @@ def inference(args, compiler_specs, pte_files):
     qnn_executor_runner_args = " ".join(
         [
             f"cd {adb.workspace} &&",
-            "export ADSP_LIBRARY_PATH=. &&",
-            "export LD_LIBRARY_PATH=. &&",
             f"./qaihub_stable_diffusion_runner {' '.join(qnn_executor_runner_args)}",
         ]
     )
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp
index 687a260c4a5..9c15ceadf8a 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp
@@ -66,9 +66,10 @@ void usage_message() {
   gflags::SetUsageMessage(usage_message);
 }
 
+using executorch::runtime::Error;
+
 int main(int argc, char** argv) {
-  using namespace torch::executor;
-  runtime_init();
+  executorch::runtime::runtime_init();
   usage_message();
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   bool is_default =
@@ -101,7 +102,7 @@ int main(int argc, char** argv) {
       FLAGS_text_encoder_path, FLAGS_unet_path, FLAGS_vae_path};
 
   // Create stable_diffusion_runner
-  Runner runner(
+  example::Runner runner(
       models_path,
       FLAGS_num_time_steps,
       FLAGS_guidance_scale,
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
index b6c211d8acb..cc54a801737 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
@@ -22,10 +22,15 @@
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/platform/log.h>
 
-using namespace ::executorch::extension;
+using executorch::extension::from_blob;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::extension::llm::time_in_ms;
+using executorch::runtime::Error;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
 
-namespace torch {
-namespace executor {
+namespace example {
 
 Runner::Runner(
     const std::vector<std::string>& models_path,
@@ -88,11 +93,11 @@ Error Runner::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
-  stats_.model_load_start_ms = util::time_in_ms();
+  stats_.model_load_start_ms = time_in_ms();
   for (auto& module : modules_) {
     ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward"));
   }
-  stats_.model_load_end_ms = util::time_in_ms();
+  stats_.model_load_end_ms = time_in_ms();
   return Error::Ok;
 }
 
@@ -119,7 +124,7 @@ Error Runner::parse_input_list(std::string& path) {
 
 Error Runner::init_tokenizer(const std::string& vocab_json_path) {
   ET_LOG(Info, "Loading Tokenizer from json");
-  stats_.tokenizer_load_start_ms = util::time_in_ms();
+  stats_.tokenizer_load_start_ms = time_in_ms();
   std::ifstream fin(vocab_json_path);
   auto update_map = [this](std::string& target, std::regex& re) {
     std::smatch sm;
@@ -159,7 +164,7 @@ Error Runner::init_tokenizer(const std::string& vocab_json_path) {
     std::string target = text.substr(pos);
     update_map(target, re_pattern);
   }
-  stats_.tokenizer_load_end_ms = util::time_in_ms();
+  stats_.tokenizer_load_end_ms = time_in_ms();
   return Error::Ok;
 }
 
@@ -338,15 +343,15 @@ void Runner::step(
 
 Error Runner::generate(std::string prompt) {
   ET_LOG(Info, "Start generating");
-  stats_.generate_start_ms = util::time_in_ms();
+  stats_.generate_start_ms = time_in_ms();
 
   // Start tokenize
-  stats_.tokenizer_parsing_start_ms = util::time_in_ms();
+  stats_.tokenizer_parsing_start_ms = time_in_ms();
   std::vector<int32_t> cond_tokens = tokenize(prompt);
   cond_tokens.resize(max_tokens_);
   std::vector<int32_t> uncond_tokens = tokenize("");
   uncond_tokens.resize(max_tokens_);
-  stats_.tokenizer_parsing_end_ms = util::time_in_ms();
+  stats_.tokenizer_parsing_end_ms = time_in_ms();
 
   std::vector<Result<MethodMeta>> method_metas = get_methods_meta();
 
@@ -373,14 +378,14 @@ Error Runner::generate(std::string prompt) {
       uncond_emb_vec.data(),
       {1, 77, 1024},
       encoder_method_meta.output_tensor_meta(0)->scalar_type());
-  modules_[0]->set_output_data_ptr(cond_emb_tensor, 0);
-  long encoder_start = util::time_in_ms();
+  modules_[0]->set_output(cond_emb_tensor);
+  long encoder_start = time_in_ms();
   auto cond_res = modules_[0]->forward(cond_tokens_tensor);
-  stats_.text_encoder_execution_time += (util::time_in_ms() - encoder_start);
-  modules_[0]->set_output_data_ptr(uncond_emb_tensor, 0);
-  encoder_start = util::time_in_ms();
+  stats_.text_encoder_execution_time += (time_in_ms() - encoder_start);
+  modules_[0]->set_output(uncond_emb_tensor);
+  encoder_start = time_in_ms();
   auto uncond_res = modules_[0]->forward(uncond_tokens_tensor);
-  stats_.text_encoder_execution_time += (util::time_in_ms() - encoder_start);
+  stats_.text_encoder_execution_time += (time_in_ms() - encoder_start);
 
   // Initialize unet parameters
   MethodMeta unet_method_meta = method_metas[1].get();
@@ -451,7 +456,7 @@ Error Runner::generate(std::string prompt) {
 
   // Execute unet
   for (int step_index = 0; step_index < num_time_steps_; step_index++) {
-    long start_post_process = util::time_in_ms();
+    long start_post_process = time_in_ms();
     scale_model_input(latent, fp_latent_model_input, sigmas[step_index]);
 
     quant_tensor(
@@ -461,24 +466,24 @@ Error Runner::generate(std::string prompt) {
         unet_input_latent_offset_);
 
     stats_.unet_aggregate_post_processing_time +=
-        (util::time_in_ms() - start_post_process);
-    modules_[1]->set_output_data_ptr(noise_pred_text_tensor, 0);
-    long start_unet_execution = util::time_in_ms();
+        (time_in_ms() - start_post_process);
+    modules_[1]->set_output(noise_pred_text_tensor);
+    long start_unet_execution = time_in_ms();
     auto cond_res = modules_[1]->forward(
         {latent_tensor, time_emb_tensors[step_index], cond_emb_tensor});
     stats_.unet_aggregate_execution_time +=
-        (util::time_in_ms() - start_unet_execution);
-    modules_[1]->set_output_data_ptr(noise_pred_uncond_tensor, 0);
-    start_unet_execution = util::time_in_ms();
+        (time_in_ms() - start_unet_execution);
+    modules_[1]->set_output(noise_pred_uncond_tensor);
+    start_unet_execution = time_in_ms();
     auto uncond_res = modules_[1]->forward(
         {latent_tensor,
          time_emb_tensors[step_index],
          uncond_emb_tensor}); // results in noise_pred_uncond_vec
     stats_.unet_aggregate_execution_time +=
-        (util::time_in_ms() - start_unet_execution);
+        (time_in_ms() - start_unet_execution);
 
     // start unet post processing
-    start_post_process = util::time_in_ms();
+    start_post_process = time_in_ms();
 
     dequant_tensor(
         noise_pred_text,
@@ -497,7 +502,7 @@ Error Runner::generate(std::string prompt) {
     }
     step(fp_noise_pred_text, sigmas, latent, prev_sample, step_index);
     stats_.unet_aggregate_post_processing_time +=
-        (util::time_in_ms() - start_post_process);
+        (time_in_ms() - start_post_process);
   }
 
   // Start VAE
@@ -519,11 +524,11 @@ Error Runner::generate(std::string prompt) {
 
   quant_tensor(latent, vae_input, vae_input_scale_, vae_input_offset_);
 
-  modules_[2]->set_output_data_ptr(output_tensor, 0);
-  long start_vae_execution = util::time_in_ms();
+  modules_[2]->set_output(output_tensor);
+  long start_vae_execution = time_in_ms();
   auto vae_res = modules_[2]->forward(vae_input_tensor);
-  stats_.vae_execution_time = (util::time_in_ms() - start_vae_execution);
-  stats_.generate_end_ms = util::time_in_ms();
+  stats_.vae_execution_time = (time_in_ms() - start_vae_execution);
+  stats_.generate_end_ms = time_in_ms();
 
   // Dequant uint16 output to fp32 output
   dequant_tensor(q_out, out, vae_output_scale_, vae_output_offset_);
@@ -605,5 +610,4 @@ Error Runner::print_performance() {
   return Error::Ok;
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h
index e081ab80ccc..f91efd5b832 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h
@@ -17,8 +17,7 @@
 
 #include <executorch/extension/module/module.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
 class Runner {
  public:
@@ -77,9 +76,9 @@ class Runner {
   };
 
   bool is_loaded() const;
-  Error load();
-  Error init_tokenizer(const std::string& vocab_json_path);
-  Error print_performance();
+  executorch::runtime::Error load();
+  executorch::runtime::Error init_tokenizer(const std::string& vocab_json_path);
+  executorch::runtime::Error print_performance();
   std::vector<int> tokenize(std::string prompt);
   std::vector<float> gen_latent_from_file();
   std::vector<float> gen_random_latent(float sigma);
@@ -89,15 +88,16 @@ class Runner {
       std::vector<float>& sample,
       std::vector<float>& prev_sample,
       int step_index);
-  std::vector<Result<MethodMeta>> get_methods_meta();
+  std::vector<executorch::runtime::Result<executorch::runtime::MethodMeta>>
+  get_methods_meta();
   std::vector<float> get_time_steps();
   std::vector<float> get_sigmas(const std::vector<float>& time_steps);
   void scale_model_input(
       const std::vector<float>& vec,
       std::vector<float>& latent_model_input,
       float sigma);
-  Error parse_input_list(std::string& path);
-  Error generate(std::string prompt);
+  executorch::runtime::Error parse_input_list(std::string& path);
+  executorch::runtime::Error generate(std::string prompt);
   void quant_tensor(
       const std::vector<float>& fp_vec,
       std::vector<uint16_t>& quant_vec,
@@ -111,7 +111,7 @@ class Runner {
 
  private:
   Stats stats_;
-  std::vector<std::unique_ptr<Module>> modules_;
+  std::vector<std::unique_ptr<executorch::extension::Module>> modules_;
   std::vector<std::vector<uint16_t>> time_emb_list_;
   std::unordered_map<std::string, int32_t> vocab_to_token_map_;
 
@@ -137,5 +137,4 @@ class Runner {
   const bool fix_latents_ = false;
 };
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/examples/qualcomm/qaihub_scripts/utils/export.py b/examples/qualcomm/qaihub_scripts/utils/export.py
index 9dfe6796491..b742f59f1d4 100644
--- a/examples/qualcomm/qaihub_scripts/utils/export.py
+++ b/examples/qualcomm/qaihub_scripts/utils/export.py
@@ -220,7 +220,6 @@ def compile(args):
     )
     # setup memory planning
     memory_planning_pass = MemoryPlanningPass(
-        memory_planning_algo="greedy",
         alloc_graph_input=args.allocate_graph_io,
         alloc_graph_output=args.allocate_graph_io,
     )
diff --git a/examples/qualcomm/qaihub_scripts/utils/utils.py b/examples/qualcomm/qaihub_scripts/utils/utils.py
index 67d519a688e..ad55d7fd10b 100644
--- a/examples/qualcomm/qaihub_scripts/utils/utils.py
+++ b/examples/qualcomm/qaihub_scripts/utils/utils.py
@@ -68,7 +68,6 @@ def gen_pte_from_ctx_bin(
     for pte_name in pte_names:
         print(f"{pte_name} generating...")
         memory_planning_pass = MemoryPlanningPass(
-            memory_planning_algo="greedy",
             alloc_graph_input=False,
             alloc_graph_output=False,
         )
diff --git a/examples/qualcomm/qnn_intermediate_output_inspector.py b/examples/qualcomm/qnn_intermediate_output_inspector.py
new file mode 100644
index 00000000000..59e1a279d82
--- /dev/null
+++ b/examples/qualcomm/qnn_intermediate_output_inspector.py
@@ -0,0 +1,52 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from executorch.devtools import Inspector
+
+
+def main(args):
+    # Create an Inspector instance with etdump and the debug buffer.
+    inspector = Inspector(
+        etdump_path=args.etdump_path,
+        etrecord=args.etrecord_path,
+        debug_buffer_path=args.debug_buffer_path,
+    )
+
+    # Accessing intermediate outputs from each event (an event here is essentially an instruction that executed in the runtime).
+    for event_block in inspector.event_blocks:
+        if event_block.name == "Execute":
+            for event in event_block.events:
+                # If user enables profiling and dump intermediate outputs the same time, we need to skip the profiling event
+                if event.perf_data is not None and event.is_delegated_op:
+                    continue
+                print("Event Name: ", event.name)
+                print(event.debug_data)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--etdump_path",
+        required=True,
+        help="Provide an ETDump file path. File extension should be .etdp",
+    )
+    parser.add_argument(
+        "--etrecord_path",
+        required=False,
+        default=None,
+        help="Provide an optional ETRecord file path. File extension should be .bin",
+    )
+    parser.add_argument(
+        "--debug_buffer_path",
+        required=False,
+        default=None,
+        help="Provide an optional debug buffer file path. File extension should be .bin",
+    )
+    args = parser.parse_args()
+
+    main(args)
diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py
index 7f24d616182..4563a1062fc 100755
--- a/examples/qualcomm/scripts/deeplab_v3.py
+++ b/examples/qualcomm/scripts/deeplab_v3.py
@@ -8,7 +8,6 @@
 import os
 import random
 import re
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -98,7 +97,7 @@ def main(args):
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
index 8852cf0e4c7..d4d826b5b13 100755
--- a/examples/qualcomm/scripts/edsr.py
+++ b/examples/qualcomm/scripts/edsr.py
@@ -7,7 +7,6 @@
 import json
 import os
 import re
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -124,7 +123,7 @@ def main(args):
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index 08f18d6ac6a..9e073b998d7 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -1,3 +1,4 @@
+# pyre-ignore-all-errors
 import argparse
 import copy
 
@@ -24,7 +25,8 @@
 
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
-if __name__ == "__main__":
+
+def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-m",
@@ -104,3 +106,7 @@
         generate_etrecord(etrecord_path, edge_copy, executorch_program)
 
     save_pte_program(executorch_program, args.model_name, args.output_folder)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py
index 9cc35463d41..67a2509016b 100755
--- a/examples/qualcomm/scripts/inception_v3.py
+++ b/examples/qualcomm/scripts/inception_v3.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -16,6 +15,7 @@
 from executorch.examples.models.inception_v3.model import InceptionV3Model
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     parse_skip_delegation_node,
     setup_common_args_and_variables,
@@ -24,40 +24,6 @@
 )
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append((feature,))
-        targets.append(target)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
@@ -74,9 +40,11 @@ def main(args):
     if args.compile_only:
         inputs = [(torch.rand(1, 3, 224, 224),)]
     else:
-        inputs, targets, input_list = get_dataset(
+        inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
             data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
         )
     pte_filename = "ic3_qnn"
     instance = InceptionV3Model()
@@ -93,7 +61,7 @@ def main(args):
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py
index 9a19de1a37a..2c69a00c1bd 100755
--- a/examples/qualcomm/scripts/inception_v4.py
+++ b/examples/qualcomm/scripts/inception_v4.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -16,6 +15,7 @@
 from executorch.examples.models.inception_v4 import InceptionV4Model
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     parse_skip_delegation_node,
     setup_common_args_and_variables,
@@ -24,39 +24,6 @@
 )
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize((299, 299)),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append((feature,))
-        targets.append(target)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
@@ -71,11 +38,12 @@ def main(args):
 
     data_num = 100
     if args.compile_only:
-        inputs = [(torch.rand(1, 3, 224, 224),)]
+        inputs = [(torch.rand(1, 3, 299, 299),)]
     else:
-        inputs, targets, input_list = get_dataset(
+        inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
             data_size=data_num,
+            image_shape=(299, 299),
         )
     pte_filename = "ic4_qnn"
     instance = InceptionV4Model()
@@ -92,7 +60,7 @@ def main(args):
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 605bb27d330..573e23640b2 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -302,7 +301,7 @@ def calibrator(gm):
             file.write(exec_prog.buffer)
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py
index a915e26c6be..13a31956782 100755
--- a/examples/qualcomm/scripts/mobilenet_v2.py
+++ b/examples/qualcomm/scripts/mobilenet_v2.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -16,6 +15,7 @@
 from executorch.examples.models.mobilenet_v2 import MV2Model
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     parse_skip_delegation_node,
     setup_common_args_and_variables,
@@ -24,40 +24,6 @@
 )
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append((feature,))
-        targets.append(target)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
@@ -74,9 +40,11 @@ def main(args):
     if args.compile_only:
         inputs = [(torch.rand(1, 3, 224, 224),)]
     else:
-        inputs, targets, input_list = get_dataset(
+        inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
             data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
         )
     pte_filename = "mv2_qnn"
     instance = MV2Model()
@@ -93,7 +61,7 @@ def main(args):
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py
index 068e9cba3a7..b15ca81b0fa 100644
--- a/examples/qualcomm/scripts/mobilenet_v3.py
+++ b/examples/qualcomm/scripts/mobilenet_v3.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -15,6 +14,7 @@
 from executorch.examples.models.mobilenet_v3 import MV3Model
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     parse_skip_delegation_node,
     setup_common_args_and_variables,
@@ -23,40 +23,6 @@
 )
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append((feature,))
-        targets.append(target)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
@@ -73,9 +39,11 @@ def main(args):
     if args.compile_only:
         inputs = [(torch.rand(1, 3, 224, 224),)]
     else:
-        inputs, targets, input_list = get_dataset(
+        inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
             data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
         )
     pte_filename = "mv3_qnn"
     instance = MV3Model()
@@ -91,7 +59,7 @@ def main(args):
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py
index c9fc988d560..e48d80e6180 100755
--- a/examples/qualcomm/scripts/torchvision_vit.py
+++ b/examples/qualcomm/scripts/torchvision_vit.py
@@ -6,7 +6,6 @@
 
 import json
 import os
-import sys
 from multiprocessing.connection import Client
 
 import numpy as np
@@ -16,6 +15,7 @@
 from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
+    get_imagenet_dataset,
     make_output_dir,
     setup_common_args_and_variables,
     SimpleADB,
@@ -23,40 +23,6 @@
 )
 
 
-def get_dataset(dataset_path, data_size):
-    from torchvision import datasets, transforms
-
-    def get_data_loader():
-        preprocess = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                ),
-            ]
-        )
-        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
-        return torch.utils.data.DataLoader(
-            imagenet_data,
-            shuffle=True,
-        )
-
-    # prepare input data
-    inputs, targets, input_list = [], [], ""
-    data_loader = get_data_loader()
-    for index, data in enumerate(data_loader):
-        if index >= data_size:
-            break
-        feature, target = data
-        inputs.append(feature)
-        targets.append(target)
-        input_list += f"input_{index}_0.raw\n"
-
-    return inputs, targets, input_list
-
-
 def main(args):
     # ensure the working directory exist.
     os.makedirs(args.artifact, exist_ok=True)
@@ -65,9 +31,11 @@ def main(args):
     if args.compile_only:
         inputs = [(torch.rand(1, 3, 224, 224),)]
     else:
-        inputs, targets, input_list = get_dataset(
+        inputs, targets, input_list = get_imagenet_dataset(
             dataset_path=f"{args.dataset}",
             data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
         )
 
     pte_filename = "vit_qnn"
@@ -83,7 +51,7 @@ def main(args):
     )
 
     if args.compile_only:
-        sys.exit(0)
+        return
 
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 5d9a3aef262..d88f14e4b96 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -67,6 +67,7 @@ def __init__(
         host_id=None,
         error_only=False,
         shared_buffer=False,
+        dump_intermediate_outputs=False,
         runner="examples/qualcomm/executor_runner/qnn_executor_runner",
     ):
         self.qnn_sdk = qnn_sdk
@@ -78,8 +79,11 @@ def __init__(
         self.working_dir = Path(self.pte_path[0]).parent.absolute()
         self.input_list_filename = "input_list.txt"
         self.etdump_path = f"{self.workspace}/etdump.etdp"
+        self.dump_intermediate_outputs = dump_intermediate_outputs
+        self.debug_output_path = f"{self.workspace}/debug_output.bin"
         self.output_folder = f"{self.workspace}/outputs"
         self.arch_table = {
+            "SSG2115P": "73",
             "SM8650": "75",
             "SM8550": "73",
             "SM8475": "69",
@@ -153,13 +157,17 @@ def execute(self, custom_runner_cmd=None):
                     f"--input_list_path {self.input_list_filename}",
                     f"--etdump_path {self.etdump_path}",
                     "--shared_buffer" if self.shared_buffer else "",
+                    f"--debug_output_path {self.debug_output_path}",
+                    (
+                        "--dump_intermediate_outputs"
+                        if self.dump_intermediate_outputs
+                        else ""
+                    ),
                 ]
             )
             qnn_executor_runner_cmds = " ".join(
                 [
                     f"cd {self.workspace} &&",
-                    "export ADSP_LIBRARY_PATH=. &&",
-                    "export LD_LIBRARY_PATH=. &&",
                     f"./qnn_executor_runner {qnn_executor_runner_args}",
                 ]
             )
@@ -178,6 +186,12 @@ def pull_etdump(self, output_path, callback=None):
         if callback:
             callback()
 
+    def pull_debug_output(self, etdump_path, debug_ouput_path, callback=None):
+        self._adb(["pull", self.etdump_path, etdump_path])
+        self._adb(["pull", self.debug_output_path, debug_ouput_path])
+        if callback:
+            callback()
+
 
 def make_quantizer(
     quant_dtype: Optional[QuantDtype],
@@ -219,23 +233,17 @@ def build_executorch_binary(
     soc_model,
     file_name,
     dataset: List[torch.Tensor] | Callable[[torch.fx.GraphModule], None],
-    custom_annotations=(),
     skip_node_id_set=None,
     skip_node_op_set=None,
     quant_dtype: Optional[QuantDtype] = None,
-    per_channel_linear=False,  # TODO: remove this once QNN fully supports linear
+    custom_quantizer=None,
     shared_buffer=False,
     metadata=None,
-    act_observer=MovingAverageMinMaxObserver,
+    dump_intermediate_outputs=False,
+    custom_pass_config=None,
 ):
     if quant_dtype is not None:
-        quantizer = make_quantizer(
-            quant_dtype=quant_dtype,
-            custom_annotations=custom_annotations,
-            per_channel_conv=True,
-            per_channel_linear=per_channel_linear,
-            act_observer=act_observer,
-        )
+        quantizer = custom_quantizer or make_quantizer(quant_dtype=quant_dtype)
         captured_model = torch.export.export(model, inputs).module()
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
@@ -247,9 +255,9 @@ def build_executorch_binary(
                 annotated_model(*data)
 
         quantized_model = convert_pt2e(annotated_model)
-        edge_prog = capture_program(quantized_model, inputs)
+        edge_prog = capture_program(quantized_model, inputs, custom_pass_config)
     else:
-        edge_prog = capture_program(model, inputs)
+        edge_prog = capture_program(model, inputs, custom_pass_config)
 
     backend_options = generate_htp_compiler_spec(
         use_fp16=False if quant_dtype else True
@@ -259,6 +267,7 @@ def build_executorch_binary(
             soc_model=getattr(QcomChipset, soc_model),
             backend_options=backend_options,
             shared_buffer=shared_buffer,
+            dump_intermediate_outputs=dump_intermediate_outputs,
         ),
         skip_node_id_set,
         skip_node_op_set,
@@ -270,7 +279,6 @@ def build_executorch_binary(
         # Therefore, won't want to pre-allocate
         # by memory manager in runtime.
         memory_planning_pass=MemoryPlanningPass(
-            memory_planning_algo="greedy",
             alloc_graph_input=not shared_buffer,
             alloc_graph_output=not shared_buffer,
         ),
@@ -346,6 +354,40 @@ def histogram(golden, predict):
     return (pa, mpa, miou, cls_iou)
 
 
+def get_imagenet_dataset(dataset_path, data_size, image_shape, crop_size=None):
+    from torchvision import datasets, transforms
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size or image_shape[0]),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=True,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.raw\n"
+
+    return inputs, targets, input_list
+
+
 def setup_common_args_and_variables():
     parser = argparse.ArgumentParser()
 
@@ -432,20 +474,19 @@ def setup_common_args_and_variables():
         default=False,
     )
 
+    parser.add_argument(
+        "--dump_intermediate_outputs",
+        help="If specified, enable dump intermediate outputs",
+        action="store_true",
+        default=False,
+    )
+
     # QNN_SDK_ROOT might also be an argument, but it is used in various places.
     # So maybe it's fine to just use the environment.
     if "QNN_SDK_ROOT" not in os.environ:
         raise RuntimeError("Environment variable QNN_SDK_ROOT must be set")
     print(f"QNN_SDK_ROOT={os.getenv('QNN_SDK_ROOT')}")
 
-    if "LD_LIBRARY_PATH" not in os.environ:
-        print(
-            "[Warning] LD_LIBRARY_PATH is not set. If errors like libQnnHtp.so "
-            "not found happen, please follow setup.md to set environment."
-        )
-    else:
-        print(f"LD_LIBRARY_PATH={os.getenv('LD_LIBRARY_PATH')}")
-
     return parser
 
 
diff --git a/examples/selective_build/test_selective_build.sh b/examples/selective_build/test_selective_build.sh
index fd2ae421e22..5af3de5f3e5 100644
--- a/examples/selective_build/test_selective_build.sh
+++ b/examples/selective_build/test_selective_build.sh
@@ -48,9 +48,9 @@ test_buck2_select_ops_in_list() {
     ${PYTHON_EXECUTABLE} -m examples.portable.scripts.export --model_name="add_mul"
 
     echo "Running selective build test"
-    # set max_kernel_num=21: 19 primops, add, mul
+    # set max_kernel_num=22: 19 primops, add, mul
     $BUCK run //examples/selective_build:selective_build_test \
-        --config=executorch.max_kernel_num=21 \
+        --config=executorch.max_kernel_num=22 \
         --config=executorch.select_ops=list \
         -- --model_path=./add_mul.pte
 
@@ -117,11 +117,11 @@ test_cmake_select_ops_in_list() {
 
     local example_dir=examples/selective_build
     local build_dir=cmake-out/${example_dir}
-    # set MAX_KERNEL_NUM=21: 19 primops, add, mul
+    # set MAX_KERNEL_NUM=22: 19 primops, add, mul
     rm -rf ${build_dir}
     retry cmake -DBUCK2="$BUCK" \
             -DCMAKE_BUILD_TYPE=Release \
-            -DMAX_KERNEL_NUM=21 \
+            -DMAX_KERNEL_NUM=22 \
             -DEXECUTORCH_SELECT_OPS_LIST="aten::convolution.out,\
 aten::_native_batch_norm_legit_no_training.out,aten::hardtanh.out,aten::add.out,\
 aten::mean.out,aten::view_copy.out,aten::permute_copy.out,aten::addmm.out,\
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index 0ae84c0197a..520aa82d7cf 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -81,7 +81,7 @@
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+    model = torch.export.export_for_training(model, example_inputs).module()
 
     if args.quantize:
         logging.info("Quantizing Model...")
diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py
index bd23f7f383e..e5453842281 100644
--- a/examples/xnnpack/quantization/example.py
+++ b/examples/xnnpack/quantization/example.py
@@ -60,7 +60,7 @@ def verify_xnnpack_quantizer_matching_fx_quant_model(model_name, model, example_
     m = model
 
     # 1. pytorch 2.0 export quantization flow (recommended/default flow)
-    m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs))
+    m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
     quantizer = XNNPACKQuantizer()
     quantization_config = get_symmetric_quantization_config(is_per_channel=True)
     quantizer.set_global(quantization_config)
@@ -177,7 +177,7 @@ def main() -> None:
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    model = torch._export.capture_pre_autograd_graph(model, example_inputs)
+    model = torch.export.export_for_training(model, example_inputs).module()
     start = time.perf_counter()
     quantized_model = quantize(model, example_inputs)
     end = time.perf_counter()
diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index d114d8b4705..966cae5f022 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -261,6 +261,15 @@ def _partition_and_lower_one_graph_module(
                     call_delegate_args.append(inp_node)
                     break
 
+        def generate_debug_handle(ep: ExportedProgram) -> int:
+            """
+            Generate a debug handle for the given ExportedProgram.
+            """
+            debug_handle = 0
+            for node in ep.graph_module.graph.nodes:
+                debug_handle = max(debug_handle, node.meta.get("debug_handle", 0))
+            return debug_handle + 1
+
         # Replace the partitioned submodule with a lowered submodule
         # Add call_method node with function "forward"
         with tagged_graph_module.graph.inserting_before(call_module_node):
@@ -273,8 +282,8 @@ def _partition_and_lower_one_graph_module(
                 (lowered_node,) + tuple(call_delegate_args),
                 call_module_node.kwargs,
             )
-            call_delegate_node.meta["debug_handle"] = len(
-                tagged_graph_module.graph.nodes
+            call_delegate_node.meta["debug_handle"] = generate_debug_handle(
+                owning_program
             )
             call_delegate_node.meta["val"] = submodule_output_node.meta["val"]
             call_module_node.replace_all_uses_with(call_delegate_node)
diff --git a/exir/backend/test/demos/rpc/CMakeLists.txt b/exir/backend/test/demos/rpc/CMakeLists.txt
index cd1b6e73ff2..d3722e830d4 100644
--- a/exir/backend/test/demos/rpc/CMakeLists.txt
+++ b/exir/backend/test/demos/rpc/CMakeLists.txt
@@ -29,7 +29,7 @@ set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 add_library(
   executor_backend STATIC ExecutorBackendRegister.cpp ExecutorBackend.cpp
 )
-target_link_libraries(executor_backend PRIVATE executorch_no_prim_ops)
+target_link_libraries(executor_backend PRIVATE executorch_core)
 
 target_include_directories(
   executor_backend PUBLIC ${_common_include_directories}
diff --git a/exir/backend/test/demos/rpc/ExecutorBackend.cpp b/exir/backend/test/demos/rpc/ExecutorBackend.cpp
index aeef621a271..d398b87123b 100644
--- a/exir/backend/test/demos/rpc/ExecutorBackend.cpp
+++ b/exir/backend/test/demos/rpc/ExecutorBackend.cpp
@@ -21,8 +21,30 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 
-namespace torch {
-namespace executor {
+using ::executorch::aten::Tensor;
+using ::executorch::extension::BufferDataLoader;
+using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::Backend;
+using ::executorch::runtime::BackendExecutionContext;
+using ::executorch::runtime::BackendInitContext;
+using ::executorch::runtime::CompileSpec;
+using ::executorch::runtime::DelegateHandle;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::FreeableBuffer;
+using ::executorch::runtime::HierarchicalAllocator;
+using ::executorch::runtime::MemoryAllocator;
+using ::executorch::runtime::MemoryManager;
+using ::executorch::runtime::Method;
+using ::executorch::runtime::MethodMeta;
+using ::executorch::runtime::Program;
+using ::executorch::runtime::Result;
+using ::executorch::runtime::Span;
+using ::executorch::runtime::Tag;
+using ::executorch::runtime::internal::copy_tensor_data;
+
+namespace example {
+
 /**
  * ExecutorBackend is a backend to execute an executorch program via delegate.
  * In preprocess, the preprocesed bytes (delegate blob) is an executorch
@@ -51,8 +73,8 @@ class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
     // will return the data directly without copying it.
     MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
     auto loader = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        runtime_allocator, util::BufferDataLoader);
-    new (loader) util::BufferDataLoader(processed->data(), processed->size());
+        runtime_allocator, BufferDataLoader);
+    new (loader) BufferDataLoader(processed->data(), processed->size());
     // Can't free `processed` because the program will point into that memory.
 
     // Try loading the program.
@@ -150,7 +172,7 @@ class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
       if (output.tag == Tag::Tensor) {
         Tensor t_src = output.toTensor();
         Tensor t_dst = args[num_inputs + i]->toTensor();
-        status = internal::copy_tensor_data(t_dst, t_src);
+        status = copy_tensor_data(t_dst, t_src);
       }
     }
 
@@ -165,12 +187,11 @@ class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
   }
 };
 
-Error registerExecutorBackend() {
+Error register_executor_backend() {
   static auto cls = ExecutorBackend();
   static Backend backend{"ExecutorBackend", &cls};
   static auto success_with_compiler = register_backend(backend);
   return success_with_compiler;
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/exir/backend/test/demos/rpc/ExecutorBackend.h b/exir/backend/test/demos/rpc/ExecutorBackend.h
index a7b6bf94ebc..747c48265db 100644
--- a/exir/backend/test/demos/rpc/ExecutorBackend.h
+++ b/exir/backend/test/demos/rpc/ExecutorBackend.h
@@ -10,10 +10,8 @@
 
 #include <executorch/runtime/core/error.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
-Error registerExecutorBackend();
+::executorch::runtime::Error register_executor_backend();
 
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/exir/backend/test/demos/rpc/ExecutorBackendRegister.cpp b/exir/backend/test/demos/rpc/ExecutorBackendRegister.cpp
index fdb9fc2d423..b697b24c492 100644
--- a/exir/backend/test/demos/rpc/ExecutorBackendRegister.cpp
+++ b/exir/backend/test/demos/rpc/ExecutorBackendRegister.cpp
@@ -10,10 +10,9 @@
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 namespace {
-static Error register_success = registerExecutorBackend();
+static ::executorch::runtime::Error register_success =
+    register_executor_backend();
 } // namespace
-} // namespace executor
-} // namespace torch
+} // namespace example
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index 11a0d6d069d..24865e7a841 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -56,9 +56,7 @@ class ExecutorchBackendConfig:
 
     # A single memory planning pass can be defined for all the programs in the
     # EdgeProgramManager or can be defined per program.
-    memory_planning_pass: Union[PassType, Dict[str, PassType]] = MemoryPlanningPass(
-        "greedy"
-    )
+    memory_planning_pass: Union[PassType, Dict[str, PassType]] = MemoryPlanningPass()
     to_out_var_pass: PassType = ToOutVarPass(ignore_to_out_var_failure=False)
     dynamic_memory_planning_mode: DynamicMemoryPlanningMode = (
         DynamicMemoryPlanningMode.UPPER_BOUND
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 123896ecdba..2feeefc4ef9 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -1145,7 +1145,6 @@ def forward(self, k: torch.Tensor) -> torch.Tensor:
         config = exir.ExecutorchBackendConfig(
             sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             memory_planning_pass=MemoryPlanningPass(
-                memory_planning_algo="greedy",
                 # allow_lifetime_and_storage_overlap: bool = False,
                 alloc_graph_input=True,
                 alloc_graph_output=False,
@@ -1606,9 +1605,7 @@ def forward(self, x):
         )
         model = model.to_executorch(
             config=ExecutorchBackendConfig(
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
@@ -1652,3 +1649,23 @@ def forward(self, x):
         self.assertEqual(
             pte_data.execution_plan, model.executorch_program.execution_plan
         )
+
+    def test_mutate_input_tensor(self) -> None:
+        class MutateInputTensorModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x.add_(1)
+
+        model = to_edge(
+            export(MutateInputTensorModule(), (torch.zeros(1),))
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False)
+            )
+        )
+        executorch_model = _load_for_executorch_from_buffer(model.buffer)
+        input = torch.zeros(1)
+        executorch_model(input)
+        self.assertEqual(input, torch.ones(1))
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index e50d3038dac..bc42bba9a26 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -326,7 +326,7 @@ def program(
             verifiers=[lowered_exported_program.verifier],
         )
         if memory_planning is None:
-            memory_planning = MemoryPlanningPass("greedy")
+            memory_planning = MemoryPlanningPass()
         exported_program = _transform(exported_program, SpecPropPass(), memory_planning)
         emitted_program = emit_program(
             exported_program, emit_stacktrace=emit_stacktrace
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index 859bd069013..3c28639ba13 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -18,12 +18,7 @@
 from executorch.exir import memory
 from executorch.exir.control_flow import while_loop as exir_while
 from executorch.exir.delegate import executorch_call_delegate
-from executorch.exir.error import (
-    ExportError,
-    ExportErrorType,
-    internal_assert,
-    InternalError,
-)
+from executorch.exir.error import internal_assert, InternalError
 from executorch.exir.operator.convert import is_inplace_variant, is_out_variant
 from executorch.exir.schema import TensorShapeDynamism
 from executorch.exir.tensor import TensorSpec
@@ -255,17 +250,6 @@ def verify_graph_input_output(self) -> None:
             ), f"Misallocate graph output {graph_output_allocated} v.s. {self.alloc_graph_output}"
 
 
-def register_algo(fn: Callable[..., List[int]]) -> Callable[..., List[int]]:
-    algo_name = fn.__name__
-    if algo_name in REGISTERED_ALGOS:
-        raise ExportError(
-            ExportErrorType.VIOLATION_OF_SPEC,
-            f"Re-registering memory planning algorithm {algo_name}",
-        )
-    REGISTERED_ALGOS[algo_name] = fn
-    return fn
-
-
 def _is_out_var_node(node: torch.fx.Node) -> bool:
     return (
         node.op == "call_function"
@@ -561,7 +545,6 @@ def get_node_tensor_specs(
         ]
 
 
-@register_algo
 def greedy(
     graph_module: torch.fx.GraphModule,
     alignment: int,
@@ -615,7 +598,6 @@ def greedy(
     return total_sizes
 
 
-@register_algo
 def naive(
     graph_module: torch.fx.GraphModule,
     alignment: int,
@@ -656,15 +638,6 @@ def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
     return bufsizes
 
 
-def get_algo(algo_name: str) -> Callable[..., List[int]]:
-    if algo_name not in REGISTERED_ALGOS:
-        raise ExportError(
-            ExportErrorType.NOT_SUPPORTED,
-            f"Memory planning algorithm '{algo_name}' not found",
-        )
-    return REGISTERED_ALGOS[algo_name]
-
-
 def get_cond_nodes(graph_module: torch.fx.GraphModule) -> Iterable[Node]:
     for nd in graph_module.graph.nodes:
         if nd.target is torch.ops.higher_order.cond:
diff --git a/exir/passes/insert_write_back_for_buffers_pass.py b/exir/passes/insert_write_back_for_buffers_pass.py
index 7aef3571910..1ddbf98e7ee 100644
--- a/exir/passes/insert_write_back_for_buffers_pass.py
+++ b/exir/passes/insert_write_back_for_buffers_pass.py
@@ -15,6 +15,7 @@
     OutputKind,
     OutputSpec,
 )
+from torch.export.graph_signature import TensorArgument
 from torch.utils import _pytree as pytree
 
 
@@ -73,20 +74,21 @@ def insert_write_back_for_buffers_pass(
     ep: ExportedProgram,
 ) -> Tuple[torch.fx.GraphModule, ExportGraphSignature]:
     gm: torch.fx.GraphModule = ep.graph_module
-    lifted_inputs: List[Optional[str]] = [
-        (
-            in_spec.target
-            if in_spec.kind
-            in (
-                InputKind.BUFFER,
-                InputKind.CONSTANT_TENSOR,
-                InputKind.PARAMETER,
-                InputKind.CUSTOM_OBJ,
-            )
-            else None
-        )
-        for in_spec in ep.graph_signature.input_specs
-    ]
+    lifted_inputs: List[Optional[str]] = []
+    for in_spec in ep.graph_signature.input_specs:
+        if in_spec.kind in (
+            InputKind.BUFFER,
+            InputKind.CONSTANT_TENSOR,
+            InputKind.PARAMETER,
+            InputKind.CUSTOM_OBJ,
+        ):
+            lifted_inputs.append(in_spec.target)
+        elif in_spec.kind is InputKind.USER_INPUT and isinstance(
+            in_spec.arg, TensorArgument
+        ):
+            lifted_inputs.append(in_spec.arg.name)
+        else:
+            lifted_inputs.append(None)
 
     input_name_to_node: Dict[str, torch.fx.Node] = {}
 
@@ -101,7 +103,8 @@ def insert_write_back_for_buffers_pass(
     mutated_outputs: List[Optional[str]] = [
         (
             out_spec.target
-            if out_spec.kind in (OutputKind.BUFFER_MUTATION,)
+            if out_spec.kind
+            in (OutputKind.BUFFER_MUTATION, OutputKind.USER_INPUT_MUTATION)
             and out_spec.arg.name
             not in {
                 val.name for val in input_name_to_node.values()
@@ -121,7 +124,10 @@ def insert_write_back_for_buffers_pass(
     new_output_specs: List[OutputSpec] = []
     i = 0
     for output_spec in ep.graph_signature.output_specs:
-        if output_spec.kind == OutputKind.BUFFER_MUTATION:
+        if output_spec.kind in (
+            OutputKind.BUFFER_MUTATION,
+            OutputKind.USER_INPUT_MUTATION,
+        ):
             output_spec.arg.name = buffer_output_nodes[i].name
             i += 1
         new_output_specs.append(output_spec)
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
index 9295cabcab6..112b8f5fc52 100644
--- a/exir/passes/memory_planning_pass.py
+++ b/exir/passes/memory_planning_pass.py
@@ -6,7 +6,7 @@
 
 import logging
 import warnings
-from typing import Optional
+from typing import Callable, List, Optional
 
 import torch
 from executorch.exir.error import internal_assert
@@ -14,8 +14,8 @@
 from executorch.exir.memory_planning import (
     _is_out_var_node,
     apply_algo,
-    get_algo,
     get_node_tensor_specs,
+    greedy,
     Verifier,
 )
 from executorch.exir.operator.convert import get_out_args_from_opoverload
@@ -27,7 +27,7 @@
 class MemoryPlanningPass(PassBase):
     def __init__(
         self,
-        memory_planning_algo: str = "greedy",
+        memory_planning_algo: Callable[..., List[int]] = greedy,
         allow_lifetime_and_storage_overlap: bool = False,
         alloc_graph_input: bool = True,
         alloc_graph_output: bool = True,
@@ -96,14 +96,13 @@ def run(
         memory_planning_algo
         """
         self._set_alloc_node_spec(graph_module)
-        algo = get_algo(self.memory_planning_algo)
         # TODO(shunting) if people have concern of adding a field to GraphModule
         # directly, we should define a GraphModule subclass that we can add our
         # customized fields. Using the graph_module object to convey information across
         # passes/stages is quite natural and avoid yet another 'context' data structure
         # to do the job.
         _ = apply_algo(
-            algo,
+            self.memory_planning_algo,
             graph_module,
             self.alignment,
             graph_signature,
@@ -125,7 +124,7 @@ def run(
                 self.allow_lifetime_and_storage_overlap
             )
             logging.debug(
-                f"The {self.memory_planning_algo} algorithm reuses storage for {num_reuse_pairs} pair of tensors"
+                f"The {getattr(self.memory_planning_algo, '__name__', repr(self.memory_planning_algo))} algorithm reuses storage for {num_reuse_pairs} pair of tensors"
             )
         verifier.verify_graph_input_output()
         return PassResult(graph_module, True)
diff --git a/exir/program/TARGETS b/exir/program/TARGETS
index 730c9e93aed..fc73abf1ff7 100644
--- a/exir/program/TARGETS
+++ b/exir/program/TARGETS
@@ -22,6 +22,7 @@ python_library(
         "//caffe2:torch",
         "//executorch/exir:error",
         "//executorch/exir:graph_module",
+        "//executorch/exir:pass_base",
         "//executorch/exir:pass_manager",
         "//executorch/exir:print_program",
         "//executorch/exir:schema",
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 6b72d190f9d..144cd0d0e8e 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -9,12 +9,13 @@
 import copy
 import io
 import logging
-from typing import Any, Dict, List, Optional, Sequence, Set, TextIO, Union
+from typing import Any, Dict, List, Optional, Sequence, Set, TextIO, Tuple, Union
 
 import torch
 import torch._export
 from executorch.exir._serialize import _serialize_pte_binary
 from executorch.exir._serialize._cord import Cord
+from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
@@ -1057,6 +1058,54 @@ def to_edge_transform_and_lower(
     return edge_manager
 
 
+@experimental(
+    """
+    This is an experimental API which overloads to_edge by preserving specified ops to not be decomposed. 
+    This function will be combined with to_edge in the future.
+    """
+)
+def to_edge_with_preserved_ops(
+    programs: Union[ExportedProgram, Dict[str, ExportedProgram]],
+    constant_methods: Optional[Dict[str, Any]] = None,
+    compile_config: Optional[EdgeCompileConfig] = None,
+    preserve_ops: Tuple[torch._ops.OpOverload, ...] = (),
+) -> "EdgeProgramManager":
+    """
+    :func:`to_edge` constructs an EdgeProgramManager from a set of exported programs in
+    ATen dialect. Upon construction those programs are transformed into edge dialect.
+
+    Args:
+        programs: Can be a single ExportedProgram or a dictionary mapping function names to their corresponding ExportedPrograms. If only a single ExportedProgram is provided it will be assigned the name "forward".
+        constant_methods: An optional dictionary of method name to the constant value returned by that method in eager mode. Often used to store config information on Edge models.
+        compile_config: An optional argument used to provide greater control over the transformation to edge dialect process.
+        preserve_ops: An argument used to specify ops that should not be decomposed.
+
+    Returns:
+        EdgeProgramManager
+    """
+    assert not isinstance(constant_methods, EdgeCompileConfig)
+    config = compile_config or EdgeCompileConfig()
+    if not isinstance(programs, dict):
+        aten_programs = {"forward": programs}
+    else:
+        aten_programs = programs
+
+    edge_programs: Dict[str, ExportedProgram] = {}
+
+    for name, program in aten_programs.items():
+        # Decompose to Core ATen
+        program = program.run_decompositions(
+            _default_decomposition_table(), _preserve_ops=preserve_ops
+        )
+        edge_programs[name] = _generate_edge_program(
+            name, config, program, list(preserve_ops)
+        )
+
+    return EdgeProgramManager(
+        edge_programs, constant_methods, config, list(preserve_ops)
+    )
+
+
 def to_edge(
     programs: Union[ExportedProgram, Dict[str, ExportedProgram]],
     constant_methods: Optional[Dict[str, Any]] = None,
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index 73f023e778b..73eea7b93ef 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -26,6 +26,7 @@
     ExecutorchProgramManager,
     to_edge,
     to_edge_transform_and_lower,
+    to_edge_with_preserved_ops,
 )
 from executorch.exir.tracer import _default_decomposition_table
 from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
@@ -249,12 +250,10 @@ def test_executorch_manager_multi_config(self):
         def get_executorch_memory_planning_passes() -> Dict[str, MemoryPlanningPass]:
             return {
                 "forward": MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=True,
                     alloc_graph_output=False,
                 ),
                 "foo": MemoryPlanningPass(
-                    memory_planning_algo="greedy",
                     alloc_graph_input=False,
                     alloc_graph_output=True,
                 ),
@@ -716,3 +715,89 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             )
         except SpecViolationError:
             self.fail("Should not error out on linalg_vector_norm op")
+
+    def _test_to_edge_with_preserved_ops(
+        self, program, preserved_ops, expected_preserved_ops
+    ):
+        edge = to_edge_with_preserved_ops(program, preserve_ops=preserved_ops)
+
+        def count_nodes(graph_module, target):
+            count = 0
+            for node in graph_module.graph.nodes:
+                if node.op == "call_function" and node.target in target:
+                    count += 1
+            return count
+
+        aten_ops_non_decomposed = count_nodes(
+            program.graph_module,
+            preserved_ops,
+        )
+
+        edge_ops_non_decomposed = count_nodes(
+            edge.exported_program().graph_module,
+            expected_preserved_ops,
+        )
+
+        self.assertEqual(aten_ops_non_decomposed, edge_ops_non_decomposed)
+
+    def test_to_edge_with_single_preserved_op(self):
+        model = TestLinear()
+        program = torch.export.export(model, model._get_random_inputs())
+
+        ops_not_to_decompose = [
+            torch.ops.aten.linear.default,
+        ]
+        expected_non_decomposed_edge_ops = [
+            exir_ops.edge.aten.linear.default,
+        ]
+
+        self._test_to_edge_with_preserved_ops(
+            program, ops_not_to_decompose, expected_non_decomposed_edge_ops
+        )
+
+    def test_to_edge_with_partial_ops_preserved(self):
+        model = TestLinearSDPACombined()
+        program = torch.export.export(model, model._get_random_inputs())
+
+        ops_not_to_decompose = [
+            torch.ops.aten.linear.default,
+        ]
+        expected_non_decomposed_edge_ops = [
+            exir_ops.edge.aten.linear.default,
+        ]
+
+        self._test_to_edge_with_preserved_ops(
+            program, ops_not_to_decompose, expected_non_decomposed_edge_ops
+        )
+
+    def test_to_edge_with_multiple_ops_preserved(self):
+        model = TestLinearSDPACombined()
+        program = torch.export.export(model, model._get_random_inputs())
+
+        ops_not_to_decompose = [
+            torch.ops.aten.linear.default,
+            torch.ops.aten.scaled_dot_product_attention.default,
+        ]
+        expected_non_decomposed_edge_ops = [
+            exir_ops.edge.aten.linear.default,
+            exir_ops.edge.aten.scaled_dot_product_attention.default,
+        ]
+
+        self._test_to_edge_with_preserved_ops(
+            program, ops_not_to_decompose, expected_non_decomposed_edge_ops
+        )
+
+    def test_to_edge_with_preserved_ops_not_in_model(self):
+        model = TestSDPA()
+        program = torch.export.export(model, model._get_random_inputs())
+
+        ops_not_to_decompose = [
+            torch.ops.aten.linear.default,
+        ]
+        expected_non_decomposed_edge_ops = [
+            exir_ops.edge.aten.linear.default,
+        ]
+
+        self._test_to_edge_with_preserved_ops(
+            program, ops_not_to_decompose, expected_non_decomposed_edge_ops
+        )
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index 12a0583ab41..ebea0acf0f4 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -17,6 +17,8 @@
 from executorch.exir.memory_planning import (
     filter_nodes,
     get_node_tensor_specs,
+    greedy,
+    naive,
     Verifier,
 )
 from executorch.exir.pass_base import PassResult
@@ -208,7 +210,7 @@ def forward(self, a: torch.Tensor) -> torch.Tensor:
 
 def maketest(
     module_cls: Type[torch.nn.Module],
-    criteria: Optional[List[Tuple[str, bool]]] = None,
+    criteria: Optional[List[Tuple[Callable[..., List[int]], bool]]] = None,
     extra_check: Optional[Callable[..., None]] = None,
     use_functionalization: bool = True,
     alloc_graph_input: bool = True,
@@ -222,13 +224,15 @@ def wrapper(self: "TestMemoryPlanning") -> None:
         if not criteria:
             criteria = [
                 # naive algorithm does not reuse tensor storages
-                ("naive", False),
+                (naive, False),
                 # greedy algorithm should reuse tensor storages in the testing model
-                ("greedy", True),
+                (greedy, True),
             ]
 
         for algo, expect_reuse in criteria:
-            print(f"algo {algo}, expect_reuse {expect_reuse}")
+            print(
+                f"algo {getattr(algo, '__name__', repr(algo))}, expect_reuse {expect_reuse}"
+            )
             eager_module = module_cls().eval()
             inputs = eager_module.get_random_inputs()
             graph_module = (
@@ -353,8 +357,8 @@ def verify_overlap_placeholders(
     test_return_two: Callable[..., None] = maketest(
         ModuleReturnTwo,
         criteria=[
-            ("naive", False),
-            ("greedy", True),
+            (naive, False),
+            (greedy, True),
         ],
     )
 
@@ -363,8 +367,8 @@ def verify_overlap_placeholders(
     test_list_arg: Callable[..., None] = maketest(
         ModuleListArg,
         criteria=[
-            ("naive", False),
-            ("greedy", True),
+            (naive, False),
+            (greedy, True),
         ],
         extra_check=ModuleListArg.extra_check,
     )
@@ -466,12 +470,12 @@ def quantize(self, eager_model: nn.Module) -> nn.Module:
     @parameterized.expand(
         [
             (
-                "naive",
+                naive,
                 [(1, 0), (3, 0), (1, 4), (3, 4), (1, 8)],
                 [0, 12, 0, 8],
             ),
             (
-                "greedy",
+                greedy,
                 [(1, 0), (3, 0), (1, 4), (3, 4), (1, 0)],
                 [0, 8, 0, 8],
             ),
@@ -479,7 +483,7 @@ def quantize(self, eager_model: nn.Module) -> nn.Module:
     )
     def test_multiple_pools(
         self,
-        algo: str,
+        algo: Callable[..., List[int]],
         expected_allocs: List[Tuple[int, int]],
         expected_bufsizes: List[int],
     ) -> None:
@@ -550,9 +554,7 @@ def count_planned_inputs(
 
         ep_no_input_planning = to_edge(export(model, inputs)).to_executorch(
             config=ExecutorchBackendConfig(
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
@@ -572,9 +574,7 @@ def count_planned_inputs(
 
         ep_input_planning = to_edge(export(model, inputs)).to_executorch(
             config=ExecutorchBackendConfig(
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=True
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=True),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index a167a67dd94..d039db51876 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -713,7 +713,7 @@ def test_alloc_node_spec(self) -> None:
         self.assertIsNotNone(new_gm_res)
         new_gm = new_gm_res.graph_module
 
-        new_gm_res = MemoryPlanningPass("greedy")(new_gm)
+        new_gm_res = MemoryPlanningPass()(new_gm)
         self.assertIsNotNone(new_gm_res)
         new_gm = new_gm_res.graph_module
 
@@ -1413,10 +1413,10 @@ def quantize_model(
             m_eager: torch.nn.Module, example_inputs: Tuple[torch.Tensor]
         ) -> Tuple[EdgeProgramManager, int, int]:
             # program capture
-            m = torch._export.capture_pre_autograd_graph(
+            m = torch.export.export_for_training(
                 m_eager,
                 example_inputs,
-            )
+            ).module()
 
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config()
diff --git a/exir/tests/test_remove_view_copy.py b/exir/tests/test_remove_view_copy.py
index f64a1f19981..0925a8abc89 100644
--- a/exir/tests/test_remove_view_copy.py
+++ b/exir/tests/test_remove_view_copy.py
@@ -48,9 +48,7 @@ def test_disable(self) -> None:
         etpm = to_edge(ep).to_executorch(
             config=ExecutorchBackendConfig(
                 remove_view_copy=False,
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             ),
         )
 
@@ -72,9 +70,7 @@ def test_output_matches(self) -> None:
         etpm_remove = epm_remove.to_executorch(
             config=ExecutorchBackendConfig(
                 remove_view_copy=True,
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             ),
         )
 
@@ -82,9 +78,7 @@ def test_output_matches(self) -> None:
         etpm_no_remove = epm_no_remove.to_executorch(
             config=ExecutorchBackendConfig(
                 remove_view_copy=True,
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             ),
         )
 
@@ -107,9 +101,7 @@ def test_spec(self) -> None:
         etpm = to_edge(ep).to_executorch(
             config=ExecutorchBackendConfig(
                 remove_view_copy=True,
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             ),
         )
 
diff --git a/exir/tests/test_tracer.py b/exir/tests/test_tracer.py
index 82c7ab118cb..415443c4c1e 100644
--- a/exir/tests/test_tracer.py
+++ b/exir/tests/test_tracer.py
@@ -100,6 +100,17 @@ def f(x: torch.Tensor) -> torch.Tensor:
             any(node.meta.get("stack_trace", None) for node in traced_f.graph.nodes)
         )
 
+    def test_ones(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x):
+                y = torch.ones(x.shape[0])
+                return x + y
+
+        ep = torch.export.export(
+            M(), (torch.ones(3),), dynamic_shapes={"x": {0: torch.export.Dim("x")}}
+        )
+        exir.to_edge(ep)
+
     def test_possible_input_mutation(self) -> None:
         def f(x: torch.Tensor) -> torch.Tensor:
             return torch.add(torch.ones(5), torch.ones(5), out=x)
diff --git a/exir/verification/verifier.py b/exir/verification/verifier.py
index b519e20393a..2c45929bf23 100644
--- a/exir/verification/verifier.py
+++ b/exir/verification/verifier.py
@@ -7,6 +7,7 @@
 import itertools
 import operator
 import types
+from contextlib import nullcontext
 from typing import Any, List, Optional, Tuple, Type
 
 import torch
@@ -19,6 +20,7 @@
     RunHigherOrderOperatorError,
 )
 from torch._dispatch.python import enable_python_dispatcher
+from torch._export.utils import _detect_fake_mode_from_gm
 
 from torch._export.verifier import SpecViolationError, Verifier
 from torch._ops import OpOverload
@@ -161,8 +163,9 @@ def extract_input(node: torch.fx.Node) -> Optional[FakeTensor]:
 def _check_tensor_args_matching_op_allowed_dtype(gm: GraphModule) -> None:
     validator = EdgeOpArgValidator(gm)
     inputs = _get_inputs(gm)
+    fake_mode = _detect_fake_mode_from_gm(gm) or nullcontext()
     try:
-        with enable_python_dispatcher():
+        with enable_python_dispatcher(), fake_mode:
             validator.run(*inputs)
     except RunHigherOrderOperatorError:
         # NB: ignore higher order operator in the graph.
diff --git a/extension/android/BUCK b/extension/android/BUCK
index dfc5db18137..5d021250e6d 100644
--- a/extension/android/BUCK
+++ b/extension/android/BUCK
@@ -10,6 +10,7 @@ fb_android_library(
         "src/main/java/org/pytorch/executorch/Module.java",
         "src/main/java/org/pytorch/executorch/NativePeer.java",
         "src/main/java/org/pytorch/executorch/Tensor.java",
+        "src/main/java/org/pytorch/executorch/annotations/Experimental.java",
     ],
     autoglob = False,
     language = "JAVA",
@@ -22,8 +23,14 @@ fb_android_library(
 fb_android_library(
     name = "executorch_llama",
     srcs = [
+        "src/main/java/org/pytorch/executorch/DType.java",
+        "src/main/java/org/pytorch/executorch/EValue.java",
         "src/main/java/org/pytorch/executorch/LlamaCallback.java",
         "src/main/java/org/pytorch/executorch/LlamaModule.java",
+        "src/main/java/org/pytorch/executorch/Module.java",
+        "src/main/java/org/pytorch/executorch/NativePeer.java",
+        "src/main/java/org/pytorch/executorch/Tensor.java",
+        "src/main/java/org/pytorch/executorch/annotations/Experimental.java",
     ],
     autoglob = False,
     language = "JAVA",
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 9a1a14b113a..17b3e30eb41 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -30,6 +30,8 @@ set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../lib/cmake/ExecuTorch)
 find_package(executorch CONFIG REQUIRED)
 target_link_options_shared_lib(executorch)
 
+add_library(executorch_jni SHARED jni/jni_layer.cpp)
+
 set(link_libraries)
 list(
   APPEND
@@ -58,13 +60,21 @@ else()
   list(APPEND link_libraries portable_ops_lib portable_kernels)
   target_link_options_shared_lib(portable_ops_lib)
 endif()
+
+if(TARGET quantized_kernels)
+  list(APPEND link_libraries quantized_kernels quantized_ops_lib)
+  target_link_options_shared_lib(quantized_ops_lib)
+endif()
+
 if(TARGET qnn_executorch_backend)
   list(APPEND link_libraries qnn_executorch_backend)
 endif()
+
 if(TARGET xnnpack_backend)
   target_link_options_shared_lib(xnnpack_backend)
-  list(APPEND link_libraries xnnpack_backend XNNPACK pthreadpool cpuinfo)
+  list(APPEND link_libraries xnnpack_backend XNNPACK pthreadpool cpuinfo microkernels-prod)
 endif()
+
 if(TARGET vulkan_backend)
   target_link_options_shared_lib(vulkan_backend)
   list(APPEND link_libraries vulkan_backend)
@@ -79,7 +89,27 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   target_link_options_shared_lib(custom_ops)
 endif()
 
-add_library(executorch_jni SHARED jni/jni_layer.cpp)
+if(TARGET pthreadpool)
+  target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1)
+  target_include_directories(
+    executorch_jni
+    PUBLIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include
+  )
+  target_include_directories(
+    executorch_jni
+    PUBLIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include
+  )
+endif()
+
+if(EXECUTORCH_JNI_CUSTOM_LIBRARY)
+  list(APPEND link_libraries ${EXECUTORCH_JNI_CUSTOM_LIBRARY})
+  target_link_libraries(
+    executorch_jni -Wl,--whole-archive ${EXECUTORCH_JNI_CUSTOM_LIBRARY}
+    -Wl,--no-whole-archive
+  )
+endif()
 
 if(EXECUTORCH_BUILD_LLAMA_JNI)
   target_sources(executorch_jni PRIVATE jni/jni_layer_llama.cpp)
@@ -95,6 +125,17 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner
   )
 
+endif()
+
+target_include_directories(
+  executorch_jni PRIVATE ${_common_include_directories}
+)
+
+target_compile_options(executorch_jni PUBLIC ${_common_compile_options})
+
+target_link_libraries(executorch_jni ${link_libraries})
+
+if (NEURON_BUFFER_ALLOCATOR_LIB)
   target_sources(
     executorch_jni PRIVATE
     ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -112,33 +153,6 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner
   )
   ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
-  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION /Users/cmodi/Documents/ai/clean/executorch/backends/mediatek/libneuron_buffer_allocator.so)
+  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB}/libneuron_buffer_allocator.so)
   list(APPEND link_libraries neuron_backend libneuron_buffer_allocator)
 endif()
-
-if(TARGET quantized_kernels)
-  list(APPEND link_libraries quantized_kernels quantized_ops_lib)
-  target_link_options_shared_lib(quantized_ops_lib)
-endif()
-
-target_include_directories(
-  executorch_jni PRIVATE ${_common_include_directories}
-)
-
-target_compile_options(executorch_jni PUBLIC ${_common_compile_options})
-
-target_link_libraries(executorch_jni ${link_libraries})
-
-if(TARGET pthreadpool)
-  target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1)
-  target_include_directories(
-    executorch_jni
-    PUBLIC
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include
-  )
-  target_include_directories(
-    executorch_jni
-    PUBLIC
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include
-  )
-endif()
diff --git a/extension/android/benchmark/README.md b/extension/android/benchmark/README.md
new file mode 100644
index 00000000000..cfc5ef0e594
--- /dev/null
+++ b/extension/android/benchmark/README.md
@@ -0,0 +1,60 @@
+Minibench: ExecuTorch Android Benchmark App
+===
+
+Minibench is a benchmarking app for testing the performance of the ExecuTorch runtime on Android devices.
+
+It supports both generic (vision, audio, etc) models and LLM.
+
+- For generic model, it reports metrics such as model load time, and average inference time.
+- For LLM, it reports metrics such as model load time, and tokens per second.
+- We are working on providing more metrics in the future.
+
+Minibench is usedful for giving reference performance data when developers integrate ExecuTorch with their own Android app.
+
+## Build
+You will need executorch AAR for Java and JNI dependencies.
+```
+export ANDROID_NDK=<path_to_android_ndk>
+sh build/build_android_llm_demo.sh
+```
+and copy the AAR to `app/libs`.
+```
+mkdir -p app/libs
+cp $BUILD_AAR_DIR/executorch.aar app/libs
+```
+
+You can also refer to [this script](https://github.com/pytorch/executorch/blob/62024d8/.github/workflows/android-perf.yml#L226-L235) to see how it is built.
+
+Then you can build and install the app on Android Studio, or simply run
+```
+./gradlew installDebug
+```
+
+## Usage
+This apk does not come with a launcher icon. Instead, trigger it from command line
+
+### Push model to a directory
+```
+adb shell mkdir /data/local/tmp/minibench
+adb push my_model.pte /data/local/tmp/minibench
+# optionally, push tokenizer for LLM
+adb push tokenizer.bin /data/local/tmp/minibench
+```
+
+### Generic model
+```
+adb shell am start -W -S -n org.pytorch.minibench/org.pytorch.minibench.LlmBenchmarkActivity \
+ --es model_dir /data/local/tmp/minibench
+```
+
+### LLM
+```
+adb shell am start -W -S -n org.pytorch.minibench/org.pytorch.minibench.LlmBenchmarkActivity \
+ --es model_dir /data/local/tmp/minibench --es tokenizer_path /data/local/tmp/minibench/tokenizer.bin
+```
+
+### Fetch results
+```
+adb shell run-as org.pytorch.minibench cat files/benchmark_results.json
+```
+If the ExecuTorch runner is initialized and loads your model, but there is a load error or run error, you will see error code from that JSON.
diff --git a/extension/android/benchmark/android-llm-device-farm-test-spec.yml b/extension/android/benchmark/android-llm-device-farm-test-spec.yml
index 4e3274ce66f..ffb528767a5 100644
--- a/extension/android/benchmark/android-llm-device-farm-test-spec.yml
+++ b/extension/android/benchmark/android-llm-device-farm-test-spec.yml
@@ -10,18 +10,21 @@ phases:
     commands:
       # Prepare the model and the tokenizer
       - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /sdcard/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "mkdir -p /data/local/tmp/llama/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.bin /data/local/tmp/llama/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.pte /data/local/tmp/llama/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/*.bin"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/*.pte"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/llama/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mkdir -p /data/local/tmp/minibench/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.bin /data/local/tmp/minibench/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.pte /data/local/tmp/minibench/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/minibench/*.bin"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/minibench/*.pte"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/minibench/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "run-as org.pytorch.minibench rm -rf files"
 
   test:
     commands:
       # By default, the following ADB command is used by Device Farm to run your Instrumentation test.
       # Please refer to Android's documentation for more options on running instrumentation tests with adb:
       # https://developer.android.com/studio/test/command-line#run-tests-with-adb
+
+      # Run the Instrumentation test for sanity check
       - echo "Starting the Instrumentation test"
       - |
         adb -s $DEVICEFARM_DEVICE_UDID shell "am instrument -r -w --no-window-animation \
@@ -67,17 +70,33 @@ phases:
         fi;
 
       # Run the new generic benchmark activity https://developer.android.com/tools/adb#am
-      - echo "Run LLM benchmark"
+      - echo "Determine model type"
+      - |
+        BIN_FOUND="$(adb -s $DEVICEFARM_DEVICE_UDID shell find /data/local/tmp/minibench/ -name '*.bin')"
+        if [ -z "$BIN_FOUND" ]; then
+          echo "No tokenizer files found in /data/local/tmp/minibench/"
+        else
+          echo "tokenizer files found in /data/local/tmp/minibench/"
+        fi
+
+      - echo "Run benchmark"
       - |
-        adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.LlmBenchmarkActivity \
-        --es "model_dir" "/data/local/tmp/llama" \
-        --es "tokenizer_path" "/data/local/tmp/llama/tokenizer.bin"
+        adb -s $DEVICEFARM_DEVICE_UDID shell am force-stop org.pytorch.minibench
+        if [ -z "$BIN_FOUND" ]; then
+          adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.BenchmarkActivity \
+            --es "model_dir" "/data/local/tmp/minibench"
+        else
+          adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.LlmBenchmarkActivity \
+            --es "model_dir" "/data/local/tmp/minibench" \
+            --es "tokenizer_path" "/data/local/tmp/minibench/tokenizer.bin"
+        fi
+
 
   post_test:
     commands:
-      - echo "Gather LLM benchmark results"
+      - echo "Gather benchmark results"
       - |
-        BENCHMARK_RESULTS=""
+        BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as org.pytorch.minibench cat files/benchmark_results.json)
         ATTEMPT=0
         MAX_ATTEMPT=10
         while [ -z "${BENCHMARK_RESULTS}" ] && [ $ATTEMPT -lt $MAX_ATTEMPT ]; do
diff --git a/extension/android/benchmark/app/src/main/AndroidManifest.xml b/extension/android/benchmark/app/src/main/AndroidManifest.xml
index 098905c052c..7f62c509d55 100644
--- a/extension/android/benchmark/app/src/main/AndroidManifest.xml
+++ b/extension/android/benchmark/app/src/main/AndroidManifest.xml
@@ -3,11 +3,16 @@
     xmlns:tools="http://schemas.android.com/tools">
 
     <application
+        android:extractNativeLibs="true"
         android:label="@string/app_name"
         android:supportsRtl="true"
         android:theme="@style/Theme.MiniBench"
         tools:targetApi="31">
 
+        <uses-native-library
+            android:name="libcdsprpc.so"
+            android:required="false" />
+
         <activity
             android:name=".BenchmarkActivity"
             android:exported="true">
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
index a79f668f80b..2397bcfb851 100644
--- a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
@@ -11,16 +11,29 @@
 import android.app.Activity;
 import android.content.Intent;
 import android.os.Bundle;
+import android.system.ErrnoException;
+import android.system.Os;
+import com.google.gson.Gson;
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
 import org.pytorch.executorch.Module;
 
 public class BenchmarkActivity extends Activity {
   @Override
   protected void onCreate(Bundle savedInstanceState) {
     super.onCreate(savedInstanceState);
+
+    try {
+      Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+    } catch (ErrnoException e) {
+      finish();
+    }
+
     Intent intent = getIntent();
     File modelDir = new File(intent.getStringExtra("model_dir"));
     File model =
@@ -32,20 +45,56 @@ protected void onCreate(Bundle savedInstanceState) {
     int numIter = intent.getIntExtra("num_iter", 10);
 
     // TODO: Format the string with a parsable format
-    StringBuilder resultText = new StringBuilder();
+    Stats stats = new Stats();
 
+    // Record the time it takes to load the model and the forward method
+    stats.loadStart = System.nanoTime();
     Module module = Module.load(model.getPath());
+    stats.errorCode = module.loadMethod("forward");
+    stats.loadEnd = System.nanoTime();
+
     for (int i = 0; i < numIter; i++) {
-      long start = System.currentTimeMillis();
+      long start = System.nanoTime();
       module.forward();
-      long forwardMs = System.currentTimeMillis() - start;
-      resultText.append(forwardMs).append(";");
+      double forwardMs = (System.nanoTime() - start) * 1e-6;
+      stats.latency.add(forwardMs);
     }
 
-    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
-      writer.write(resultText.toString());
+    final BenchmarkMetric.BenchmarkModel benchmarkModel =
+        BenchmarkMetric.extractBackendAndQuantization(model.getName().replace(".pte", ""));
+    final List<BenchmarkMetric> results = new ArrayList<>();
+    // The list of metrics we have atm includes:
+    // Avg inference latency after N iterations
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "avg_inference_latency(ms)",
+            stats.latency.stream().mapToDouble(l -> l).average().orElse(0.0f),
+            0.0f));
+    // Model load time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel, "model_load_time(ms)", (stats.loadEnd - stats.loadStart) * 1e-6, 0.0f));
+    // Load status
+    results.add(new BenchmarkMetric(benchmarkModel, "load_status", stats.errorCode, 0));
+
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
+      Gson gson = new Gson();
+      writer.write(gson.toJson(results));
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
 }
+
+class Stats {
+  long loadStart;
+  long loadEnd;
+  List<Double> latency = new ArrayList<>();
+  int errorCode = 0;
+
+  @Override
+  public String toString() {
+    return "latency: " + latency.stream().map(Object::toString).collect(Collectors.joining(""));
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java
new file mode 100644
index 00000000000..22ee7b84804
--- /dev/null
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import android.app.ActivityManager;
+import android.os.Build;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+class BenchmarkMetric {
+  public static class BenchmarkModel {
+    // The model name, i.e. stories110M
+    String name;
+    String backend;
+    String quantization;
+
+    public BenchmarkModel(final String name, final String backend, final String quantization) {
+      this.name = name;
+      this.backend = backend;
+      this.quantization = quantization;
+    }
+  }
+
+  BenchmarkModel benchmarkModel;
+
+  // The metric name, i.e. TPS
+  String metric;
+
+  // The actual value and the option target value
+  double actualValue;
+  double targetValue;
+
+  public static class DeviceInfo {
+    // Let's see which information we want to include here
+    final String device = Build.BRAND;
+    // The phone model and Android release version
+    final String arch = Build.MODEL;
+    final String os = "Android " + Build.VERSION.RELEASE;
+    final long totalMem = new ActivityManager.MemoryInfo().totalMem;
+    final long availMem = new ActivityManager.MemoryInfo().availMem;
+  }
+
+  DeviceInfo deviceInfo = new DeviceInfo();
+
+  public BenchmarkMetric(
+      final BenchmarkModel benchmarkModel,
+      final String metric,
+      final double actualValue,
+      final double targetValue) {
+    this.benchmarkModel = benchmarkModel;
+    this.metric = metric;
+    this.actualValue = actualValue;
+    this.targetValue = targetValue;
+  }
+
+  // TODO (huydhn): Figure out a way to extract the backend and quantization information from
+  // the .pte model itself instead of parsing its name
+  public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) {
+    final Matcher m =
+        Pattern.compile("(?<name>\\w+)_(?<backend>\\w+)_(?<quantization>\\w+)").matcher(model);
+    if (m.matches()) {
+      return new BenchmarkMetric.BenchmarkModel(
+          m.group("name"), m.group("backend"), m.group("quantization"));
+    } else {
+      return new BenchmarkMetric.BenchmarkModel(model, "", "");
+    }
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
index 496cbde53d6..3bc38aad403 100644
--- a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
+++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
@@ -11,12 +11,18 @@
 import android.app.Activity;
 import android.content.Intent;
 import android.os.Bundle;
+import android.system.ErrnoException;
+import android.system.Os;
 import android.util.Log;
 import com.google.gson.Gson;
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 public class LlmBenchmarkActivity extends Activity implements ModelRunnerCallback {
   ModelRunner mModelRunner;
@@ -44,20 +50,28 @@ protected void onCreate(Bundle savedInstanceState) {
       mPrompt = "The ultimate answer";
     }
 
+    try {
+      Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+    } catch (ErrnoException e) {
+      finish();
+    }
+
     mStatsInfo = new StatsInfo();
+    mStatsInfo.modelName = model.getName().replace(".pte", "");
     mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
-    mStatsInfo.loadStart = System.currentTimeMillis();
+    mStatsInfo.loadStart = System.nanoTime();
   }
 
   @Override
   public void onModelLoaded(int status) {
-    mStatsInfo.loadEnd = System.currentTimeMillis();
+    mStatsInfo.loadEnd = System.nanoTime();
+    mStatsInfo.loadStatus = status;
     if (status != 0) {
       Log.e("LlmBenchmarkRunner", "Loaded failed: " + status);
       onGenerationStopped();
       return;
     }
-    mStatsInfo.generateStart = System.currentTimeMillis();
+    mStatsInfo.generateStart = System.nanoTime();
     mModelRunner.generate(mPrompt);
   }
 
@@ -71,32 +85,58 @@ public void onStats(String stats) {
 
   @Override
   public void onGenerationStopped() {
-    mStatsInfo.generateEnd = System.currentTimeMillis();
+    mStatsInfo.generateEnd = System.nanoTime();
+
+    final BenchmarkMetric.BenchmarkModel benchmarkModel =
+        BenchmarkMetric.extractBackendAndQuantization(mStatsInfo.modelName);
+    final List<BenchmarkMetric> results = new ArrayList<>();
+    // The list of metrics we have atm includes:
+    // Load status
+    results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsInfo.loadStatus, 0));
+    // Model load time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "model_load_time(ms)",
+            (mStatsInfo.loadEnd - mStatsInfo.loadStart) * 1e-6,
+            0.0f));
+    // LLM generate time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "generate_time(ms)",
+            (mStatsInfo.generateEnd - mStatsInfo.generateStart) * 1e-6,
+            0.0f));
+    // Token per second
+    results.add(
+        new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsInfo.tokens), 0.0f));
 
-    // TODO (huydhn): Remove txt files here once the JSON format is ready
-    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
-      writer.write(mStatsInfo.toString());
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
-
-    // TODO (huydhn): Figure out on what the final JSON results looks like, we need something
-    // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042
     try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
       Gson gson = new Gson();
-      writer.write(gson.toJson(mStatsInfo));
+      writer.write(gson.toJson(results));
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
+
+  private double extractTPS(final String tokens) {
+    final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens);
+    if (m.find()) {
+      return Double.parseDouble(m.group());
+    } else {
+      return 0.0f;
+    }
+  }
 }
 
 class StatsInfo {
+  int loadStatus;
   long loadStart;
   long loadEnd;
   long generateStart;
   long generateEnd;
   String tokens;
+  String modelName;
 
   @Override
   public String toString() {
diff --git a/extension/android/jni/jni_layer_constants.h b/extension/android/jni/jni_layer_constants.h
index 43946ffab6e..b710dbe8e08 100644
--- a/extension/android/jni/jni_layer_constants.h
+++ b/extension/android/jni/jni_layer_constants.h
@@ -8,7 +8,7 @@
 
 #include <unordered_map>
 
-#include <executorch/runtime/core/portable_type/scalar_type.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 
 namespace executorch::extension {
 
@@ -37,7 +37,7 @@ constexpr static int kTensorDTypeBits4x2 = 20;
 constexpr static int kTensorDTypeBits8 = 21;
 constexpr static int kTensorDTypeBits16 = 22;
 
-using torch::executor::ScalarType;
+using exec_aten::ScalarType;
 
 const std::unordered_map<ScalarType, int> scalar_type_to_java_dtype = {
     {ScalarType::Byte, kTensorDTypeUInt8},
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 50476df5690..1fa90280e75 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -31,7 +31,8 @@
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
-using namespace torch::executor;
+namespace llm = ::executorch::extension::llm;
+using ::executorch::runtime::Error;
 
 namespace executorch_jni {
 
@@ -49,7 +50,7 @@ class ExecuTorchLlamaCallbackJni
     method(self(), s);
   }
 
-  void onStats(const Stats& result) const {
+  void onStats(const llm::Stats& result) const {
     static auto cls = ExecuTorchLlamaCallbackJni::javaClassStatic();
     static const auto method = cls->getMethod<void(jfloat)>("onStats");
     double eval_time =
@@ -67,8 +68,8 @@ class ExecuTorchLlamaJni
  private:
   friend HybridBase;
   int model_type_category_;
-  std::unique_ptr<Runner> runner_;
-  std::unique_ptr<MultimodalRunner> multi_modal_runner_;
+  std::unique_ptr<example::Runner> runner_;
+  std::unique_ptr<llm::MultimodalRunner> multi_modal_runner_;
   std::unique_ptr<MTKLlamaRunner> mtk_llama_runner_;
 
  public:
@@ -97,22 +98,22 @@ class ExecuTorchLlamaJni
 #if defined(ET_USE_THREADPOOL)
     // Reserve 1 thread for the main thread.
     uint32_t num_performant_cores =
-        torch::executorch::cpuinfo::get_num_performant_cores() - 1;
+        ::executorch::extension::cpuinfo::get_num_performant_cores() - 1;
     if (num_performant_cores > 0) {
       ET_LOG(Info, "Resetting threadpool to %d threads", num_performant_cores);
-      torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool(
-          num_performant_cores);
+      ::executorch::extension::threadpool::get_threadpool()
+          ->_unsafe_reset_threadpool(num_performant_cores);
     }
 #endif
 
     model_type_category_ = model_type_category;
     if (model_type_category == MODEL_TYPE_CATEGORY_MULTIMODAL) {
-      multi_modal_runner_ = std::make_unique<LlavaRunner>(
+      multi_modal_runner_ = std::make_unique<example::LlavaRunner>(
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
     } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
-      runner_ = std::make_unique<Runner>(
+      runner_ = std::make_unique<example::Runner>(
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
@@ -135,7 +136,7 @@ class ExecuTorchLlamaJni
       jboolean echo) {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
       auto image_size = image->size();
-      std::vector<Image> images;
+      std::vector<llm::Image> images;
       if (image_size != 0) {
         std::vector<jint> image_data_jint(image_size);
         std::vector<uint8_t> image_data(image_size);
@@ -143,7 +144,7 @@ class ExecuTorchLlamaJni
         for (int i = 0; i < image_size; i++) {
           image_data[i] = image_data_jint[i];
         }
-        Image image_runner{image_data, width, height, channels};
+        llm::Image image_runner{image_data, width, height, channels};
         images.push_back(image_runner);
       }
       multi_modal_runner_->generate(
@@ -151,14 +152,14 @@ class ExecuTorchLlamaJni
           prompt->toStdString(),
           seq_len,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const Stats& result) { callback->onStats(result); },
+          [callback](const llm::Stats& result) { callback->onStats(result); },
           echo);
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       runner_->generate(
           prompt->toStdString(),
           seq_len,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const Stats& result) { callback->onStats(result); },
+          [callback](const llm::Stats& result) { callback->onStats(result); },
           echo);
     } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
       mtk_llama_runner_->generate(
@@ -213,7 +214,7 @@ class ExecuTorchLlamaJni
     }
 
     auto image_size = image->size();
-    std::vector<Image> images;
+    std::vector<llm::Image> images;
     if (image_size != 0) {
       std::vector<jint> image_data_jint(image_size);
       std::vector<uint8_t> image_data(image_size);
@@ -221,7 +222,7 @@ class ExecuTorchLlamaJni
       for (int i = 0; i < image_size; i++) {
         image_data[i] = image_data_jint[i];
       }
-      Image image_runner{image_data, width, height, channels};
+      llm::Image image_runner{image_data, width, height, channels};
       images.push_back(image_runner);
     }
     // TODO(hsz): make  start_pos a reference and update it here
@@ -246,9 +247,7 @@ class ExecuTorchLlamaJni
         seq_len,
         start_pos,
         [callback](const std::string& result) { callback->onResult(result); },
-        [callback](const ::executorch::extension::llm::Stats& stats) {
-          callback->onStats(stats);
-        },
+        [callback](const llm::Stats& stats) { callback->onStats(stats); },
         echo));
   }
 
diff --git a/extension/android/src/main/java/org/pytorch/executorch/DType.java b/extension/android/src/main/java/org/pytorch/executorch/DType.java
index 97da05a0af1..f5d33d0b71e 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/DType.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/DType.java
@@ -8,60 +8,63 @@
 
 package org.pytorch.executorch;
 
+import org.pytorch.executorch.annotations.Experimental;
+
 /**
  * Codes representing tensor data types.
  *
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 public enum DType {
   // NOTE: "jniCode" must be kept in sync with scalar_type.h.
   // NOTE: Never serialize "jniCode", because it can change between releases.
 
-  /** Code for dtype torch::executor::Byte */
+  /** Code for dtype ScalarType::Byte */
   UINT8(0),
-  /** Code for dtype torch::executor::Char */
+  /** Code for dtype ScalarType::Char */
   INT8(1),
-  /** Code for dtype torch::executor::Short */
+  /** Code for dtype ScalarType::Short */
   INT16(2),
-  /** Code for dtype torch::executor::Int */
+  /** Code for dtype ScalarType::Int */
   INT32(3),
-  /** Code for dtype torch::executor::Long */
+  /** Code for dtype ScalarType::Long */
   INT64(4),
-  /** Code for dtype torch::executor::Half */
+  /** Code for dtype ScalarType::Half */
   HALF(5),
-  /** Code for dtype torch::executor::Float */
+  /** Code for dtype ScalarType::Float */
   FLOAT(6),
-  /** Code for dtype torch::executor::Double */
+  /** Code for dtype ScalarType::Double */
   DOUBLE(7),
-  /** Code for dtype torch::executor::ComplexHalf */
+  /** Code for dtype ScalarType::ComplexHalf */
   COMPLEX_HALF(8),
-  /** Code for dtype torch::executor::ComplexFloat */
+  /** Code for dtype ScalarType::ComplexFloat */
   COMPLEX_FLOAT(9),
-  /** Code for dtype torch::executor::ComplexDouble */
+  /** Code for dtype ScalarType::ComplexDouble */
   COMPLEX_DOUBLE(10),
-  /** Code for dtype torch::executor::Bool */
+  /** Code for dtype ScalarType::Bool */
   BOOL(11),
-  /** Code for dtype torch::executor::QInt8 */
+  /** Code for dtype ScalarType::QInt8 */
   QINT8(12),
-  /** Code for dtype torch::executor::QUInt8 */
+  /** Code for dtype ScalarType::QUInt8 */
   QUINT8(13),
-  /** Code for dtype torch::executor::QInt32 */
+  /** Code for dtype ScalarType::QInt32 */
   QINT32(14),
-  /** Code for dtype torch::executor::BFloat16 */
+  /** Code for dtype ScalarType::BFloat16 */
   BFLOAT16(15),
-  /** Code for dtype torch::executor::QUInt4x2 */
+  /** Code for dtype ScalarType::QUInt4x2 */
   QINT4X2(16),
-  /** Code for dtype torch::executor::QUInt2x4 */
+  /** Code for dtype ScalarType::QUInt2x4 */
   QINT2X4(17),
-  /** Code for dtype torch::executor::Bits1x8 */
+  /** Code for dtype ScalarType::Bits1x8 */
   BITS1X8(18),
-  /** Code for dtype torch::executor::Bits2x4 */
+  /** Code for dtype ScalarType::Bits2x4 */
   BITS2X4(19),
-  /** Code for dtype torch::executor::Bits4x2 */
+  /** Code for dtype ScalarType::Bits4x2 */
   BITS4X2(20),
-  /** Code for dtype torch::executor::Bits8 */
+  /** Code for dtype ScalarType::Bits8 */
   BITS8(21),
-  /** Code for dtype torch::executor::Bits16 */
+  /** Code for dtype ScalarType::Bits16 */
   BITS16(22),
   ;
 
diff --git a/extension/android/src/main/java/org/pytorch/executorch/EValue.java b/extension/android/src/main/java/org/pytorch/executorch/EValue.java
index 971545d62fb..0065d808728 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/EValue.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/EValue.java
@@ -11,6 +11,7 @@
 import com.facebook.jni.annotations.DoNotStrip;
 import java.util.Locale;
 import java.util.Optional;
+import org.pytorch.executorch.annotations.Experimental;
 
 /**
  * Java representation of an ExecuTorch value, which is implemented as tagged union that can be one
@@ -30,6 +31,7 @@
  *
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 @DoNotStrip
 public class EValue {
   private static final int TYPE_CODE_NONE = 0;
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java
index 2d327925d17..b30fa2515a9 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java
@@ -9,6 +9,7 @@
 package org.pytorch.executorch;
 
 import com.facebook.jni.annotations.DoNotStrip;
+import org.pytorch.executorch.annotations.Experimental;
 
 /**
  * Callback interface for Llama model. Users can implement this interface to receive the generated
@@ -16,6 +17,7 @@
  *
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 public interface LlamaCallback {
   /**
    * Called when a new result is available from JNI. Users will keep getting onResult() invocations
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
index 7c77dbae08f..6de26bc7fe8 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
@@ -12,6 +12,7 @@
 import com.facebook.jni.annotations.DoNotStrip;
 import com.facebook.soloader.nativeloader.NativeLoader;
 import com.facebook.soloader.nativeloader.SystemDelegate;
+import org.pytorch.executorch.annotations.Experimental;
 
 /**
  * LlamaModule is a wrapper around the Executorch Llama model. It provides a simple interface to
@@ -19,6 +20,7 @@
  *
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 public class LlamaModule {
 
   public static final int MODEL_TYPE_TEXT = 1;
@@ -171,7 +173,7 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) {
    * @param prompt The text prompt to LLaVA.
    * @param seqLen The total sequence length, including the prompt tokens and new tokens.
    * @param startPos The starting position in KV cache of the input in the LLM.
-   * @param llamaCallback callback object to receive results.
+   * @param callback callback object to receive results.
    * @param echo indicate whether to echo the input prompt or not.
    * @return The error code.
    */
diff --git a/extension/android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/src/main/java/org/pytorch/executorch/Module.java
index de2ed78b520..084f1be23f5 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/Module.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/Module.java
@@ -11,12 +11,14 @@
 import com.facebook.soloader.nativeloader.NativeLoader;
 import com.facebook.soloader.nativeloader.SystemDelegate;
 import java.util.Map;
+import org.pytorch.executorch.annotations.Experimental;
 
 /**
  * Java wrapper for ExecuTorch Module.
  *
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 public class Module {
 
   /** Load mode for the module. Load the whole file as a buffer. */
diff --git a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
index f63de985069..6d0078b0b62 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
@@ -12,12 +12,14 @@
 import com.facebook.jni.annotations.DoNotStrip;
 import com.facebook.soloader.nativeloader.NativeLoader;
 import java.util.Map;
+import org.pytorch.executorch.annotations.Experimental;
 
 /**
  * Interface for the native peer object for entry points to the Module
  *
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 class NativePeer {
   static {
     // Loads libexecutorch.so from jniLibs
diff --git a/extension/android/src/main/java/org/pytorch/executorch/Tensor.java b/extension/android/src/main/java/org/pytorch/executorch/Tensor.java
index 8a1639703d3..685110ff9ae 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/Tensor.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/Tensor.java
@@ -19,6 +19,7 @@
 import java.nio.LongBuffer;
 import java.util.Arrays;
 import java.util.Locale;
+import org.pytorch.executorch.annotations.Experimental;
 
 /**
  * Representation of an ExecuTorch Tensor. Behavior is similar to PyTorch's tensor objects.
@@ -39,6 +40,7 @@
  *
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
+@Experimental
 public abstract class Tensor {
   private static final String ERROR_MSG_DATA_BUFFER_NOT_NULL = "Data buffer must be not null";
   private static final String ERROR_MSG_DATA_ARRAY_NOT_NULL = "Data array must be not null";
diff --git a/extension/android/src/main/java/org/pytorch/executorch/annotations/Experimental.java b/extension/android/src/main/java/org/pytorch/executorch/annotations/Experimental.java
new file mode 100644
index 00000000000..f5f36fc56da
--- /dev/null
+++ b/extension/android/src/main/java/org/pytorch/executorch/annotations/Experimental.java
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.executorch.annotations;
+
+/**
+ * This annotation indicates that an API is experimental and may change or be removed at any time.
+ * It does not provide any guarantees for API stability or backward-compatibility.
+ *
+ * <p>This status is not permanent, and APIs marked with this annotation will need to be either made
+ * more robust or removed in the future.
+ */
+public @interface Experimental {}
diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
index 41b2bd16a53..b86ceff806b 100644
--- a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
+++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
@@ -7,9 +7,19 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		032A73CA2CAFBA8600932D36 /* LLaMATests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 032A73C82CAFBA8600932D36 /* LLaMATests.mm */; };
+		032A74182CAFBB7800932D36 /* text_decoder_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73DB2CAFBB7800932D36 /* text_decoder_runner.cpp */; };
+		032A741A2CAFBB7800932D36 /* bpe_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73FA2CAFBB7800932D36 /* bpe_tokenizer.cpp */; };
+		032A741D2CAFBB7800932D36 /* text_prefiller.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73DD2CAFBB7800932D36 /* text_prefiller.cpp */; };
+		032A741E2CAFBB7800932D36 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73FE2CAFBB7800932D36 /* tiktoken.cpp */; };
+		032A741F2CAFBB7800932D36 /* sampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73E62CAFBB7800932D36 /* sampler.cpp */; };
+		032A74232CAFC1B300932D36 /* runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A74222CAFC1B300932D36 /* runner.cpp */; };
+		032A74262CAFC34800932D36 /* llama_tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A74252CAFC34800932D36 /* llama_tiktoken.cpp */; };
+		0351D9D72CAFC9A200607121 /* Resources in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Resources */; };
+		03B0118E2CAC567900054791 /* DynamicTestCase.m in Sources */ = {isa = PBXBuildFile; fileRef = 03B0118C2CAC567900054791 /* DynamicTestCase.m */; };
+		03B011912CAD114E00054791 /* ResourceTestCase.m in Sources */ = {isa = PBXBuildFile; fileRef = 03B011902CAD114E00054791 /* ResourceTestCase.m */; };
 		03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; };
-		03B2D37A2C8A515C0046936E /* Tests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* Tests.mm */; };
-		03C7FA382C8AA3EC00E6E9AE /* Models in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Models */; };
+		03B2D37A2C8A515C0046936E /* GenericTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* GenericTests.mm */; };
 		03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */; };
 		03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */; };
 		03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */; };
@@ -37,14 +47,41 @@
 /* End PBXContainerItemProxy section */
 
 /* Begin PBXFileReference section */
+		032A73C82CAFBA8600932D36 /* LLaMATests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LLaMATests.mm; sourceTree = "<group>"; };
+		032A73D42CAFBB7800932D36 /* image.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = image.h; sourceTree = "<group>"; };
+		032A73D52CAFBB7800932D36 /* image_prefiller.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = image_prefiller.h; sourceTree = "<group>"; };
+		032A73D62CAFBB7800932D36 /* multimodal_runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = multimodal_runner.h; sourceTree = "<group>"; };
+		032A73D72CAFBB7800932D36 /* stats.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = stats.h; sourceTree = "<group>"; };
+		032A73DA2CAFBB7800932D36 /* text_decoder_runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = text_decoder_runner.h; sourceTree = "<group>"; };
+		032A73DB2CAFBB7800932D36 /* text_decoder_runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = text_decoder_runner.cpp; sourceTree = "<group>"; };
+		032A73DC2CAFBB7800932D36 /* text_prefiller.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = text_prefiller.h; sourceTree = "<group>"; };
+		032A73DD2CAFBB7800932D36 /* text_prefiller.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = text_prefiller.cpp; sourceTree = "<group>"; };
+		032A73DE2CAFBB7800932D36 /* text_token_generator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = text_token_generator.h; sourceTree = "<group>"; };
+		032A73DF2CAFBB7800932D36 /* util.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = "<group>"; };
+		032A73E52CAFBB7800932D36 /* sampler.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = "<group>"; };
+		032A73E62CAFBB7800932D36 /* sampler.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = "<group>"; };
+		032A73F82CAFBB7800932D36 /* base64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = base64.h; sourceTree = "<group>"; };
+		032A73F92CAFBB7800932D36 /* bpe_tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = bpe_tokenizer.h; sourceTree = "<group>"; };
+		032A73FA2CAFBB7800932D36 /* bpe_tokenizer.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = bpe_tokenizer.cpp; sourceTree = "<group>"; };
+		032A73FD2CAFBB7800932D36 /* tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = "<group>"; };
+		032A73FE2CAFBB7800932D36 /* tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = tiktoken.cpp; sourceTree = "<group>"; };
+		032A73FF2CAFBB7800932D36 /* tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = "<group>"; };
+		032A74212CAFC1B300932D36 /* runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../examples/models/llama2/runner/runner.h; sourceTree = SOURCE_ROOT; };
+		032A74222CAFC1B300932D36 /* runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../examples/models/llama2/runner/runner.cpp; sourceTree = SOURCE_ROOT; };
+		032A74242CAFC34800932D36 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = ../../../examples/models/llama2/tokenizer/llama_tiktoken.h; sourceTree = SOURCE_ROOT; };
+		032A74252CAFC34800932D36 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama_tiktoken.cpp; path = ../../../examples/models/llama2/tokenizer/llama_tiktoken.cpp; sourceTree = SOURCE_ROOT; };
 		037C96A02C8A570B00B3DF38 /* Tests.xctestplan */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = Tests.xctestplan; sourceTree = "<group>"; };
+		03B0118B2CAC567900054791 /* DynamicTestCase.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DynamicTestCase.h; sourceTree = "<group>"; };
+		03B0118C2CAC567900054791 /* DynamicTestCase.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = DynamicTestCase.m; sourceTree = "<group>"; };
+		03B0118F2CAD114E00054791 /* ResourceTestCase.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ResourceTestCase.h; sourceTree = "<group>"; };
+		03B011902CAD114E00054791 /* ResourceTestCase.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ResourceTestCase.m; sourceTree = "<group>"; };
 		03B019502C8A80D30044D558 /* Tests.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = Tests.xcconfig; sourceTree = "<group>"; };
 		03B2D3642C8A515A0046936E /* Benchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Benchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		03B2D3672C8A515A0046936E /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = "<group>"; };
 		03B2D36D2C8A515B0046936E /* App.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = App.entitlements; sourceTree = "<group>"; };
 		03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
-		03B2D3792C8A515C0046936E /* Tests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = Tests.mm; sourceTree = "<group>"; };
-		03C7FA322C8AA24200E6E9AE /* Models */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Models; sourceTree = SOURCE_ROOT; };
+		03B2D3792C8A515C0046936E /* GenericTests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = GenericTests.mm; sourceTree = "<group>"; };
+		03C7FA322C8AA24200E6E9AE /* Resources */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Resources; sourceTree = SOURCE_ROOT; };
 		03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = "<group>"; };
 		03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = "<group>"; };
 		03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = "<group>"; };
@@ -93,13 +130,82 @@
 /* End PBXFrameworksBuildPhase section */
 
 /* Begin PBXGroup section */
+		032A73C92CAFBA8600932D36 /* LLaMA */ = {
+			isa = PBXGroup;
+			children = (
+				032A73E02CAFBB7800932D36 /* runner */,
+				032A73E92CAFBB7800932D36 /* sampler */,
+				032A74022CAFBB7800932D36 /* tokenizer */,
+				032A73C82CAFBA8600932D36 /* LLaMATests.mm */,
+			);
+			path = LLaMA;
+			sourceTree = "<group>";
+		};
+		032A73E02CAFBB7800932D36 /* runner */ = {
+			isa = PBXGroup;
+			children = (
+				032A73D42CAFBB7800932D36 /* image.h */,
+				032A73D52CAFBB7800932D36 /* image_prefiller.h */,
+				032A73D62CAFBB7800932D36 /* multimodal_runner.h */,
+				032A74212CAFC1B300932D36 /* runner.h */,
+				032A74222CAFC1B300932D36 /* runner.cpp */,
+				032A73D72CAFBB7800932D36 /* stats.h */,
+				032A73DA2CAFBB7800932D36 /* text_decoder_runner.h */,
+				032A73DB2CAFBB7800932D36 /* text_decoder_runner.cpp */,
+				032A73DC2CAFBB7800932D36 /* text_prefiller.h */,
+				032A73DD2CAFBB7800932D36 /* text_prefiller.cpp */,
+				032A73DE2CAFBB7800932D36 /* text_token_generator.h */,
+				032A73DF2CAFBB7800932D36 /* util.h */,
+			);
+			name = runner;
+			path = ../../llm/runner;
+			sourceTree = SOURCE_ROOT;
+		};
+		032A73E92CAFBB7800932D36 /* sampler */ = {
+			isa = PBXGroup;
+			children = (
+				032A73E52CAFBB7800932D36 /* sampler.h */,
+				032A73E62CAFBB7800932D36 /* sampler.cpp */,
+			);
+			name = sampler;
+			path = ../../llm/sampler;
+			sourceTree = SOURCE_ROOT;
+		};
+		032A74022CAFBB7800932D36 /* tokenizer */ = {
+			isa = PBXGroup;
+			children = (
+				032A73F82CAFBB7800932D36 /* base64.h */,
+				032A73F92CAFBB7800932D36 /* bpe_tokenizer.h */,
+				032A73FA2CAFBB7800932D36 /* bpe_tokenizer.cpp */,
+				032A74242CAFC34800932D36 /* llama_tiktoken.h */,
+				032A74252CAFC34800932D36 /* llama_tiktoken.cpp */,
+				032A73FD2CAFBB7800932D36 /* tiktoken.h */,
+				032A73FE2CAFBB7800932D36 /* tiktoken.cpp */,
+				032A73FF2CAFBB7800932D36 /* tokenizer.h */,
+			);
+			name = tokenizer;
+			path = ../../llm/tokenizer;
+			sourceTree = SOURCE_ROOT;
+		};
+		03B0118D2CAC567900054791 /* TestUtils */ = {
+			isa = PBXGroup;
+			children = (
+				03B0118B2CAC567900054791 /* DynamicTestCase.h */,
+				03B0118C2CAC567900054791 /* DynamicTestCase.m */,
+				03B0118F2CAD114E00054791 /* ResourceTestCase.h */,
+				03B011902CAD114E00054791 /* ResourceTestCase.m */,
+			);
+			path = TestUtils;
+			sourceTree = "<group>";
+		};
 		03B2D35B2C8A515A0046936E = {
 			isa = PBXGroup;
 			children = (
 				03B2D3662C8A515A0046936E /* App */,
 				03ED6CEB2C8AAF5300F2D6EE /* Frameworks */,
-				03C7FA322C8AA24200E6E9AE /* Models */,
+				03C7FA322C8AA24200E6E9AE /* Resources */,
 				03B2D3782C8A515C0046936E /* Tests */,
+				03B0118D2CAC567900054791 /* TestUtils */,
 				03B2D3652C8A515A0046936E /* Products */,
 			);
 			sourceTree = "<group>";
@@ -125,7 +231,8 @@
 		03B2D3782C8A515C0046936E /* Tests */ = {
 			isa = PBXGroup;
 			children = (
-				03B2D3792C8A515C0046936E /* Tests.mm */,
+				032A73C92CAFBA8600932D36 /* LLaMA */,
+				03B2D3792C8A515C0046936E /* GenericTests.mm */,
 				03B019502C8A80D30044D558 /* Tests.xcconfig */,
 				037C96A02C8A570B00B3DF38 /* Tests.xctestplan */,
 			);
@@ -162,7 +269,6 @@
 			buildPhases = (
 				03B2D3602C8A515A0046936E /* Sources */,
 				03B2D3612C8A515A0046936E /* Frameworks */,
-				03B2D3622C8A515A0046936E /* Resources */,
 			);
 			buildRules = (
 			);
@@ -177,9 +283,10 @@
 			isa = PBXNativeTarget;
 			buildConfigurationList = 03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */;
 			buildPhases = (
+				032A74202CAFBE6200932D36 /* Build Cmake Dependencies */,
 				03B2D3712C8A515C0046936E /* Sources */,
 				03B2D3722C8A515C0046936E /* Frameworks */,
-				03B2D3732C8A515C0046936E /* Resources */,
+				0351D9D62CAFC99C00607121 /* Resources */,
 			);
 			buildRules = (
 			);
@@ -230,22 +337,36 @@
 /* End PBXProject section */
 
 /* Begin PBXResourcesBuildPhase section */
-		03B2D3622C8A515A0046936E /* Resources */ = {
+		0351D9D62CAFC99C00607121 /* Resources */ = {
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				0351D9D72CAFC9A200607121 /* Resources in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
-		03B2D3732C8A515C0046936E /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		032A74202CAFBE6200932D36 /* Build Cmake Dependencies */ = {
+			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				03C7FA382C8AA3EC00E6E9AE /* Models in Resources */,
+			);
+			inputFileListPaths = (
+			);
+			inputPaths = (
+			);
+			name = "Build Cmake Dependencies";
+			outputFileListPaths = (
+			);
+			outputPaths = (
 			);
 			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    local target=$2\n    shift 2\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n    if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n        extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n        extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n    fi\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\" --target \"$target\"\n    if [[ \"$target\" == \"install\" ]]; then\n        cmake --install . --prefix \"$CMAKE_DIR\"\n    fi\n}\n\ncmake_build \"$SRCROOT/../../llm/third-party/abseil-cpp\" \"install\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../llm/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../llm/third-party/sentencepiece\" \"sentencepiece-static\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n";
 		};
-/* End PBXResourcesBuildPhase section */
+/* End PBXShellScriptBuildPhase section */
 
 /* Begin PBXSourcesBuildPhase section */
 		03B2D3602C8A515A0046936E /* Sources */ = {
@@ -260,7 +381,17 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				03B2D37A2C8A515C0046936E /* Tests.mm in Sources */,
+				03B0118E2CAC567900054791 /* DynamicTestCase.m in Sources */,
+				032A74182CAFBB7800932D36 /* text_decoder_runner.cpp in Sources */,
+				032A741A2CAFBB7800932D36 /* bpe_tokenizer.cpp in Sources */,
+				032A741D2CAFBB7800932D36 /* text_prefiller.cpp in Sources */,
+				032A741E2CAFBB7800932D36 /* tiktoken.cpp in Sources */,
+				032A741F2CAFBB7800932D36 /* sampler.cpp in Sources */,
+				03B011912CAD114E00054791 /* ResourceTestCase.m in Sources */,
+				032A74232CAFC1B300932D36 /* runner.cpp in Sources */,
+				03B2D37A2C8A515C0046936E /* GenericTests.mm in Sources */,
+				032A73CA2CAFBA8600932D36 /* LLaMATests.mm in Sources */,
+				032A74262CAFC34800932D36 /* llama_tiktoken.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -389,7 +520,6 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				CODE_SIGN_ENTITLEMENTS = App/App.entitlements;
-				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = "";
@@ -410,6 +540,7 @@
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark;
 				PRODUCT_NAME = Benchmark;
+				REGISTER_APP_GROUPS = NO;
 				SDKROOT = auto;
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
 				SUPPORTS_MACCATALYST = NO;
@@ -423,7 +554,6 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				CODE_SIGN_ENTITLEMENTS = App/App.entitlements;
-				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = "";
@@ -444,6 +574,7 @@
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark;
 				PRODUCT_NAME = Benchmark;
+				REGISTER_APP_GROUPS = NO;
 				SDKROOT = auto;
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
 				SUPPORTS_MACCATALYST = NO;
@@ -468,6 +599,7 @@
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests;
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				REGISTER_APP_GROUPS = NO;
 				SDKROOT = auto;
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
 				SUPPORTS_MACCATALYST = NO;
@@ -493,6 +625,7 @@
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests;
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				REGISTER_APP_GROUPS = NO;
 				SDKROOT = auto;
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
 				SUPPORTS_MACCATALYST = NO;
diff --git a/extension/apple/Benchmark/Frameworks/download_frameworks.sh b/extension/apple/Benchmark/Frameworks/download_frameworks.sh
new file mode 100755
index 00000000000..e27b8c20111
--- /dev/null
+++ b/extension/apple/Benchmark/Frameworks/download_frameworks.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+frameworks=(
+  "backend_coreml"
+  "backend_mps"
+  "backend_xnnpack"
+  "executorch"
+  "kernels_custom"
+  "kernels_optimized"
+  "kernels_portable"
+  "kernels_quantized"
+)
+
+cd "$(dirname "$0")" || exit
+
+for framework in "${frameworks[@]}"; do
+  rm -f "${framework}-latest.zip"
+  rm -rf "${framework}.xcframework"
+  curl -sSLO "https://ossci-ios.s3.amazonaws.com/executorch/${framework}-latest.zip" && \
+  unzip -q "${framework}-latest.zip" && \
+  rm "${framework}-latest.zip"
+done
diff --git a/extension/apple/Benchmark/README.md b/extension/apple/Benchmark/README.md
new file mode 100644
index 00000000000..5db84915a2b
--- /dev/null
+++ b/extension/apple/Benchmark/README.md
@@ -0,0 +1,367 @@
+# ExecuTorch Benchmark App for Apple Platforms
+
+## Introduction
+
+The **Benchmark App** is a tool designed to help developers measure the performance of PyTorch models on Apple devices using the ExecuTorch runtime.
+It provides a flexible framework for dynamically generating and running performance tests on your models, allowing you to assess metrics such as load times, inference speeds, memory usage, and more.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_benchmark_app.png" alt="Benchmark App" style="width:800px">
+</p>
+
+## Prerequisites
+
+- [Xcode](https://apps.apple.com/us/app/xcode/id497799835?mt=12/) 15.0 or later with command-line tools if not already installed (`xcode-select --install`).
+- [CMake](https://cmake.org/download/) 3.19 or later
+  - Download and open the macOS `.dmg` installer and move the CMake app to `/Applications` folder.
+  - Install CMake command line tools: `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install`
+- A development provisioning profile with the [`increased-memory-limit`](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement if targeting iOS devices.
+
+## Setting Up the App
+
+### Get the Code
+
+To get started, clone the ExecuTorch repository and cd into the source code directory:
+
+```bash
+git clone https://github.com/pytorch/executorch.git --depth 1 --recurse-submodules --shallow-submodules
+cd executorch
+```
+
+This command performs a shallow clone to speed up the process.
+
+### Set Up the Frameworks
+
+The Benchmark App relies on prebuilt ExecuTorch frameworks.
+You have two options:
+
+<details>
+<summary>Option 1: Download Prebuilt Frameworks</summary>
+<br/>
+
+Run the provided script to download the prebuilt frameworks:
+
+```bash
+./extension/apple/Benchmark/Frameworks/download_frameworks.sh
+```
+</details>
+
+<details>
+<summary>Option 2: Build Frameworks Locally</summary>
+<br/>
+
+Alternatively, you can build the frameworks yourself by following the [guide](https://pytorch.org/executorch/main/apple-runtime.html#local-build).
+</details>
+
+Once the frameworks are downloaded or built, verify that the `Frameworks` directory contains the necessary `.xcframework` files:
+
+```bash
+ls extension/apple/Benchmark/Frameworks
+```
+
+You should see:
+
+```
+backend_coreml.xcframework
+backend_mps.xcframework
+backend_xnnpack.xcframework
+executorch.xcframework
+kernels_custom.xcframework
+kernels_optimized.xcframework
+kernels_portable.xcframework
+kernels_quantized.xcframework
+```
+
+## Adding Models and Resources
+
+Place your exported model files (`.pte`) and any other resources (e.g., `tokenizer.bin`) into the `extension/apple/Benchmark/Resources` directory:
+
+```bash
+cp <path/to/my_model.pte> <path/to/llama3.pte> <path/to/tokenizer.bin> extension/apple/Benchmark/Resources
+```
+
+Optionally, check that the files are there:
+
+```bash
+ls extension/apple/Benchmark/Resources
+```
+
+For this example you should see:
+
+```
+llama3.pte
+my_model.pte
+tokenizer.bin
+```
+
+The app automatically bundles these resources and makes them available to the test suite.
+
+## Running the Tests
+
+### Build and Run the Tests
+
+Open the Benchmark Xcode project:
+
+```bash
+open extension/apple/Benchmark/Benchmark.xcodeproj
+```
+
+Select the destination device or simulator and press `Command+U`, or click `Product` > `Test` in the menu to run the test suite.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_benchmark_app_tests.png" alt="Benchmark App Tests" style="width:800px">
+</p>
+
+### Configure Signing (if necessary)
+
+If you plan to run the app on a physical device, you may need to set up code signing:
+
+1. Open the **Project Navigator** by pressing `Command+1` and click on the `Benchmark` root of the file tree.
+2. Under Targets section go to the **Signing & Capabilities** tab of both the `App` and `Tests` targets.
+3. Select your development team. Alternatively, manually pick a provisioning profile that supports the increased memory limit entitlement and modify the bundle identifier if needed.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_benchmark_app_signing.png" alt="Benchmark App Signing" style="width:800px">
+</p>
+
+## Viewing Test Results and Metrics
+
+After running the tests, you can view the results in Xcode:
+
+1. Open the **Test Report Navigator** by pressing `Command+9`.
+2. Select the most recent test run.
+3. You'll see a list of tests that ran, along with their status (passed or failed).
+4. To view metrics for a specific test:
+   - Double-click on the test in the list.
+   - Switch to the **Metrics** tab to see detailed performance data.
+
+**Note**: The tests use `XCTMeasureOptions` to run each test multiple times (usually five) to obtain average performance metrics.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_benchmark_app_test_load.png" alt="Benchmark App Test Load" style="width:800px">
+</p>
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_benchmark_app_test_forward.png" alt="Benchmark App Test Forward" style="width:800px">
+</p>
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_benchmark_app_test_generate.png" alt="Benchmark App Test Generate" style="width:800px">
+</p>
+
+## Understanding the Test Suite
+
+The Benchmark App uses a dynamic test generation framework to create tests based on the resources you provide.
+
+### Dynamic Test Generation
+
+The key components are:
+
+- **`DynamicTestCase`**: A subclass of `XCTestCase` that allows for the dynamic creation of test methods.
+- **`ResourceTestCase`**: Builds upon `DynamicTestCase` to generate tests based on resources that match specified criteria.
+
+### How It Works
+
+1. **Define Directories and Predicates**: Override the `directories` and `predicates` methods to specify where to look for resources and how to match them.
+
+2. **Generate Resource Combinations**: The framework searches the specified `directories` for files matching the `predicates`, generating all possible combinations.
+
+3. **Create Dynamic Tests**: For each combination of resources, it calls `dynamicTestsForResources`, where you define the tests to run.
+
+4. **Test Naming**: Test names are dynamically formed using the format:
+
+   ```
+   test_<TestName>_<Resource1>_<Resource2>_..._<OS>_<Version>_<DeviceModel>
+   ```
+
+   This ensures that each test is uniquely identifiable based on the resources and device.
+
+### Example: Generic Model Tests
+
+Here's how you might create a test to measure model load and inference times:
+
+```objective-c
+@interface GenericTests : ResourceTestCase
+@end
+
+@implementation GenericTests
+
++ (NSArray<NSString *> *)directories {
+  return @[@"Resources"];
+}
+
++ (NSDictionary<NSString *, BOOL (^)(NSString *)> *)predicates {
+  return @{
+    @"model" : ^BOOL(NSString *filename) {
+      return [filename hasSuffix:@".pte"];
+    },
+  };
+}
+
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTestsForResources:(NSDictionary<NSString *, NSString *> *)resources {
+  NSString *modelPath = resources[@"model"];
+  return @{
+    @"load" : ^(XCTestCase *testCase) {
+      [testCase measureWithMetrics:@[[XCTClockMetric new], [XCTMemoryMetric new]] block:^{
+        XCTAssertEqual(Module(modelPath.UTF8String).load_forward(), Error::Ok);
+      }];
+    },
+    @"forward" : ^(XCTestCase *testCase) {
+      // Set up and measure the forward pass...
+    },
+  };
+}
+
+@end
+```
+
+In this example:
+
+- We look for `.pte` files in the `Resources` directory.
+- For each model found, we create two tests: `load` and `forward`.
+- The tests measure the time and memory usage of loading and running the model.
+
+## Extending the Test Suite
+
+You can create custom tests by subclassing `ResourceTestCase` and overriding the necessary methods.
+
+### Steps to Create Custom Tests
+
+1. **Subclass `ResourceTestCase`**:
+
+   ```objective-c
+   @interface MyCustomTests : ResourceTestCase
+   @end
+   ```
+
+2. **Override `directories` and `predicates`**:
+
+   Specify where to look for resources and how to match them.
+
+   ```objective-c
+   + (NSArray<NSString *> *)directories {
+     return @[@"Resources"];
+   }
+
+   + (NSDictionary<NSString *, BOOL (^)(NSString *)> *)predicates {
+     return @{
+       @"model" : ^BOOL(NSString *filename) {
+         return [filename hasSuffix:@".pte"];
+       },
+       @"config" : ^BOOL(NSString *filename) {
+         return [filename isEqualToString:@"config.json"];
+       },
+     };
+   }
+   ```
+
+3. **Implement `dynamicTestsForResources`**:
+
+   Define the tests to run for each combination of resources.
+
+   ```objective-c
+   + (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTestsForResources:(NSDictionary<NSString *, NSString *> *)resources {
+     NSString *modelPath = resources[@"model"];
+     NSString *configPath = resources[@"config"];
+     return @{
+       @"customTest" : ^(XCTestCase *testCase) {
+         // Implement your test logic here.
+       },
+     };
+   }
+   ```
+
+4. **Add the Test Class to the Test Target**:
+
+   Ensure your new test class is included in the test target in Xcode.
+
+### Example: LLaMA Token Generation Test
+
+An example of a more advanced test is measuring the tokens per second during text generation with the LLaMA model.
+
+```objective-c
+@interface LLaMATests : ResourceTestCase
+@end
+
+@implementation LLaMATests
+
++ (NSArray<NSString *> *)directories {
+  return @[@"Resources"];
+}
+
++ (NSDictionary<NSString *, BOOL (^)(NSString *)> *)predicates {
+  return @{
+    @"model" : ^BOOL(NSString *filename) {
+      return [filename hasSuffix:@".pte"] && [filename containsString:@"llama"];
+    },
+    @"tokenizer" : ^BOOL(NSString *filename) {
+      return [filename isEqualToString:@"tokenizer.bin"];
+    },
+  };
+}
+
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTestsForResources:(NSDictionary<NSString *, NSString *> *)resources {
+  NSString *modelPath = resources[@"model"];
+  NSString *tokenizerPath = resources[@"tokenizer"];
+  return @{
+    @"generate" : ^(XCTestCase *testCase) {
+      // Implement the token generation test...
+    },
+  };
+}
+
+@end
+```
+
+In this test:
+
+- We look for LLaMA model files and a `tokenizer.bin`.
+- We measure tokens per second and memory usage during text generation.
+
+## Measuring Performance
+
+The Benchmark App leverages Apple's performance testing APIs to measure metrics such as execution time and memory usage.
+
+- **Measurement Options**: By default, each test is run five times to calculate average metrics.
+- **Custom Metrics**: You can define custom metrics by implementing the `XCTMetric` protocol.
+- **Available Metrics**:
+  - `XCTClockMetric`: Measures wall-clock time.
+  - `XCTMemoryMetric`: Measures memory usage.
+  - **Custom Metrics**: For example, the LLaMA test includes a `TokensPerSecondMetric`.
+
+## Running Tests from the Command Line
+
+You can also run the tests using `xcodebuild`:
+
+```bash
+# Run on an iOS Simulator
+xcodebuild test -project extension/apple/Benchmark/Benchmark.xcodeproj \
+-scheme Benchmark \
+-destination 'platform=iOS Simulator,name=<SimulatorName>' \
+-testPlan Tests
+
+# Run on a physical iOS device
+xcodebuild test -project extension/apple/Benchmark/Benchmark.xcodeproj \
+-scheme Benchmark \
+-destination 'platform=iOS,name=<DeviceName>' \
+-testPlan Tests \
+-allowProvisioningUpdates DEVELOPMENT_TEAM=<YourTeamID>
+```
+
+Replace `<SimulatorName>`, `<DeviceName>`, and `<YourTeamID>` with your simulator/device name and Apple development team ID.
+
+## macOS
+
+The app can be built and run on macOS, just add it as the destination platform.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_benchmark_app_macos.png" alt="Benchmark App macOS" style="width:700px">
+</p>
+
+Also, set up app signing to run locally.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_benchmark_app_macos_signing.png" alt="Benchmark App macOS Signing" style="width:800px">
+</p>
+
+## Conclusion
+
+The ExecuTorch Benchmark App provides a flexible and powerful framework for testing and measuring the performance of PyTorch models on Apple devices. By leveraging dynamic test generation, you can easily add your models and resources to assess their performance metrics. Whether you're optimizing existing models or developing new ones, this tool can help you gain valuable insights into their runtime behavior.
diff --git a/extension/apple/Benchmark/Resources/README.md b/extension/apple/Benchmark/Resources/README.md
new file mode 100644
index 00000000000..ad6a37104d0
--- /dev/null
+++ b/extension/apple/Benchmark/Resources/README.md
@@ -0,0 +1,4 @@
+# Resources
+
+This directory and all files in it will be copied to the bundle’s root directory.
+Place here any resource files you want to access at runtime.
diff --git a/extension/apple/Benchmark/TestUtils/DynamicTestCase.h b/extension/apple/Benchmark/TestUtils/DynamicTestCase.h
new file mode 100644
index 00000000000..4cf7b525602
--- /dev/null
+++ b/extension/apple/Benchmark/TestUtils/DynamicTestCase.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <XCTest/XCTest.h>
+
+/**
+ * DynamicTestCase is a subclass of XCTestCase that allows dynamic creation of
+ * test methods. Subclasses should override the `+dynamicTests` method to
+ * provide a dictionary of test names and corresponding test blocks.
+ */
+@interface DynamicTestCase : XCTestCase
+
+/**
+ * Returns a dictionary mapping test names to test blocks.
+ * Subclasses should override this method to provide dynamic tests.
+ */
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTests;
+
+@end
diff --git a/extension/apple/Benchmark/TestUtils/DynamicTestCase.m b/extension/apple/Benchmark/TestUtils/DynamicTestCase.m
new file mode 100644
index 00000000000..a232e1c50b1
--- /dev/null
+++ b/extension/apple/Benchmark/TestUtils/DynamicTestCase.m
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "DynamicTestCase.h"
+
+#import <objc/runtime.h>
+#import <sys/utsname.h>
+
+#if TARGET_OS_IOS
+#import <UIKit/UIDevice.h>
+#endif
+
+static NSString *deviceInfoString(void) {
+  static NSString *deviceInfo;
+  static dispatch_once_t onceToken;
+  dispatch_once(&onceToken, ^{
+    struct utsname systemInfo;
+    uname(&systemInfo);
+#if TARGET_OS_IOS
+    UIDevice *device = UIDevice.currentDevice;
+    deviceInfo = [NSString stringWithFormat:@"%@_%@_%@",
+                                            device.systemName,
+                                            device.systemVersion,
+                                            @(systemInfo.machine)];
+#elif TARGET_OS_MAC
+        NSOperatingSystemVersion version = NSProcessInfo.processInfo.operatingSystemVersion;
+        deviceInfo = [NSString stringWithFormat:@"macOS_%ld_%ld_%ld_%@",
+                      (long)version.majorVersion,
+                      (long)version.minorVersion, (long)version.patchVersion, @(systemInfo.machine)];
+#endif // TARGET_OS_IOS
+    deviceInfo = [[deviceInfo
+        componentsSeparatedByCharactersInSet:[NSCharacterSet
+                                                 punctuationCharacterSet]]
+        componentsJoinedByString:@"_"];
+  });
+  return deviceInfo;
+}
+
+@implementation DynamicTestCase
+
++ (void)initialize {
+  if (self != [DynamicTestCase class]) {
+    NSString *deviceInfo = deviceInfoString();
+    [[self dynamicTests]
+        enumerateKeysAndObjectsUsingBlock:^(NSString *testName,
+                                            void (^testCase)(XCTestCase *),
+                                            BOOL __unused *stop) {
+          NSString *methodName =
+              [NSString stringWithFormat:@"test_%@_%@", testName, deviceInfo];
+          class_addMethod(self,
+                          NSSelectorFromString(methodName),
+                          imp_implementationWithBlock(testCase),
+                          "v@:");
+        }];
+  }
+}
+
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTests {
+  return @{};
+}
+
+@end
diff --git a/extension/apple/Benchmark/TestUtils/ResourceTestCase.h b/extension/apple/Benchmark/TestUtils/ResourceTestCase.h
new file mode 100644
index 00000000000..3bab25d0604
--- /dev/null
+++ b/extension/apple/Benchmark/TestUtils/ResourceTestCase.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "DynamicTestCase.h"
+
+/**
+ * ResourceTestCase is a subclass of DynamicTestCase that generates tests based
+ * on resources. Subclasses should override the bundle, directories, predicates,
+ * and dynamicTestsForResources methods.
+ */
+@interface ResourceTestCase : DynamicTestCase
+
+/**
+ * Returns an array of NSBundle objects to search for resources.
+ * By default, returns the main bundle and the bundle for the class.
+ */
++ (NSArray<NSBundle *> *)bundles;
+
+/**
+ * Returns an array of directory paths (relative to the bundles' resource paths)
+ * to search. Subclasses should override to specify directories containing
+ * resources.
+ */
++ (NSArray<NSString *> *)directories;
+
+/**
+ * Returns a dictionary mapping resource keys to predicates.
+ * Each predicate is a block that takes a filename and returns a BOOL indicating
+ * a match. Subclasses should override to specify predicates for matching
+ * resources.
+ */
++ (NSDictionary<NSString *, BOOL (^)(NSString *)> *)predicates;
+
+/**
+ * Returns a dictionary mapping test names to test blocks, given a dictionary of
+ * resources. Subclasses should override to provide tests for combinations of
+ * resources.
+ *
+ * @param resources A dictionary mapping resource keys to resource file paths.
+ * @return A dictionary mapping test names to test blocks.
+ */
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTestsForResources:
+    (NSDictionary<NSString *, NSString *> *)resources;
+
+@end
diff --git a/extension/apple/Benchmark/TestUtils/ResourceTestCase.m b/extension/apple/Benchmark/TestUtils/ResourceTestCase.m
new file mode 100644
index 00000000000..36aab0c3fd1
--- /dev/null
+++ b/extension/apple/Benchmark/TestUtils/ResourceTestCase.m
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ResourceTestCase.h"
+
+static void generateCombinations(
+    NSDictionary<NSString *, NSArray<NSString *> *> *matchesByResource,
+    NSArray<NSString *> *keys,
+    NSMutableDictionary<NSString *, NSString *> *result,
+    NSUInteger index,
+    NSMutableSet<NSDictionary<NSString *, NSString *> *> *combinations) {
+  if (index == keys.count) {
+    if (result.count == keys.count) {
+      [combinations addObject:[result copy]];
+    }
+    return;
+  }
+  NSString *key = keys[index];
+  NSArray<NSString *> *matches = matchesByResource[key] ?: @[];
+  if (!matches.count) {
+    generateCombinations(
+        matchesByResource, keys, result, index + 1, combinations);
+    return;
+  }
+  for (NSString *match in matches) {
+    result[key] = match;
+    generateCombinations(
+        matchesByResource, keys, result, index + 1, combinations);
+    [result removeObjectForKey:key];
+  }
+}
+
+@implementation ResourceTestCase
+
++ (NSArray<NSBundle *> *)bundles {
+  return @[ [NSBundle mainBundle], [NSBundle bundleForClass:self] ];
+}
+
++ (NSArray<NSString *> *)directories {
+  return @[];
+}
+
++ (NSDictionary<NSString *, BOOL (^)(NSString *)> *)predicates {
+  return @{};
+}
+
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTestsForResources:
+    (NSDictionary<NSString *, NSString *> *)resources {
+  return @{};
+}
+
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTests {
+  NSMutableDictionary<NSString *, void (^)(XCTestCase *)> *tests =
+      [NSMutableDictionary new];
+  NSMutableSet<NSDictionary<NSString *, NSString *> *> *combinations =
+      [NSMutableSet new];
+  NSDictionary<NSString *, BOOL (^)(NSString *)> *predicates =
+      [self predicates];
+  NSArray<NSString *> *sortedKeys =
+      [predicates.allKeys sortedArrayUsingSelector:@selector(compare:)];
+
+  if (predicates.count == 0)
+    return @{};
+
+  for (NSBundle *bundle in self.bundles) {
+    for (NSString *directory in self.directories) {
+      NSArray<NSURL *> *resourceURLs =
+          [bundle URLsForResourcesWithExtension:nil subdirectory:directory];
+      if (!resourceURLs.count) {
+        continue;
+      };
+      NSMutableDictionary<NSString *, NSMutableArray<NSString *> *>
+          *matchesByResource = [NSMutableDictionary new];
+
+      for (NSURL *url in resourceURLs) {
+        NSString *file = url.lastPathComponent;
+        NSString *fullPath = url.path;
+
+        for (NSString *key in sortedKeys) {
+          if (predicates[key](file)) {
+            matchesByResource[key] =
+                matchesByResource[key] ?: [NSMutableArray new];
+            [matchesByResource[key] addObject:fullPath];
+          }
+        }
+      }
+      NSMutableDictionary<NSString *, NSString *> *result =
+          [NSMutableDictionary new];
+      generateCombinations(
+          matchesByResource, sortedKeys, result, 0, combinations);
+    }
+  }
+  for (NSDictionary<NSString *, NSString *> *resources in combinations) {
+    NSMutableString *resourceString = [NSMutableString new];
+    NSCharacterSet *punctuationSet = [NSCharacterSet punctuationCharacterSet];
+    for (NSString *key in sortedKeys) {
+      NSString *lastComponent = [resources[key] lastPathComponent];
+      NSString *cleanedComponent =
+          [[lastComponent componentsSeparatedByCharactersInSet:punctuationSet]
+              componentsJoinedByString:@"_"];
+      [resourceString appendFormat:@"_%@", cleanedComponent];
+    }
+    NSDictionary<NSString *, void (^)(XCTestCase *)> *resourceTests =
+        [self dynamicTestsForResources:resources];
+    [resourceTests
+        enumerateKeysAndObjectsUsingBlock:^(
+            NSString *testName, void (^testBlock)(XCTestCase *), BOOL *stop) {
+          tests[[testName stringByAppendingString:resourceString]] = testBlock;
+        }];
+  }
+  return tests;
+}
+
+@end
diff --git a/extension/apple/Benchmark/Tests/GenericTests.mm b/extension/apple/Benchmark/Tests/GenericTests.mm
new file mode 100644
index 00000000000..ce685335767
--- /dev/null
+++ b/extension/apple/Benchmark/Tests/GenericTests.mm
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ResourceTestCase.h"
+
+#import <executorch/extension/module/module.h>
+#import <executorch/extension/tensor/tensor.h>
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+
+#define ASSERT_OK_OR_RETURN(value__)            \
+  ({                                            \
+    XCTAssertEqual(value__.error(), Error::Ok); \
+    if (!value__.ok()) {                        \
+      return;                                   \
+    }                                           \
+  })
+
+@interface GenericTests : ResourceTestCase
+@end
+
+@implementation GenericTests
+
++ (NSArray<NSString *> *)directories {
+  return @[
+    @"Resources",
+    @"aatp/data", // AWS Farm devices look for resources here.
+  ];
+}
+
++ (NSDictionary<NSString *, BOOL (^)(NSString *)> *)predicates {
+  return @{
+    @"model" : ^BOOL(NSString *filename){
+      return [filename hasSuffix:@".pte"];
+    },
+  };
+}
+
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTestsForResources:
+    (NSDictionary<NSString *, NSString *> *)resources {
+  NSString *modelPath = resources[@"model"];
+  return @{
+    @"load" : ^(XCTestCase *testCase){
+      [testCase
+          measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ]
+                       block:^{
+                         XCTAssertEqual(
+                             Module(modelPath.UTF8String).load_forward(),
+                             Error::Ok);
+                       }];
+    },
+    @"forward" : ^(XCTestCase *testCase) {
+      auto __block module = std::make_unique<Module>(modelPath.UTF8String);
+
+      const auto method_meta = module->method_meta("forward");
+      ASSERT_OK_OR_RETURN(method_meta);
+
+      const auto num_inputs = method_meta->num_inputs();
+      XCTAssertGreaterThan(num_inputs, 0);
+
+      std::vector<TensorPtr> tensors;
+      tensors.reserve(num_inputs);
+
+      for (auto index = 0; index < num_inputs; ++index) {
+        const auto input_tag = method_meta->input_tag(index);
+        ASSERT_OK_OR_RETURN(input_tag);
+
+        switch (*input_tag) {
+        case Tag::Tensor: {
+          const auto tensor_meta = method_meta->input_tensor_meta(index);
+          ASSERT_OK_OR_RETURN(tensor_meta);
+
+          const auto sizes = tensor_meta->sizes();
+          tensors.emplace_back(
+              ones({sizes.begin(), sizes.end()}, tensor_meta->scalar_type()));
+          XCTAssertEqual(module->set_input(tensors.back(), index), Error::Ok);
+        } break;
+        default:
+          XCTFail("Unsupported tag %i at input %d", *input_tag, index);
+        }
+      }
+      [testCase measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ]
+                            block:^{
+                              XCTAssertEqual(module->forward().error(), Error::Ok);
+                            }];
+    },
+  };
+}
+
+@end
diff --git a/extension/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
new file mode 100644
index 00000000000..f3558308c82
--- /dev/null
+++ b/extension/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ResourceTestCase.h"
+
+#import <executorch/examples/models/llama2/runner/runner.h>
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+
+@interface TokensPerSecondMetric : NSObject<XCTMetric>
+
+@property(nonatomic, assign) NSUInteger tokenCount;
+
+@end
+
+@implementation TokensPerSecondMetric
+
+- (id)copyWithZone:(NSZone *)zone {
+  TokensPerSecondMetric *copy = [[[self class] allocWithZone:zone] init];
+  copy.tokenCount = self.tokenCount;
+  return copy;
+}
+
+- (NSArray<XCTPerformanceMeasurement *> *)
+    reportMeasurementsFromStartTime:
+        (XCTPerformanceMeasurementTimestamp *)startTime
+                          toEndTime:
+                              (XCTPerformanceMeasurementTimestamp *)endTime
+                              error:(NSError **)error {
+  double elapsedTime =
+      (endTime.absoluteTimeNanoSeconds - startTime.absoluteTimeNanoSeconds) /
+      (double)NSEC_PER_SEC;
+  return @[ [[XCTPerformanceMeasurement alloc]
+      initWithIdentifier:NSStringFromClass([self class])
+             displayName:@"Tokens Per Second"
+             doubleValue:(self.tokenCount / elapsedTime)
+              unitSymbol:@"t/s"] ];
+}
+
+@end
+
+@interface LLaMATests : ResourceTestCase
+@end
+
+@implementation LLaMATests
+
++ (NSArray<NSString *> *)directories {
+  return @[
+    @"Resources",
+    @"aatp/data", // AWS Farm devices look for resources here.
+  ];
+}
+
++ (NSDictionary<NSString *, BOOL (^)(NSString *)> *)predicates {
+  return @{
+    @"model" : ^BOOL(NSString *filename){
+      return [filename hasSuffix:@".pte"] && [filename containsString:@"llama"];
+    },
+    @"tokenizer" : ^BOOL(NSString *filename) {
+      return [filename isEqual:@"tokenizer.bin"];
+    },
+  };
+}
+
++ (NSDictionary<NSString *, void (^)(XCTestCase *)> *)dynamicTestsForResources:
+    (NSDictionary<NSString *, NSString *> *)resources {
+  NSString *modelPath = resources[@"model"];
+  NSString *tokenizerPath = resources[@"tokenizer"];
+  return @{
+    @"generate" : ^(XCTestCase *testCase){
+      auto __block runner = std::make_unique<example::Runner>(
+          modelPath.UTF8String, tokenizerPath.UTF8String);
+      const auto status = runner->load();
+      if (status != Error::Ok) {
+        XCTFail("Load failed with error %i", status);
+        return;
+      }
+      TokensPerSecondMetric *tokensPerSecondMetric = [TokensPerSecondMetric new];
+      [testCase measureWithMetrics:@[ tokensPerSecondMetric, [XCTMemoryMetric new] ]
+                            block:^{
+                              tokensPerSecondMetric.tokenCount = 0;
+                              const auto status = runner->generate(
+                                  "Once upon a time",
+                                  128,
+                                  [=](const std::string &token) {
+                                    tokensPerSecondMetric.tokenCount++;
+                                  },
+                                  nullptr,
+                                  false);
+                              XCTAssertEqual(status, Error::Ok);
+                            }];
+    },
+  };
+}
+
+@end
diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm
deleted file mode 100644
index 2730cd8963f..00000000000
--- a/extension/apple/Benchmark/Tests/Tests.mm
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#import <XCTest/XCTest.h>
-
-#import <objc/runtime.h>
-
-#import <executorch/extension/module/module.h>
-#import <executorch/extension/tensor/tensor.h>
-
-using namespace ::executorch::extension;
-using namespace ::executorch::runtime;
-
-@interface Tests : XCTestCase
-@end
-
-@implementation Tests
-
-+ (void)initialize {
-  if (self != [self class]) {
-    return;
-  }
-  for (NSBundle *bundle in @[
-         [NSBundle mainBundle],
-         [NSBundle bundleForClass:[self class]],
-       ]) {
-    for (NSString *directory in @[
-           @"Models",
-           @"aatp/data",
-         ]) {
-      NSString *directoryPath =
-          [bundle.resourcePath stringByAppendingPathComponent:directory];
-      NSArray *filePaths =
-          [NSFileManager.defaultManager contentsOfDirectoryAtPath:directoryPath
-                                                            error:nil];
-      for (NSString *filePath in filePaths) {
-        if (![filePath hasSuffix:@".pte"]) {
-          continue;
-        }
-        NSString *modelPath =
-            [directoryPath stringByAppendingPathComponent:filePath];
-        NSString *directoryName =
-            [directory stringByReplacingOccurrencesOfString:@"/"
-                                                 withString:@"_"]
-                .lowercaseString;
-        NSString *modelName =
-            modelPath.lastPathComponent.stringByDeletingPathExtension;
-
-        SEL testLoadSelector = NSSelectorFromString([NSString
-            stringWithFormat:@"test_load_%@_%@", directoryName, modelName]);
-        IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) {
-          auto __block module = std::make_unique<Module>(modelPath.UTF8String);
-          [_self measureWithMetrics:@[
-            [XCTClockMetric new],
-            [XCTMemoryMetric new],
-          ]
-                            options:XCTMeasureOptions.defaultOptions
-                              block:^{
-                                XCTAssertEqual(module->load_method("forward"),
-                                               Error::Ok);
-                              }];
-        });
-        class_addMethod(
-            [self class], testLoadSelector, testLoadImplementation, "v@:");
-
-        SEL testForwardSelector = NSSelectorFromString([NSString
-            stringWithFormat:@"test_forward_%@_%@", directoryName, modelName]);
-        IMP testForwardImplementation = imp_implementationWithBlock(^(
-            id _self) {
-          auto __block module = std::make_unique<Module>(modelPath.UTF8String);
-          XCTAssertEqual(module->load_method("forward"), Error::Ok);
-
-          const auto method_meta = module->method_meta("forward");
-          XCTAssertEqual(method_meta.error(), Error::Ok);
-
-          const auto num_inputs = method_meta->num_inputs();
-          XCTAssertGreaterThan(num_inputs, 0);
-
-          std::vector<TensorPtr> __block tensors;
-          tensors.reserve(num_inputs);
-          std::vector<EValue> __block inputs;
-          inputs.reserve(num_inputs);
-
-          for (auto index = 0; index < num_inputs; ++index) {
-            const auto input_tag = method_meta->input_tag(index);
-            XCTAssertEqual(input_tag.error(), Error::Ok);
-
-            switch (*input_tag) {
-            case Tag::Tensor: {
-              const auto tensor_meta = method_meta->input_tensor_meta(index);
-              XCTAssertEqual(tensor_meta.error(), Error::Ok);
-
-              const auto sizes = tensor_meta->sizes();
-              tensors.emplace_back(ones({sizes.begin(), sizes.end()},
-                                        tensor_meta->scalar_type()));
-              inputs.emplace_back(tensors.back());
-            } break;
-            default:
-              XCTFail("Unsupported tag %i at input %d", *input_tag, index);
-            }
-          }
-          [_self measureWithMetrics:@[
-            [XCTClockMetric new],
-            [XCTMemoryMetric new],
-          ]
-                            options:XCTMeasureOptions.defaultOptions
-                              block:^{
-                                XCTAssertEqual(module->forward(inputs).error(),
-                                               Error::Ok);
-                              }];
-        });
-        class_addMethod([self class],
-                        testForwardSelector,
-                        testForwardImplementation,
-                        "v@:");
-      }
-    }
-  }
-}
-
-@end
diff --git a/extension/apple/Benchmark/Tests/Tests.xcconfig b/extension/apple/Benchmark/Tests/Tests.xcconfig
index 838cc61a43d..f99e5068feb 100644
--- a/extension/apple/Benchmark/Tests/Tests.xcconfig
+++ b/extension/apple/Benchmark/Tests/Tests.xcconfig
@@ -1,26 +1,50 @@
+HEADER_SEARCH_PATHS[sdk=iphonesimulator*] = $(inherited) \
+$(SRCROOT)/../../../.. \
+$(TEMP_DIR)/cmake/include
+
+LIBRARY_SEARCH_PATHS[sdk=iphonesimulator*] = $(inherited) \
+$(TEMP_DIR)/cmake/lib
+
 OTHER_LDFLAGS[sdk=iphonesimulator*] = $(inherited) \
+@$(TEMP_DIR)/cmake/linker_flags \
 -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-simulator-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a \
--force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a
 
+HEADER_SEARCH_PATHS[sdk=iphoneos*] = $(inherited) \
+$(SRCROOT)/../../../.. \
+$(TEMP_DIR)/cmake/include
+
+LIBRARY_SEARCH_PATHS[sdk=iphoneos*] = $(inherited) \
+$(TEMP_DIR)/cmake/lib
+
 OTHER_LDFLAGS[sdk=iphoneos*] = $(inherited) \
+@$(TEMP_DIR)/cmake/linker_flags \
 -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-ios-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a \
--force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized-ios-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-release.a
 
+HEADER_SEARCH_PATHS[sdk=macos*] = $(inherited) \
+$(SRCROOT)/../../../.. \
+$(TEMP_DIR)/cmake/include
+
+LIBRARY_SEARCH_PATHS[sdk=macos*] = $(inherited) \
+$(TEMP_DIR)/cmake/lib
+
 OTHER_LDFLAGS[sdk=macos*] = $(inherited) \
+@$(TEMP_DIR)/cmake/linker_flags \
 -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-macos-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-macos-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-macos-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-macos-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-macos-release.a \
--force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized-macos-release.a \
 -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-macos-release.a
diff --git a/extension/aten_util/aten_bridge.cpp b/extension/aten_util/aten_bridge.cpp
index 362dc57c37d..1305ae75ce0 100644
--- a/extension/aten_util/aten_bridge.cpp
+++ b/extension/aten_util/aten_bridge.cpp
@@ -73,6 +73,8 @@ torch::executor::ScalarType torch_to_executorch_scalar_type(
       return torch::executor::ScalarType::Short;
     case c10::ScalarType::Half:
       return torch::executor::ScalarType::Half;
+    case c10::ScalarType::BFloat16:
+      return torch::executor::ScalarType::BFloat16;
     case c10::ScalarType::Int:
       return torch::executor::ScalarType::Int;
     case c10::ScalarType::Float:
@@ -103,6 +105,8 @@ c10::ScalarType executorch_to_torch_scalar_type(
       return c10::ScalarType::Short;
     case torch::executor::ScalarType::Half:
       return c10::ScalarType::Half;
+    case torch::executor::ScalarType::BFloat16:
+      return c10::ScalarType::BFloat16;
     case torch::executor::ScalarType::Int:
       return c10::ScalarType::Int;
     case torch::executor::ScalarType::Float:
@@ -166,5 +170,12 @@ at::Tensor alias_attensor_to_etensor(const torch::executor::Tensor& etensor) {
   return t;
 }
 
+TensorPtr alias_tensor_ptr_to_attensor(at::Tensor& t) {
+  return make_tensor_ptr(
+      {t.sizes().begin(), t.sizes().end()},
+      t.mutable_data_ptr(),
+      torch::executor::ScalarType(t.scalar_type()));
+}
+
 } // namespace extension
 } // namespace executorch
diff --git a/extension/aten_util/aten_bridge.h b/extension/aten_util/aten_bridge.h
index 0d6b697463c..62b07eee51d 100644
--- a/extension/aten_util/aten_bridge.h
+++ b/extension/aten_util/aten_bridge.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 
 #include <ATen/Functions.h> // @manual=//caffe2/aten:ATen-cpu
@@ -48,6 +49,8 @@ void alias_etensor_to_attensor(at::Tensor& at, torch::executor::Tensor& et);
  */
 at::Tensor alias_attensor_to_etensor(const torch::executor::Tensor& et);
 
+TensorPtr alias_tensor_ptr_to_attensor(at::Tensor& t);
+
 } // namespace extension
 } // namespace executorch
 
diff --git a/extension/aten_util/test/aten_bridge_test.cpp b/extension/aten_util/test/aten_bridge_test.cpp
index cf6d2b85978..ba331162fca 100644
--- a/extension/aten_util/test/aten_bridge_test.cpp
+++ b/extension/aten_util/test/aten_bridge_test.cpp
@@ -18,6 +18,7 @@
 using namespace ::testing;
 using namespace torch::executor;
 using namespace torch::executor::util;
+using namespace executorch::extension;
 
 namespace {
 at::Tensor generate_at_tensor() {
@@ -146,3 +147,10 @@ TEST(ATenBridgeTest, AliasATTensorToETensor) {
   auto aliased_at_tensor = alias_attensor_to_etensor(etensor);
   EXPECT_EQ(aliased_at_tensor.const_data_ptr(), etensor_data.data());
 }
+
+TEST(ATenBridgeTest, AliasTensorPtrToATenTensor) {
+  auto at_tensor = generate_at_tensor();
+  const auto& et_tensor_ptr = alias_tensor_ptr_to_attensor(at_tensor);
+  alias_etensor_to_attensor(at_tensor, *et_tensor_ptr);
+  EXPECT_EQ(at_tensor.const_data_ptr(), et_tensor_ptr->const_data_ptr());
+}
diff --git a/extension/benchmark/README.md b/extension/benchmark/README.md
new file mode 100644
index 00000000000..e22cc2121e3
--- /dev/null
+++ b/extension/benchmark/README.md
@@ -0,0 +1,81 @@
+# Benchmarking Infrastructure (Experimental)
+
+The ExecuTorch project introduces an advanced benchmarking infrastructure designed to measure the performance of models on Android and iOS devices. It supports various backend delegates and devices, enabling reproducible performance measurements and facilitating collaborative efforts in performance tuning and debugging. This infrastructure is built on top of the [Nova reusable mobile workflow](https://github.com/pytorch/test-infra/wiki/Testing-Android-and-iOS-apps-on-OSS-CI-using-Nova-reusable-mobile-workflow) powered by PyTorch test-infra.
+
+### Key Features
+
+- **Multiple Models**: Supports a variety of ExecuTorch-enabled models such as `MobileNetV2` etc. Integration with compatible Hugging Face models is coming soon.
+
+- **Device Support**: Includes popular phones like latest Apple iPhone, Google Pixel, and Samsung Galaxy, etc.
+
+- **Backend Delegates**: Supports XNNPACK, Apple CoreML, Qualcomm QNN, and more in the near future.
+
+- **Benchmark Apps:** Generic apps that support both GenAI and non-GenAI models, capable of measuring performance offline. [Android App](../android/benchmark/) | [iOS App](../apple/Benchmark/). Popular Android and iOS profilers with in-depth performance analysis will be integrated with these apps in the future.
+
+- **Performance Monitoring**: Stores results in a database with a dashboard for tracking performance and detecting regressions.
+
+> **Disclaimer:** The infrastructure is new and experimental. We're working on improving its accessibility and stability over time.
+
+
+## Preliminary Benchmark Results
+
+Below is a table summarizing some example data points obtained via the infra. These numbers represent model load time and average inference latency across different platforms and backends.
+
+| Model                 | Backend      | Model Load Time (ms) | Avg Inference Latency (ms) | Device                 |
+|-----------------------|--------------|----------------------|----------------------------|------------------------|
+| MobileNetV3 (mv3)     | XNNPACK Q8   | [34.024](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218988461)               | [252.628](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218988461)                    | Samsung S22            |
+| MobileNetV3 (mv3)     | QNN FP16     | [168.414](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987785)              | [1.182](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987785)                      | Samsung S22            |
+| MobileNetV3 (mv3)     | COREML FP16  | [57.372](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930691)               | [0.429](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930691)                    | Apple iPhone 15 Pro    |
+| MobileNetV2 (mv2)     | XNNPACK Q8   | [14.397](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987379)               | [10.796](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987379)                     | Samsung S22            |
+| MobileNetV2 (mv2)     | QNN FP16     | [136.862](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987097)              | [0.673](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218987097)                      | Samsung S22            |
+| MobileNetV2 (mv2)     | COREML FP16  | [50.892](https://github.com/pytorch/executorch/actions/runs/11171117103/job/31056078594)               | [0.631](https://github.com/pytorch/executorch/actions/runs/11171117103/job/31056078594)                    | Apple iPhone 15 Pro   |
+| InceptionV4 (ic4)     | XNNPACK Q8   | [87.617](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986716)               | [117.937](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986716)                    | Samsung S22            |
+| InceptionV4 (ic4)     | QNN FP16     | [163.943](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986381)              | [2.734](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986381)                      | Samsung S22            |
+| InceptionV4 (ic4)     | COREML FP16  | [118.686](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930140)              | [4.289](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930140)                   | Apple iPhone 15 Pro    |
+| InceptionV3 (ic3)     | XNNPACK Q8   | [60.708](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986023)               | [98.390](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218986023)                     | Samsung S22            |
+| InceptionV3 (ic3)     | QNN FP16     | [134.732](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985425)              | [1.351](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985425)                      | Samsung S22            |
+| InceptionV3 (ic3)     | COREML FP16  | [86.728](https://github.com/pytorch/executorch/actions/runs/11171117103/job/31056078753)               | [1.391](https://github.com/pytorch/executorch/actions/runs/11171117103/job/31056078753)                   | Apple iPhone 15 Pro    |
+| DeepLabV3 (dl3)       | XNNPACK Q8   | [90.616](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985758)               | [666.219](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985758)                    | Samsung S22            |
+| DeepLabV3 (dl3)       | QNN FP16     | [182.207](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985141)              | [9.759](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218985141)                      | Samsung S22            |
+| ResNet50 (resnet50)   | XNNPACK Q8   | [55.462](https://github.com/pytorch/executorch/actions/runs/10911725781/job/30285857102)               | [37.021](https://github.com/pytorch/executorch/actions/runs/10911725781/job/30285857102)                     | Apple iPhone 15 Pro    |
+| ResNet50 (resnet50)   | COREML FP16  | [68.943](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930818)               | [1.979](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930818)                   | Apple iPhone 15 Pro    |
+| TorchVisionVit (vit)  | QNN FP16     | [174.430](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218989581)              | [199.279](https://github.com/pytorch/executorch/actions/runs/10875550238/job/30218989581)                    | Samsung S22            |
+| Wave2Letter (w2l)     | XNNPACK Q8   | [33.913](https://github.com/pytorch/executorch/actions/runs/10857890364/job/30139445319)               | [135.584](https://github.com/pytorch/executorch/actions/runs/10857890364/job/30139445319)                    | Apple iPhone 15 Pro    |
+| Wave2Letter (w2l)     | COREML FP16  | [109.254](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999931566)               | [28.465](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999931566)                    | Apple iPhone 15    |
+| MobileBERT (mobilebert) | XNNPACK FP32 | [26.499](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930558)               | [33.978](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930558)                    | Apple iPhone 15 Pro    |
+| MobileBERT (mobilebert) | COREML FP16  | [206.202](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930398)               | [1.873](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999930398)                    | Apple iPhone 15 Pro    |
+| EDSR (edsr)           | XNNPACK Q8   | [3.190](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999929836)               | [168.429](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999929836)                    | Apple iPhone 15 Pro    |
+| EDSR (edsr)           | COREML FP16  | [156.075](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999929690)               | [77.346](https://github.com/pytorch/executorch/actions/runs/11136241814/job/30999929690)                    | Apple iPhone 15 Pro    |
+
+
+## Supported Use Cases
+
+The benchmarking infrastructure currently supports two major use-cases:
+
+- **On-Demand Model Benchmarking:** Users can trigger benchmarking requests via GitHub Actions workflow dispatch UI. This feature will help backend developers collaborate with the ExecuTorch team to debug performance issues and advance state-of-the-art (SOTA) performance.
+
+- **Automated Nightly Batched Benchmarking:** The infrastructure performs automated nightly benchmarking to track and monitor performance over time. This allows for consistent performance monitoring and regression detection.
+
+
+## High-Level Diagram
+
+![Benchmarking Infrastructure](../../docs/source/_static/img/benchmark-infra.png)
+
+
+## Scheduling On-Demand Benchmarking
+
+Users can schedule a benchmarking workflow on a pull request through GitHub Actions using the workflow dispatch UI. Follow the steps below to trigger benchmarking:
+1. Access `pytorch/executorch` repository on GitHub and navigate to the "Actions" tab.
+2. Select `android-perf` or `apple-perf` workflow from the list of workflows.
+3. Click "Run workflow" and fill in the required parameters for the model you want to benchmark, e.g. branch name, model name and delegate, and device pool, etc.
+
+> **Note:** Write permission to the repo will be needed in order to run the on-demand workflow.
+
+
+## Retrieving Benchmark Results
+
+Currently, retrieving benchmark results involves manually extracting the `benchmark_results.json` from the `Customer_Artifacts.zip` stored on AWS S3 from the benchmarking job. This process is not yet streamlined. We are working on simplifying this process and linking the results directly to the dashboard, which will be available soon.
+
+
+## Feedback and Issue Reporting
+We encourage users to share feedback or report any issues while using the infra. Please submit your feedback via GitHub Issues.
diff --git a/extension/export_util/export_hf_model.py b/extension/export_util/export_hf_model.py
new file mode 100644
index 00000000000..201e085fcc8
--- /dev/null
+++ b/extension/export_util/export_hf_model.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+import torch.export._trace
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from torch.nn.attention import SDPBackend
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.integrations.executorch import convert_and_export_with_cache
+from transformers.modeling_utils import PreTrainedModel
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-hfm",
+        "--hf_model_repo",
+        required=True,
+        default=None,
+        help="a valid huggingface model repo name",
+    )
+    parser.add_argument(
+        "-d",
+        "--dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16"],
+        default="float32",
+        help="specify the dtype for loading the model",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_name",
+        required=False,
+        default=None,
+        help="output name of the exported model",
+    )
+
+    args = parser.parse_args()
+
+    # Configs to HF model
+    device = "cpu"
+    # TODO: remove getattr once https://github.com/huggingface/transformers/pull/33741 is merged
+    dtype = getattr(torch, args.dtype)
+    batch_size = 1
+    max_length = 123
+    cache_implementation = "static"
+    attn_implementation = "sdpa"
+
+    # Load and configure a HF model
+    model = AutoModelForCausalLM.from_pretrained(
+        args.hf_model_repo,
+        attn_implementation=attn_implementation,
+        device_map=device,
+        torch_dtype=dtype,
+        generation_config=GenerationConfig(
+            use_cache=True,
+            cache_implementation=cache_implementation,
+            max_length=max_length,
+            cache_config={
+                "batch_size": batch_size,
+                "max_cache_len": max_length,
+            },
+        ),
+    )
+    print(f"{model.config}")
+    print(f"{model.generation_config}")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.hf_model_repo)
+    input_ids = tokenizer([""], return_tensors="pt").to(device)["input_ids"]
+    cache_position = torch.tensor([0], dtype=torch.long)
+
+    def _get_constant_methods(model: PreTrainedModel):
+        return {
+            "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6,
+            "get_bos_id": model.config.bos_token_id,
+            "get_eos_id": model.config.eos_token_id,
+            "get_head_dim": model.config.hidden_size / model.config.num_attention_heads,
+            "get_max_batch_size": model.generation_config.cache_config.batch_size,
+            "get_max_seq_len": model.generation_config.cache_config.max_cache_len,
+            "get_n_bos": 1,
+            "get_n_eos": 1,
+            "get_n_kv_heads": model.config.num_key_value_heads,
+            "get_n_layers": model.config.num_hidden_layers,
+            "get_vocab_size": model.config.vocab_size,
+            "use_kv_cache": model.generation_config.use_cache,
+        }
+
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+
+        exported_prog = convert_and_export_with_cache(model, input_ids, cache_position)
+        prog = (
+            to_edge(
+                exported_prog,
+                compile_config=EdgeCompileConfig(
+                    _check_ir_validity=False,
+                    _skip_dim_order=True,
+                ),
+                constant_methods=_get_constant_methods(model),
+            )
+            .to_backend(XnnpackPartitioner())
+            .to_executorch(ExecutorchBackendConfig(extract_delegate_segments=True))
+        )
+        out_name = args.output_name if args.output_name else model.config.model_type
+        filename = os.path.join("./", f"{out_name}.pte")
+        with open(filename, "wb") as f:
+            prog.write_to_file(f)
+            print(f"Saved exported program to {filename}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extension/export_util/utils.py b/extension/export_util/utils.py
index 37e09babbb9..40ceb6ffec2 100644
--- a/extension/export_util/utils.py
+++ b/extension/export_util/utils.py
@@ -14,8 +14,7 @@
 import torch
 from executorch.exir import EdgeProgramManager, ExecutorchProgramManager, to_edge
 from executorch.exir.tracer import Value
-from torch._export import capture_pre_autograd_graph
-from torch.export import export, ExportedProgram
+from torch.export import export, export_for_training, ExportedProgram
 
 
 _EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
@@ -95,7 +94,7 @@ def export_to_exec_prog(
 ) -> ExecutorchProgramManager:
     m = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    m = capture_pre_autograd_graph(m, example_inputs)
+    m = export_for_training(m, example_inputs).module()
 
     core_aten_ep = _to_core_aten(m, example_inputs, dynamic_shapes, strict=strict)
 
diff --git a/extension/kernel_util/make_boxed_from_unboxed_functor.h b/extension/kernel_util/make_boxed_from_unboxed_functor.h
index 409c981cbb1..5d9645f0930 100644
--- a/extension/kernel_util/make_boxed_from_unboxed_functor.h
+++ b/extension/kernel_util/make_boxed_from_unboxed_functor.h
@@ -173,10 +173,19 @@ static executorch::runtime::Kernel make_boxed_kernel(
 } // namespace extension
 } // namespace executorch
 
-#define EXECUTORCH_LIBRARY(ns, op_name, func)                    \
-  static auto res_##ns = ::executorch::runtime::register_kernel( \
-      ::executorch::extension::make_boxed_kernel(                \
-          #ns "::" op_name, EXECUTORCH_FN(func)))
+// Inspired from C10_CONCATENATE
+#define ET_CONCATENATE_IMPL(s1, s2) s1##s2
+#define ET_CONCATENATE(s1, s2) ET_CONCATENATE_IMPL(s1, s2)
+#define ET_UID __LINE__
+
+#define EXECUTORCH_LIBRARY(ns, op_name, func) \
+  _EXECUTORCH_LIBRARY_IMPL(ns, op_name, func, ET_UID)
+
+#define _EXECUTORCH_LIBRARY_IMPL(ns, op_name, func, uid) \
+  static auto ET_CONCATENATE(res_##ns##_, uid) =         \
+      ::executorch::runtime::register_kernel(            \
+          ::executorch::extension::make_boxed_kernel(    \
+              #ns "::" op_name, EXECUTORCH_FN(func)))
 
 namespace torch {
 namespace executor {
diff --git a/extension/llm/README.md b/extension/llm/README.md
index dfc193e41e1..ddcf4c727d2 100644
--- a/extension/llm/README.md
+++ b/extension/llm/README.md
@@ -2,8 +2,9 @@ This subtree contains libraries and utils of running generative AI, including La
 Below is a list of sub folders.
 ## export
 Model preparation codes are in _export_ folder. The main entry point is the _LLMEdgeManager_ class. It hosts a _torch.nn.Module_, with a list of methods that can be used to prepare the LLM model for ExecuTorch runtime.
-Note that ExecuTorch supports two [quantization APIs](https://pytorch.org/docs/stable/quantization.html#quantization-api-summary): eager mode quantization (aka source transform based quantization), and PyTorch 2 Export based quantization (aka pt2e quantization).
-Typical methods include:
+Note that ExecuTorch supports two [quantization APIs](https://pytorch.org/docs/stable/quantization.html#quantization-api-summary): eager mode quantization (aka source transform based quantization) and PyTorch 2 Export based quantization (aka pt2e quantization).
+
+Commonly used methods in this class include:
 - _set_output_dir_: where users want to save the exported .pte file.
 - _to_dtype_: override the data type of the module.
 - _source_transform_: execute a series of source transform passes. Some transform passes include
@@ -19,7 +20,7 @@ Typical methods include:
 
 Some usage of LLMEdgeManager can be found in executorch/examples/models/llama2, and executorch/examples/models/llava.
 
-When the .pte file is exported and saved, we can prepare a load and run it in a runner.
+When the .pte file is exported and saved, we can load and run it in a runner (see below).
 
 ## tokenizer
 Currently, we support two types of tokenizers: sentencepiece and Tiktoken.
@@ -28,20 +29,21 @@ Currently, we support two types of tokenizers: sentencepiece and Tiktoken.
   - _tokenizer.py_: rewrite a sentencepiece tokenizer model to a serialization format that the runtime can load.
 - In C++:
   - _tokenizer.h_: a simple tokenizer interface. Actual tokenizer classes can be implemented based on this. In this folder, we provide two tokenizer implementations:
-  - _bpe_tokenizer_. We need the rewritten version of tokenizer artifact (refer to _tokenizer.py_ above), for bpe tokenizer to work.
-  - _tiktokern_. It's for llama3 and llama3.1.
+    - _bpe_tokenizer_. Note: we need the rewritten version of tokenizer artifact (refer to _tokenizer.py_ above), for bpe tokenizer to work.
+    - _tiktoken_. For llama3 and llama3.1.
 
 ## sampler
 A sampler class in C++ to sample the logistics given some hyperparameters.
 
 ## custom_ops
-It hosts a custom sdpa operator. This sdpa operator implements CPU flash attention, it avoids copies by taking the kv cache as one of the arguments to this custom operator.
-- _sdpa_with_kv_cache.py_, _op_sdpa_aot.cpp_: custom op definition in PyTorch with C++ registration.
-- _op_sdpa.cpp_: the optimized operator implementation and registration of _sdpa_with_kv_cache.out_.
+Contains custom op, such as:
+- custom sdpa: implements CPU flash attention and avoids copies by taking the kv cache as one of its arguments.
+  - _sdpa_with_kv_cache.py_, _op_sdpa_aot.cpp_: custom op definition in PyTorch with C++ registration.
+  - _op_sdpa.cpp_: the optimized operator implementation and registration of _sdpa_with_kv_cache.out_.
 
 ## runner
 It hosts the libary components used in a C++ llm runner. Currently, it hosts _stats.h_ on runtime status like token numbers and latency.
 
-With the components above, an actual runner can be built for a model or a series of models. An exmaple is in //executorch/examples/models/llama2/runner, where a C++ runner code is built to run Llama 2, 3, 3.1 and other models using the same architecture.
+With the components above, an actual runner can be built for a model or a series of models. An example is in //executorch/examples/models/llama2/runner, where a C++ runner code is built to run Llama 2, 3, 3.1 and other models using the same architecture.
 
 Usages can also be found in the [torchchat repo](https://github.com/pytorch/torchchat/tree/main/runner).
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 723444498a4..d42e37f9bd0 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -60,7 +60,7 @@ target_include_directories(
   custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
 )
 target_link_libraries(
-  custom_ops PUBLIC ${custom_ops_libs} executorch_no_prim_ops
+  custom_ops PUBLIC ${custom_ops_libs} executorch_core
 )
 
 target_compile_options(
@@ -75,7 +75,9 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   add_library(
     custom_ops_aot_lib SHARED
     ${_custom_ops__srcs} ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_fast_hadamard_transform_aten.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop_aot.cpp
   )
   target_include_directories(
     custom_ops_aot_lib PUBLIC "${_common_include_directories}"
@@ -90,7 +92,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   else()
     # If no portable_lib, custom_ops_aot_lib still gives the ability to use the
     # ops in PyTorch
-    target_link_libraries(custom_ops_aot_lib PUBLIC executorch_no_prim_ops)
+    target_link_libraries(custom_ops_aot_lib PUBLIC executorch_core)
   endif()
 
   target_link_libraries(
diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
index 8fe776ab095..c12795fd249 100644
--- a/extension/llm/custom_ops/TARGETS
+++ b/extension/llm/custom_ops/TARGETS
@@ -22,6 +22,19 @@ runtime.python_test(
     ],
 )
 
+runtime.python_test(
+    name = "test_update_quantized_cache",
+    srcs = [
+        "test_update_quantized_cache.py",
+    ],
+    preload_deps = [
+        ":custom_ops_aot_lib",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
 runtime.python_test(
     name = "test_preprocess_custom_ops",
     srcs = [
diff --git a/extension/llm/custom_ops/model_sharding.py b/extension/llm/custom_ops/model_sharding.py
index 75d6fd25740..244c036c9b7 100644
--- a/extension/llm/custom_ops/model_sharding.py
+++ b/extension/llm/custom_ops/model_sharding.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-ignore-all-errors
 import re
 from typing import List
 
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform.cpp b/extension/llm/custom_ops/op_fast_hadamard_transform.cpp
new file mode 100644
index 00000000000..2d005ecd68d
--- /dev/null
+++ b/extension/llm/custom_ops/op_fast_hadamard_transform.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h>
+#include <executorch/kernels/optimized/utils/llvmMathExtras.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h> // For apply_over_dim.
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+Tensor& fast_hadamard_transform_out(
+    RuntimeContext& ctx,
+    const Tensor& mat,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, mat.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx, mat.scalar_type() == out.scalar_type(), InvalidArgument, out);
+
+  if (mat.dim() == 0 || mat.numel() == 0) {
+    return out;
+  }
+
+  ET_KERNEL_CHECK(
+      ctx,
+      is_contiguous_dim_order(mat.dim_order().data(), mat.dim()),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      is_contiguous_dim_order(out.dim_order().data(), out.dim()),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      mat.strides().back() == 1,
+      InvalidArgument,
+      out,
+      "input matrix that isn't contiguous in the last dimension is not supported!");
+
+  const auto last_dim_size = mat.sizes().back();
+  const auto divisible_by_28 = last_dim_size % 28 == 0;
+  auto power_of_two_size = divisible_by_28 ? last_dim_size / 28 : last_dim_size;
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      (power_of_two_size & (power_of_two_size - 1)) == 0,
+      InvalidArgument,
+      out,
+      "This implementation requires power-of-2 (or power-of-2 * 28) input size in the last dimension!");
+
+  const auto log2_power_of_two_size = executorch::llvm::countTrailingZeros(
+      static_cast<unsigned int>(power_of_two_size),
+      executorch::llvm::ZeroBehavior::ZB_Undefined);
+
+  ET_SWITCH_FLOATH_TYPES(mat.scalar_type(), ctx, __func__, CTYPE, [&] {
+    const CTYPE* const mat_data = mat.const_data_ptr<CTYPE>();
+    CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+    std::memcpy(out_data, mat_data, mat.numel() * sizeof(CTYPE));
+
+    if (divisible_by_28) {
+      apply_over_dim(
+          [log2_power_of_two_size, out_data](
+              const size_t size, const size_t stride, const size_t base) {
+            executorch::fast_hadamard_transform_28N(
+                out_data + base, log2_power_of_two_size);
+          },
+          out,
+          out.dim() - 1);
+    } else {
+      apply_over_dim(
+          [log2_power_of_two_size, out_data](
+              const size_t size, const size_t stride, const size_t base) {
+            executorch::fast_hadamard_transform(
+                out_data + base, log2_power_of_two_size);
+          },
+          out,
+          out.dim() - 1);
+    }
+  });
+  return out;
+}
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+EXECUTORCH_LIBRARY(
+    llama,
+    "fast_hadamard_transform.out",
+    torch::executor::native::fast_hadamard_transform_out);
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform.h b/extension/llm/custom_ops/op_fast_hadamard_transform.h
new file mode 100644
index 00000000000..399401c3558
--- /dev/null
+++ b/extension/llm/custom_ops/op_fast_hadamard_transform.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch::executor::native {
+
+// Compute the fast Walsh-Hadamard transform
+// (https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform)
+// of mat along the last dimension (which must be contiguous).
+//
+// mat.sizes().back() is currently required to be either a power of
+// two, or 28 * a power of two.
+Tensor& fast_hadamard_transform_out(
+    RuntimeContext& ctx,
+    const Tensor& mat,
+    Tensor& out);
+} // namespace torch::executor::native
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp b/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp
new file mode 100644
index 00000000000..d2e4c01d25f
--- /dev/null
+++ b/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/extension/llm/custom_ops/op_fast_hadamard_transform.h>
+
+#include <torch/library.h>
+
+namespace torch::executor::native {
+namespace {
+Tensor& fast_hadamard_transform_out_no_context(const Tensor& vec, Tensor& out) {
+  exec_aten::RuntimeContext context;
+  return fast_hadamard_transform_out(context, vec, out);
+}
+at::Tensor fast_hadamard_transform_aten(const at::Tensor& vec) {
+  auto out = at::empty_like(vec);
+  WRAP_TO_ATEN(fast_hadamard_transform_out_no_context, 1)
+  (vec, out);
+  return out;
+}
+} // namespace
+} // namespace torch::executor::native
+
+TORCH_LIBRARY_FRAGMENT(llama, m) {
+  m.def("fast_hadamard_transform(Tensor mat) -> Tensor");
+  m.def(
+      "fast_hadamard_transform.out(Tensor mat, *, Tensor(a!) out) -> Tensor(a!)");
+}
+
+TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
+  m.impl(
+      "fast_hadamard_transform",
+      torch::executor::native::fast_hadamard_transform_aten);
+  m.impl(
+      "fast_hadamard_transform.out",
+      WRAP_TO_ATEN(
+          torch::executor::native::fast_hadamard_transform_out_no_context, 1));
+}
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index 0bb168bdadb..8afec156d53 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -224,7 +224,7 @@ void cpu_flash_attention(
     bool is_causal,
     const optional<Tensor>& attn_mask,
     const optional<double>& scale,
-    bool is_with_kv_cache = false,
+    bool is_seq_at_dim_1 = false,
     const int64_t start_pos = 0) {
   (void)dropout_p;
   // Query (Batch x Num_heads  x Q_seq_len  x Dim_per_head)
@@ -265,7 +265,7 @@ void cpu_flash_attention(
   int64_t kvSize = value.size(2);
   int64_t num_heads_kv = key.size(1);
 
-  if (is_with_kv_cache) {
+  if (is_seq_at_dim_1) {
     num_head = query.size(2);
     num_heads_kv = key.size(2);
     qSize = query.size(1);
@@ -311,7 +311,7 @@ void cpu_flash_attention(
   int64_t qStrideH = strides[1];
   int64_t qStrideM = strides[2];
 
-  if (is_with_kv_cache) {
+  if (is_seq_at_dim_1) {
     qStrideH = strides[2];
     qStrideM = strides[1];
   }
@@ -321,7 +321,7 @@ void cpu_flash_attention(
   int64_t kStrideH = strides[1];
   int64_t kStrideN = strides[2];
 
-  if (is_with_kv_cache) {
+  if (is_seq_at_dim_1) {
     kStrideH = strides[2];
     kStrideN = strides[1];
   }
@@ -331,7 +331,7 @@ void cpu_flash_attention(
   int64_t vStrideH = strides[1];
   int64_t vStrideN = strides[2];
 
-  if (is_with_kv_cache) {
+  if (is_seq_at_dim_1) {
     vStrideH = strides[2];
     vStrideN = strides[1];
   }
@@ -341,7 +341,7 @@ void cpu_flash_attention(
   int64_t oStrideH = strides[1];
   int64_t oStrideM = strides[2];
 
-  if (is_with_kv_cache) {
+  if (is_seq_at_dim_1) {
     oStrideH = strides[2];
     oStrideM = strides[1];
   }
@@ -367,7 +367,7 @@ void cpu_flash_attention(
   int64_t qSlice = (qSize - 1) / qSplitSize + 1;
 #ifdef ET_USE_THREADPOOL
   int64_t num_thread =
-      torch::executorch::threadpool::get_threadpool()->get_thread_count();
+      ::executorch::extension::threadpool::get_threadpool()->get_thread_count();
 #else
   int64_t num_thread = 1;
 #endif
@@ -700,10 +700,23 @@ void update_cache(
     const Tensor& cache,
     int64_t start_pos,
     int64_t seq_length) { // NOLINT: unused parameter 'seq_length'
+  // 1) Cache shape should be [bs, max_seq_len, num heads, head dim]
+  // 2) projected_value shape should be [bs, seq_len, num heads, head dim]
+  // 3) We're updating the cache with projected_value, at position start_pos
+
+  ET_CHECK_MSG(
+      projected_value.size(0) == cache.size(0),
+      "projected_value batch size should be equal to the cache batch size.");
   ET_CHECK_MSG(
-      projected_value.size(0) == 1,
-      "projected_value must have batch size of 1");
-  ET_CHECK_MSG(cache.size(0) == 1, "cache must have batch size of 1");
+      projected_value.size(2) == cache.size(2),
+      "projected_value number of heads should be equal to the cache number of heads.");
+  ET_CHECK_MSG(
+      projected_value.size(3) == cache.size(3),
+      "projected_value embedding dimension should be equal to the cache embedding dimension.");
+  ET_CHECK_MSG(
+      projected_value.element_size() == cache.element_size(),
+      "projected_value data type size should be equal to the cache data type size.");
+
   ET_CHECK_MSG(
       is_contiguous_dim_order(
           projected_value.dim_order().data(), projected_value.dim()),
@@ -714,22 +727,37 @@ void update_cache(
   ET_CHECK_MSG(projected_value_data != nullptr, "projected_value data is null");
   ET_CHECK_MSG(cache_data, "cache data is null");
 
-  auto strides = cache.strides();
-  exec_aten::StridesType seq_dim_stride = strides[1];
-  exec_aten::SizesType pos_offset = start_pos * seq_dim_stride;
-  exec_aten::SizesType pos_offset_bytes =
-      pos_offset * projected_value.element_size();
-  exec_aten::SizesType num_bytes =
-      projected_value.numel() * projected_value.element_size();
-  // NOLINTNEXTLINE
-  std::memcpy(
-      (uint8_t*)cache_data + pos_offset_bytes, projected_value_data, num_bytes);
+  auto cache_strides = cache.strides();
+  exec_aten::StridesType cache_batch_dim_stride = cache_strides[0];
+  exec_aten::StridesType cache_seq_dim_stride = cache_strides[1];
+
+  auto value_strides = projected_value.strides();
+  exec_aten::StridesType value_batch_dim_stride = value_strides[0];
+
+  exec_aten::SizesType num_bytes_to_copy =
+      (projected_value.numel() / projected_value.size(0)) *
+      projected_value.element_size();
+
+  for (int64_t batch_line = 0; batch_line < projected_value.size(0);
+       ++batch_line) {
+    exec_aten::SizesType cache_pos_offset =
+        (batch_line * cache_batch_dim_stride +
+         start_pos * cache_seq_dim_stride) *
+        cache.element_size();
+    exec_aten::SizesType value_pos_offset =
+        (batch_line * value_batch_dim_stride) * cache.element_size();
+
+    std::memcpy(
+        (uint8_t*)cache_data + cache_pos_offset,
+        (uint8_t*)projected_value_data + value_pos_offset,
+        num_bytes_to_copy);
+  }
 }
 
 } // anonymous namespace
 
 Tensor& flash_attention_kernel_out(
-    KernelRuntimeContext& ctx,
+    RuntimeContext& ctx,
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -810,28 +838,18 @@ Tensor& flash_attention_kernel_out(
   @param[in] start_pos: sequence position
   @param[in] seq_len: Seq length. e.g. seq_len dim of q_projected.
 */
-Tensor& sdpa_with_kv_cache_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& q_projected,
-    const Tensor& k_projected,
-    const Tensor& v_projected,
-    Tensor& key_cache,
-    Tensor& value_cache,
+Tensor& custom_sdpa_out(
+    RuntimeContext& ctx,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
     const int64_t start_pos,
-    const int64_t seq_len,
     const optional<Tensor>& attn_mask,
     const double dropout_p,
     const bool is_causal,
     // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
     const optional<double> scale,
     Tensor& output) {
-  (void)ctx;
-  ET_KERNEL_CHECK(
-      ctx,
-      validate_cache_params(key_cache, value_cache, start_pos, seq_len),
-      InvalidArgument,
-      output);
-
   ET_KERNEL_CHECK_MSG(
       ctx,
       !attn_mask.has_value() || !is_causal,
@@ -839,29 +857,31 @@ Tensor& sdpa_with_kv_cache_out(
       output,
       "attn_mask and is_causal cannot be set at the same time");
 
-  ET_CHECK_MSG(q_projected.dim() == 4, "query must be a 4D tensor");
-
-  update_cache(k_projected, key_cache, start_pos, seq_len);
-  update_cache(v_projected, value_cache, start_pos, seq_len);
+  ET_CHECK_MSG(q.dim() == 4, "query must be a 4D tensor");
 
-  auto q_seq_len = q_projected.size(1);
+  const int64_t seq_len = q.size(1);
+  auto q_seq_len = q.size(1);
 
+  // Refactor the following into create_view util perhaps using
+  // TensorPtr
   std::array<exec_aten::DimOrderType, util::kKVDim> sliced_key_dim_order{
       0, 1, 2, 3};
   std::array<exec_aten::SizesType, util::kKVDim> sliced_key_sizes;
-  sliced_key_sizes[0] = key_cache.size(0);
+  sliced_key_sizes[0] = k.size(0);
   sliced_key_sizes[1] = start_pos + seq_len; // key_cache.size(2);
-  sliced_key_sizes[2] = key_cache.size(2);
-  sliced_key_sizes[3] = key_cache.size(3);
+  sliced_key_sizes[2] = k.size(2);
+  sliced_key_sizes[3] = k.size(3);
   std::array<exec_aten::StridesType, util::kKVDim> sliced_key_strides;
   dim_order_to_stride_nocheck(
       sliced_key_sizes.data(),
       sliced_key_dim_order.data(),
       util::kKVDim,
       sliced_key_strides.data());
-  void* key_cache_data = key_cache.mutable_data_ptr();
+  // since the cache is sliced, the batch stride needs to stay the same.
+  sliced_key_strides[0] = k.strides()[0];
+  void* key_cache_data = k.mutable_data_ptr();
   TensorImpl k_impl = TensorImpl(
-      key_cache.scalar_type(),
+      k.scalar_type(),
       util::kKVDim,
       sliced_key_sizes.data(),
       key_cache_data,
@@ -873,19 +893,21 @@ Tensor& sdpa_with_kv_cache_out(
   std::array<exec_aten::DimOrderType, util::kKVDim> sliced_value_dim_order{
       0, 1, 2, 3};
   std::array<exec_aten::SizesType, util::kKVDim> sliced_value_sizes;
-  sliced_value_sizes[0] = value_cache.size(0);
+  sliced_value_sizes[0] = v.size(0);
   sliced_value_sizes[1] = start_pos + seq_len; // value_cache.size(2);
-  sliced_value_sizes[2] = value_cache.size(2);
-  sliced_value_sizes[3] = value_cache.size(3);
+  sliced_value_sizes[2] = v.size(2);
+  sliced_value_sizes[3] = v.size(3);
   std::array<exec_aten::StridesType, util::kKVDim> sliced_value_strides;
   dim_order_to_stride_nocheck(
       sliced_value_sizes.data(),
       sliced_value_dim_order.data(),
       util::kKVDim,
       sliced_value_strides.data());
-  void* value_cache_data = value_cache.mutable_data_ptr();
+  // since the cache is sliced, the batch stride needs to stay the same.
+  sliced_value_strides[0] = v.strides()[0];
+  void* value_cache_data = v.mutable_data_ptr();
   TensorImpl value_impl = TensorImpl(
-      value_cache.scalar_type(),
+      v.scalar_type(),
       util::kKVDim,
       sliced_value_sizes.data(),
       value_cache_data,
@@ -894,63 +916,113 @@ Tensor& sdpa_with_kv_cache_out(
       TensorShapeDynamism::STATIC);
   Tensor sliced_value_cache(&value_impl);
 
-  // Is this true?
-  // Cant do this as is because the expectation of this kernel is
-  // that q, k, v are [B, num heads, seq length, head dim]
-  // and the cache is [B, max seq len, num heads, head dim]
-  // and q, k, v are all [B, seq length, num heads, head dim]
-
   ET_KERNEL_CHECK(
       ctx,
-      resize_tensor(output, q_projected.sizes()) == Error::Ok,
+      resize_tensor(output, q.sizes()) == Error::Ok,
       InvalidArgument,
       output);
 
   // TODO(task): replace the template param selection logic
   // with whatever apprpriately makes more sense for
-  ET_SWITCH_FLOAT_TYPES(
-      q_projected.scalar_type(), ctx, "flash_attention", CTYPE, [&] {
-        // TODO we need to re-evaluate this for ARM CPUs
-        // And there can be many so instead of templatizing
-        // we might consider another appraoch
-        if (q_seq_len >= 768) {
-          cpu_flash_attention<CTYPE, 256, 512>(
-              output,
-              q_projected,
-              sliced_key_cache,
-              sliced_value_cache,
-              dropout_p,
-              is_causal,
-              attn_mask,
-              scale,
-              true,
-              start_pos);
-        } else if (q_seq_len >= 192) {
-          cpu_flash_attention<CTYPE, 64, 512>(
-              output,
-              q_projected,
-              sliced_key_cache,
-              sliced_value_cache,
-              dropout_p,
-              is_causal,
-              attn_mask,
-              scale,
-              true,
-              start_pos);
-        } else {
-          cpu_flash_attention<CTYPE, 32, 512>(
-              output,
-              q_projected,
-              sliced_key_cache,
-              sliced_value_cache,
-              dropout_p,
-              is_causal,
-              attn_mask,
-              scale,
-              true,
-              start_pos);
-        }
-      });
+  ET_SWITCH_FLOAT_TYPES(q.scalar_type(), ctx, "flash_attention", CTYPE, [&] {
+    // TODO we need to re-evaluate this for ARM CPUs
+    // And there can be many so instead of templatizing
+    // we might consider another appraoch
+    if (q_seq_len >= 768) {
+      cpu_flash_attention<CTYPE, 256, 512>(
+          output,
+          q,
+          sliced_key_cache,
+          sliced_value_cache,
+          dropout_p,
+          is_causal,
+          attn_mask,
+          scale,
+          true, /* is_seq_at_dim_1 */
+          start_pos);
+    } else if (q_seq_len >= 192) {
+      cpu_flash_attention<CTYPE, 64, 512>(
+          output,
+          q,
+          sliced_key_cache,
+          sliced_value_cache,
+          dropout_p,
+          is_causal,
+          attn_mask,
+          scale,
+          true, /* is_seq_at_dim_1 */
+          start_pos);
+    } else {
+      cpu_flash_attention<CTYPE, 32, 512>(
+          output,
+          q,
+          sliced_key_cache,
+          sliced_value_cache,
+          dropout_p,
+          is_causal,
+          attn_mask,
+          scale,
+          true, /* is_seq_at_dim_1 */
+          start_pos);
+    }
+  });
+  return output;
+}
+/*
+  Input params
+  @param[in] q_projected Projected query with query weights.
+  Format [n_layers, batch size, seq_len, num heads, head dim]
+  @param[in] k_projected Projected query with key weights.
+  Format [n_layers, batch size, seq_len, num heads, head dim]
+  @param[in] v_projected Projected query with value weights.
+  Format [n_layers, batch size, seq_len, num heads, head dim]
+  @param[in] key_cache Cache of previous k_projected.
+  Format [n_layers, batch size, max_seq_len, num heads, head dim]
+  @param[in] key_cache Cache of previous v_projected.
+  Format [n_layers, batch size, max_seq_len, num heads, head dim]
+  ....
+  @param[in] start_pos: sequence position
+  @param[in] seq_len: Seq length. e.g. seq_len dim of q_projected.
+*/
+Tensor& sdpa_with_kv_cache_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& q_projected,
+    const Tensor& k_projected,
+    const Tensor& v_projected,
+    Tensor& key_cache,
+    Tensor& value_cache,
+    const int64_t start_pos,
+    const int64_t seq_len,
+    const optional<Tensor>& attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output) {
+  (void)ctx;
+  ET_KERNEL_CHECK(
+      ctx,
+      validate_cache_params(key_cache, value_cache, start_pos, seq_len),
+      InvalidArgument,
+      output);
+
+  ET_CHECK_MSG(q_projected.dim() == 4, "query must be a 4D tensor");
+
+  update_cache(k_projected, key_cache, start_pos, seq_len);
+  update_cache(v_projected, value_cache, start_pos, seq_len);
+
+  custom_sdpa_out(
+      ctx,
+      q_projected,
+      key_cache,
+      value_cache,
+      start_pos,
+      attn_mask,
+      dropout_p,
+      is_causal,
+      scale,
+      output);
+
   return output;
 }
 } // namespace native
@@ -961,3 +1033,8 @@ EXECUTORCH_LIBRARY(
     llama,
     "sdpa_with_kv_cache.out",
     torch::executor::native::sdpa_with_kv_cache_out);
+
+EXECUTORCH_LIBRARY(
+    llama,
+    "custom_sdpa.out",
+    torch::executor::native::custom_sdpa_out);
diff --git a/extension/llm/custom_ops/op_sdpa.h b/extension/llm/custom_ops/op_sdpa.h
index ce969b013d2..bc2202b9bd8 100644
--- a/extension/llm/custom_ops/op_sdpa.h
+++ b/extension/llm/custom_ops/op_sdpa.h
@@ -31,6 +31,19 @@ Tensor& sdpa_with_kv_cache_out(
     const optional<double> scale,
     Tensor& output);
 
+Tensor& custom_sdpa_out(
+    RuntimeContext& ctx,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const int64_t start_pos,
+    const optional<Tensor>& attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output);
+
 Tensor& flash_attention_kernel_out(
     KernelRuntimeContext& ctx,
     const Tensor& query,
diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp
index 6db8a0ed7cb..b129a5a3cde 100644
--- a/extension/llm/custom_ops/op_sdpa_aot.cpp
+++ b/extension/llm/custom_ops/op_sdpa_aot.cpp
@@ -9,6 +9,7 @@
 #include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 #include <executorch/extension/llm/custom_ops/op_sdpa.h>
+#include <executorch/extension/llm/custom_ops/op_update_quantized_cache.h>
 
 #include <torch/library.h>
 
@@ -16,7 +17,6 @@ namespace torch {
 namespace executor {
 
 namespace native {
-
 Tensor& sdpa_with_kv_cache_out_no_context(
     const Tensor& q_projected,
     const Tensor& k_projected,
@@ -60,11 +60,11 @@ at::Tensor sdpa_with_kv_cache_aten(
     const int64_t seq_len,
     // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
     // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
-    const c10::optional<at::Tensor> attn_mask,
+    const std::optional<at::Tensor> attn_mask,
     const double dropout_p,
     const bool is_causal,
     // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
-    const c10::optional<double> scale) {
+    const std::optional<double> scale) {
   auto output = at::empty_like(q_projected);
   WRAP_TO_ATEN(sdpa_with_kv_cache_out_no_context, 11)
   (q_projected,
@@ -82,11 +82,76 @@ at::Tensor sdpa_with_kv_cache_aten(
   return output;
 }
 
+Tensor& custom_sdpa_out_no_context(
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const int64_t start_pos,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output) {
+  exec_aten::RuntimeContext context{};
+  return torch::executor::native::custom_sdpa_out(
+      context,
+      q,
+      k,
+      v,
+      start_pos,
+      attn_mask,
+      dropout_p,
+      is_causal,
+      scale,
+      output);
+}
+
+at::Tensor custom_sdpa_aten(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const int64_t start_pos,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const c10::optional<at::Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const c10::optional<double> scale) {
+  auto output = at::empty_like(q);
+  WRAP_TO_ATEN(custom_sdpa_out_no_context, 8)
+  (q, k, v, start_pos, attn_mask, dropout_p, is_causal, scale, output);
+  return output;
+}
+
+Tensor& update_quantized_cache_out_no_context(
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    Tensor& output) {
+  exec_aten::RuntimeContext context{};
+  return torch::executor::native::update_quantized_cache_out(
+      context, value, cache, start_pos, output);
+}
+
+at::Tensor update_quantized_cache_aten(
+    const at::Tensor& value,
+    at::Tensor& cache,
+    const int64_t start_pos) {
+  auto output = at::empty({1});
+  WRAP_TO_ATEN(update_quantized_cache_out_no_context, 3)
+  (value, cache, start_pos, output);
+  return output;
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
 
-TORCH_LIBRARY(llama, m) {
+TORCH_LIBRARY_FRAGMENT(llama, m) {
   m.def(
       "sdpa_with_kv_cache(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
       "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
@@ -95,8 +160,23 @@ TORCH_LIBRARY(llama, m) {
       "sdpa_with_kv_cache.out(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
       "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
       "float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(c!) out) -> Tensor(c!)");
+  m.def(
+      "custom_sdpa(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
+      "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
+      "float? scale=None) -> Tensor");
+  m.def(
+      "custom_sdpa.out(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
+      "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
+      "float? scale=None, *, Tensor(a!) out) -> Tensor(a!)");
+  m.def(
+      "update_quantized_cache(Tensor value, Tensor(a!) cache, "
+      "SymInt start_pos) -> Tensor");
+  m.def(
+      "update_quantized_cache.out(Tensor value, Tensor(a!) cache, "
+      "SymInt start_pos, *, Tensor(b!) out) -> Tensor(b!)");
 }
 
+// TODO: Rename this file to op_custom_ops_aot.cpp
 TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
   m.impl(
       "sdpa_with_kv_cache", torch::executor::native::sdpa_with_kv_cache_aten);
@@ -104,4 +184,15 @@ TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
       "sdpa_with_kv_cache.out",
       WRAP_TO_ATEN(
           torch::executor::native::sdpa_with_kv_cache_out_no_context, 11));
+  m.impl("custom_sdpa", torch::executor::native::custom_sdpa_aten);
+  m.impl(
+      "custom_sdpa.out",
+      WRAP_TO_ATEN(torch::executor::native::custom_sdpa_out_no_context, 8));
+  m.impl(
+      "update_quantized_cache",
+      torch::executor::native::update_quantized_cache_aten);
+  m.impl(
+      "update_quantized_cache.out",
+      WRAP_TO_ATEN(
+          torch::executor::native::update_quantized_cache_out_no_context, 3));
 }
diff --git a/extension/llm/custom_ops/op_tile_crop_aot.cpp b/extension/llm/custom_ops/op_tile_crop_aot.cpp
new file mode 100644
index 00000000000..1755e543ebe
--- /dev/null
+++ b/extension/llm/custom_ops/op_tile_crop_aot.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/extension/llm/custom_ops/op_tile_crop.h>
+
+#include <torch/library.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+Tensor&
+tile_crop_out_no_context(const Tensor& input, int64_t tile_size, Tensor& out) {
+  exec_aten::RuntimeContext context{};
+  return tile_crop_out_impl(context, input, tile_size, out);
+}
+
+at::Tensor tile_crop_aten(const at::Tensor& input, int64_t tile_size) {
+  // max_num_tiles = 4, num_channels = 3.
+  auto output = at::empty({4, 3, tile_size, tile_size});
+
+  WRAP_TO_ATEN(torch::executor::native::tile_crop_out_no_context, 2)
+  (input, tile_size, output);
+  return output;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+TORCH_LIBRARY(preprocess, m) {
+  m.def("tile_crop(Tensor input, int tile_size) -> Tensor");
+  m.def(
+      "tile_crop.out(Tensor input, int tile_size, *, Tensor(a!) out) -> Tensor(a!)");
+}
+
+TORCH_LIBRARY_IMPL(preprocess, CompositeExplicitAutograd, m) {
+  m.impl("tile_crop", torch::executor::native::tile_crop_aten);
+  m.impl(
+      "tile_crop.out",
+      WRAP_TO_ATEN(torch::executor::native::tile_crop_out_no_context, 2));
+}
diff --git a/extension/llm/custom_ops/op_tile_crop_aot.py b/extension/llm/custom_ops/op_tile_crop_aot.py
new file mode 100644
index 00000000000..701aabc441c
--- /dev/null
+++ b/extension/llm/custom_ops/op_tile_crop_aot.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from pathlib import Path
+
+import torch
+
+try:
+    tile_crop = torch.ops.preprocess.tile_crop.default
+    assert tile_crop is not None
+except:
+    libs = list(Path(__file__).parent.resolve().glob("libcustom_ops_aot_lib.*"))
+    assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
+    logging.info(f"Loading custom ops library: {libs[0]}")
+    torch.ops.load_library(libs[0])
+    tile_crop = torch.ops.preprocess.tile_crop.default
+    assert tile_crop is not None
+
+preprocess_ops_lib = torch.library.Library("preprocess", "IMPL")
+
+MAX_NUM_TILES = 4
+
+
+# Register meta kernel to prevent export tracing into the tile_crop impl.
+@torch.library.register_fake("preprocess::tile_crop")
+def tile_crop(output: torch.Tensor, tile_size: int) -> torch.Tensor:
+    # Returned tensor is of size [n, 3, 224, 224], where n = number of tiles.
+    # Use an unbacked symint to create an upper-bounded dynamic shape output.
+    # Otherwise, output is set to a static shape, and we can only output
+    # tensors of shape [MAX_NUM_TILES, 3, 224, 224].
+    ctx = torch._custom_ops.get_ctx()
+    s0 = ctx.create_unbacked_symint()
+    torch._constrain_as_size(s0, 0, MAX_NUM_TILES)
+    return torch.empty([s0, output.size(0), tile_size, tile_size])
diff --git a/extension/llm/custom_ops/op_update_quantized_cache.cpp b/extension/llm/custom_ops/op_update_quantized_cache.cpp
new file mode 100644
index 00000000000..54ec999cb8f
--- /dev/null
+++ b/extension/llm/custom_ops/op_update_quantized_cache.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/custom_ops/op_update_quantized_cache.h>
+
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+// @lint-ignore CLANGTIDY facebook-unused-include-check
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+namespace {
+bool validate_cache_params(
+    const Tensor& quantized_value,
+    const Tensor& quantized_cache,
+    int64_t start_pos,
+    int64_t seq_length) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      quantized_cache.dim() == 4, "quantized cache must be a 4D tensor");
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      quantized_value.dim() == 4, "quantized_value must be a 4D tensor");
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      start_pos < quantized_cache.size(1),
+      "start_pos must be less than cache size at dim 1");
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      (start_pos + seq_length) <= quantized_cache.size(1),
+      "start_post + seq_length must be less than max seq length supported by cache."
+      "start pos: %" PRId64 ", seq_length: %" PRId64
+      "."
+      "cache size: %zd",
+      start_pos,
+      seq_length,
+      quantized_cache.size(1));
+
+  // Make sure they are in contiguous dim order
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      is_contiguous_dim_order(
+          quantized_cache.dim_order().data(), quantized_cache.dim()),
+      "quantized cache must be in contiguous dim order");
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      is_contiguous_dim_order(
+          quantized_value.dim_order().data(), quantized_value.dim()),
+      "quantized value must be in contiguous dim order");
+
+  return true;
+}
+} // anonymous namespace
+
+Tensor& update_quantized_cache_out(
+    RuntimeContext& ctx,
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    Tensor& output) {
+  (void)ctx;
+  int64_t seq_len = value.size(1);
+  ET_KERNEL_CHECK(
+      ctx,
+      validate_cache_params(value, cache, start_pos, seq_len),
+      InvalidArgument,
+      output);
+
+  ET_CHECK_MSG(
+      value.size(0) == cache.size(0),
+      "projected_value batch size should be equal to the cache batch size.");
+  ET_CHECK_MSG(
+      value.size(2) == cache.size(2),
+      "projected_value number of heads should be equal to the cache number of heads.");
+  ET_CHECK_MSG(
+      value.size(3) == cache.size(3),
+      "projected_value embedding dimension should be equal to the cache embedding dimension.");
+  ET_CHECK_MSG(
+      value.element_size() == cache.element_size(),
+      "projected_value data type size should be equal to the cache data type size.");
+
+  ET_CHECK_MSG(
+      is_contiguous_dim_order(value.dim_order().data(), value.dim()),
+      "projected value must be in contiguous dim order");
+  ET_CHECK_MSG(
+      is_contiguous_dim_order(cache.dim_order().data(), cache.dim()),
+      "projected value must be in contiguous dim order");
+
+  const void* value_data = value.const_data_ptr();
+  void* cache_data = cache.mutable_data_ptr();
+
+  ET_CHECK_MSG(value_data, "projected_value data is null");
+  ET_CHECK_MSG(cache_data, "cache data is null");
+
+  auto cache_strides = cache.strides();
+  exec_aten::StridesType cache_batch_dim_stride = cache_strides[0];
+  exec_aten::StridesType cache_seq_dim_stride = cache_strides[1];
+
+  auto value_strides = value.strides();
+  exec_aten::StridesType value_batch_dim_stride = value_strides[0];
+
+  exec_aten::SizesType num_bytes_to_copy =
+      (value.numel() / value.size(0)) * value.element_size();
+
+  for (int64_t batch_line = 0; batch_line < value.size(0); ++batch_line) {
+    exec_aten::SizesType cache_pos_offset =
+        (batch_line * cache_batch_dim_stride +
+         start_pos * cache_seq_dim_stride) *
+        cache.element_size();
+    exec_aten::SizesType value_pos_offset =
+        (batch_line * value_batch_dim_stride) * cache.element_size();
+
+    std::memcpy(
+        (uint8_t*)cache_data + cache_pos_offset,
+        (uint8_t*)value_data + value_pos_offset,
+        num_bytes_to_copy);
+  }
+
+  // Noone uses output. Just a placeholder.
+  return output;
+}
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+// Really this is just an inplace tensor update op
+// which makes assumption on the rank of a tensor,
+// and the dim order (memory layout) of the tensor.
+// Furthermore assumes that the indexing is along
+// sequence dimension (dim 1) of the tensor.
+// In later diffs will rename this to update_cache.
+EXECUTORCH_LIBRARY(
+    llama,
+    "update_quantized_cache.out",
+    torch::executor::native::update_quantized_cache_out);
diff --git a/extension/llm/custom_ops/op_update_quantized_cache.h b/extension/llm/custom_ops/op_update_quantized_cache.h
new file mode 100644
index 00000000000..9cd8090839a
--- /dev/null
+++ b/extension/llm/custom_ops/op_update_quantized_cache.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+Tensor& update_quantized_cache_out(
+    RuntimeContext& ctx,
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    Tensor& output);
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/custom_ops/sdpa_with_kv_cache.py b/extension/llm/custom_ops/sdpa_with_kv_cache.py
index 7673f64d924..85021266b59 100644
--- a/extension/llm/custom_ops/sdpa_with_kv_cache.py
+++ b/extension/llm/custom_ops/sdpa_with_kv_cache.py
@@ -17,9 +17,12 @@
 
 from torch.library import impl
 
+# TODO rename this file to custom_ops_meta_registration.py
 try:
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
+    op2 = torch.ops.llama.fast_hadamard_transform.default
+    assert op2 is not None
 except:
     libs = list(Path(__file__).parent.resolve().glob("libcustom_ops_aot_lib.*"))
     assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
@@ -27,6 +30,8 @@
     torch.ops.load_library(libs[0])
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
+    op2 = torch.ops.llama.fast_hadamard_transform.default
+    assert op2 is not None
 
 custom_ops_lib = torch.library.Library("llama", "IMPL")
 
@@ -126,3 +131,91 @@ def sdpa_with_kv_cache_meta(
     )
 
     return torch.empty_like(query)
+
+
+@impl(custom_ops_lib, "fast_hadamard_transform", "Meta")
+def fast_hadamard_transform_meta(mat):
+    # assert(mat.strides[-1] == 1, "input matrix must be contiguous in the last dimension!")
+    # assert(mat.shape[-1] == 128 or mat.shape[-1] == 14336, "unexpected input size for llama3 demo!")
+    # assert(mat.is_contiguous(), "input matrix must be contiguous currently!")
+    return torch.empty_like(mat)
+
+
+@impl(custom_ops_lib, "custom_sdpa", "Meta")
+def custom_sdpa(
+    query,
+    key_cache,
+    value_cache,
+    start_pos,
+    attn_mask=None,
+    drpout_p=0.0,
+    is_causal=False,
+    scale=None,
+):
+    seq_len = query.size(1)
+    _validate_params(
+        query,
+        key_cache,
+        value_cache,
+        key_cache,
+        value_cache,
+        start_pos,
+        seq_len,
+        attn_mask,
+        drpout_p,
+        is_causal,
+        scale,
+    )
+
+    return torch.empty_like(query)
+
+
+def _validate_update_cache_params(
+    value,
+    cache,
+    start_pos,
+):
+    seq_len = value.size(1)
+    assert (
+        value.dim() == 4
+    ), f"Expected value to be 4 dimensional but got {value.dim()} dimensions."
+
+    assert (
+        value.dtype == cache.dtype
+    ), f"Expected value and cache to be of the same type but got value type {value.dtype} and cache type {cache.dtype}"
+
+    for i in [0, 2, 3]:
+        assert value.size(i) == cache.size(
+            i
+        ), f"Expected value and cache to have same size in dimension {i} but got {value.size(i)} and {cache.size(i)}"
+
+    torch._check_is_size(start_pos)
+    # Setting to arbitrary limit of 256 for now since there is no way
+    # to plumb this information from model config
+    torch._check(start_pos < cache.size(1))
+    assert start_pos < cache.size(
+        1
+    ), f"Start position {start_pos} must be less than sequence length {cache.size(1)}"
+
+    torch._check((start_pos + seq_len) < cache.size(1))
+    assert (start_pos + seq_len) < cache.size(
+        1
+    ), f"Start position  + length = {start_pos + seq_len} must be less than sequence length {cache.size(1)}"
+
+
+@impl(custom_ops_lib, "update_quantized_cache", "Meta")
+def update_quantized_cache_meta(
+    value,
+    cache,
+    start_pos,
+):
+    _validate_update_cache_params(
+        value,
+        cache,
+        start_pos,
+    )
+
+    # Update cache doesnt really return anything but I dont know a better
+    # workaround. Should we just return cache instead? But I am afraid that
+    # will result in extra memory allocation
+    return torch.empty((1,), dtype=value.dtype, device="meta")
diff --git a/extension/llm/custom_ops/spinquant/README.md b/extension/llm/custom_ops/spinquant/README.md
new file mode 100644
index 00000000000..e946e0ee60e
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/README.md
@@ -0,0 +1,16 @@
+# SpinQuant
+
+This is an implementation of the [Fast Hadamard
+Transform](https://en.wikipedia.org/wiki/Fast_Walsh–Hadamard_transform)
+as used in [SpinQuant](https://arxiv.org/abs/2405.16406) (for the R3
+and R4 matrices), [QuaRot](https://arxiv.org/abs/2404.00456), and
+[Quip#](https://arxiv.org/pdf/2402.04396). We follow those papers'
+method (as implemented in
+https://github.com/Dao-AILab/fast-hadamard-transform/) for extending
+the transform to non-power-of-two input sizes. CUDA is not considered
+because https://github.com/Dao-AILab/fast-hadamard-transform/ is
+already available.
+
+The intended long-term destination for this code is pytorch/ao; it is
+in ExecuTorch temporarily until we get C++ dependency from ExecuTorch
+on torchao figured out.
diff --git a/extension/llm/custom_ops/spinquant/TARGETS b/extension/llm/custom_ops/spinquant/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.cpp b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.cpp
new file mode 100644
index 00000000000..dd34e8da852
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fast_hadamard_transform.h"
+
+#include <algorithm>
+
+namespace executorch {
+namespace {
+// Normalization step: divide by sqrt(1 << log2_vec_size). Similar
+// to fast_sqrt above, if N is even, then the maximum-precision way
+// to do this is right-shift by log2_vec_size / 2. If N is odd, we
+// still do the right-shift, and then we have an extra division by
+// sqrt(2) that we perform by making use of a sufficiently accurate
+// rational approximation. Our initial idea was to divide by sqrt(2)
+// by adjusting the quantization scale, but that would cause this
+// function to tend to increase the magnitude of the elements of
+// vec, which would resulting in clipping and therefore accuracy
+// loss, especially compounded over 30+ transformer layers.
+void quantized_normalize_after_fht(
+    const int32_t* tmp,
+    int16_t* out,
+    int log2_vec_size,
+    int vec_size) {
+  const int log2_sqrt_vec_size = log2_vec_size / 2;
+  constexpr int32_t qmin = -(1 << 15) + 1;
+  constexpr int32_t qmax = -qmin;
+  if (log2_vec_size % 2 != 0) {
+    // 408 / 577 - 1.0 / sqrt(2) ~= 1.062e-0.6, which should be close enough.
+    static const int32_t inv_sqrt_2_numerator = 408;
+    static const int32_t inv_sqrt_2_denominator = 577;
+    for (int ii = 0; ii < vec_size; ++ii) {
+      const auto val_over_sqrt_vec_size =
+          (tmp[ii] * inv_sqrt_2_numerator / inv_sqrt_2_denominator) >>
+          log2_sqrt_vec_size;
+      out[ii] = std::clamp(val_over_sqrt_vec_size, qmin, qmax);
+    }
+  } else {
+    for (int ii = 0; ii < vec_size; ++ii) {
+      out[ii] = std::clamp(tmp[ii] >> log2_sqrt_vec_size, qmin, qmax);
+    }
+  }
+}
+} // namespace
+
+void fast_hadamard_transform_symmetric_quantized_s16(
+    int16_t* vec,
+    int log2_vec_size) {
+  if (log2_vec_size == 0) {
+    return;
+  }
+
+  const int vec_size = 1 << log2_vec_size;
+  // We perform log2_vec_size rounds where each round's maximum output
+  // is at most double the maximum input, so we can at most multiply
+  // the maximum input by vec_size. Performing intermediate arithmetic
+  // in 32-bit precision should prevent overflow, since 16 +
+  // log2_vec_size should be much less than 32.
+  auto tmp = std::make_unique<int32_t[]>(vec_size);
+  std::copy(vec, vec + vec_size, tmp.get());
+
+  // Per the function-level comment above, we can ignore the
+  // quantization scale, so we just delegate to the usual unnormalized
+  // implementation.
+  // NOTE: if we need this to be fast on CPU, we can use FFHT to
+  // generate fht_uint32 similar to fht_float.
+  internal::fast_hadamard_transform_unnormalized_simple_impl(
+      tmp.get(), log2_vec_size);
+
+  quantized_normalize_after_fht(tmp.get(), vec, log2_vec_size, vec_size);
+}
+
+void fast_hadamard_transform_symmetric_quantized_s16_28N(
+    int16_t* vec,
+    int log2_vec_size) {
+  if (log2_vec_size == 0) {
+    return;
+  }
+  const int vec_size = (1 << log2_vec_size);
+
+  auto tmp = std::make_unique<int32_t[]>(vec_size * 28);
+  std::copy(vec, vec + vec_size * 28, tmp.get());
+
+  for (int ii = 0; ii < 28; ++ii) {
+    internal::fast_hadamard_transform_unnormalized_simple_impl(
+        &tmp[ii * vec_size], log2_vec_size);
+  }
+
+  for (int ii = 0; ii < vec_size; ++ii) {
+    hadamard_mult_28_strided(&tmp[ii], vec_size);
+  }
+
+  quantized_normalize_after_fht(tmp.get(), vec, log2_vec_size, vec_size * 28);
+}
+
+} // namespace executorch
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
new file mode 100644
index 00000000000..712b7787683
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// (c) Meta Platforms, Inc. and affiliates.
+#pragma once
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <memory>
+
+#include <executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h>
+
+#include "fast_hadamard_transform_special.h"
+
+namespace executorch {
+namespace internal {
+
+// Square root of 1 << log2_n.
+template <typename T>
+T fast_sqrt_of_power_of_2(int log2_n) {
+  // The square root of 2**N is, by definition, 2**(N/2), which is
+  // trivial to compute for even N using a left shift.
+  //
+  // For odd N, 2**(N/2) = 2**(floor(N/2) + 1/2)
+  //                     = 2**(floor(N/2)) * (2 ** (1/2))
+  //                     = 2**(floor(N/2)) * sqrt(2)
+  // which is again fast to compute.
+  return T(1 << (log2_n / 2)) * ((log2_n % 2) ? T(std::sqrt(2)) : T(1));
+}
+
+template <typename T>
+void normalize_after_fht(T* out, int log2_vec_size) {
+  const T inv_sqrt = T(1) / fast_sqrt_of_power_of_2<T>(log2_vec_size);
+  const int vec_size = 1 << log2_vec_size;
+  for (int ii = 0; ii < vec_size; ++ii) {
+    out[ii] *= inv_sqrt;
+  }
+}
+
+template <typename T>
+void fast_hadamard_transform_unnormalized_simple_impl(
+    T* vec,
+    int log2_vec_size) {
+  // NOTE: If you're here because you're profiling a model and this is
+  // slow, consider updating FFHT to generate efficient assembly for
+  // your data type!
+  if (log2_vec_size == 0) {
+    return;
+  }
+
+  int step = 1;
+  const auto vec_size = 1 << log2_vec_size;
+  while (step < vec_size) {
+    for (int ii = 0; ii < vec_size; ii += step * 2) {
+      for (int jj = ii; jj < ii + step; ++jj) {
+        auto x = vec[jj];
+        auto y = vec[jj + step];
+        vec[jj] = x + y;
+        vec[jj + step] = x - y;
+      }
+    }
+    step *= 2;
+  }
+}
+
+template <typename T>
+void fast_hadamard_transform_simple_impl(T* vec, int log2_vec_size) {
+  fast_hadamard_transform_unnormalized_simple_impl(vec, log2_vec_size);
+  normalize_after_fht(vec, log2_vec_size);
+}
+
+inline void fast_hadamard_transform_ffht_impl(float* vec, int log2_vec_size) {
+#if defined(__aarch64__) || defined(__x86_64__)
+  if (log2_vec_size <= 0) {
+    return;
+  }
+
+  fht_float(vec, log2_vec_size);
+  normalize_after_fht(vec, log2_vec_size);
+#else
+  fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+#endif
+}
+
+} // namespace internal
+
+// Compute the fast Walsh-Hadamard transform
+// (https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform)
+// of vec, which must be of length (1 << log2_vec_size).
+template <typename T>
+void fast_hadamard_transform(T* vec, int log2_vec_size) {
+  if constexpr (std::is_same_v<T, float>) {
+    internal::fast_hadamard_transform_ffht_impl(vec, log2_vec_size);
+  } else {
+    internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+  }
+}
+
+// Compute a quantized fast Walsh-Hadamard transform of vec, which
+// must be of length (1 << log2_vec_size) and symmetrically quantized.
+//
+// Note that we do not need to know the quantization scale, because
+// the Fast Hadamard transform is a series of additions and
+// subtractions with a final multiplication step, and we have the
+// following trivial identities:
+//
+// scale * a + scale * b = scale * (a + b)  (addition doesn't need the scale)
+// alpha * (scale * a) = scale * (alpha * a) (multiplication doesn't need the
+// scale)
+void fast_hadamard_transform_symmetric_quantized_s16(
+    int16_t* vec,
+    int log2_vec_size);
+
+// Like fast_hadamard_transform, but vec must be of length 28 * (1 <<
+// log2_vec_size) and the transform is computed by interpreting vec as
+// a (28, 1 << log2_vec_size) matrix and performing 28 FHTs, followed
+// by (1 << log2_vec_size) multiplications by a particular Hadamard
+// matrix of size 28x28 (see special_hadamard_code_gen.py for the
+// exact matrix).
+template <typename T>
+void fast_hadamard_transform_28N(T* vec, int log2_vec_size) {
+  const int vec_size = (1 << log2_vec_size);
+  for (int ii = 0; ii < 28; ++ii) {
+    fast_hadamard_transform(&vec[ii * vec_size], log2_vec_size);
+  }
+  for (int ii = 0; ii < vec_size; ++ii) {
+    hadamard_mult_28_strided(&vec[ii], vec_size);
+  }
+}
+
+// We don't need the quantization scale; see the function-level
+// comment on fast_hadamard_transform_symmetric_quantized_s16 for
+// details.
+void fast_hadamard_transform_symmetric_quantized_s16_28N(
+    int16_t* vec,
+    int log2_vec_size);
+
+} // namespace executorch
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h
new file mode 100644
index 00000000000..ca5a8d61e73
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h
@@ -0,0 +1,241 @@
+// @generated by special_hadamard_code_gen.py strided_cpu
+
+
+#pragma once
+
+
+template <typename T>
+void hadamard_mult_12_strided(T* input, int stride) {
+    T x[12];
+    T out[12];
+    x[0] = input[0 * stride];
+    x[1] = input[1 * stride];
+    x[2] = input[2 * stride];
+    x[3] = input[3 * stride];
+    x[4] = input[4 * stride];
+    x[5] = input[5 * stride];
+    x[6] = input[6 * stride];
+    x[7] = input[7 * stride];
+    x[8] = input[8 * stride];
+    x[9] = input[9 * stride];
+    x[10] = input[10 * stride];
+    x[11] = input[11 * stride];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11];
+    out[1] = + x[0] + x[1] - x[2] + x[3] - x[4] - x[5] - x[6] + x[7] + x[8] + x[9] - x[10] + x[11];
+    out[2] = + x[0] + x[1] + x[2] - x[3] + x[4] - x[5] - x[6] - x[7] + x[8] + x[9] + x[10] - x[11];
+    out[3] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] + x[10] + x[11];
+    out[4] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] + x[11];
+    out[5] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11];
+    out[6] = + x[0] + x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11];
+    out[7] = + x[0] - x[1] + x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] - x[11];
+    out[8] = + x[0] - x[1] - x[2] + x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11];
+    out[9] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11];
+    out[10] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11];
+    out[11] = + x[0] - x[1] + x[2] - x[3] - x[4] - x[5] + x[6] + x[7] + x[8] - x[9] + x[10] + x[11];
+    #pragma unroll
+    for (int ii = 0; ii < 12; ++ii) { input[stride * ii] = out[ii]; }
+}
+
+
+template <typename T>
+void hadamard_mult_20_strided(T* input, int stride) {
+    T x[20];
+    T out[20];
+    x[0] = input[0 * stride];
+    x[1] = input[1 * stride];
+    x[2] = input[2 * stride];
+    x[3] = input[3 * stride];
+    x[4] = input[4 * stride];
+    x[5] = input[5 * stride];
+    x[6] = input[6 * stride];
+    x[7] = input[7 * stride];
+    x[8] = input[8 * stride];
+    x[9] = input[9 * stride];
+    x[10] = input[10 * stride];
+    x[11] = input[11 * stride];
+    x[12] = input[12 * stride];
+    x[13] = input[13 * stride];
+    x[14] = input[14 * stride];
+    x[15] = input[15 * stride];
+    x[16] = input[16 * stride];
+    x[17] = input[17 * stride];
+    x[18] = input[18 * stride];
+    x[19] = input[19 * stride];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19];
+    out[1] = - x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] + x[11] + x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] + x[19];
+    out[2] = - x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11] + x[12] + x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19];
+    out[3] = - x[0] - x[1] - x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] - x[10] - x[11] + x[12] + x[13] + x[14] + x[15] + x[16] - x[17] + x[18] - x[19];
+    out[4] = - x[0] - x[1] - x[2] - x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] + x[19];
+    out[5] = - x[0] + x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] - x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19];
+    out[6] = + x[0] - x[1] + x[2] + x[3] + x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] - x[14] + x[15] + x[16] + x[17] - x[18] - x[19];
+    out[7] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] + x[16] + x[17] + x[18] - x[19];
+    out[8] = + x[0] + x[1] + x[2] - x[3] + x[4] - x[5] - x[6] - x[7] + x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] + x[17] + x[18] + x[19];
+    out[9] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] - x[17] + x[18] + x[19];
+    out[10] = - x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] - x[15] + x[16] + x[17] + x[18] + x[19];
+    out[11] = - x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19];
+    out[12] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19];
+    out[13] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] - x[18] + x[19];
+    out[14] = - x[0] + x[1] + x[2] - x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] + x[15] + x[16] + x[17] + x[18] - x[19];
+    out[15] = - x[0] + x[1] - x[2] - x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] - x[17] - x[18] - x[19];
+    out[16] = + x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] - x[18] - x[19];
+    out[17] = - x[0] + x[1] - x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] - x[19];
+    out[18] = - x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19];
+    out[19] = + x[0] - x[1] - x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19];
+    #pragma unroll
+    for (int ii = 0; ii < 20; ++ii) { input[stride * ii] = out[ii]; }
+}
+
+
+template <typename T>
+void hadamard_mult_28_strided(T* input, int stride) {
+    T x[28];
+    T out[28];
+    x[0] = input[0 * stride];
+    x[1] = input[1 * stride];
+    x[2] = input[2 * stride];
+    x[3] = input[3 * stride];
+    x[4] = input[4 * stride];
+    x[5] = input[5 * stride];
+    x[6] = input[6 * stride];
+    x[7] = input[7 * stride];
+    x[8] = input[8 * stride];
+    x[9] = input[9 * stride];
+    x[10] = input[10 * stride];
+    x[11] = input[11 * stride];
+    x[12] = input[12 * stride];
+    x[13] = input[13 * stride];
+    x[14] = input[14 * stride];
+    x[15] = input[15 * stride];
+    x[16] = input[16 * stride];
+    x[17] = input[17 * stride];
+    x[18] = input[18 * stride];
+    x[19] = input[19 * stride];
+    x[20] = input[20 * stride];
+    x[21] = input[21 * stride];
+    x[22] = input[22 * stride];
+    x[23] = input[23 * stride];
+    x[24] = input[24 * stride];
+    x[25] = input[25 * stride];
+    x[26] = input[26 * stride];
+    x[27] = input[27 * stride];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] - x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] + x[25] - x[26] - x[27];
+    out[1] = - x[0] + x[1] - x[2] - x[3] - x[4] - x[5] - x[6] + x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] - x[24] + x[25] + x[26] - x[27];
+    out[2] = - x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] - x[7] + x[8] + x[9] + x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] + x[26] + x[27];
+    out[3] = - x[0] - x[1] - x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] + x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] - x[25] - x[26] + x[27];
+    out[4] = - x[0] - x[1] - x[2] - x[3] + x[4] - x[5] - x[6] - x[7] - x[8] - x[9] + x[10] + x[11] + x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] + x[20] + x[21] + x[22] - x[23] - x[24] + x[25] - x[26] - x[27];
+    out[5] = - x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] - x[7] - x[8] - x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] - x[18] + x[19] - x[20] - x[21] + x[22] + x[23] - x[24] - x[25] + x[26] - x[27];
+    out[6] = - x[0] - x[1] - x[2] - x[3] - x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] - x[16] - x[17] + x[18] - x[19] + x[20] - x[21] - x[22] + x[23] + x[24] - x[25] - x[26] + x[27];
+    out[7] = - x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] + x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] + x[15] + x[16] - x[17] - x[18] + x[19] + x[20] + x[21] - x[22] + x[23] - x[24] - x[25] + x[26] - x[27];
+    out[8] = - x[0] - x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] + x[24] - x[25] - x[26] + x[27];
+    out[9] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] - x[20] + x[21] - x[22] + x[23] - x[24] + x[25] - x[26] - x[27];
+    out[10] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] - x[20] - x[21] + x[22] - x[23] + x[24] - x[25] + x[26] - x[27];
+    out[11] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] + x[20] - x[21] - x[22] + x[23] - x[24] + x[25] - x[26] + x[27];
+    out[12] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] + x[20] + x[21] - x[22] - x[23] + x[24] - x[25] + x[26] - x[27];
+    out[13] = - x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] - x[20] - x[21] + x[22] - x[23] - x[24] + x[25] - x[26] + x[27];
+    out[14] = - x[0] + x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27];
+    out[15] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] - x[14] + x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] - x[22] - x[23] + x[24] + x[25] + x[26] + x[27];
+    out[16] = - x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] - x[15] + x[16] - x[17] - x[18] - x[19] - x[20] + x[21] - x[22] - x[23] - x[24] + x[25] + x[26] + x[27];
+    out[17] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] + x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] + x[17] - x[18] - x[19] - x[20] + x[21] + x[22] - x[23] - x[24] - x[25] + x[26] + x[27];
+    out[18] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] - x[20] + x[21] + x[22] + x[23] - x[24] - x[25] - x[26] + x[27];
+    out[19] = - x[0] + x[1] + x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] + x[19] - x[20] + x[21] + x[22] + x[23] + x[24] - x[25] - x[26] - x[27];
+    out[20] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] + x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] - x[27];
+    out[21] = - x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] + x[20] + x[21] - x[22] - x[23] - x[24] - x[25] - x[26] - x[27];
+    out[22] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] + x[16] - x[17] - x[18] - x[19] - x[20] - x[21] + x[22] - x[23] - x[24] - x[25] - x[26] - x[27];
+    out[23] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] + x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] - x[26] - x[27];
+    out[24] = - x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] + x[16] + x[17] + x[18] - x[19] - x[20] - x[21] - x[22] - x[23] + x[24] - x[25] - x[26] - x[27];
+    out[25] = - x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] + x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] + x[18] + x[19] - x[20] - x[21] - x[22] - x[23] - x[24] + x[25] - x[26] - x[27];
+    out[26] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] + x[26] - x[27];
+    out[27] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] - x[26] + x[27];
+    #pragma unroll
+    for (int ii = 0; ii < 28; ++ii) { input[stride * ii] = out[ii]; }
+}
+
+
+template <typename T>
+void hadamard_mult_40_strided(T* input, int stride) {
+    T x[40];
+    T out[40];
+    x[0] = input[0 * stride];
+    x[1] = input[1 * stride];
+    x[2] = input[2 * stride];
+    x[3] = input[3 * stride];
+    x[4] = input[4 * stride];
+    x[5] = input[5 * stride];
+    x[6] = input[6 * stride];
+    x[7] = input[7 * stride];
+    x[8] = input[8 * stride];
+    x[9] = input[9 * stride];
+    x[10] = input[10 * stride];
+    x[11] = input[11 * stride];
+    x[12] = input[12 * stride];
+    x[13] = input[13 * stride];
+    x[14] = input[14 * stride];
+    x[15] = input[15 * stride];
+    x[16] = input[16 * stride];
+    x[17] = input[17 * stride];
+    x[18] = input[18 * stride];
+    x[19] = input[19 * stride];
+    x[20] = input[20 * stride];
+    x[21] = input[21 * stride];
+    x[22] = input[22 * stride];
+    x[23] = input[23 * stride];
+    x[24] = input[24 * stride];
+    x[25] = input[25 * stride];
+    x[26] = input[26 * stride];
+    x[27] = input[27 * stride];
+    x[28] = input[28 * stride];
+    x[29] = input[29 * stride];
+    x[30] = input[30 * stride];
+    x[31] = input[31 * stride];
+    x[32] = input[32 * stride];
+    x[33] = input[33 * stride];
+    x[34] = input[34 * stride];
+    x[35] = input[35 * stride];
+    x[36] = input[36 * stride];
+    x[37] = input[37 * stride];
+    x[38] = input[38 * stride];
+    x[39] = input[39 * stride];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] - x[26] - x[27] - x[28] - x[29] - x[30] - x[31] - x[32] - x[33] - x[34] - x[35] - x[36] - x[37] - x[38] - x[39];
+    out[1] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19] + x[20] + x[21] - x[22] + x[23] + x[24] - x[25] - x[26] - x[27] - x[28] + x[29] - x[30] + x[31] - x[32] + x[33] + x[34] + x[35] + x[36] - x[37] - x[38] + x[39];
+    out[2] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] + x[14] + x[15] + x[16] + x[17] - x[18] - x[19] + x[20] + x[21] + x[22] - x[23] + x[24] + x[25] - x[26] - x[27] - x[28] - x[29] + x[30] - x[31] + x[32] - x[33] + x[34] + x[35] + x[36] + x[37] - x[38] - x[39];
+    out[3] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] + x[18] - x[19] + x[20] - x[21] + x[22] + x[23] - x[24] + x[25] + x[26] - x[27] - x[28] - x[29] - x[30] + x[31] - x[32] + x[33] - x[34] + x[35] + x[36] + x[37] + x[38] - x[39];
+    out[4] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] + x[16] + x[17] + x[18] + x[19] + x[20] - x[21] - x[22] + x[23] + x[24] - x[25] + x[26] + x[27] - x[28] - x[29] - x[30] - x[31] + x[32] - x[33] + x[34] - x[35] + x[36] + x[37] + x[38] + x[39];
+    out[5] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19] + x[20] + x[21] - x[22] - x[23] + x[24] + x[25] - x[26] + x[27] + x[28] - x[29] - x[30] - x[31] - x[32] + x[33] - x[34] + x[35] - x[36] + x[37] + x[38] + x[39];
+    out[6] = + x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] + x[19] + x[20] + x[21] + x[22] - x[23] - x[24] + x[25] + x[26] - x[27] + x[28] + x[29] - x[30] - x[31] - x[32] - x[33] + x[34] - x[35] + x[36] - x[37] + x[38] + x[39];
+    out[7] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] + x[20] + x[21] + x[22] + x[23] - x[24] - x[25] + x[26] + x[27] - x[28] + x[29] + x[30] - x[31] - x[32] - x[33] - x[34] + x[35] - x[36] + x[37] - x[38] + x[39];
+    out[8] = + x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] + x[20] + x[21] + x[22] + x[23] + x[24] - x[25] - x[26] + x[27] + x[28] - x[29] + x[30] + x[31] - x[32] - x[33] - x[34] - x[35] + x[36] - x[37] + x[38] - x[39];
+    out[9] = + x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] + x[19] + x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] - x[27] + x[28] + x[29] - x[30] + x[31] + x[32] - x[33] - x[34] - x[35] - x[36] + x[37] - x[38] + x[39];
+    out[10] = + x[0] + x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] + x[10] - x[11] + x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] + x[20] + x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27] - x[28] + x[29] + x[30] - x[31] + x[32] + x[33] - x[34] - x[35] - x[36] - x[37] + x[38] - x[39];
+    out[11] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] + x[20] - x[21] + x[22] - x[23] + x[24] + x[25] + x[26] + x[27] - x[28] - x[29] + x[30] + x[31] - x[32] + x[33] + x[34] - x[35] - x[36] - x[37] - x[38] + x[39];
+    out[12] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] + x[20] + x[21] - x[22] + x[23] - x[24] + x[25] + x[26] + x[27] + x[28] - x[29] - x[30] + x[31] + x[32] - x[33] + x[34] + x[35] - x[36] - x[37] - x[38] - x[39];
+    out[13] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] + x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] - x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] + x[24] - x[25] + x[26] + x[27] + x[28] + x[29] - x[30] - x[31] + x[32] + x[33] - x[34] + x[35] + x[36] - x[37] - x[38] - x[39];
+    out[14] = + x[0] - x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] + x[20] - x[21] - x[22] + x[23] - x[24] + x[25] - x[26] + x[27] + x[28] + x[29] + x[30] - x[31] - x[32] + x[33] + x[34] - x[35] + x[36] + x[37] - x[38] - x[39];
+    out[15] = + x[0] - x[1] - x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] + x[20] - x[21] - x[22] - x[23] + x[24] - x[25] + x[26] - x[27] + x[28] + x[29] + x[30] + x[31] - x[32] - x[33] + x[34] + x[35] - x[36] + x[37] + x[38] - x[39];
+    out[16] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] + x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] + x[25] - x[26] + x[27] - x[28] + x[29] + x[30] + x[31] + x[32] - x[33] - x[34] + x[35] + x[36] - x[37] + x[38] + x[39];
+    out[17] = + x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] + x[12] + x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] + x[20] + x[21] - x[22] - x[23] - x[24] - x[25] + x[26] - x[27] + x[28] - x[29] + x[30] + x[31] + x[32] + x[33] - x[34] - x[35] + x[36] + x[37] - x[38] + x[39];
+    out[18] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] + x[20] + x[21] + x[22] - x[23] - x[24] - x[25] - x[26] + x[27] - x[28] + x[29] - x[30] + x[31] + x[32] + x[33] + x[34] - x[35] - x[36] + x[37] + x[38] - x[39];
+    out[19] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] + x[20] - x[21] + x[22] + x[23] - x[24] - x[25] - x[26] - x[27] + x[28] - x[29] + x[30] - x[31] + x[32] + x[33] + x[34] + x[35] - x[36] - x[37] + x[38] + x[39];
+    out[20] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] - x[20] + x[21] + x[22] + x[23] + x[24] + x[25] + x[26] + x[27] + x[28] + x[29] + x[30] + x[31] + x[32] + x[33] + x[34] + x[35] + x[36] + x[37] + x[38] + x[39];
+    out[21] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19] - x[20] - x[21] + x[22] - x[23] - x[24] + x[25] + x[26] + x[27] + x[28] - x[29] + x[30] - x[31] + x[32] - x[33] - x[34] - x[35] - x[36] + x[37] + x[38] - x[39];
+    out[22] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] + x[14] + x[15] + x[16] + x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] + x[26] + x[27] + x[28] + x[29] - x[30] + x[31] - x[32] + x[33] - x[34] - x[35] - x[36] - x[37] + x[38] + x[39];
+    out[23] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] + x[18] - x[19] - x[20] + x[21] - x[22] - x[23] + x[24] - x[25] - x[26] + x[27] + x[28] + x[29] + x[30] - x[31] + x[32] - x[33] + x[34] - x[35] - x[36] - x[37] - x[38] + x[39];
+    out[24] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] + x[16] + x[17] + x[18] + x[19] - x[20] + x[21] + x[22] - x[23] - x[24] + x[25] - x[26] - x[27] + x[28] + x[29] + x[30] + x[31] - x[32] + x[33] - x[34] + x[35] - x[36] - x[37] - x[38] - x[39];
+    out[25] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19] - x[20] - x[21] + x[22] + x[23] - x[24] - x[25] + x[26] - x[27] - x[28] + x[29] + x[30] + x[31] + x[32] - x[33] + x[34] - x[35] + x[36] - x[37] - x[38] - x[39];
+    out[26] = + x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] + x[19] - x[20] - x[21] - x[22] + x[23] + x[24] - x[25] - x[26] + x[27] - x[28] - x[29] + x[30] + x[31] + x[32] + x[33] - x[34] + x[35] - x[36] + x[37] - x[38] - x[39];
+    out[27] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] - x[20] - x[21] - x[22] - x[23] + x[24] + x[25] - x[26] - x[27] + x[28] - x[29] - x[30] + x[31] + x[32] + x[33] + x[34] - x[35] + x[36] - x[37] + x[38] - x[39];
+    out[28] = + x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] - x[20] - x[21] - x[22] - x[23] - x[24] + x[25] + x[26] - x[27] - x[28] + x[29] - x[30] - x[31] + x[32] + x[33] + x[34] + x[35] - x[36] + x[37] - x[38] + x[39];
+    out[29] = + x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] - x[24] - x[25] + x[26] + x[27] - x[28] - x[29] + x[30] - x[31] - x[32] + x[33] + x[34] + x[35] + x[36] - x[37] + x[38] - x[39];
+    out[30] = + x[0] + x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] + x[10] - x[11] + x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] - x[20] - x[21] + x[22] - x[23] - x[24] - x[25] - x[26] + x[27] + x[28] - x[29] - x[30] + x[31] - x[32] - x[33] + x[34] + x[35] + x[36] + x[37] - x[38] + x[39];
+    out[31] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] - x[20] + x[21] - x[22] + x[23] - x[24] - x[25] - x[26] - x[27] + x[28] + x[29] - x[30] - x[31] + x[32] - x[33] - x[34] + x[35] + x[36] + x[37] + x[38] - x[39];
+    out[32] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] + x[22] - x[23] + x[24] - x[25] - x[26] - x[27] - x[28] + x[29] + x[30] - x[31] - x[32] + x[33] - x[34] - x[35] + x[36] + x[37] + x[38] + x[39];
+    out[33] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] + x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] - x[17] - x[18] - x[19] - x[20] + x[21] - x[22] + x[23] - x[24] + x[25] - x[26] - x[27] - x[28] - x[29] + x[30] + x[31] - x[32] - x[33] + x[34] - x[35] - x[36] + x[37] + x[38] + x[39];
+    out[34] = + x[0] - x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] - x[20] + x[21] + x[22] - x[23] + x[24] - x[25] + x[26] - x[27] - x[28] - x[29] - x[30] + x[31] + x[32] - x[33] - x[34] + x[35] - x[36] - x[37] + x[38] + x[39];
+    out[35] = + x[0] - x[1] - x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] - x[20] + x[21] + x[22] + x[23] - x[24] + x[25] - x[26] + x[27] - x[28] - x[29] - x[30] - x[31] + x[32] + x[33] - x[34] - x[35] + x[36] - x[37] - x[38] + x[39];
+    out[36] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] + x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] - x[20] + x[21] + x[22] + x[23] + x[24] - x[25] + x[26] - x[27] + x[28] - x[29] - x[30] - x[31] - x[32] + x[33] + x[34] - x[35] - x[36] + x[37] - x[38] - x[39];
+    out[37] = + x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] + x[12] + x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] - x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] + x[27] - x[28] + x[29] - x[30] - x[31] - x[32] - x[33] + x[34] + x[35] - x[36] - x[37] + x[38] - x[39];
+    out[38] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] - x[20] - x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27] + x[28] - x[29] + x[30] - x[31] - x[32] - x[33] - x[34] + x[35] + x[36] - x[37] - x[38] + x[39];
+    out[39] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] + x[25] + x[26] + x[27] - x[28] + x[29] - x[30] + x[31] - x[32] - x[33] - x[34] - x[35] + x[36] + x[37] - x[38] - x[39];
+    #pragma unroll
+    for (int ii = 0; ii < 40; ++ii) { input[stride * ii] = out[ii]; }
+}
+
diff --git a/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py
new file mode 100644
index 00000000000..a8b9feb0785
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py
@@ -0,0 +1,279 @@
+# Portions (c) Meta Platforms, Inc. and affiliates.
+# This file is adapted from
+# https://github.com/Dao-AILab/fast-hadamard-transform/blob/master/csrc/code_gen.py .
+
+# BSD 3-Clause License
+
+# Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from pathlib import Path
+
+import numpy as np
+
+# From https://en.wikipedia.org/wiki/Paley_construction (construction II for q = 5)
+
+had_12_paley = """
++-++++++++++
+--+-+-+-+-+-
++++-++----++
++---+--+-++-
++++++-++----
++-+---+--+-+
+++--+++-++--
++--++---+--+
+++----+++-++
++--+-++---+-
+++++----+++-
++-+--+-++---
+"""
+
+# From http://neilsloane.com/hadamard/
+
+had_12 = """
++-----------
+++-+---+++-+
++++-+---+++-
++-++-+---+++
+++-++-+---++
++++-++-+---+
+++++-++-+---
++-+++-++-+--
++--+++-++-+-
++---+++-++-+
+++---+++-++-
++-+---+++-++
+"""
+
+had_20_will = """
++----+----++--++-++-
+-+----+---+++---+-++
+--+----+---+++-+-+-+
+---+----+---+++++-+-
+----+----++--++-++-+
+-+++++-----+--+++--+
++-+++-+---+-+--+++--
+++-++--+---+-+--+++-
++++-+---+---+-+--+++
+++++-----++--+-+--++
+--++-+-++-+-----++++
+---++-+-++-+---+-+++
++---++-+-+--+--++-++
+++---++-+----+-+++-+
+-++---++-+----+++++-
+-+--+--++-+----+----
++-+-----++-+----+---
+-+-+-+---+--+----+--
+--+-+++------+----+-
++--+--++------+----+
+"""
+
+
+had_28_will = """
++------++----++-+--+-+--++--
+-+-----+++-----+-+--+-+--++-
+--+-----+++---+-+-+----+--++
+---+-----+++---+-+-+-+--+--+
+----+-----+++---+-+-+++--+--
+-----+-----++++--+-+--++--+-
+------++----++-+--+-+--++--+
+--++++-+-------++--+++-+--+-
+---++++-+-----+-++--+-+-+--+
++---+++--+----++-++--+-+-+--
+++---++---+----++-++--+-+-+-
++++---+----+----++-++--+-+-+
+++++--------+-+--++-++--+-+-
+-++++--------+++--++--+--+-+
+-+-++-++--++--+--------++++-
++-+-++--+--++--+--------++++
+-+-+-++--+--++--+----+---+++
++-+-+-++--+--+---+---++---++
+++-+-+-++--+------+--+++---+
+-++-+-+-++--+------+-++++---
++-++-+---++--+------+-++++--
+-++--++-+-++-+++----++------
++-++--++-+-++-+++-----+-----
+++-++---+-+-++-+++-----+----
+-++-++-+-+-+-+--+++-----+---
+--++-++++-+-+----+++-----+--
++--++-+-++-+-+----+++-----+-
+++--++-+-++-+-+----++------+
+"""
+
+
+had_40_tpal = """
++-------------------+-------------------
+++-++----+-+-++++--+++-++----+-+-++++--+
++++-++----+-+-++++--+++-++----+-+-++++--
++-++-++----+-+-++++-+-++-++----+-+-++++-
++--++-++----+-+-+++++--++-++----+-+-++++
+++--++-++----+-+-+++++--++-++----+-+-+++
++++--++-++----+-+-+++++--++-++----+-+-++
+++++--++-++----+-+-+++++--++-++----+-+-+
++++++--++-++----+-+-+++++--++-++----+-+-
++-++++--++-++----+-++-++++--++-++----+-+
+++-++++--++-++----+-++-++++--++-++----+-
++-+-++++--++-++----++-+-++++--++-++----+
+++-+-++++--++-++----++-+-++++--++-++----
++-+-+-++++--++-++---+-+-+-++++--++-++---
++--+-+-++++--++-++--+--+-+-++++--++-++--
++---+-+-++++--++-++-+---+-+-++++--++-++-
++----+-+-++++--++-+++----+-+-++++--++-++
+++----+-+-++++--++-+++----+-+-++++--++-+
++++----+-+-++++--++-+++----+-+-++++--++-
++-++----+-+-++++--+++-++----+-+-++++--++
++--------------------+++++++++++++++++++
+++-++----+-+-++++--+--+--++++-+-+----++-
++++-++----+-+-++++-----+--++++-+-+----++
++-++-++----+-+-++++--+--+--++++-+-+----+
++--++-++----+-+-++++-++--+--++++-+-+----
+++--++-++----+-+-+++--++--+--++++-+-+---
++++--++-++----+-+-++---++--+--++++-+-+--
+++++--++-++----+-+-+----++--+--++++-+-+-
++++++--++-++----+-+------++--+--++++-+-+
++-++++--++-++----+-+-+----++--+--++++-+-
+++-++++--++-++----+---+----++--+--++++-+
++-+-++++--++-++----+-+-+----++--+--++++-
+++-+-++++--++-++------+-+----++--+--++++
++-+-+-++++--++-++----+-+-+----++--+--+++
++--+-+-++++--++-++---++-+-+----++--+--++
++---+-+-++++--++-++--+++-+-+----++--+--+
++----+-+-++++--++-++-++++-+-+----++--+--
+++----+-+-++++--++-+--++++-+-+----++--+-
++++----+-+-++++--++----++++-+-+----++--+
++-++----+-+-++++--++-+--++++-+-+----++--
+"""
+
+# NOTE: the original Dao-AILab/fast-hadamard-transform uses had_12_paley rather than
+# had_12 here. However, SpinQuant and QuaRot seem to use had_12, so we follow them here.
+had_strings = [had_12, had_20_will, had_28_will, had_40_tpal]
+
+header = """
+
+#pragma once
+
+"""
+
+
+TEMPLATE = """
+__device__ __forceinline__ void hadamard_mult_thread_{N}(float x[{N}]) {{
+    float out[{N}];
+    {code}
+    #pragma unroll
+    for (int i = 0; i < {N}; i++) {{ x[i] = out[i]; }}
+}}
+
+"""
+
+
+CPU_TEMPLATE = """
+template <typename T>
+void hadamard_mult_{N}(T* x) {{
+    float out[{N}];
+    {code}
+    #pragma unroll
+    for (int i = 0; i < {N}; i++) {{ x[i] = out[i]; }}
+}}
+
+"""
+
+STRIDED_CPU_TEMPLATE = """
+template <typename T>
+void hadamard_mult_{N}_strided(T* input, int stride) {{
+    T x[{N}];
+    T out[{N}];
+    {strided_load_code}
+    {code}
+    #pragma unroll
+    for (int ii = 0; ii < {N}; ++ii) {{ input[stride * ii] = out[ii]; }}
+}}
+
+"""
+
+
+def string_to_array(string):
+    # Convert strings of + and - to bool arrays
+    string = string.strip().replace("+", "1").replace("-", "-1").split()
+    return np.stack(
+        [
+            np.fromstring(" ".join(string[i]), dtype=np.int32, sep=" ")
+            for i in range(len(string))
+        ]
+    )
+
+
+def strided_load_code_gen(N):
+    return "\n    ".join([f"x[{i}] = input[{i} * stride];" for i in range(N)])
+
+
+def array_code_gen(arr, template):
+    N = arr.shape[0]
+    assert arr.shape[0] == arr.shape[1]
+    out = []
+    for i in range(N):
+        out.append(
+            f"out[{i}] = "
+            + " ".join([f"{'+' if arr[i, j] == 1 else '-'} x[{j}]" for j in range(N)])
+            + ";"
+        )
+    return template.format(
+        N=str(N), code="\n    ".join(out), strided_load_code=strided_load_code_gen(N)
+    )
+
+
+OPTION_TO_TEMPLATE = {
+    "cuda": TEMPLATE,
+    "cpu": CPU_TEMPLATE,
+    "strided_cpu": STRIDED_CPU_TEMPLATE,
+}
+
+
+def main(option="cuda"):
+    try:
+        template = OPTION_TO_TEMPLATE[option]
+    except KeyError:
+        raise Exception(
+            f"bad target option {option}; options are {', '.join(OPTION_TO_TEMPLATE.keys())}"
+        )
+    output_dir = Path(__file__).parent / "fast_hadamard_transform_special.h"
+    generated_line = f"// @{'generated'} by special_hadamard_code_gen.py {option}\n"
+
+    output_dir.write_text(
+        generated_line
+        + header
+        + "".join(array_code_gen(string_to_array(s), template) for s in had_strings)
+    )
+
+
+if __name__ == "__main__":
+    import sys
+
+    option = "cuda"
+    if len(sys.argv) > 1:
+        option = sys.argv[1]
+    main(option)
diff --git a/extension/llm/custom_ops/spinquant/targets.bzl b/extension/llm/custom_ops/spinquant/targets.bzl
new file mode 100644
index 00000000000..e87af3b80d8
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/targets.bzl
@@ -0,0 +1,22 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_library(
+        name = "fast_hadamard_transform",
+        exported_headers = [
+            "fast_hadamard_transform.h",
+            "fast_hadamard_transform_special.h",
+        ],
+        srcs = [
+            "fast_hadamard_transform.cpp",
+        ],
+        exported_deps = [
+            "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:fht",
+        ],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+    )
diff --git a/extension/llm/custom_ops/spinquant/test/TARGETS b/extension/llm/custom_ops/spinquant/test/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h
new file mode 100644
index 00000000000..c0b27809598
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h
@@ -0,0 +1,137 @@
+// @generated by special_hadamard_code_gen.py cpu
+
+
+#pragma once
+
+
+template <typename T>
+void hadamard_mult_12(T* x) {
+    float out[12];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11];
+    out[1] = + x[0] + x[1] - x[2] + x[3] - x[4] - x[5] - x[6] + x[7] + x[8] + x[9] - x[10] + x[11];
+    out[2] = + x[0] + x[1] + x[2] - x[3] + x[4] - x[5] - x[6] - x[7] + x[8] + x[9] + x[10] - x[11];
+    out[3] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] + x[10] + x[11];
+    out[4] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] + x[11];
+    out[5] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11];
+    out[6] = + x[0] + x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11];
+    out[7] = + x[0] - x[1] + x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] - x[11];
+    out[8] = + x[0] - x[1] - x[2] + x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11];
+    out[9] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11];
+    out[10] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11];
+    out[11] = + x[0] - x[1] + x[2] - x[3] - x[4] - x[5] + x[6] + x[7] + x[8] - x[9] + x[10] + x[11];
+    #pragma unroll
+    for (int i = 0; i < 12; i++) { x[i] = out[i]; }
+}
+
+
+template <typename T>
+void hadamard_mult_20(T* x) {
+    float out[20];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19];
+    out[1] = - x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] + x[11] + x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] + x[19];
+    out[2] = - x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11] + x[12] + x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19];
+    out[3] = - x[0] - x[1] - x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] - x[10] - x[11] + x[12] + x[13] + x[14] + x[15] + x[16] - x[17] + x[18] - x[19];
+    out[4] = - x[0] - x[1] - x[2] - x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] + x[19];
+    out[5] = - x[0] + x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] - x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19];
+    out[6] = + x[0] - x[1] + x[2] + x[3] + x[4] - x[5] + x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] - x[14] + x[15] + x[16] + x[17] - x[18] - x[19];
+    out[7] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] + x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] + x[16] + x[17] + x[18] - x[19];
+    out[8] = + x[0] + x[1] + x[2] - x[3] + x[4] - x[5] - x[6] - x[7] + x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] + x[17] + x[18] + x[19];
+    out[9] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] - x[17] + x[18] + x[19];
+    out[10] = - x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] - x[15] + x[16] + x[17] + x[18] + x[19];
+    out[11] = - x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19];
+    out[12] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19];
+    out[13] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] - x[18] + x[19];
+    out[14] = - x[0] + x[1] + x[2] - x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] + x[15] + x[16] + x[17] + x[18] - x[19];
+    out[15] = - x[0] + x[1] - x[2] - x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] - x[17] - x[18] - x[19];
+    out[16] = + x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] - x[18] - x[19];
+    out[17] = - x[0] + x[1] - x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] - x[19];
+    out[18] = - x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19];
+    out[19] = + x[0] - x[1] - x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19];
+    #pragma unroll
+    for (int i = 0; i < 20; i++) { x[i] = out[i]; }
+}
+
+
+template <typename T>
+void hadamard_mult_28(T* x) {
+    float out[28];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] - x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] + x[25] - x[26] - x[27];
+    out[1] = - x[0] + x[1] - x[2] - x[3] - x[4] - x[5] - x[6] + x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] - x[24] + x[25] + x[26] - x[27];
+    out[2] = - x[0] - x[1] + x[2] - x[3] - x[4] - x[5] - x[6] - x[7] + x[8] + x[9] + x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] + x[26] + x[27];
+    out[3] = - x[0] - x[1] - x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] + x[9] + x[10] + x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] - x[25] - x[26] + x[27];
+    out[4] = - x[0] - x[1] - x[2] - x[3] + x[4] - x[5] - x[6] - x[7] - x[8] - x[9] + x[10] + x[11] + x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] + x[20] + x[21] + x[22] - x[23] - x[24] + x[25] - x[26] - x[27];
+    out[5] = - x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] - x[7] - x[8] - x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] - x[18] + x[19] - x[20] - x[21] + x[22] + x[23] - x[24] - x[25] + x[26] - x[27];
+    out[6] = - x[0] - x[1] - x[2] - x[3] - x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] - x[16] - x[17] + x[18] - x[19] + x[20] - x[21] - x[22] + x[23] + x[24] - x[25] - x[26] + x[27];
+    out[7] = - x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] + x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] + x[15] + x[16] - x[17] - x[18] + x[19] + x[20] + x[21] - x[22] + x[23] - x[24] - x[25] + x[26] - x[27];
+    out[8] = - x[0] - x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] + x[8] - x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] + x[24] - x[25] - x[26] + x[27];
+    out[9] = + x[0] - x[1] - x[2] - x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] - x[20] + x[21] - x[22] + x[23] - x[24] + x[25] - x[26] - x[27];
+    out[10] = + x[0] + x[1] - x[2] - x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] - x[20] - x[21] + x[22] - x[23] + x[24] - x[25] + x[26] - x[27];
+    out[11] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] + x[20] - x[21] - x[22] + x[23] - x[24] + x[25] - x[26] + x[27];
+    out[12] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] + x[20] + x[21] - x[22] - x[23] + x[24] - x[25] + x[26] - x[27];
+    out[13] = - x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] - x[20] - x[21] + x[22] - x[23] - x[24] + x[25] - x[26] + x[27];
+    out[14] = - x[0] + x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27];
+    out[15] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] - x[14] + x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] - x[22] - x[23] + x[24] + x[25] + x[26] + x[27];
+    out[16] = - x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] - x[15] + x[16] - x[17] - x[18] - x[19] - x[20] + x[21] - x[22] - x[23] - x[24] + x[25] + x[26] + x[27];
+    out[17] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] + x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] + x[17] - x[18] - x[19] - x[20] + x[21] + x[22] - x[23] - x[24] - x[25] + x[26] + x[27];
+    out[18] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] + x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] - x[20] + x[21] + x[22] + x[23] - x[24] - x[25] - x[26] + x[27];
+    out[19] = - x[0] + x[1] + x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] + x[19] - x[20] + x[21] + x[22] + x[23] + x[24] - x[25] - x[26] - x[27];
+    out[20] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] - x[6] - x[7] - x[8] + x[9] + x[10] - x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] + x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] - x[27];
+    out[21] = - x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] + x[20] + x[21] - x[22] - x[23] - x[24] - x[25] - x[26] - x[27];
+    out[22] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] + x[16] - x[17] - x[18] - x[19] - x[20] - x[21] + x[22] - x[23] - x[24] - x[25] - x[26] - x[27];
+    out[23] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] + x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] - x[26] - x[27];
+    out[24] = - x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] + x[16] + x[17] + x[18] - x[19] - x[20] - x[21] - x[22] - x[23] + x[24] - x[25] - x[26] - x[27];
+    out[25] = - x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] + x[10] - x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] + x[18] + x[19] - x[20] - x[21] - x[22] - x[23] - x[24] + x[25] - x[26] - x[27];
+    out[26] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] - x[10] + x[11] - x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] + x[26] - x[27];
+    out[27] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] - x[11] + x[12] - x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] - x[26] + x[27];
+    #pragma unroll
+    for (int i = 0; i < 28; i++) { x[i] = out[i]; }
+}
+
+
+template <typename T>
+void hadamard_mult_40(T* x) {
+    float out[40];
+    out[0] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] + x[20] - x[21] - x[22] - x[23] - x[24] - x[25] - x[26] - x[27] - x[28] - x[29] - x[30] - x[31] - x[32] - x[33] - x[34] - x[35] - x[36] - x[37] - x[38] - x[39];
+    out[1] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19] + x[20] + x[21] - x[22] + x[23] + x[24] - x[25] - x[26] - x[27] - x[28] + x[29] - x[30] + x[31] - x[32] + x[33] + x[34] + x[35] + x[36] - x[37] - x[38] + x[39];
+    out[2] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] + x[14] + x[15] + x[16] + x[17] - x[18] - x[19] + x[20] + x[21] + x[22] - x[23] + x[24] + x[25] - x[26] - x[27] - x[28] - x[29] + x[30] - x[31] + x[32] - x[33] + x[34] + x[35] + x[36] + x[37] - x[38] - x[39];
+    out[3] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] + x[18] - x[19] + x[20] - x[21] + x[22] + x[23] - x[24] + x[25] + x[26] - x[27] - x[28] - x[29] - x[30] + x[31] - x[32] + x[33] - x[34] + x[35] + x[36] + x[37] + x[38] - x[39];
+    out[4] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] + x[16] + x[17] + x[18] + x[19] + x[20] - x[21] - x[22] + x[23] + x[24] - x[25] + x[26] + x[27] - x[28] - x[29] - x[30] - x[31] + x[32] - x[33] + x[34] - x[35] + x[36] + x[37] + x[38] + x[39];
+    out[5] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19] + x[20] + x[21] - x[22] - x[23] + x[24] + x[25] - x[26] + x[27] + x[28] - x[29] - x[30] - x[31] - x[32] + x[33] - x[34] + x[35] - x[36] + x[37] + x[38] + x[39];
+    out[6] = + x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] + x[19] + x[20] + x[21] + x[22] - x[23] - x[24] + x[25] + x[26] - x[27] + x[28] + x[29] - x[30] - x[31] - x[32] - x[33] + x[34] - x[35] + x[36] - x[37] + x[38] + x[39];
+    out[7] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] + x[20] + x[21] + x[22] + x[23] - x[24] - x[25] + x[26] + x[27] - x[28] + x[29] + x[30] - x[31] - x[32] - x[33] - x[34] + x[35] - x[36] + x[37] - x[38] + x[39];
+    out[8] = + x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] + x[20] + x[21] + x[22] + x[23] + x[24] - x[25] - x[26] + x[27] + x[28] - x[29] + x[30] + x[31] - x[32] - x[33] - x[34] - x[35] + x[36] - x[37] + x[38] - x[39];
+    out[9] = + x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] + x[19] + x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] - x[27] + x[28] + x[29] - x[30] + x[31] + x[32] - x[33] - x[34] - x[35] - x[36] + x[37] - x[38] + x[39];
+    out[10] = + x[0] + x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] + x[10] - x[11] + x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] + x[20] + x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27] - x[28] + x[29] + x[30] - x[31] + x[32] + x[33] - x[34] - x[35] - x[36] - x[37] + x[38] - x[39];
+    out[11] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] + x[20] - x[21] + x[22] - x[23] + x[24] + x[25] + x[26] + x[27] - x[28] - x[29] + x[30] + x[31] - x[32] + x[33] + x[34] - x[35] - x[36] - x[37] - x[38] + x[39];
+    out[12] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] + x[20] + x[21] - x[22] + x[23] - x[24] + x[25] + x[26] + x[27] + x[28] - x[29] - x[30] + x[31] + x[32] - x[33] + x[34] + x[35] - x[36] - x[37] - x[38] - x[39];
+    out[13] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] + x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] - x[17] - x[18] - x[19] + x[20] - x[21] + x[22] - x[23] + x[24] - x[25] + x[26] + x[27] + x[28] + x[29] - x[30] - x[31] + x[32] + x[33] - x[34] + x[35] + x[36] - x[37] - x[38] - x[39];
+    out[14] = + x[0] - x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] + x[20] - x[21] - x[22] + x[23] - x[24] + x[25] - x[26] + x[27] + x[28] + x[29] + x[30] - x[31] - x[32] + x[33] + x[34] - x[35] + x[36] + x[37] - x[38] - x[39];
+    out[15] = + x[0] - x[1] - x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] + x[20] - x[21] - x[22] - x[23] + x[24] - x[25] + x[26] - x[27] + x[28] + x[29] + x[30] + x[31] - x[32] - x[33] + x[34] + x[35] - x[36] + x[37] + x[38] - x[39];
+    out[16] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] + x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] + x[20] - x[21] - x[22] - x[23] - x[24] + x[25] - x[26] + x[27] - x[28] + x[29] + x[30] + x[31] + x[32] - x[33] - x[34] + x[35] + x[36] - x[37] + x[38] + x[39];
+    out[17] = + x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] + x[12] + x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] + x[20] + x[21] - x[22] - x[23] - x[24] - x[25] + x[26] - x[27] + x[28] - x[29] + x[30] + x[31] + x[32] + x[33] - x[34] - x[35] + x[36] + x[37] - x[38] + x[39];
+    out[18] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] + x[20] + x[21] + x[22] - x[23] - x[24] - x[25] - x[26] + x[27] - x[28] + x[29] - x[30] + x[31] + x[32] + x[33] + x[34] - x[35] - x[36] + x[37] + x[38] - x[39];
+    out[19] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] + x[20] - x[21] + x[22] + x[23] - x[24] - x[25] - x[26] - x[27] + x[28] - x[29] + x[30] - x[31] + x[32] + x[33] + x[34] + x[35] - x[36] - x[37] + x[38] + x[39];
+    out[20] = + x[0] - x[1] - x[2] - x[3] - x[4] - x[5] - x[6] - x[7] - x[8] - x[9] - x[10] - x[11] - x[12] - x[13] - x[14] - x[15] - x[16] - x[17] - x[18] - x[19] - x[20] + x[21] + x[22] + x[23] + x[24] + x[25] + x[26] + x[27] + x[28] + x[29] + x[30] + x[31] + x[32] + x[33] + x[34] + x[35] + x[36] + x[37] + x[38] + x[39];
+    out[21] = + x[0] + x[1] - x[2] + x[3] + x[4] - x[5] - x[6] - x[7] - x[8] + x[9] - x[10] + x[11] - x[12] + x[13] + x[14] + x[15] + x[16] - x[17] - x[18] + x[19] - x[20] - x[21] + x[22] - x[23] - x[24] + x[25] + x[26] + x[27] + x[28] - x[29] + x[30] - x[31] + x[32] - x[33] - x[34] - x[35] - x[36] + x[37] + x[38] - x[39];
+    out[22] = + x[0] + x[1] + x[2] - x[3] + x[4] + x[5] - x[6] - x[7] - x[8] - x[9] + x[10] - x[11] + x[12] - x[13] + x[14] + x[15] + x[16] + x[17] - x[18] - x[19] - x[20] - x[21] - x[22] + x[23] - x[24] - x[25] + x[26] + x[27] + x[28] + x[29] - x[30] + x[31] - x[32] + x[33] - x[34] - x[35] - x[36] - x[37] + x[38] + x[39];
+    out[23] = + x[0] - x[1] + x[2] + x[3] - x[4] + x[5] + x[6] - x[7] - x[8] - x[9] - x[10] + x[11] - x[12] + x[13] - x[14] + x[15] + x[16] + x[17] + x[18] - x[19] - x[20] + x[21] - x[22] - x[23] + x[24] - x[25] - x[26] + x[27] + x[28] + x[29] + x[30] - x[31] + x[32] - x[33] + x[34] - x[35] - x[36] - x[37] - x[38] + x[39];
+    out[24] = + x[0] - x[1] - x[2] + x[3] + x[4] - x[5] + x[6] + x[7] - x[8] - x[9] - x[10] - x[11] + x[12] - x[13] + x[14] - x[15] + x[16] + x[17] + x[18] + x[19] - x[20] + x[21] + x[22] - x[23] - x[24] + x[25] - x[26] - x[27] + x[28] + x[29] + x[30] + x[31] - x[32] + x[33] - x[34] + x[35] - x[36] - x[37] - x[38] - x[39];
+    out[25] = + x[0] + x[1] - x[2] - x[3] + x[4] + x[5] - x[6] + x[7] + x[8] - x[9] - x[10] - x[11] - x[12] + x[13] - x[14] + x[15] - x[16] + x[17] + x[18] + x[19] - x[20] - x[21] + x[22] + x[23] - x[24] - x[25] + x[26] - x[27] - x[28] + x[29] + x[30] + x[31] + x[32] - x[33] + x[34] - x[35] + x[36] - x[37] - x[38] - x[39];
+    out[26] = + x[0] + x[1] + x[2] - x[3] - x[4] + x[5] + x[6] - x[7] + x[8] + x[9] - x[10] - x[11] - x[12] - x[13] + x[14] - x[15] + x[16] - x[17] + x[18] + x[19] - x[20] - x[21] - x[22] + x[23] + x[24] - x[25] - x[26] + x[27] - x[28] - x[29] + x[30] + x[31] + x[32] + x[33] - x[34] + x[35] - x[36] + x[37] - x[38] - x[39];
+    out[27] = + x[0] + x[1] + x[2] + x[3] - x[4] - x[5] + x[6] + x[7] - x[8] + x[9] + x[10] - x[11] - x[12] - x[13] - x[14] + x[15] - x[16] + x[17] - x[18] + x[19] - x[20] - x[21] - x[22] - x[23] + x[24] + x[25] - x[26] - x[27] + x[28] - x[29] - x[30] + x[31] + x[32] + x[33] + x[34] - x[35] + x[36] - x[37] + x[38] - x[39];
+    out[28] = + x[0] + x[1] + x[2] + x[3] + x[4] - x[5] - x[6] + x[7] + x[8] - x[9] + x[10] + x[11] - x[12] - x[13] - x[14] - x[15] + x[16] - x[17] + x[18] - x[19] - x[20] - x[21] - x[22] - x[23] - x[24] + x[25] + x[26] - x[27] - x[28] + x[29] - x[30] - x[31] + x[32] + x[33] + x[34] + x[35] - x[36] + x[37] - x[38] + x[39];
+    out[29] = + x[0] - x[1] + x[2] + x[3] + x[4] + x[5] - x[6] - x[7] + x[8] + x[9] - x[10] + x[11] + x[12] - x[13] - x[14] - x[15] - x[16] + x[17] - x[18] + x[19] - x[20] + x[21] - x[22] - x[23] - x[24] - x[25] + x[26] + x[27] - x[28] - x[29] + x[30] - x[31] - x[32] + x[33] + x[34] + x[35] + x[36] - x[37] + x[38] - x[39];
+    out[30] = + x[0] + x[1] - x[2] + x[3] + x[4] + x[5] + x[6] - x[7] - x[8] + x[9] + x[10] - x[11] + x[12] + x[13] - x[14] - x[15] - x[16] - x[17] + x[18] - x[19] - x[20] - x[21] + x[22] - x[23] - x[24] - x[25] - x[26] + x[27] + x[28] - x[29] - x[30] + x[31] - x[32] - x[33] + x[34] + x[35] + x[36] + x[37] - x[38] + x[39];
+    out[31] = + x[0] - x[1] + x[2] - x[3] + x[4] + x[5] + x[6] + x[7] - x[8] - x[9] + x[10] + x[11] - x[12] + x[13] + x[14] - x[15] - x[16] - x[17] - x[18] + x[19] - x[20] + x[21] - x[22] + x[23] - x[24] - x[25] - x[26] - x[27] + x[28] + x[29] - x[30] - x[31] + x[32] - x[33] - x[34] + x[35] + x[36] + x[37] + x[38] - x[39];
+    out[32] = + x[0] + x[1] - x[2] + x[3] - x[4] + x[5] + x[6] + x[7] + x[8] - x[9] - x[10] + x[11] + x[12] - x[13] + x[14] + x[15] - x[16] - x[17] - x[18] - x[19] - x[20] - x[21] + x[22] - x[23] + x[24] - x[25] - x[26] - x[27] - x[28] + x[29] + x[30] - x[31] - x[32] + x[33] - x[34] - x[35] + x[36] + x[37] + x[38] + x[39];
+    out[33] = + x[0] - x[1] + x[2] - x[3] + x[4] - x[5] + x[6] + x[7] + x[8] + x[9] - x[10] - x[11] + x[12] + x[13] - x[14] + x[15] + x[16] - x[17] - x[18] - x[19] - x[20] + x[21] - x[22] + x[23] - x[24] + x[25] - x[26] - x[27] - x[28] - x[29] + x[30] + x[31] - x[32] - x[33] + x[34] - x[35] - x[36] + x[37] + x[38] + x[39];
+    out[34] = + x[0] - x[1] - x[2] + x[3] - x[4] + x[5] - x[6] + x[7] + x[8] + x[9] + x[10] - x[11] - x[12] + x[13] + x[14] - x[15] + x[16] + x[17] - x[18] - x[19] - x[20] + x[21] + x[22] - x[23] + x[24] - x[25] + x[26] - x[27] - x[28] - x[29] - x[30] + x[31] + x[32] - x[33] - x[34] + x[35] - x[36] - x[37] + x[38] + x[39];
+    out[35] = + x[0] - x[1] - x[2] - x[3] + x[4] - x[5] + x[6] - x[7] + x[8] + x[9] + x[10] + x[11] - x[12] - x[13] + x[14] + x[15] - x[16] + x[17] + x[18] - x[19] - x[20] + x[21] + x[22] + x[23] - x[24] + x[25] - x[26] + x[27] - x[28] - x[29] - x[30] - x[31] + x[32] + x[33] - x[34] - x[35] + x[36] - x[37] - x[38] + x[39];
+    out[36] = + x[0] - x[1] - x[2] - x[3] - x[4] + x[5] - x[6] + x[7] - x[8] + x[9] + x[10] + x[11] + x[12] - x[13] - x[14] + x[15] + x[16] - x[17] + x[18] + x[19] - x[20] + x[21] + x[22] + x[23] + x[24] - x[25] + x[26] - x[27] + x[28] - x[29] - x[30] - x[31] - x[32] + x[33] + x[34] - x[35] - x[36] + x[37] - x[38] - x[39];
+    out[37] = + x[0] + x[1] - x[2] - x[3] - x[4] - x[5] + x[6] - x[7] + x[8] - x[9] + x[10] + x[11] + x[12] + x[13] - x[14] - x[15] + x[16] + x[17] - x[18] + x[19] - x[20] - x[21] + x[22] + x[23] + x[24] + x[25] - x[26] + x[27] - x[28] + x[29] - x[30] - x[31] - x[32] - x[33] + x[34] + x[35] - x[36] - x[37] + x[38] - x[39];
+    out[38] = + x[0] + x[1] + x[2] - x[3] - x[4] - x[5] - x[6] + x[7] - x[8] + x[9] - x[10] + x[11] + x[12] + x[13] + x[14] - x[15] - x[16] + x[17] + x[18] - x[19] - x[20] - x[21] - x[22] + x[23] + x[24] + x[25] + x[26] - x[27] + x[28] - x[29] + x[30] - x[31] - x[32] - x[33] - x[34] + x[35] + x[36] - x[37] - x[38] + x[39];
+    out[39] = + x[0] - x[1] + x[2] + x[3] - x[4] - x[5] - x[6] - x[7] + x[8] - x[9] + x[10] - x[11] + x[12] + x[13] + x[14] + x[15] - x[16] - x[17] + x[18] + x[19] - x[20] + x[21] - x[22] - x[23] + x[24] + x[25] + x[26] + x[27] - x[28] + x[29] - x[30] + x[31] - x[32] - x[33] - x[34] - x[35] + x[36] + x[37] - x[38] - x[39];
+    #pragma unroll
+    for (int i = 0; i < 40; i++) { x[i] = out[i]; }
+}
+
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test.cpp b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test.cpp
new file mode 100644
index 00000000000..8587b600a3a
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <executorch/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h>
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h>
+
+using executorch::runtime::testing::fast_hadamard_transform_28N_with_transpose;
+using executorch::runtime::testing::random_floats;
+using executorch::runtime::testing::reference_fht_impl;
+
+TEST(FastHadamardTransformTest, SingleElement) {
+  // FHT of a single element is a no-op.
+  std::array<float, 1> data = {{42}};
+  executorch::fast_hadamard_transform(data.data(), 0);
+  EXPECT_EQ(data[0], 42);
+}
+
+TEST(FastHadamardTransformTest, LargerInput) {
+  std::vector<float> data = random_floats(4096);
+
+  auto expected = data;
+  reference_fht_impl(expected.data(), expected.size());
+
+  auto actual = data;
+  executorch::fast_hadamard_transform(actual.data(), 12);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    EXPECT_FLOAT_EQ(actual[ii], expected[ii]);
+  }
+}
+
+TEST(FastHadamardTransform28NTest, Basic) {
+  std::vector<float> data = random_floats(1024 * 28);
+
+  auto expected = data;
+  fast_hadamard_transform_28N_with_transpose(expected.data(), 10);
+
+  auto actual = data;
+  executorch::fast_hadamard_transform_28N(actual.data(), 10);
+
+  for (int ii = 0; ii < actual.size(); ++ii) {
+    EXPECT_FLOAT_EQ(actual[ii], expected[ii]);
+  }
+}
+
+namespace {
+constexpr int32_t qmin = -(1 << 15) + 1;
+constexpr int32_t qmax = -qmin;
+
+int16_t quantize(float x, float scale) {
+  float scaled = x / scale;
+  // XXX: Supposed to round ties to even, but this is just test code.
+  int32_t scaled_int =
+      std::clamp((int32_t)std::lround<int32_t>(scaled), qmin, qmax);
+  return static_cast<int16_t>(scaled_int);
+}
+
+template <typename T>
+std::vector<T> quantize(const std::vector<float>& data, float scale) {
+  std::vector<T> result;
+  result.reserve(data.size());
+  for (const float unquant : data) {
+    result.push_back(quantize(unquant, scale));
+  }
+  return result;
+}
+
+template <typename T>
+std::pair<std::vector<T>, float> quantize(const std::vector<float>& data) {
+  auto [minIt, maxIt] = std::minmax_element(data.begin(), data.end());
+  float scale = (*maxIt - *minIt) / (qmax - qmin);
+  return {quantize<T>(data, scale), scale};
+}
+
+template <typename T>
+float dequantize(T x, float scale) {
+  return x * scale;
+}
+
+template <typename T>
+std::vector<float> dequantize(const std::vector<T>& data, float scale) {
+  static_assert(!std::is_same_v<T, float>);
+  std::vector<float> result;
+  result.reserve(data.size());
+  for (const T quant : data) {
+    result.push_back(dequantize(quant, scale));
+  }
+  return result;
+}
+
+#define EXPECT_CLOSE_IMPL(a, b, atol, rtol)             \
+  EXPECT_LE(std::abs(a - b), atol + rtol * std::abs(b)) \
+      << "a: " << a << ", b: " << b
+#define EXPECT_CLOSE(a, b) EXPECT_CLOSE_IMPL(a, b, 2e-4, 1e-4)
+
+void testQuantizedFastHadamardTransform(int logN) {
+  std::vector<float> data = random_floats(1 << logN);
+
+  auto [qdata, scale] = quantize<int16_t>(data);
+
+  auto expected_unquant = dequantize(qdata, scale);
+  reference_fht_impl(expected_unquant.data(), expected_unquant.size());
+  auto expected = quantize<int16_t>(expected_unquant, scale);
+
+  auto actual = qdata;
+  executorch::fast_hadamard_transform_symmetric_quantized_s16(
+      actual.data(), logN);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    EXPECT_CLOSE(
+        dequantize(actual[ii], scale), dequantize(expected[ii], scale));
+  }
+}
+
+} // namespace
+
+TEST(QuantizedFastHadamardTransformTest, Basic) {
+  testQuantizedFastHadamardTransform(12); // 4096
+}
+
+TEST(QuantizedFastHadamardTransformTest, OddLogN) {
+  testQuantizedFastHadamardTransform(11); // 2048
+}
+
+TEST(QuantizedFastHadamardTransform28NTest, Basic) {
+  std::vector<float> data = random_floats(1024 * 28);
+
+  auto [qdata, scale] = quantize<int16_t>(data);
+
+  auto expected_unquant = dequantize(qdata, scale);
+  fast_hadamard_transform_28N_with_transpose(expected_unquant.data(), 10);
+  auto expected = quantize<int16_t>(expected_unquant, scale);
+
+  auto actual = qdata;
+  executorch::fast_hadamard_transform_symmetric_quantized_s16_28N(
+      actual.data(), 10);
+
+  for (int ii = 0; ii < expected.size(); ++ii) {
+    EXPECT_CLOSE(
+        dequantize(actual[ii], scale), dequantize(expected[ii], scale));
+  }
+}
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.cpp b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.cpp
new file mode 100644
index 00000000000..25199f481ee
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h>
+#include <executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h>
+
+#include <cmath>
+#include <random>
+#include <vector>
+
+namespace executorch::runtime::testing {
+
+void reference_fht_impl(float* buf, int n) {
+  dumb_fht(buf, std::log2<int>(n));
+  const auto root_n = std::sqrt(n);
+  for (int ii = 0; ii < n; ++ii) {
+    buf[ii] /= root_n;
+  }
+}
+
+std::vector<float> random_floats(int howMany) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::normal_distribution<float> dist;
+  std::vector<float> data(howMany);
+  for (int ii = 0; ii < data.size(); ++ii) {
+    data[ii] = dist(gen);
+  }
+  return data;
+}
+
+} // namespace executorch::runtime::testing
diff --git a/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h
new file mode 100644
index 00000000000..aaf4a9e5c0f
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <executorch/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h>
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h>
+
+namespace executorch::runtime::testing {
+void reference_fht_impl(float* buf, int n);
+
+// Alternate implementation of fast_hadamard_transform_28N to mutation
+// test against. Benchmarking suggests this one is slower, which is
+// why it's in the test.
+template <typename T>
+void fast_hadamard_transform_28N_with_transpose(T* vec, int log2_vec_size) {
+  const int vec_size = (1 << log2_vec_size);
+  for (int ii = 0; ii < 28; ++ii) {
+    executorch::fast_hadamard_transform(&vec[ii * vec_size], log2_vec_size);
+  }
+  std::unique_ptr<T[]> transposed = std::make_unique<T[]>(28 * vec_size);
+  for (int ii = 0; ii < 28; ++ii) {
+    for (int jj = 0; jj < vec_size; ++jj) {
+      transposed[jj * 28 + ii] = vec[ii * vec_size + jj];
+    }
+  }
+  for (int ii = 0; ii < vec_size; ++ii) {
+    hadamard_mult_28(&transposed[ii * 28]);
+  }
+  for (int jj = 0; jj < vec_size; ++jj) {
+    for (int ii = 0; ii < 28; ++ii) {
+      vec[ii * vec_size + jj] = transposed[jj * 28 + ii];
+    }
+  }
+}
+
+std::vector<float> random_floats(int howMany);
+
+} // namespace executorch::runtime::testing
diff --git a/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp b/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp
new file mode 100644
index 00000000000..7ab2d6c3002
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/custom_ops/op_fast_hadamard_transform.h>
+#include <executorch/extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_test_impl.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+using exec_aten::Tensor;
+
+using executorch::runtime::testing::fast_hadamard_transform_28N_with_transpose;
+using executorch::runtime::testing::random_floats;
+using executorch::runtime::testing::reference_fht_impl;
+
+namespace {
+Tensor& fast_hadamard_transform_nocontext(const Tensor& vec, Tensor& out) {
+  exec_aten::RuntimeContext context;
+  return torch::executor::native::fast_hadamard_transform_out(
+      context, vec, out);
+}
+} // namespace
+
+TEST(OpFastHadamardTransformTest, EmptyInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  auto vec = tfFloat.zeros({0});
+  auto out = tfFloat.zeros({0});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+  EXPECT_EQ(result.numel(), 0);
+}
+
+TEST(OpFastHadamardTransformTest, SingleElementInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  auto vec = tfFloat.ones({1});
+  auto out = tfFloat.zeros({1});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+  EXPECT_EQ(result.numel(), 1);
+  // FHT of a single element is a no-op.
+  EXPECT_EQ(result.const_data_ptr<float>()[0], 1);
+}
+
+TEST(OpFastHadamardTransformTest, FourKInput) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  std::vector<float> data = random_floats(4096);
+  auto vec = tfFloat.make({4096}, data);
+  auto out = tfFloat.zeros({4096});
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+
+  std::vector<float> reference_result = data;
+  reference_fht_impl(reference_result.data(), reference_result.size());
+
+  const float* const result_data = result.const_data_ptr<float>();
+  for (int ii = 0; ii < data.size(); ++ii) {
+    EXPECT_FLOAT_EQ(result_data[ii], reference_result[ii]);
+  }
+}
+
+TEST(OpFastHadamardTransformTest, MultipleRows) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  std::vector<float> data = random_floats(8 * 8 * 8);
+  auto mat = tfFloat.make({8, 8, 8}, data);
+  auto out = tfFloat.zeros({8, 8, 8});
+
+  auto result = fast_hadamard_transform_nocontext(mat, out);
+
+  std::vector<float> reference_result = data;
+  for (int ii = 0; ii < 8; ++ii) {
+    for (int jj = 0; jj < 8; ++jj) {
+      reference_fht_impl(&reference_result[ii * 64 + jj * 8], 8);
+    }
+  }
+
+  const float* const result_data = result.const_data_ptr<float>();
+  for (int ii = 0; ii < data.size(); ++ii) {
+    EXPECT_FLOAT_EQ(result_data[ii], reference_result[ii]);
+  }
+}
+
+TEST(OpFastHadamardTransformTest, Basic28N) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  constexpr int kTestLogSize = 7;
+  constexpr int kTestPowerOfTwoSize = 1 << kTestLogSize;
+  constexpr int kTestTotalSize = kTestPowerOfTwoSize * 28;
+  std::vector<float> data = random_floats(kTestTotalSize);
+  auto vec = tfFloat.make({kTestTotalSize}, data);
+  auto out = tfFloat.zeros({kTestTotalSize});
+
+  // The operator is supposed to autodetect 28 * 2**N size and handle
+  // accordingly.
+  auto result = fast_hadamard_transform_nocontext(vec, out);
+
+  std::vector<float> reference_result = data;
+  fast_hadamard_transform_28N_with_transpose(
+      reference_result.data(), kTestLogSize);
+
+  const float* const result_data = result.const_data_ptr<float>();
+  for (int ii = 0; ii < data.size(); ++ii) {
+    EXPECT_FLOAT_EQ(result_data[ii], reference_result[ii]);
+  }
+}
+
+TEST(OpFastHadamardTransformTest, InvalidSize) {
+  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  auto mat = tfFloat.zeros({3});
+  auto out = tfFloat.zeros({3});
+
+  exec_aten::RuntimeContext context;
+  torch::executor::native::fast_hadamard_transform_out(context, mat, out);
+  EXPECT_NE(context.failure_state(), executorch::runtime::Error::Ok);
+}
diff --git a/extension/llm/custom_ops/spinquant/test/targets.bzl b/extension/llm/custom_ops/spinquant/test/targets.bzl
new file mode 100644
index 00000000000..47ae39752a8
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/targets.bzl
@@ -0,0 +1,42 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_library(
+        name = "fast_hadamard_transform_test_impl",
+        srcs = ["fast_hadamard_transform_test_impl.cpp"],
+        exported_headers = [
+            "fast_hadamard_transform_special_unstrided_cpu.h",
+            "fast_hadamard_transform_test_impl.h",
+        ],
+        exported_deps = [
+            "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
+        ],
+        deps = [
+            "//executorch/extension/llm/custom_ops/spinquant/third-party/FFHT:dumb_fht",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "fast_hadamard_transform_test",
+        srcs = ["fast_hadamard_transform_test.cpp"],
+        deps = [
+            ":fast_hadamard_transform_test_impl",
+            "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "op_fast_hadamard_transform_test",
+        srcs = ["op_fast_hadamard_transform_test.cpp"],
+        deps = [
+            ":fast_hadamard_transform_test_impl",
+            "//executorch/extension/llm/custom_ops:custom_ops",
+            "//executorch/kernels/test:test_util",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md
new file mode 100644
index 00000000000..52c4e01cd49
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Alexandr Andoni, Piotr Indyk, Thijs Laarhoven,
+Ilya Razenshteyn, Ludwig Schmidt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
new file mode 100644
index 00000000000..7cbeb3ddae9
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
@@ -0,0 +1,21 @@
+CC = gcc
+CFLAGS = -O3 -march=native -std=c99 -pedantic -Wall -Wextra -Wshadow -Wpointer-arith -Wcast-qual -Wstrict-prototypes -Wmissing-prototypes
+
+all: test_float test_double fast_copy.o fht.o
+
+OBJ := dumb_fht.o fast_copy.o fht.o
+
+%.o: %.c
+	$(CC) $< -o $@ -c $(CFLAGS)
+
+test_%: test_%.c $(OBJ)
+	$(CC) $< $(OBJ) -o $@ $(CFLAGS)
+
+test_double_header_only: test_double_header_only.c
+	$(CC) $< -o $@ $(CFLAGS)
+
+test_float_header_only: test_double_header_only.c
+	$(CC) $< -o $@ $(CFLAGS)
+
+clean:
+	rm -f test_float test_double test_float_header_only test_double_header_only $(OBJ)
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
new file mode 100644
index 00000000000..dcc9840f25a
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
@@ -0,0 +1,5 @@
+# Fast Fast Hadamard Transform
+
+This directory contains a fork of https://github.com/FALCONN-LIB/FFHT
+(License: https://github.com/FALCONN-LIB/FFHT/blob/master/LICENSE.md)
+focused on ARM64 NEON code generation.
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/TARGETS b/extension/llm/custom_ops/spinquant/third-party/FFHT/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.c
new file mode 100644
index 00000000000..8f30f3e8ea3
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.c
@@ -0,0 +1,17 @@
+#include "dumb_fht.h"
+
+void dumb_fht(float* buf, int log_n) {
+  int n = 1 << log_n;
+  for (int i = 0; i < log_n; ++i) {
+    int s1 = 1 << i;
+    int s2 = s1 << 1;
+    for (int j = 0; j < n; j += s2) {
+      for (int k = 0; k < s1; ++k) {
+        float u = buf[j + k];
+        float v = buf[j + k + s1];
+        buf[j + k] = u + v;
+        buf[j + k + s1] = u - v;
+      }
+    }
+  }
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h
new file mode 100644
index 00000000000..8ea702d449e
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/dumb_fht.h
@@ -0,0 +1,14 @@
+#ifndef DUMB_FHT_H
+#define DUMB_FHT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void dumb_fht(float* buf, int log_n);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* DUMB_FHT_H */
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/example.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/example.py
new file mode 100644
index 00000000000..576c89830da
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/example.py
@@ -0,0 +1,20 @@
+import numpy as np
+import ffht
+import timeit
+import sys
+
+reps = 1000
+n = 2**20
+chunk_size = 1024
+
+a = np.random.randn(n).astype(np.float32)
+
+t1 = timeit.default_timer()
+for i in range(reps):
+    ffht.fht(a)
+t2 = timeit.default_timer()
+
+if sys.version_info[0] == 2:
+    print (t2 - t1 + 0.0) / (reps + 0.0)
+if sys.version_info[0] == 3:
+    print('{}'.format((t2 - t1 + 0.0) / (reps + 0.0)))
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c
new file mode 100644
index 00000000000..bf3cbd1986d
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c
@@ -0,0 +1,60 @@
+#include "fast_copy.h"
+#include <string.h>
+#include <stdlib.h>
+#if (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+#ifdef FHT_HEADER_ONLY
+#  define _STORAGE_ static inline
+#else
+#  define _STORAGE_
+#endif
+
+// These functions all assume that the size of memory being copied is a power of 2.
+
+#if _FEATURE_AVX512F
+// If n is less than 64, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
+_STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
+    if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
+        return memcpy(out, in, n);
+    }
+    n >>= 6;
+    for(__m512 *ov = (__m512 *)out, *iv = (__m512 *)in; n--;) {
+        _mm512_storeu_ps((float *)(ov++), _mm512_loadu_ps((float *)(iv++)));
+    }
+    return out;
+}
+#elif __AVX2__
+// If n is less than 32, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
+_STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
+    if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
+        return memcpy(out, in, n);
+    }
+    n >>= 5;
+    for(__m256 *ov = (__m256 *)out, *iv = (__m256 *)in; n--;) {
+        _mm256_storeu_ps((float *)(ov++), _mm256_loadu_ps((float *)(iv++)));
+    }
+    return out;
+}
+#elif __SSE2__
+// If n is less than 16, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
+_STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
+    if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
+        return memcpy(out, in, n);
+    }
+    n >>= 4;
+    for(__m128 *ov = (__m128 *)out, *iv = (__m128 *)in; n--;) {
+        _mm_storeu_ps((float *)(ov++), _mm_loadu_ps((float *)(iv++)));
+    }
+    return out;
+}
+#else
+_STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
+    return memcpy(out, in, n);
+}
+#endif
+
+#ifdef FHT_HEADER_ONLY
+#  undef _STORAGE_
+#endif
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h
new file mode 100644
index 00000000000..f4d4fabc01a
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.h
@@ -0,0 +1,21 @@
+#ifndef _FAST_COPY_H__
+#define _FAST_COPY_H__
+#include <stdlib.h>
+
+#ifndef FAST_COPY_MEMCPY_THRESHOLD
+#  define FAST_COPY_MEMCPY_THRESHOLD ((size_t)1ull << 20)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef FHT_HEADER_ONLY
+#include "fast_copy.c"
+#else
+void *fast_copy(void *out, void *in, size_t m);
+#endif
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // _FAST_COPY_H__
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c
new file mode 100644
index 00000000000..c374ff618f6
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.c
@@ -0,0 +1,3 @@
+#include "fht.h"
+
+#include "fht_impl.h"
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h
new file mode 100644
index 00000000000..3bc78e353bc
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht.h
@@ -0,0 +1,45 @@
+#ifndef _FHT_H_
+#define _FHT_H_
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int fht_float(float* buf, int log_n);
+#ifndef __aarch64__
+int fht_double(double* buf, int log_n);
+#endif
+int fht_float_oop(float* in, float* out, int log_n);
+#ifndef __aarch64__
+int fht_double_oop(double* in, double* out, int log_n);
+#endif
+
+#ifdef __cplusplus
+
+} // extern "C"
+
+static inline int fht(float* buf, int log_n) {
+  return fht_float(buf, log_n);
+}
+
+#ifndef __aarch64__
+static inline int fht(double* buf, int log_n) {
+  return fht_double(buf, log_n);
+}
+#endif
+
+static inline int fht(float* buf, float* out, int log_n) {
+  return fht_float_oop(buf, out, log_n);
+}
+
+#ifndef __aarch64__
+static inline int fht(double* buf, double* out, int log_n) {
+  return fht_double_oop(buf, out, log_n);
+}
+#endif
+
+#endif
+
+#endif
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c
new file mode 100644
index 00000000000..721130dc9f6
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c
@@ -0,0 +1,19671 @@
+#include "fht.h"
+static inline void helper_float_1(float *buf);
+static inline void helper_float_1(float *buf) {
+  for (int j = 0; j < 2; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+}
+static inline void helper_float_2(float *buf);
+static inline void helper_float_2(float *buf) {
+  for (int j = 0; j < 4; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+  for (int j = 0; j < 4; j += 4) {
+    for (int k = 0; k < 2; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 2];
+      buf[j + k] = u + v;
+      buf[j + k + 2] = u - v;
+    }
+  }
+}
+static inline void helper_float_3(float *buf);
+static inline void helper_float_3(float *buf) {
+  for (int j = 0; j < 8; j += 8) {
+    __asm__ volatile (
+      "vmovups (%0), %%ymm0\n"
+      "vpermilps $160, %%ymm0, %%ymm8\n"
+      "vpermilps $245, %%ymm0, %%ymm9\n"
+      "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+      "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+      "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+      "vpermilps $68, %%ymm0, %%ymm8\n"
+      "vpermilps $238, %%ymm0, %%ymm9\n"
+      "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+      "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+      "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+      "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+      "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+      "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+      "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+      "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+      "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+      "vmovups %%ymm0, (%0)\n"
+      :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+    );
+  }
+}
+static inline void helper_float_4(float *buf);
+static inline void helper_float_4(float *buf) {
+  for (int j = 0; j < 16; j += 16) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_5(float *buf);
+static inline void helper_float_5(float *buf) {
+  for (int j = 0; j < 32; j += 32) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $160, %%ymm2, %%ymm8\n"
+        "vpermilps $245, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilps $160, %%ymm3, %%ymm8\n"
+        "vpermilps $245, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vpermilps $68, %%ymm2, %%ymm8\n"
+        "vpermilps $238, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+        "vpermilps $68, %%ymm3, %%ymm8\n"
+        "vpermilps $238, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vmovups %%ymm0, (%0)\n"
+        "vmovups %%ymm1, (%1)\n"
+        "vmovups %%ymm2, (%2)\n"
+        "vmovups %%ymm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_6(float *buf);
+static inline void helper_float_6(float *buf) {
+  for (int j = 0; j < 64; j += 64) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $160, %%ymm2, %%ymm8\n"
+        "vpermilps $245, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilps $160, %%ymm3, %%ymm8\n"
+        "vpermilps $245, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilps $160, %%ymm4, %%ymm8\n"
+        "vpermilps $245, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilps $160, %%ymm5, %%ymm8\n"
+        "vpermilps $245, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilps $160, %%ymm6, %%ymm8\n"
+        "vpermilps $245, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilps $160, %%ymm7, %%ymm8\n"
+        "vpermilps $245, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vpermilps $68, %%ymm2, %%ymm8\n"
+        "vpermilps $238, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+        "vpermilps $68, %%ymm3, %%ymm8\n"
+        "vpermilps $238, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+        "vpermilps $68, %%ymm4, %%ymm8\n"
+        "vpermilps $238, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+        "vpermilps $68, %%ymm5, %%ymm8\n"
+        "vpermilps $238, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+        "vpermilps $68, %%ymm6, %%ymm8\n"
+        "vpermilps $238, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+        "vpermilps $68, %%ymm7, %%ymm8\n"
+        "vpermilps $238, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_7_recursive(float *buf, int depth);
+void helper_float_7_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_7(float *buf);
+void helper_float_7(float *buf) {
+  helper_float_7_recursive(buf, 7);
+}
+void helper_float_8_recursive(float *buf, int depth);
+void helper_float_8_recursive(float *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_float_8_recursive(buf + 0, 6);
+    helper_float_8_recursive(buf + 64, 6);
+    helper_float_8_recursive(buf + 128, 6);
+    helper_float_8_recursive(buf + 192, 6);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_8(float *buf);
+void helper_float_8(float *buf) {
+  helper_float_8_recursive(buf, 8);
+}
+static inline void helper_float_9(float *buf);
+static inline void helper_float_9(float *buf) {
+  for (int j = 0; j < 512; j += 64) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $160, %%ymm2, %%ymm8\n"
+        "vpermilps $245, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilps $160, %%ymm3, %%ymm8\n"
+        "vpermilps $245, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilps $160, %%ymm4, %%ymm8\n"
+        "vpermilps $245, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilps $160, %%ymm5, %%ymm8\n"
+        "vpermilps $245, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilps $160, %%ymm6, %%ymm8\n"
+        "vpermilps $245, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilps $160, %%ymm7, %%ymm8\n"
+        "vpermilps $245, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vpermilps $68, %%ymm2, %%ymm8\n"
+        "vpermilps $238, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+        "vpermilps $68, %%ymm3, %%ymm8\n"
+        "vpermilps $238, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+        "vpermilps $68, %%ymm4, %%ymm8\n"
+        "vpermilps $238, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+        "vpermilps $68, %%ymm5, %%ymm8\n"
+        "vpermilps $238, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+        "vpermilps $68, %%ymm6, %%ymm8\n"
+        "vpermilps $238, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+        "vpermilps $68, %%ymm7, %%ymm8\n"
+        "vpermilps $238, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 512) {
+    for (int k = 0; k < 64; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_10_recursive(float *buf, int depth);
+void helper_float_10_recursive(float *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_10(float *buf);
+void helper_float_10(float *buf) {
+  helper_float_10_recursive(buf, 10);
+}
+void helper_float_11_recursive(float *buf, int depth);
+void helper_float_11_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_11(float *buf);
+void helper_float_11(float *buf) {
+  helper_float_11_recursive(buf, 11);
+}
+static inline void helper_float_12(float *buf);
+static inline void helper_float_12(float *buf) {
+  for (int j = 0; j < 4096; j += 64) {
+    for (int k = 0; k < 8; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vpermilps $160, %%ymm0, %%ymm8\n"
+        "vpermilps $245, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilps $160, %%ymm1, %%ymm8\n"
+        "vpermilps $245, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilps $160, %%ymm2, %%ymm8\n"
+        "vpermilps $245, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilps $160, %%ymm3, %%ymm8\n"
+        "vpermilps $245, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilps $160, %%ymm4, %%ymm8\n"
+        "vpermilps $245, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilps $160, %%ymm5, %%ymm8\n"
+        "vpermilps $245, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilps $160, %%ymm6, %%ymm8\n"
+        "vpermilps $245, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilps $160, %%ymm7, %%ymm8\n"
+        "vpermilps $245, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+        "vpermilps $68, %%ymm0, %%ymm8\n"
+        "vpermilps $238, %%ymm0, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+        "vpermilps $68, %%ymm1, %%ymm8\n"
+        "vpermilps $238, %%ymm1, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+        "vpermilps $68, %%ymm2, %%ymm8\n"
+        "vpermilps $238, %%ymm2, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+        "vpermilps $68, %%ymm3, %%ymm8\n"
+        "vpermilps $238, %%ymm3, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+        "vpermilps $68, %%ymm4, %%ymm8\n"
+        "vpermilps $238, %%ymm4, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+        "vpermilps $68, %%ymm5, %%ymm8\n"
+        "vpermilps $238, %%ymm5, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+        "vpermilps $68, %%ymm6, %%ymm8\n"
+        "vpermilps $238, %%ymm6, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+        "vpermilps $68, %%ymm7, %%ymm8\n"
+        "vpermilps $238, %%ymm7, %%ymm9\n"
+        "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+        "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+        "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+        "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+        "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+        "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+        "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 4096; j += 512) {
+    for (int k = 0; k < 64; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 4096; j += 4096) {
+    for (int k = 0; k < 512; k += 8) {
+      __asm__ volatile (
+        "vmovups (%0), %%ymm0\n"
+        "vmovups (%1), %%ymm1\n"
+        "vmovups (%2), %%ymm2\n"
+        "vmovups (%3), %%ymm3\n"
+        "vmovups (%4), %%ymm4\n"
+        "vmovups (%5), %%ymm5\n"
+        "vmovups (%6), %%ymm6\n"
+        "vmovups (%7), %%ymm7\n"
+        "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovups %%ymm8, (%0)\n"
+        "vmovups %%ymm9, (%1)\n"
+        "vmovups %%ymm10, (%2)\n"
+        "vmovups %%ymm11, (%3)\n"
+        "vmovups %%ymm12, (%4)\n"
+        "vmovups %%ymm13, (%5)\n"
+        "vmovups %%ymm14, (%6)\n"
+        "vmovups %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_13_recursive(float *buf, int depth);
+void helper_float_13_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_float_13_recursive(buf + 0, 11);
+    helper_float_13_recursive(buf + 2048, 11);
+    helper_float_13_recursive(buf + 4096, 11);
+    helper_float_13_recursive(buf + 6144, 11);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_13(float *buf);
+void helper_float_13(float *buf) {
+  helper_float_13_recursive(buf, 13);
+}
+void helper_float_14_recursive(float *buf, int depth);
+void helper_float_14_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_14_recursive(buf + 0, 12);
+    helper_float_14_recursive(buf + 4096, 12);
+    helper_float_14_recursive(buf + 8192, 12);
+    helper_float_14_recursive(buf + 12288, 12);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_14(float *buf);
+void helper_float_14(float *buf) {
+  helper_float_14_recursive(buf, 14);
+}
+void helper_float_15_recursive(float *buf, int depth);
+void helper_float_15_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_15_recursive(buf + 0, 13);
+    helper_float_15_recursive(buf + 8192, 13);
+    helper_float_15_recursive(buf + 16384, 13);
+    helper_float_15_recursive(buf + 24576, 13);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_15(float *buf);
+void helper_float_15(float *buf) {
+  helper_float_15_recursive(buf, 15);
+}
+void helper_float_16_recursive(float *buf, int depth);
+void helper_float_16_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_16_recursive(buf + 0, 13);
+    helper_float_16_recursive(buf + 8192, 13);
+    helper_float_16_recursive(buf + 16384, 13);
+    helper_float_16_recursive(buf + 24576, 13);
+    helper_float_16_recursive(buf + 32768, 13);
+    helper_float_16_recursive(buf + 40960, 13);
+    helper_float_16_recursive(buf + 49152, 13);
+    helper_float_16_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_16(float *buf);
+void helper_float_16(float *buf) {
+  helper_float_16_recursive(buf, 16);
+}
+void helper_float_17_recursive(float *buf, int depth);
+void helper_float_17_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_17_recursive(buf + 0, 12);
+    helper_float_17_recursive(buf + 4096, 12);
+    helper_float_17_recursive(buf + 8192, 12);
+    helper_float_17_recursive(buf + 12288, 12);
+    helper_float_17_recursive(buf + 16384, 12);
+    helper_float_17_recursive(buf + 20480, 12);
+    helper_float_17_recursive(buf + 24576, 12);
+    helper_float_17_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_17_recursive(buf + 0, 15);
+    helper_float_17_recursive(buf + 32768, 15);
+    helper_float_17_recursive(buf + 65536, 15);
+    helper_float_17_recursive(buf + 98304, 15);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_17(float *buf);
+void helper_float_17(float *buf) {
+  helper_float_17_recursive(buf, 17);
+}
+void helper_float_18_recursive(float *buf, int depth);
+void helper_float_18_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_18_recursive(buf + 0, 12);
+    helper_float_18_recursive(buf + 4096, 12);
+    helper_float_18_recursive(buf + 8192, 12);
+    helper_float_18_recursive(buf + 12288, 12);
+    helper_float_18_recursive(buf + 16384, 12);
+    helper_float_18_recursive(buf + 20480, 12);
+    helper_float_18_recursive(buf + 24576, 12);
+    helper_float_18_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_18_recursive(buf + 0, 15);
+    helper_float_18_recursive(buf + 32768, 15);
+    helper_float_18_recursive(buf + 65536, 15);
+    helper_float_18_recursive(buf + 98304, 15);
+    helper_float_18_recursive(buf + 131072, 15);
+    helper_float_18_recursive(buf + 163840, 15);
+    helper_float_18_recursive(buf + 196608, 15);
+    helper_float_18_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_18(float *buf);
+void helper_float_18(float *buf) {
+  helper_float_18_recursive(buf, 18);
+}
+void helper_float_19_recursive(float *buf, int depth);
+void helper_float_19_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_19_recursive(buf + 0, 13);
+    helper_float_19_recursive(buf + 8192, 13);
+    helper_float_19_recursive(buf + 16384, 13);
+    helper_float_19_recursive(buf + 24576, 13);
+    helper_float_19_recursive(buf + 32768, 13);
+    helper_float_19_recursive(buf + 40960, 13);
+    helper_float_19_recursive(buf + 49152, 13);
+    helper_float_19_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_19_recursive(buf + 0, 16);
+    helper_float_19_recursive(buf + 65536, 16);
+    helper_float_19_recursive(buf + 131072, 16);
+    helper_float_19_recursive(buf + 196608, 16);
+    helper_float_19_recursive(buf + 262144, 16);
+    helper_float_19_recursive(buf + 327680, 16);
+    helper_float_19_recursive(buf + 393216, 16);
+    helper_float_19_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_19(float *buf);
+void helper_float_19(float *buf) {
+  helper_float_19_recursive(buf, 19);
+}
+void helper_float_20_recursive(float *buf, int depth);
+void helper_float_20_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_20_recursive(buf + 0, 12);
+    helper_float_20_recursive(buf + 4096, 12);
+    helper_float_20_recursive(buf + 8192, 12);
+    helper_float_20_recursive(buf + 12288, 12);
+    helper_float_20_recursive(buf + 16384, 12);
+    helper_float_20_recursive(buf + 20480, 12);
+    helper_float_20_recursive(buf + 24576, 12);
+    helper_float_20_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_20_recursive(buf + 0, 15);
+    helper_float_20_recursive(buf + 32768, 15);
+    helper_float_20_recursive(buf + 65536, 15);
+    helper_float_20_recursive(buf + 98304, 15);
+    helper_float_20_recursive(buf + 131072, 15);
+    helper_float_20_recursive(buf + 163840, 15);
+    helper_float_20_recursive(buf + 196608, 15);
+    helper_float_20_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_20_recursive(buf + 0, 18);
+    helper_float_20_recursive(buf + 262144, 18);
+    helper_float_20_recursive(buf + 524288, 18);
+    helper_float_20_recursive(buf + 786432, 18);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_20(float *buf);
+void helper_float_20(float *buf) {
+  helper_float_20_recursive(buf, 20);
+}
+void helper_float_21_recursive(float *buf, int depth);
+void helper_float_21_recursive(float *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_21_recursive(buf + 0, 9);
+    helper_float_21_recursive(buf + 512, 9);
+    helper_float_21_recursive(buf + 1024, 9);
+    helper_float_21_recursive(buf + 1536, 9);
+    helper_float_21_recursive(buf + 2048, 9);
+    helper_float_21_recursive(buf + 2560, 9);
+    helper_float_21_recursive(buf + 3072, 9);
+    helper_float_21_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_21_recursive(buf + 0, 12);
+    helper_float_21_recursive(buf + 4096, 12);
+    helper_float_21_recursive(buf + 8192, 12);
+    helper_float_21_recursive(buf + 12288, 12);
+    helper_float_21_recursive(buf + 16384, 12);
+    helper_float_21_recursive(buf + 20480, 12);
+    helper_float_21_recursive(buf + 24576, 12);
+    helper_float_21_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_21_recursive(buf + 0, 15);
+    helper_float_21_recursive(buf + 32768, 15);
+    helper_float_21_recursive(buf + 65536, 15);
+    helper_float_21_recursive(buf + 98304, 15);
+    helper_float_21_recursive(buf + 131072, 15);
+    helper_float_21_recursive(buf + 163840, 15);
+    helper_float_21_recursive(buf + 196608, 15);
+    helper_float_21_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_21_recursive(buf + 0, 18);
+    helper_float_21_recursive(buf + 262144, 18);
+    helper_float_21_recursive(buf + 524288, 18);
+    helper_float_21_recursive(buf + 786432, 18);
+    helper_float_21_recursive(buf + 1048576, 18);
+    helper_float_21_recursive(buf + 1310720, 18);
+    helper_float_21_recursive(buf + 1572864, 18);
+    helper_float_21_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_21(float *buf);
+void helper_float_21(float *buf) {
+  helper_float_21_recursive(buf, 21);
+}
+void helper_float_22_recursive(float *buf, int depth);
+void helper_float_22_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_22_recursive(buf + 0, 11);
+    helper_float_22_recursive(buf + 2048, 11);
+    helper_float_22_recursive(buf + 4096, 11);
+    helper_float_22_recursive(buf + 6144, 11);
+    helper_float_22_recursive(buf + 8192, 11);
+    helper_float_22_recursive(buf + 10240, 11);
+    helper_float_22_recursive(buf + 12288, 11);
+    helper_float_22_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_22_recursive(buf + 0, 14);
+    helper_float_22_recursive(buf + 16384, 14);
+    helper_float_22_recursive(buf + 32768, 14);
+    helper_float_22_recursive(buf + 49152, 14);
+    helper_float_22_recursive(buf + 65536, 14);
+    helper_float_22_recursive(buf + 81920, 14);
+    helper_float_22_recursive(buf + 98304, 14);
+    helper_float_22_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_22_recursive(buf + 0, 17);
+    helper_float_22_recursive(buf + 131072, 17);
+    helper_float_22_recursive(buf + 262144, 17);
+    helper_float_22_recursive(buf + 393216, 17);
+    helper_float_22_recursive(buf + 524288, 17);
+    helper_float_22_recursive(buf + 655360, 17);
+    helper_float_22_recursive(buf + 786432, 17);
+    helper_float_22_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_22_recursive(buf + 0, 20);
+    helper_float_22_recursive(buf + 1048576, 20);
+    helper_float_22_recursive(buf + 2097152, 20);
+    helper_float_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_22(float *buf);
+void helper_float_22(float *buf) {
+  helper_float_22_recursive(buf, 22);
+}
+void helper_float_23_recursive(float *buf, int depth);
+void helper_float_23_recursive(float *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_23_recursive(buf + 0, 9);
+    helper_float_23_recursive(buf + 512, 9);
+    helper_float_23_recursive(buf + 1024, 9);
+    helper_float_23_recursive(buf + 1536, 9);
+    helper_float_23_recursive(buf + 2048, 9);
+    helper_float_23_recursive(buf + 2560, 9);
+    helper_float_23_recursive(buf + 3072, 9);
+    helper_float_23_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_23_recursive(buf + 0, 12);
+    helper_float_23_recursive(buf + 4096, 12);
+    helper_float_23_recursive(buf + 8192, 12);
+    helper_float_23_recursive(buf + 12288, 12);
+    helper_float_23_recursive(buf + 16384, 12);
+    helper_float_23_recursive(buf + 20480, 12);
+    helper_float_23_recursive(buf + 24576, 12);
+    helper_float_23_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_23_recursive(buf + 0, 15);
+    helper_float_23_recursive(buf + 32768, 15);
+    helper_float_23_recursive(buf + 65536, 15);
+    helper_float_23_recursive(buf + 98304, 15);
+    helper_float_23_recursive(buf + 131072, 15);
+    helper_float_23_recursive(buf + 163840, 15);
+    helper_float_23_recursive(buf + 196608, 15);
+    helper_float_23_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_23_recursive(buf + 0, 18);
+    helper_float_23_recursive(buf + 262144, 18);
+    helper_float_23_recursive(buf + 524288, 18);
+    helper_float_23_recursive(buf + 786432, 18);
+    helper_float_23_recursive(buf + 1048576, 18);
+    helper_float_23_recursive(buf + 1310720, 18);
+    helper_float_23_recursive(buf + 1572864, 18);
+    helper_float_23_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_float_23_recursive(buf + 0, 21);
+    helper_float_23_recursive(buf + 2097152, 21);
+    helper_float_23_recursive(buf + 4194304, 21);
+    helper_float_23_recursive(buf + 6291456, 21);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_23(float *buf);
+void helper_float_23(float *buf) {
+  helper_float_23_recursive(buf, 23);
+}
+void helper_float_24_recursive(float *buf, int depth);
+void helper_float_24_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_24_recursive(buf + 0, 12);
+    helper_float_24_recursive(buf + 4096, 12);
+    helper_float_24_recursive(buf + 8192, 12);
+    helper_float_24_recursive(buf + 12288, 12);
+    helper_float_24_recursive(buf + 16384, 12);
+    helper_float_24_recursive(buf + 20480, 12);
+    helper_float_24_recursive(buf + 24576, 12);
+    helper_float_24_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_24_recursive(buf + 0, 15);
+    helper_float_24_recursive(buf + 32768, 15);
+    helper_float_24_recursive(buf + 65536, 15);
+    helper_float_24_recursive(buf + 98304, 15);
+    helper_float_24_recursive(buf + 131072, 15);
+    helper_float_24_recursive(buf + 163840, 15);
+    helper_float_24_recursive(buf + 196608, 15);
+    helper_float_24_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_24_recursive(buf + 0, 18);
+    helper_float_24_recursive(buf + 262144, 18);
+    helper_float_24_recursive(buf + 524288, 18);
+    helper_float_24_recursive(buf + 786432, 18);
+    helper_float_24_recursive(buf + 1048576, 18);
+    helper_float_24_recursive(buf + 1310720, 18);
+    helper_float_24_recursive(buf + 1572864, 18);
+    helper_float_24_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_24_recursive(buf + 0, 21);
+    helper_float_24_recursive(buf + 2097152, 21);
+    helper_float_24_recursive(buf + 4194304, 21);
+    helper_float_24_recursive(buf + 6291456, 21);
+    helper_float_24_recursive(buf + 8388608, 21);
+    helper_float_24_recursive(buf + 10485760, 21);
+    helper_float_24_recursive(buf + 12582912, 21);
+    helper_float_24_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_24(float *buf);
+void helper_float_24(float *buf) {
+  helper_float_24_recursive(buf, 24);
+}
+void helper_float_25_recursive(float *buf, int depth);
+void helper_float_25_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 10) {
+    helper_float_25_recursive(buf + 0, 7);
+    helper_float_25_recursive(buf + 128, 7);
+    helper_float_25_recursive(buf + 256, 7);
+    helper_float_25_recursive(buf + 384, 7);
+    helper_float_25_recursive(buf + 512, 7);
+    helper_float_25_recursive(buf + 640, 7);
+    helper_float_25_recursive(buf + 768, 7);
+    helper_float_25_recursive(buf + 896, 7);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_float_25_recursive(buf + 0, 10);
+    helper_float_25_recursive(buf + 1024, 10);
+    helper_float_25_recursive(buf + 2048, 10);
+    helper_float_25_recursive(buf + 3072, 10);
+    helper_float_25_recursive(buf + 4096, 10);
+    helper_float_25_recursive(buf + 5120, 10);
+    helper_float_25_recursive(buf + 6144, 10);
+    helper_float_25_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_25_recursive(buf + 0, 13);
+    helper_float_25_recursive(buf + 8192, 13);
+    helper_float_25_recursive(buf + 16384, 13);
+    helper_float_25_recursive(buf + 24576, 13);
+    helper_float_25_recursive(buf + 32768, 13);
+    helper_float_25_recursive(buf + 40960, 13);
+    helper_float_25_recursive(buf + 49152, 13);
+    helper_float_25_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_25_recursive(buf + 0, 16);
+    helper_float_25_recursive(buf + 65536, 16);
+    helper_float_25_recursive(buf + 131072, 16);
+    helper_float_25_recursive(buf + 196608, 16);
+    helper_float_25_recursive(buf + 262144, 16);
+    helper_float_25_recursive(buf + 327680, 16);
+    helper_float_25_recursive(buf + 393216, 16);
+    helper_float_25_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_25_recursive(buf + 0, 19);
+    helper_float_25_recursive(buf + 524288, 19);
+    helper_float_25_recursive(buf + 1048576, 19);
+    helper_float_25_recursive(buf + 1572864, 19);
+    helper_float_25_recursive(buf + 2097152, 19);
+    helper_float_25_recursive(buf + 2621440, 19);
+    helper_float_25_recursive(buf + 3145728, 19);
+    helper_float_25_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_float_25_recursive(buf + 0, 22);
+    helper_float_25_recursive(buf + 4194304, 22);
+    helper_float_25_recursive(buf + 8388608, 22);
+    helper_float_25_recursive(buf + 12582912, 22);
+    helper_float_25_recursive(buf + 16777216, 22);
+    helper_float_25_recursive(buf + 20971520, 22);
+    helper_float_25_recursive(buf + 25165824, 22);
+    helper_float_25_recursive(buf + 29360128, 22);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 4194304; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_25(float *buf);
+void helper_float_25(float *buf) {
+  helper_float_25_recursive(buf, 25);
+}
+void helper_float_26_recursive(float *buf, int depth);
+void helper_float_26_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_26_recursive(buf + 0, 12);
+    helper_float_26_recursive(buf + 4096, 12);
+    helper_float_26_recursive(buf + 8192, 12);
+    helper_float_26_recursive(buf + 12288, 12);
+    helper_float_26_recursive(buf + 16384, 12);
+    helper_float_26_recursive(buf + 20480, 12);
+    helper_float_26_recursive(buf + 24576, 12);
+    helper_float_26_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_26_recursive(buf + 0, 15);
+    helper_float_26_recursive(buf + 32768, 15);
+    helper_float_26_recursive(buf + 65536, 15);
+    helper_float_26_recursive(buf + 98304, 15);
+    helper_float_26_recursive(buf + 131072, 15);
+    helper_float_26_recursive(buf + 163840, 15);
+    helper_float_26_recursive(buf + 196608, 15);
+    helper_float_26_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_26_recursive(buf + 0, 18);
+    helper_float_26_recursive(buf + 262144, 18);
+    helper_float_26_recursive(buf + 524288, 18);
+    helper_float_26_recursive(buf + 786432, 18);
+    helper_float_26_recursive(buf + 1048576, 18);
+    helper_float_26_recursive(buf + 1310720, 18);
+    helper_float_26_recursive(buf + 1572864, 18);
+    helper_float_26_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_26_recursive(buf + 0, 21);
+    helper_float_26_recursive(buf + 2097152, 21);
+    helper_float_26_recursive(buf + 4194304, 21);
+    helper_float_26_recursive(buf + 6291456, 21);
+    helper_float_26_recursive(buf + 8388608, 21);
+    helper_float_26_recursive(buf + 10485760, 21);
+    helper_float_26_recursive(buf + 12582912, 21);
+    helper_float_26_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_float_26_recursive(buf + 0, 24);
+    helper_float_26_recursive(buf + 16777216, 24);
+    helper_float_26_recursive(buf + 33554432, 24);
+    helper_float_26_recursive(buf + 50331648, 24);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 16777216; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_26(float *buf);
+void helper_float_26(float *buf) {
+  helper_float_26_recursive(buf, 26);
+}
+void helper_float_27_recursive(float *buf, int depth);
+void helper_float_27_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_27_recursive(buf + 0, 12);
+    helper_float_27_recursive(buf + 4096, 12);
+    helper_float_27_recursive(buf + 8192, 12);
+    helper_float_27_recursive(buf + 12288, 12);
+    helper_float_27_recursive(buf + 16384, 12);
+    helper_float_27_recursive(buf + 20480, 12);
+    helper_float_27_recursive(buf + 24576, 12);
+    helper_float_27_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_27_recursive(buf + 0, 15);
+    helper_float_27_recursive(buf + 32768, 15);
+    helper_float_27_recursive(buf + 65536, 15);
+    helper_float_27_recursive(buf + 98304, 15);
+    helper_float_27_recursive(buf + 131072, 15);
+    helper_float_27_recursive(buf + 163840, 15);
+    helper_float_27_recursive(buf + 196608, 15);
+    helper_float_27_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_27_recursive(buf + 0, 18);
+    helper_float_27_recursive(buf + 262144, 18);
+    helper_float_27_recursive(buf + 524288, 18);
+    helper_float_27_recursive(buf + 786432, 18);
+    helper_float_27_recursive(buf + 1048576, 18);
+    helper_float_27_recursive(buf + 1310720, 18);
+    helper_float_27_recursive(buf + 1572864, 18);
+    helper_float_27_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_27_recursive(buf + 0, 21);
+    helper_float_27_recursive(buf + 2097152, 21);
+    helper_float_27_recursive(buf + 4194304, 21);
+    helper_float_27_recursive(buf + 6291456, 21);
+    helper_float_27_recursive(buf + 8388608, 21);
+    helper_float_27_recursive(buf + 10485760, 21);
+    helper_float_27_recursive(buf + 12582912, 21);
+    helper_float_27_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_27_recursive(buf + 0, 24);
+    helper_float_27_recursive(buf + 16777216, 24);
+    helper_float_27_recursive(buf + 33554432, 24);
+    helper_float_27_recursive(buf + 50331648, 24);
+    helper_float_27_recursive(buf + 67108864, 24);
+    helper_float_27_recursive(buf + 83886080, 24);
+    helper_float_27_recursive(buf + 100663296, 24);
+    helper_float_27_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_27(float *buf);
+void helper_float_27(float *buf) {
+  helper_float_27_recursive(buf, 27);
+}
+void helper_float_28_recursive(float *buf, int depth);
+void helper_float_28_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 10) {
+    helper_float_28_recursive(buf + 0, 7);
+    helper_float_28_recursive(buf + 128, 7);
+    helper_float_28_recursive(buf + 256, 7);
+    helper_float_28_recursive(buf + 384, 7);
+    helper_float_28_recursive(buf + 512, 7);
+    helper_float_28_recursive(buf + 640, 7);
+    helper_float_28_recursive(buf + 768, 7);
+    helper_float_28_recursive(buf + 896, 7);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_float_28_recursive(buf + 0, 10);
+    helper_float_28_recursive(buf + 1024, 10);
+    helper_float_28_recursive(buf + 2048, 10);
+    helper_float_28_recursive(buf + 3072, 10);
+    helper_float_28_recursive(buf + 4096, 10);
+    helper_float_28_recursive(buf + 5120, 10);
+    helper_float_28_recursive(buf + 6144, 10);
+    helper_float_28_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_28_recursive(buf + 0, 13);
+    helper_float_28_recursive(buf + 8192, 13);
+    helper_float_28_recursive(buf + 16384, 13);
+    helper_float_28_recursive(buf + 24576, 13);
+    helper_float_28_recursive(buf + 32768, 13);
+    helper_float_28_recursive(buf + 40960, 13);
+    helper_float_28_recursive(buf + 49152, 13);
+    helper_float_28_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_28_recursive(buf + 0, 16);
+    helper_float_28_recursive(buf + 65536, 16);
+    helper_float_28_recursive(buf + 131072, 16);
+    helper_float_28_recursive(buf + 196608, 16);
+    helper_float_28_recursive(buf + 262144, 16);
+    helper_float_28_recursive(buf + 327680, 16);
+    helper_float_28_recursive(buf + 393216, 16);
+    helper_float_28_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_28_recursive(buf + 0, 19);
+    helper_float_28_recursive(buf + 524288, 19);
+    helper_float_28_recursive(buf + 1048576, 19);
+    helper_float_28_recursive(buf + 1572864, 19);
+    helper_float_28_recursive(buf + 2097152, 19);
+    helper_float_28_recursive(buf + 2621440, 19);
+    helper_float_28_recursive(buf + 3145728, 19);
+    helper_float_28_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_float_28_recursive(buf + 0, 22);
+    helper_float_28_recursive(buf + 4194304, 22);
+    helper_float_28_recursive(buf + 8388608, 22);
+    helper_float_28_recursive(buf + 12582912, 22);
+    helper_float_28_recursive(buf + 16777216, 22);
+    helper_float_28_recursive(buf + 20971520, 22);
+    helper_float_28_recursive(buf + 25165824, 22);
+    helper_float_28_recursive(buf + 29360128, 22);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 4194304; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 28) {
+    helper_float_28_recursive(buf + 0, 25);
+    helper_float_28_recursive(buf + 33554432, 25);
+    helper_float_28_recursive(buf + 67108864, 25);
+    helper_float_28_recursive(buf + 100663296, 25);
+    helper_float_28_recursive(buf + 134217728, 25);
+    helper_float_28_recursive(buf + 167772160, 25);
+    helper_float_28_recursive(buf + 201326592, 25);
+    helper_float_28_recursive(buf + 234881024, 25);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 33554432; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_28(float *buf);
+void helper_float_28(float *buf) {
+  helper_float_28_recursive(buf, 28);
+}
+void helper_float_29_recursive(float *buf, int depth);
+void helper_float_29_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_29_recursive(buf + 0, 12);
+    helper_float_29_recursive(buf + 4096, 12);
+    helper_float_29_recursive(buf + 8192, 12);
+    helper_float_29_recursive(buf + 12288, 12);
+    helper_float_29_recursive(buf + 16384, 12);
+    helper_float_29_recursive(buf + 20480, 12);
+    helper_float_29_recursive(buf + 24576, 12);
+    helper_float_29_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_29_recursive(buf + 0, 15);
+    helper_float_29_recursive(buf + 32768, 15);
+    helper_float_29_recursive(buf + 65536, 15);
+    helper_float_29_recursive(buf + 98304, 15);
+    helper_float_29_recursive(buf + 131072, 15);
+    helper_float_29_recursive(buf + 163840, 15);
+    helper_float_29_recursive(buf + 196608, 15);
+    helper_float_29_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_29_recursive(buf + 0, 18);
+    helper_float_29_recursive(buf + 262144, 18);
+    helper_float_29_recursive(buf + 524288, 18);
+    helper_float_29_recursive(buf + 786432, 18);
+    helper_float_29_recursive(buf + 1048576, 18);
+    helper_float_29_recursive(buf + 1310720, 18);
+    helper_float_29_recursive(buf + 1572864, 18);
+    helper_float_29_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_29_recursive(buf + 0, 21);
+    helper_float_29_recursive(buf + 2097152, 21);
+    helper_float_29_recursive(buf + 4194304, 21);
+    helper_float_29_recursive(buf + 6291456, 21);
+    helper_float_29_recursive(buf + 8388608, 21);
+    helper_float_29_recursive(buf + 10485760, 21);
+    helper_float_29_recursive(buf + 12582912, 21);
+    helper_float_29_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_29_recursive(buf + 0, 24);
+    helper_float_29_recursive(buf + 16777216, 24);
+    helper_float_29_recursive(buf + 33554432, 24);
+    helper_float_29_recursive(buf + 50331648, 24);
+    helper_float_29_recursive(buf + 67108864, 24);
+    helper_float_29_recursive(buf + 83886080, 24);
+    helper_float_29_recursive(buf + 100663296, 24);
+    helper_float_29_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 29) {
+    helper_float_29_recursive(buf + 0, 27);
+    helper_float_29_recursive(buf + 134217728, 27);
+    helper_float_29_recursive(buf + 268435456, 27);
+    helper_float_29_recursive(buf + 402653184, 27);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 134217728; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovups %%ymm0, (%0)\n"
+          "vmovups %%ymm1, (%1)\n"
+          "vmovups %%ymm2, (%2)\n"
+          "vmovups %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_29(float *buf);
+void helper_float_29(float *buf) {
+  helper_float_29_recursive(buf, 29);
+}
+void helper_float_30_recursive(float *buf, int depth);
+void helper_float_30_recursive(float *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 8; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vpermilps $160, %%ymm0, %%ymm8\n"
+          "vpermilps $245, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilps $160, %%ymm1, %%ymm8\n"
+          "vpermilps $245, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilps $160, %%ymm2, %%ymm8\n"
+          "vpermilps $245, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilps $160, %%ymm3, %%ymm8\n"
+          "vpermilps $245, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilps $160, %%ymm4, %%ymm8\n"
+          "vpermilps $245, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilps $160, %%ymm5, %%ymm8\n"
+          "vpermilps $245, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilps $160, %%ymm6, %%ymm8\n"
+          "vpermilps $245, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilps $160, %%ymm7, %%ymm8\n"
+          "vpermilps $245, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
+          "vpermilps $68, %%ymm0, %%ymm8\n"
+          "vpermilps $238, %%ymm0, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm0\n"
+          "vpermilps $68, %%ymm1, %%ymm8\n"
+          "vpermilps $238, %%ymm1, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm1\n"
+          "vpermilps $68, %%ymm2, %%ymm8\n"
+          "vpermilps $238, %%ymm2, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm2\n"
+          "vpermilps $68, %%ymm3, %%ymm8\n"
+          "vpermilps $238, %%ymm3, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm3\n"
+          "vpermilps $68, %%ymm4, %%ymm8\n"
+          "vpermilps $238, %%ymm4, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm4\n"
+          "vpermilps $68, %%ymm5, %%ymm8\n"
+          "vpermilps $238, %%ymm5, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm5\n"
+          "vpermilps $68, %%ymm6, %%ymm8\n"
+          "vpermilps $238, %%ymm6, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm6\n"
+          "vpermilps $68, %%ymm7, %%ymm8\n"
+          "vpermilps $238, %%ymm7, %%ymm9\n"
+          "vxorps %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubps %%ymm9, %%ymm10, %%ymm11\n"
+          "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
+          "vaddps %%ymm8, %%ymm12, %%ymm7\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm0, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm0\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm1\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm2, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm2\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm3, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm3\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm4\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm5, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm5\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm6, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm6\n"
+          "vxorps %%ymm8, %%ymm8, %%ymm8\n"
+          "vsubps %%ymm7, %%ymm8, %%ymm9\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
+          "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
+          "vaddps %%ymm10, %%ymm11, %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_float_30_recursive(buf + 0, 6);
+    helper_float_30_recursive(buf + 64, 6);
+    helper_float_30_recursive(buf + 128, 6);
+    helper_float_30_recursive(buf + 192, 6);
+    helper_float_30_recursive(buf + 256, 6);
+    helper_float_30_recursive(buf + 320, 6);
+    helper_float_30_recursive(buf + 384, 6);
+    helper_float_30_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_30_recursive(buf + 0, 9);
+    helper_float_30_recursive(buf + 512, 9);
+    helper_float_30_recursive(buf + 1024, 9);
+    helper_float_30_recursive(buf + 1536, 9);
+    helper_float_30_recursive(buf + 2048, 9);
+    helper_float_30_recursive(buf + 2560, 9);
+    helper_float_30_recursive(buf + 3072, 9);
+    helper_float_30_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_30_recursive(buf + 0, 12);
+    helper_float_30_recursive(buf + 4096, 12);
+    helper_float_30_recursive(buf + 8192, 12);
+    helper_float_30_recursive(buf + 12288, 12);
+    helper_float_30_recursive(buf + 16384, 12);
+    helper_float_30_recursive(buf + 20480, 12);
+    helper_float_30_recursive(buf + 24576, 12);
+    helper_float_30_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_30_recursive(buf + 0, 15);
+    helper_float_30_recursive(buf + 32768, 15);
+    helper_float_30_recursive(buf + 65536, 15);
+    helper_float_30_recursive(buf + 98304, 15);
+    helper_float_30_recursive(buf + 131072, 15);
+    helper_float_30_recursive(buf + 163840, 15);
+    helper_float_30_recursive(buf + 196608, 15);
+    helper_float_30_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_30_recursive(buf + 0, 18);
+    helper_float_30_recursive(buf + 262144, 18);
+    helper_float_30_recursive(buf + 524288, 18);
+    helper_float_30_recursive(buf + 786432, 18);
+    helper_float_30_recursive(buf + 1048576, 18);
+    helper_float_30_recursive(buf + 1310720, 18);
+    helper_float_30_recursive(buf + 1572864, 18);
+    helper_float_30_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_30_recursive(buf + 0, 21);
+    helper_float_30_recursive(buf + 2097152, 21);
+    helper_float_30_recursive(buf + 4194304, 21);
+    helper_float_30_recursive(buf + 6291456, 21);
+    helper_float_30_recursive(buf + 8388608, 21);
+    helper_float_30_recursive(buf + 10485760, 21);
+    helper_float_30_recursive(buf + 12582912, 21);
+    helper_float_30_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_30_recursive(buf + 0, 24);
+    helper_float_30_recursive(buf + 16777216, 24);
+    helper_float_30_recursive(buf + 33554432, 24);
+    helper_float_30_recursive(buf + 50331648, 24);
+    helper_float_30_recursive(buf + 67108864, 24);
+    helper_float_30_recursive(buf + 83886080, 24);
+    helper_float_30_recursive(buf + 100663296, 24);
+    helper_float_30_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 30) {
+    helper_float_30_recursive(buf + 0, 27);
+    helper_float_30_recursive(buf + 134217728, 27);
+    helper_float_30_recursive(buf + 268435456, 27);
+    helper_float_30_recursive(buf + 402653184, 27);
+    helper_float_30_recursive(buf + 536870912, 27);
+    helper_float_30_recursive(buf + 671088640, 27);
+    helper_float_30_recursive(buf + 805306368, 27);
+    helper_float_30_recursive(buf + 939524096, 27);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 134217728; k += 8) {
+        __asm__ volatile (
+          "vmovups (%0), %%ymm0\n"
+          "vmovups (%1), %%ymm1\n"
+          "vmovups (%2), %%ymm2\n"
+          "vmovups (%3), %%ymm3\n"
+          "vmovups (%4), %%ymm4\n"
+          "vmovups (%5), %%ymm5\n"
+          "vmovups (%6), %%ymm6\n"
+          "vmovups (%7), %%ymm7\n"
+          "vaddps %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddps %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddps %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubps %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddps %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubps %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddps %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubps %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddps %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubps %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddps %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubps %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddps %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubps %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddps %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubps %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddps %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubps %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddps %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubps %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddps %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubps %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovups %%ymm8, (%0)\n"
+          "vmovups %%ymm9, (%1)\n"
+          "vmovups %%ymm10, (%2)\n"
+          "vmovups %%ymm11, (%3)\n"
+          "vmovups %%ymm12, (%4)\n"
+          "vmovups %%ymm13, (%5)\n"
+          "vmovups %%ymm14, (%6)\n"
+          "vmovups %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_30(float *buf);
+void helper_float_30(float *buf) {
+  helper_float_30_recursive(buf, 30);
+}
+int fht_float(float *buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_float_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_float_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_float_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_float_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_float_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_float_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_float_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_float_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_float_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_float_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_float_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_float_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_float_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_float_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_float_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_float_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_float_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_float_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_float_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_float_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_float_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_float_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_float_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_float_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_float_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_float_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_float_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_float_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_float_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_float_30(buf);
+    return 0;
+  }
+  return 1;
+}
+static inline void helper_double_1(double *buf);
+static inline void helper_double_1(double *buf) {
+  for (int j = 0; j < 2; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      double u = buf[j + k];
+      double v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+}
+static inline void helper_double_2(double *buf);
+static inline void helper_double_2(double *buf) {
+  for (int j = 0; j < 4; j += 4) {
+    __asm__ volatile (
+      "vmovupd (%0), %%ymm0\n"
+      "vpermilpd $0, %%ymm0, %%ymm8\n"
+      "vpermilpd $15, %%ymm0, %%ymm9\n"
+      "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+      "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+      "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+      "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+      "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+      "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+      "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+      "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+      "vmovupd %%ymm0, (%0)\n"
+      :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+    );
+  }
+}
+static inline void helper_double_3(double *buf);
+static inline void helper_double_3(double *buf) {
+  for (int j = 0; j < 8; j += 8) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_4_recursive(double *buf, int depth);
+void helper_double_4_recursive(double *buf, int depth) {
+  if (depth == 4) {
+    for (int j = 0; j < 16; j += 16) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_4(double *buf);
+void helper_double_4(double *buf) {
+  helper_double_4_recursive(buf, 4);
+}
+static inline void helper_double_5(double *buf);
+static inline void helper_double_5(double *buf) {
+  for (int j = 0; j < 32; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_6(double *buf);
+static inline void helper_double_6(double *buf) {
+  for (int j = 0; j < 64; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 64; j += 64) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_7(double *buf);
+static inline void helper_double_7(double *buf) {
+  for (int j = 0; j < 128; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 128; j += 128) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vmovupd %%ymm0, (%0)\n"
+        "vmovupd %%ymm1, (%1)\n"
+        "vmovupd %%ymm2, (%2)\n"
+        "vmovupd %%ymm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_8(double *buf);
+static inline void helper_double_8(double *buf) {
+  for (int j = 0; j < 256; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 256; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_9(double *buf);
+static inline void helper_double_9(double *buf) {
+  for (int j = 0; j < 512; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 512) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_10(double *buf);
+static inline void helper_double_10(double *buf) {
+  for (int j = 0; j < 1024; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 1024; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 1024; j += 1024) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vmovupd %%ymm0, (%0)\n"
+        "vmovupd %%ymm1, (%1)\n"
+        "vmovupd %%ymm2, (%2)\n"
+        "vmovupd %%ymm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_11(double *buf);
+static inline void helper_double_11(double *buf) {
+  for (int j = 0; j < 2048; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vpermilpd $0, %%ymm0, %%ymm8\n"
+        "vpermilpd $15, %%ymm0, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vpermilpd $0, %%ymm1, %%ymm8\n"
+        "vpermilpd $15, %%ymm1, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vpermilpd $0, %%ymm2, %%ymm8\n"
+        "vpermilpd $15, %%ymm2, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vpermilpd $0, %%ymm3, %%ymm8\n"
+        "vpermilpd $15, %%ymm3, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vpermilpd $0, %%ymm4, %%ymm8\n"
+        "vpermilpd $15, %%ymm4, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vpermilpd $0, %%ymm5, %%ymm8\n"
+        "vpermilpd $15, %%ymm5, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vpermilpd $0, %%ymm6, %%ymm8\n"
+        "vpermilpd $15, %%ymm6, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vpermilpd $0, %%ymm7, %%ymm8\n"
+        "vpermilpd $15, %%ymm7, %%ymm9\n"
+        "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+        "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+        "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+        "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+        "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+        "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+        "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+        "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+        "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+        "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+        "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+        "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+        "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+        "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 2048; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 2048; j += 2048) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "vmovupd (%0), %%ymm0\n"
+        "vmovupd (%1), %%ymm1\n"
+        "vmovupd (%2), %%ymm2\n"
+        "vmovupd (%3), %%ymm3\n"
+        "vmovupd (%4), %%ymm4\n"
+        "vmovupd (%5), %%ymm5\n"
+        "vmovupd (%6), %%ymm6\n"
+        "vmovupd (%7), %%ymm7\n"
+        "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+        "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+        "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+        "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+        "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+        "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+        "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+        "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+        "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+        "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+        "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+        "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+        "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+        "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+        "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+        "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+        "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+        "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+        "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+        "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+        "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+        "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+        "vmovupd %%ymm8, (%0)\n"
+        "vmovupd %%ymm9, (%1)\n"
+        "vmovupd %%ymm10, (%2)\n"
+        "vmovupd %%ymm11, (%3)\n"
+        "vmovupd %%ymm12, (%4)\n"
+        "vmovupd %%ymm13, (%5)\n"
+        "vmovupd %%ymm14, (%6)\n"
+        "vmovupd %%ymm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_12_recursive(double *buf, int depth);
+void helper_double_12_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_12_recursive(buf + 0, 11);
+    helper_double_12_recursive(buf + 2048, 11);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_12(double *buf);
+void helper_double_12(double *buf) {
+  helper_double_12_recursive(buf, 12);
+}
+void helper_double_13_recursive(double *buf, int depth);
+void helper_double_13_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_13_recursive(buf + 0, 11);
+    helper_double_13_recursive(buf + 2048, 11);
+    helper_double_13_recursive(buf + 4096, 11);
+    helper_double_13_recursive(buf + 6144, 11);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_13(double *buf);
+void helper_double_13(double *buf) {
+  helper_double_13_recursive(buf, 13);
+}
+void helper_double_14_recursive(double *buf, int depth);
+void helper_double_14_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_14_recursive(buf + 0, 12);
+    helper_double_14_recursive(buf + 4096, 12);
+    helper_double_14_recursive(buf + 8192, 12);
+    helper_double_14_recursive(buf + 12288, 12);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_14(double *buf);
+void helper_double_14(double *buf) {
+  helper_double_14_recursive(buf, 14);
+}
+void helper_double_15_recursive(double *buf, int depth);
+void helper_double_15_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_15_recursive(buf + 0, 12);
+    helper_double_15_recursive(buf + 4096, 12);
+    helper_double_15_recursive(buf + 8192, 12);
+    helper_double_15_recursive(buf + 12288, 12);
+    helper_double_15_recursive(buf + 16384, 12);
+    helper_double_15_recursive(buf + 20480, 12);
+    helper_double_15_recursive(buf + 24576, 12);
+    helper_double_15_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_15(double *buf);
+void helper_double_15(double *buf) {
+  helper_double_15_recursive(buf, 15);
+}
+void helper_double_16_recursive(double *buf, int depth);
+void helper_double_16_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_16_recursive(buf + 0, 11);
+    helper_double_16_recursive(buf + 2048, 11);
+    helper_double_16_recursive(buf + 4096, 11);
+    helper_double_16_recursive(buf + 6144, 11);
+    helper_double_16_recursive(buf + 8192, 11);
+    helper_double_16_recursive(buf + 10240, 11);
+    helper_double_16_recursive(buf + 12288, 11);
+    helper_double_16_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_16_recursive(buf + 0, 14);
+    helper_double_16_recursive(buf + 16384, 14);
+    helper_double_16_recursive(buf + 32768, 14);
+    helper_double_16_recursive(buf + 49152, 14);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_16(double *buf);
+void helper_double_16(double *buf) {
+  helper_double_16_recursive(buf, 16);
+}
+void helper_double_17_recursive(double *buf, int depth);
+void helper_double_17_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_17_recursive(buf + 0, 11);
+    helper_double_17_recursive(buf + 2048, 11);
+    helper_double_17_recursive(buf + 4096, 11);
+    helper_double_17_recursive(buf + 6144, 11);
+    helper_double_17_recursive(buf + 8192, 11);
+    helper_double_17_recursive(buf + 10240, 11);
+    helper_double_17_recursive(buf + 12288, 11);
+    helper_double_17_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_17_recursive(buf + 0, 14);
+    helper_double_17_recursive(buf + 16384, 14);
+    helper_double_17_recursive(buf + 32768, 14);
+    helper_double_17_recursive(buf + 49152, 14);
+    helper_double_17_recursive(buf + 65536, 14);
+    helper_double_17_recursive(buf + 81920, 14);
+    helper_double_17_recursive(buf + 98304, 14);
+    helper_double_17_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_17(double *buf);
+void helper_double_17(double *buf) {
+  helper_double_17_recursive(buf, 17);
+}
+void helper_double_18_recursive(double *buf, int depth);
+void helper_double_18_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_18_recursive(buf + 0, 12);
+    helper_double_18_recursive(buf + 4096, 12);
+    helper_double_18_recursive(buf + 8192, 12);
+    helper_double_18_recursive(buf + 12288, 12);
+    helper_double_18_recursive(buf + 16384, 12);
+    helper_double_18_recursive(buf + 20480, 12);
+    helper_double_18_recursive(buf + 24576, 12);
+    helper_double_18_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_18_recursive(buf + 0, 15);
+    helper_double_18_recursive(buf + 32768, 15);
+    helper_double_18_recursive(buf + 65536, 15);
+    helper_double_18_recursive(buf + 98304, 15);
+    helper_double_18_recursive(buf + 131072, 15);
+    helper_double_18_recursive(buf + 163840, 15);
+    helper_double_18_recursive(buf + 196608, 15);
+    helper_double_18_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_18(double *buf);
+void helper_double_18(double *buf) {
+  helper_double_18_recursive(buf, 18);
+}
+void helper_double_19_recursive(double *buf, int depth);
+void helper_double_19_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_19_recursive(buf + 0, 11);
+    helper_double_19_recursive(buf + 2048, 11);
+    helper_double_19_recursive(buf + 4096, 11);
+    helper_double_19_recursive(buf + 6144, 11);
+    helper_double_19_recursive(buf + 8192, 11);
+    helper_double_19_recursive(buf + 10240, 11);
+    helper_double_19_recursive(buf + 12288, 11);
+    helper_double_19_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_19_recursive(buf + 0, 14);
+    helper_double_19_recursive(buf + 16384, 14);
+    helper_double_19_recursive(buf + 32768, 14);
+    helper_double_19_recursive(buf + 49152, 14);
+    helper_double_19_recursive(buf + 65536, 14);
+    helper_double_19_recursive(buf + 81920, 14);
+    helper_double_19_recursive(buf + 98304, 14);
+    helper_double_19_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_19_recursive(buf + 0, 17);
+    helper_double_19_recursive(buf + 131072, 17);
+    helper_double_19_recursive(buf + 262144, 17);
+    helper_double_19_recursive(buf + 393216, 17);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_19(double *buf);
+void helper_double_19(double *buf) {
+  helper_double_19_recursive(buf, 19);
+}
+void helper_double_20_recursive(double *buf, int depth);
+void helper_double_20_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_20_recursive(buf + 0, 9);
+    helper_double_20_recursive(buf + 512, 9);
+    helper_double_20_recursive(buf + 1024, 9);
+    helper_double_20_recursive(buf + 1536, 9);
+    helper_double_20_recursive(buf + 2048, 9);
+    helper_double_20_recursive(buf + 2560, 9);
+    helper_double_20_recursive(buf + 3072, 9);
+    helper_double_20_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_20_recursive(buf + 0, 12);
+    helper_double_20_recursive(buf + 4096, 12);
+    helper_double_20_recursive(buf + 8192, 12);
+    helper_double_20_recursive(buf + 12288, 12);
+    helper_double_20_recursive(buf + 16384, 12);
+    helper_double_20_recursive(buf + 20480, 12);
+    helper_double_20_recursive(buf + 24576, 12);
+    helper_double_20_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_20_recursive(buf + 0, 15);
+    helper_double_20_recursive(buf + 32768, 15);
+    helper_double_20_recursive(buf + 65536, 15);
+    helper_double_20_recursive(buf + 98304, 15);
+    helper_double_20_recursive(buf + 131072, 15);
+    helper_double_20_recursive(buf + 163840, 15);
+    helper_double_20_recursive(buf + 196608, 15);
+    helper_double_20_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_20_recursive(buf + 0, 18);
+    helper_double_20_recursive(buf + 262144, 18);
+    helper_double_20_recursive(buf + 524288, 18);
+    helper_double_20_recursive(buf + 786432, 18);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_20(double *buf);
+void helper_double_20(double *buf) {
+  helper_double_20_recursive(buf, 20);
+}
+void helper_double_21_recursive(double *buf, int depth);
+void helper_double_21_recursive(double *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 10) {
+    helper_double_21_recursive(buf + 0, 7);
+    helper_double_21_recursive(buf + 128, 7);
+    helper_double_21_recursive(buf + 256, 7);
+    helper_double_21_recursive(buf + 384, 7);
+    helper_double_21_recursive(buf + 512, 7);
+    helper_double_21_recursive(buf + 640, 7);
+    helper_double_21_recursive(buf + 768, 7);
+    helper_double_21_recursive(buf + 896, 7);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_21_recursive(buf + 0, 10);
+    helper_double_21_recursive(buf + 1024, 10);
+    helper_double_21_recursive(buf + 2048, 10);
+    helper_double_21_recursive(buf + 3072, 10);
+    helper_double_21_recursive(buf + 4096, 10);
+    helper_double_21_recursive(buf + 5120, 10);
+    helper_double_21_recursive(buf + 6144, 10);
+    helper_double_21_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_21_recursive(buf + 0, 13);
+    helper_double_21_recursive(buf + 8192, 13);
+    helper_double_21_recursive(buf + 16384, 13);
+    helper_double_21_recursive(buf + 24576, 13);
+    helper_double_21_recursive(buf + 32768, 13);
+    helper_double_21_recursive(buf + 40960, 13);
+    helper_double_21_recursive(buf + 49152, 13);
+    helper_double_21_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_21_recursive(buf + 0, 16);
+    helper_double_21_recursive(buf + 65536, 16);
+    helper_double_21_recursive(buf + 131072, 16);
+    helper_double_21_recursive(buf + 196608, 16);
+    helper_double_21_recursive(buf + 262144, 16);
+    helper_double_21_recursive(buf + 327680, 16);
+    helper_double_21_recursive(buf + 393216, 16);
+    helper_double_21_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_21_recursive(buf + 0, 19);
+    helper_double_21_recursive(buf + 524288, 19);
+    helper_double_21_recursive(buf + 1048576, 19);
+    helper_double_21_recursive(buf + 1572864, 19);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 524288; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_21(double *buf);
+void helper_double_21(double *buf) {
+  helper_double_21_recursive(buf, 21);
+}
+void helper_double_22_recursive(double *buf, int depth);
+void helper_double_22_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_22_recursive(buf + 0, 11);
+    helper_double_22_recursive(buf + 2048, 11);
+    helper_double_22_recursive(buf + 4096, 11);
+    helper_double_22_recursive(buf + 6144, 11);
+    helper_double_22_recursive(buf + 8192, 11);
+    helper_double_22_recursive(buf + 10240, 11);
+    helper_double_22_recursive(buf + 12288, 11);
+    helper_double_22_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_22_recursive(buf + 0, 14);
+    helper_double_22_recursive(buf + 16384, 14);
+    helper_double_22_recursive(buf + 32768, 14);
+    helper_double_22_recursive(buf + 49152, 14);
+    helper_double_22_recursive(buf + 65536, 14);
+    helper_double_22_recursive(buf + 81920, 14);
+    helper_double_22_recursive(buf + 98304, 14);
+    helper_double_22_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_22_recursive(buf + 0, 17);
+    helper_double_22_recursive(buf + 131072, 17);
+    helper_double_22_recursive(buf + 262144, 17);
+    helper_double_22_recursive(buf + 393216, 17);
+    helper_double_22_recursive(buf + 524288, 17);
+    helper_double_22_recursive(buf + 655360, 17);
+    helper_double_22_recursive(buf + 786432, 17);
+    helper_double_22_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_22_recursive(buf + 0, 20);
+    helper_double_22_recursive(buf + 1048576, 20);
+    helper_double_22_recursive(buf + 2097152, 20);
+    helper_double_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_22(double *buf);
+void helper_double_22(double *buf) {
+  helper_double_22_recursive(buf, 22);
+}
+void helper_double_23_recursive(double *buf, int depth);
+void helper_double_23_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_23_recursive(buf + 0, 11);
+    helper_double_23_recursive(buf + 2048, 11);
+    helper_double_23_recursive(buf + 4096, 11);
+    helper_double_23_recursive(buf + 6144, 11);
+    helper_double_23_recursive(buf + 8192, 11);
+    helper_double_23_recursive(buf + 10240, 11);
+    helper_double_23_recursive(buf + 12288, 11);
+    helper_double_23_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_23_recursive(buf + 0, 14);
+    helper_double_23_recursive(buf + 16384, 14);
+    helper_double_23_recursive(buf + 32768, 14);
+    helper_double_23_recursive(buf + 49152, 14);
+    helper_double_23_recursive(buf + 65536, 14);
+    helper_double_23_recursive(buf + 81920, 14);
+    helper_double_23_recursive(buf + 98304, 14);
+    helper_double_23_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_23_recursive(buf + 0, 17);
+    helper_double_23_recursive(buf + 131072, 17);
+    helper_double_23_recursive(buf + 262144, 17);
+    helper_double_23_recursive(buf + 393216, 17);
+    helper_double_23_recursive(buf + 524288, 17);
+    helper_double_23_recursive(buf + 655360, 17);
+    helper_double_23_recursive(buf + 786432, 17);
+    helper_double_23_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_23_recursive(buf + 0, 20);
+    helper_double_23_recursive(buf + 1048576, 20);
+    helper_double_23_recursive(buf + 2097152, 20);
+    helper_double_23_recursive(buf + 3145728, 20);
+    helper_double_23_recursive(buf + 4194304, 20);
+    helper_double_23_recursive(buf + 5242880, 20);
+    helper_double_23_recursive(buf + 6291456, 20);
+    helper_double_23_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_23(double *buf);
+void helper_double_23(double *buf) {
+  helper_double_23_recursive(buf, 23);
+}
+void helper_double_24_recursive(double *buf, int depth);
+void helper_double_24_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_24_recursive(buf + 0, 10);
+    helper_double_24_recursive(buf + 1024, 10);
+    helper_double_24_recursive(buf + 2048, 10);
+    helper_double_24_recursive(buf + 3072, 10);
+    helper_double_24_recursive(buf + 4096, 10);
+    helper_double_24_recursive(buf + 5120, 10);
+    helper_double_24_recursive(buf + 6144, 10);
+    helper_double_24_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_24_recursive(buf + 0, 13);
+    helper_double_24_recursive(buf + 8192, 13);
+    helper_double_24_recursive(buf + 16384, 13);
+    helper_double_24_recursive(buf + 24576, 13);
+    helper_double_24_recursive(buf + 32768, 13);
+    helper_double_24_recursive(buf + 40960, 13);
+    helper_double_24_recursive(buf + 49152, 13);
+    helper_double_24_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_24_recursive(buf + 0, 16);
+    helper_double_24_recursive(buf + 65536, 16);
+    helper_double_24_recursive(buf + 131072, 16);
+    helper_double_24_recursive(buf + 196608, 16);
+    helper_double_24_recursive(buf + 262144, 16);
+    helper_double_24_recursive(buf + 327680, 16);
+    helper_double_24_recursive(buf + 393216, 16);
+    helper_double_24_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_24_recursive(buf + 0, 19);
+    helper_double_24_recursive(buf + 524288, 19);
+    helper_double_24_recursive(buf + 1048576, 19);
+    helper_double_24_recursive(buf + 1572864, 19);
+    helper_double_24_recursive(buf + 2097152, 19);
+    helper_double_24_recursive(buf + 2621440, 19);
+    helper_double_24_recursive(buf + 3145728, 19);
+    helper_double_24_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_24_recursive(buf + 0, 22);
+    helper_double_24_recursive(buf + 4194304, 22);
+    helper_double_24_recursive(buf + 8388608, 22);
+    helper_double_24_recursive(buf + 12582912, 22);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 4194304; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_24(double *buf);
+void helper_double_24(double *buf) {
+  helper_double_24_recursive(buf, 24);
+}
+void helper_double_25_recursive(double *buf, int depth);
+void helper_double_25_recursive(double *buf, int depth) {
+  if (depth == 8) {
+    for (int j = 0; j < 256; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_25_recursive(buf + 0, 8);
+    helper_double_25_recursive(buf + 256, 8);
+    helper_double_25_recursive(buf + 512, 8);
+    helper_double_25_recursive(buf + 768, 8);
+    helper_double_25_recursive(buf + 1024, 8);
+    helper_double_25_recursive(buf + 1280, 8);
+    helper_double_25_recursive(buf + 1536, 8);
+    helper_double_25_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_25_recursive(buf + 0, 11);
+    helper_double_25_recursive(buf + 2048, 11);
+    helper_double_25_recursive(buf + 4096, 11);
+    helper_double_25_recursive(buf + 6144, 11);
+    helper_double_25_recursive(buf + 8192, 11);
+    helper_double_25_recursive(buf + 10240, 11);
+    helper_double_25_recursive(buf + 12288, 11);
+    helper_double_25_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_25_recursive(buf + 0, 14);
+    helper_double_25_recursive(buf + 16384, 14);
+    helper_double_25_recursive(buf + 32768, 14);
+    helper_double_25_recursive(buf + 49152, 14);
+    helper_double_25_recursive(buf + 65536, 14);
+    helper_double_25_recursive(buf + 81920, 14);
+    helper_double_25_recursive(buf + 98304, 14);
+    helper_double_25_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_25_recursive(buf + 0, 17);
+    helper_double_25_recursive(buf + 131072, 17);
+    helper_double_25_recursive(buf + 262144, 17);
+    helper_double_25_recursive(buf + 393216, 17);
+    helper_double_25_recursive(buf + 524288, 17);
+    helper_double_25_recursive(buf + 655360, 17);
+    helper_double_25_recursive(buf + 786432, 17);
+    helper_double_25_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_25_recursive(buf + 0, 20);
+    helper_double_25_recursive(buf + 1048576, 20);
+    helper_double_25_recursive(buf + 2097152, 20);
+    helper_double_25_recursive(buf + 3145728, 20);
+    helper_double_25_recursive(buf + 4194304, 20);
+    helper_double_25_recursive(buf + 5242880, 20);
+    helper_double_25_recursive(buf + 6291456, 20);
+    helper_double_25_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_double_25_recursive(buf + 0, 23);
+    helper_double_25_recursive(buf + 8388608, 23);
+    helper_double_25_recursive(buf + 16777216, 23);
+    helper_double_25_recursive(buf + 25165824, 23);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_25(double *buf);
+void helper_double_25(double *buf) {
+  helper_double_25_recursive(buf, 25);
+}
+void helper_double_26_recursive(double *buf, int depth);
+void helper_double_26_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_26_recursive(buf + 0, 11);
+    helper_double_26_recursive(buf + 2048, 11);
+    helper_double_26_recursive(buf + 4096, 11);
+    helper_double_26_recursive(buf + 6144, 11);
+    helper_double_26_recursive(buf + 8192, 11);
+    helper_double_26_recursive(buf + 10240, 11);
+    helper_double_26_recursive(buf + 12288, 11);
+    helper_double_26_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_26_recursive(buf + 0, 14);
+    helper_double_26_recursive(buf + 16384, 14);
+    helper_double_26_recursive(buf + 32768, 14);
+    helper_double_26_recursive(buf + 49152, 14);
+    helper_double_26_recursive(buf + 65536, 14);
+    helper_double_26_recursive(buf + 81920, 14);
+    helper_double_26_recursive(buf + 98304, 14);
+    helper_double_26_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_26_recursive(buf + 0, 17);
+    helper_double_26_recursive(buf + 131072, 17);
+    helper_double_26_recursive(buf + 262144, 17);
+    helper_double_26_recursive(buf + 393216, 17);
+    helper_double_26_recursive(buf + 524288, 17);
+    helper_double_26_recursive(buf + 655360, 17);
+    helper_double_26_recursive(buf + 786432, 17);
+    helper_double_26_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_26_recursive(buf + 0, 20);
+    helper_double_26_recursive(buf + 1048576, 20);
+    helper_double_26_recursive(buf + 2097152, 20);
+    helper_double_26_recursive(buf + 3145728, 20);
+    helper_double_26_recursive(buf + 4194304, 20);
+    helper_double_26_recursive(buf + 5242880, 20);
+    helper_double_26_recursive(buf + 6291456, 20);
+    helper_double_26_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_26_recursive(buf + 0, 23);
+    helper_double_26_recursive(buf + 8388608, 23);
+    helper_double_26_recursive(buf + 16777216, 23);
+    helper_double_26_recursive(buf + 25165824, 23);
+    helper_double_26_recursive(buf + 33554432, 23);
+    helper_double_26_recursive(buf + 41943040, 23);
+    helper_double_26_recursive(buf + 50331648, 23);
+    helper_double_26_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_26(double *buf);
+void helper_double_26(double *buf) {
+  helper_double_26_recursive(buf, 26);
+}
+void helper_double_27_recursive(double *buf, int depth);
+void helper_double_27_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_27_recursive(buf + 0, 9);
+    helper_double_27_recursive(buf + 512, 9);
+    helper_double_27_recursive(buf + 1024, 9);
+    helper_double_27_recursive(buf + 1536, 9);
+    helper_double_27_recursive(buf + 2048, 9);
+    helper_double_27_recursive(buf + 2560, 9);
+    helper_double_27_recursive(buf + 3072, 9);
+    helper_double_27_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_27_recursive(buf + 0, 12);
+    helper_double_27_recursive(buf + 4096, 12);
+    helper_double_27_recursive(buf + 8192, 12);
+    helper_double_27_recursive(buf + 12288, 12);
+    helper_double_27_recursive(buf + 16384, 12);
+    helper_double_27_recursive(buf + 20480, 12);
+    helper_double_27_recursive(buf + 24576, 12);
+    helper_double_27_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_27_recursive(buf + 0, 15);
+    helper_double_27_recursive(buf + 32768, 15);
+    helper_double_27_recursive(buf + 65536, 15);
+    helper_double_27_recursive(buf + 98304, 15);
+    helper_double_27_recursive(buf + 131072, 15);
+    helper_double_27_recursive(buf + 163840, 15);
+    helper_double_27_recursive(buf + 196608, 15);
+    helper_double_27_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_27_recursive(buf + 0, 18);
+    helper_double_27_recursive(buf + 262144, 18);
+    helper_double_27_recursive(buf + 524288, 18);
+    helper_double_27_recursive(buf + 786432, 18);
+    helper_double_27_recursive(buf + 1048576, 18);
+    helper_double_27_recursive(buf + 1310720, 18);
+    helper_double_27_recursive(buf + 1572864, 18);
+    helper_double_27_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_27_recursive(buf + 0, 21);
+    helper_double_27_recursive(buf + 2097152, 21);
+    helper_double_27_recursive(buf + 4194304, 21);
+    helper_double_27_recursive(buf + 6291456, 21);
+    helper_double_27_recursive(buf + 8388608, 21);
+    helper_double_27_recursive(buf + 10485760, 21);
+    helper_double_27_recursive(buf + 12582912, 21);
+    helper_double_27_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_27_recursive(buf + 0, 24);
+    helper_double_27_recursive(buf + 16777216, 24);
+    helper_double_27_recursive(buf + 33554432, 24);
+    helper_double_27_recursive(buf + 50331648, 24);
+    helper_double_27_recursive(buf + 67108864, 24);
+    helper_double_27_recursive(buf + 83886080, 24);
+    helper_double_27_recursive(buf + 100663296, 24);
+    helper_double_27_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_27(double *buf);
+void helper_double_27(double *buf) {
+  helper_double_27_recursive(buf, 27);
+}
+void helper_double_28_recursive(double *buf, int depth);
+void helper_double_28_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_28_recursive(buf + 0, 11);
+    helper_double_28_recursive(buf + 2048, 11);
+    helper_double_28_recursive(buf + 4096, 11);
+    helper_double_28_recursive(buf + 6144, 11);
+    helper_double_28_recursive(buf + 8192, 11);
+    helper_double_28_recursive(buf + 10240, 11);
+    helper_double_28_recursive(buf + 12288, 11);
+    helper_double_28_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_28_recursive(buf + 0, 14);
+    helper_double_28_recursive(buf + 16384, 14);
+    helper_double_28_recursive(buf + 32768, 14);
+    helper_double_28_recursive(buf + 49152, 14);
+    helper_double_28_recursive(buf + 65536, 14);
+    helper_double_28_recursive(buf + 81920, 14);
+    helper_double_28_recursive(buf + 98304, 14);
+    helper_double_28_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_28_recursive(buf + 0, 17);
+    helper_double_28_recursive(buf + 131072, 17);
+    helper_double_28_recursive(buf + 262144, 17);
+    helper_double_28_recursive(buf + 393216, 17);
+    helper_double_28_recursive(buf + 524288, 17);
+    helper_double_28_recursive(buf + 655360, 17);
+    helper_double_28_recursive(buf + 786432, 17);
+    helper_double_28_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_28_recursive(buf + 0, 20);
+    helper_double_28_recursive(buf + 1048576, 20);
+    helper_double_28_recursive(buf + 2097152, 20);
+    helper_double_28_recursive(buf + 3145728, 20);
+    helper_double_28_recursive(buf + 4194304, 20);
+    helper_double_28_recursive(buf + 5242880, 20);
+    helper_double_28_recursive(buf + 6291456, 20);
+    helper_double_28_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_28_recursive(buf + 0, 23);
+    helper_double_28_recursive(buf + 8388608, 23);
+    helper_double_28_recursive(buf + 16777216, 23);
+    helper_double_28_recursive(buf + 25165824, 23);
+    helper_double_28_recursive(buf + 33554432, 23);
+    helper_double_28_recursive(buf + 41943040, 23);
+    helper_double_28_recursive(buf + 50331648, 23);
+    helper_double_28_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 28) {
+    helper_double_28_recursive(buf + 0, 26);
+    helper_double_28_recursive(buf + 67108864, 26);
+    helper_double_28_recursive(buf + 134217728, 26);
+    helper_double_28_recursive(buf + 201326592, 26);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 67108864; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vmovupd %%ymm0, (%0)\n"
+          "vmovupd %%ymm1, (%1)\n"
+          "vmovupd %%ymm2, (%2)\n"
+          "vmovupd %%ymm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_28(double *buf);
+void helper_double_28(double *buf) {
+  helper_double_28_recursive(buf, 28);
+}
+void helper_double_29_recursive(double *buf, int depth);
+void helper_double_29_recursive(double *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_29_recursive(buf + 0, 11);
+    helper_double_29_recursive(buf + 2048, 11);
+    helper_double_29_recursive(buf + 4096, 11);
+    helper_double_29_recursive(buf + 6144, 11);
+    helper_double_29_recursive(buf + 8192, 11);
+    helper_double_29_recursive(buf + 10240, 11);
+    helper_double_29_recursive(buf + 12288, 11);
+    helper_double_29_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_29_recursive(buf + 0, 14);
+    helper_double_29_recursive(buf + 16384, 14);
+    helper_double_29_recursive(buf + 32768, 14);
+    helper_double_29_recursive(buf + 49152, 14);
+    helper_double_29_recursive(buf + 65536, 14);
+    helper_double_29_recursive(buf + 81920, 14);
+    helper_double_29_recursive(buf + 98304, 14);
+    helper_double_29_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_29_recursive(buf + 0, 17);
+    helper_double_29_recursive(buf + 131072, 17);
+    helper_double_29_recursive(buf + 262144, 17);
+    helper_double_29_recursive(buf + 393216, 17);
+    helper_double_29_recursive(buf + 524288, 17);
+    helper_double_29_recursive(buf + 655360, 17);
+    helper_double_29_recursive(buf + 786432, 17);
+    helper_double_29_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_29_recursive(buf + 0, 20);
+    helper_double_29_recursive(buf + 1048576, 20);
+    helper_double_29_recursive(buf + 2097152, 20);
+    helper_double_29_recursive(buf + 3145728, 20);
+    helper_double_29_recursive(buf + 4194304, 20);
+    helper_double_29_recursive(buf + 5242880, 20);
+    helper_double_29_recursive(buf + 6291456, 20);
+    helper_double_29_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_29_recursive(buf + 0, 23);
+    helper_double_29_recursive(buf + 8388608, 23);
+    helper_double_29_recursive(buf + 16777216, 23);
+    helper_double_29_recursive(buf + 25165824, 23);
+    helper_double_29_recursive(buf + 33554432, 23);
+    helper_double_29_recursive(buf + 41943040, 23);
+    helper_double_29_recursive(buf + 50331648, 23);
+    helper_double_29_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 29) {
+    helper_double_29_recursive(buf + 0, 26);
+    helper_double_29_recursive(buf + 67108864, 26);
+    helper_double_29_recursive(buf + 134217728, 26);
+    helper_double_29_recursive(buf + 201326592, 26);
+    helper_double_29_recursive(buf + 268435456, 26);
+    helper_double_29_recursive(buf + 335544320, 26);
+    helper_double_29_recursive(buf + 402653184, 26);
+    helper_double_29_recursive(buf + 469762048, 26);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 67108864; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592), "r"(buf + j + k + 268435456), "r"(buf + j + k + 335544320), "r"(buf + j + k + 402653184), "r"(buf + j + k + 469762048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_29(double *buf);
+void helper_double_29(double *buf) {
+  helper_double_29_recursive(buf, 29);
+}
+void helper_double_30_recursive(double *buf, int depth);
+void helper_double_30_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vpermilpd $0, %%ymm0, %%ymm8\n"
+          "vpermilpd $15, %%ymm0, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vpermilpd $0, %%ymm1, %%ymm8\n"
+          "vpermilpd $15, %%ymm1, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vpermilpd $0, %%ymm2, %%ymm8\n"
+          "vpermilpd $15, %%ymm2, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vpermilpd $0, %%ymm3, %%ymm8\n"
+          "vpermilpd $15, %%ymm3, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vpermilpd $0, %%ymm4, %%ymm8\n"
+          "vpermilpd $15, %%ymm4, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vpermilpd $0, %%ymm5, %%ymm8\n"
+          "vpermilpd $15, %%ymm5, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vpermilpd $0, %%ymm6, %%ymm8\n"
+          "vpermilpd $15, %%ymm6, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vpermilpd $0, %%ymm7, %%ymm8\n"
+          "vpermilpd $15, %%ymm7, %%ymm9\n"
+          "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
+          "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
+          "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
+          "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
+          "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
+          "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
+          "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
+          "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
+          "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
+          "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
+          "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
+          "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
+          "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
+          "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_30_recursive(buf + 0, 9);
+    helper_double_30_recursive(buf + 512, 9);
+    helper_double_30_recursive(buf + 1024, 9);
+    helper_double_30_recursive(buf + 1536, 9);
+    helper_double_30_recursive(buf + 2048, 9);
+    helper_double_30_recursive(buf + 2560, 9);
+    helper_double_30_recursive(buf + 3072, 9);
+    helper_double_30_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_30_recursive(buf + 0, 12);
+    helper_double_30_recursive(buf + 4096, 12);
+    helper_double_30_recursive(buf + 8192, 12);
+    helper_double_30_recursive(buf + 12288, 12);
+    helper_double_30_recursive(buf + 16384, 12);
+    helper_double_30_recursive(buf + 20480, 12);
+    helper_double_30_recursive(buf + 24576, 12);
+    helper_double_30_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_30_recursive(buf + 0, 15);
+    helper_double_30_recursive(buf + 32768, 15);
+    helper_double_30_recursive(buf + 65536, 15);
+    helper_double_30_recursive(buf + 98304, 15);
+    helper_double_30_recursive(buf + 131072, 15);
+    helper_double_30_recursive(buf + 163840, 15);
+    helper_double_30_recursive(buf + 196608, 15);
+    helper_double_30_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_30_recursive(buf + 0, 18);
+    helper_double_30_recursive(buf + 262144, 18);
+    helper_double_30_recursive(buf + 524288, 18);
+    helper_double_30_recursive(buf + 786432, 18);
+    helper_double_30_recursive(buf + 1048576, 18);
+    helper_double_30_recursive(buf + 1310720, 18);
+    helper_double_30_recursive(buf + 1572864, 18);
+    helper_double_30_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_30_recursive(buf + 0, 21);
+    helper_double_30_recursive(buf + 2097152, 21);
+    helper_double_30_recursive(buf + 4194304, 21);
+    helper_double_30_recursive(buf + 6291456, 21);
+    helper_double_30_recursive(buf + 8388608, 21);
+    helper_double_30_recursive(buf + 10485760, 21);
+    helper_double_30_recursive(buf + 12582912, 21);
+    helper_double_30_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_30_recursive(buf + 0, 24);
+    helper_double_30_recursive(buf + 16777216, 24);
+    helper_double_30_recursive(buf + 33554432, 24);
+    helper_double_30_recursive(buf + 50331648, 24);
+    helper_double_30_recursive(buf + 67108864, 24);
+    helper_double_30_recursive(buf + 83886080, 24);
+    helper_double_30_recursive(buf + 100663296, 24);
+    helper_double_30_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 30) {
+    helper_double_30_recursive(buf + 0, 27);
+    helper_double_30_recursive(buf + 134217728, 27);
+    helper_double_30_recursive(buf + 268435456, 27);
+    helper_double_30_recursive(buf + 402653184, 27);
+    helper_double_30_recursive(buf + 536870912, 27);
+    helper_double_30_recursive(buf + 671088640, 27);
+    helper_double_30_recursive(buf + 805306368, 27);
+    helper_double_30_recursive(buf + 939524096, 27);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 134217728; k += 4) {
+        __asm__ volatile (
+          "vmovupd (%0), %%ymm0\n"
+          "vmovupd (%1), %%ymm1\n"
+          "vmovupd (%2), %%ymm2\n"
+          "vmovupd (%3), %%ymm3\n"
+          "vmovupd (%4), %%ymm4\n"
+          "vmovupd (%5), %%ymm5\n"
+          "vmovupd (%6), %%ymm6\n"
+          "vmovupd (%7), %%ymm7\n"
+          "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
+          "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
+          "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
+          "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
+          "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
+          "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
+          "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
+          "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
+          "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
+          "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
+          "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
+          "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
+          "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
+          "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
+          "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
+          "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
+          "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
+          "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
+          "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
+          "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
+          "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
+          "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
+          "vmovupd %%ymm8, (%0)\n"
+          "vmovupd %%ymm9, (%1)\n"
+          "vmovupd %%ymm10, (%2)\n"
+          "vmovupd %%ymm11, (%3)\n"
+          "vmovupd %%ymm12, (%4)\n"
+          "vmovupd %%ymm13, (%5)\n"
+          "vmovupd %%ymm14, (%6)\n"
+          "vmovupd %%ymm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_30(double *buf);
+void helper_double_30(double *buf) {
+  helper_double_30_recursive(buf, 30);
+}
+int fht_double(double *buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_double_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_double_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_double_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_double_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_double_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_double_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_double_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_double_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_double_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_double_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_double_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_double_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_double_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_double_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_double_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_double_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_double_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_double_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_double_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_double_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_double_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_double_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_double_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_double_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_double_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_double_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_double_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_double_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_double_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_double_30(buf);
+    return 0;
+  }
+  return 1;
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h
new file mode 100644
index 00000000000..13ec1086500
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_impl.h
@@ -0,0 +1,39 @@
+#ifndef _FHT_IMPL_H__
+#define _FHT_IMPL_H__
+
+#include "fast_copy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __aarch64__
+#include "fht_neon.c"
+#define VECTOR_WIDTH (16u)
+#else
+#ifdef __AVX__
+#include "fht_avx.c"
+#define VECTOR_WIDTH (32u)
+#else
+#include "fht_sse.c"
+#define VECTOR_WIDTH (16u)
+#endif
+#endif
+
+int fht_float_oop(float* in, float* out, int log_n) {
+  fast_copy(out, in, sizeof(float) << log_n);
+  return fht_float(out, log_n);
+}
+
+#ifndef __aarch64__
+int fht_double_oop(double* in, double* out, int log_n) {
+  fast_copy(out, in, sizeof(double) << log_n);
+  return fht_double(out, log_n);
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // ifndef _FHT_IMPL_H__
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c
new file mode 100644
index 00000000000..3d84ee96195
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c
@@ -0,0 +1,3019 @@
+// @generated
+#include "fht.h"
+static inline void helper_float_1(float* buf);
+static inline void helper_float_1(float* buf) {
+  for (int j = 0; j < 2; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+}
+static inline void helper_float_2(float* buf);
+static inline void helper_float_2(float* buf) {
+  for (int j = 0; j < 4; j += 4) {
+    __asm__ volatile(
+        "LD1 {v0.4S}, [%0]\n"
+        "TRN1 v16.4S, v0.4S, v0.4S\n"
+        "FNEG v17.4S, v0.4S\n"
+        "TRN2 v17.4S, v0.4S, v17.4S\n"
+        "FADD v0.4S, v16.4S, v17.4S\n"
+        "DUP v16.2D, v0.D[0]\n"
+        "FNEG v17.4S, v0.4S\n"
+        "INS v17.D[0], v0.D[1]\n"
+        "FADD v0.4S, v16.4S, v17.4S\n"
+        "ST1 {v0.4S}, [%0]\n" ::"r"(buf + j)
+        : "%v0",
+          "%v1",
+          "%v2",
+          "%v3",
+          "%v4",
+          "%v5",
+          "%v6",
+          "%v7",
+          "%v8",
+          "%v9",
+          "%v10",
+          "%v11",
+          "%v12",
+          "%v13",
+          "%v14",
+          "%v15",
+          "%v16",
+          "%v17",
+          "%v18",
+          "%v19",
+          "%v20",
+          "%v21",
+          "%v22",
+          "%v23",
+          "%v24",
+          "%v25",
+          "%v26",
+          "%v27",
+          "%v28",
+          "%v29",
+          "%v30",
+          "%v31",
+          "memory");
+  }
+}
+void helper_float_3_recursive(float* buf, int depth);
+void helper_float_3_recursive(float* buf, int depth) {
+  if (depth == 2) {
+    helper_float_2(buf);
+    return;
+  }
+  if (depth == 3) {
+    helper_float_3_recursive(buf + 0, 2);
+    helper_float_3_recursive(buf + 4, 2);
+    for (int j = 0; j < 8; j += 8) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 4)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_3(float* buf);
+void helper_float_3(float* buf) {
+  helper_float_3_recursive(buf, 3);
+}
+void helper_float_4_recursive(float* buf, int depth);
+void helper_float_4_recursive(float* buf, int depth) {
+  if (depth == 3) {
+    helper_float_3(buf);
+    return;
+  }
+  if (depth == 4) {
+    helper_float_4_recursive(buf + 0, 3);
+    helper_float_4_recursive(buf + 8, 3);
+    for (int j = 0; j < 16; j += 16) {
+      for (int k = 0; k < 8; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_4(float* buf);
+void helper_float_4(float* buf) {
+  helper_float_4_recursive(buf, 4);
+}
+void helper_float_5_recursive(float* buf, int depth);
+void helper_float_5_recursive(float* buf, int depth) {
+  if (depth == 4) {
+    helper_float_4(buf);
+    return;
+  }
+  if (depth == 5) {
+    helper_float_5_recursive(buf + 0, 4);
+    helper_float_5_recursive(buf + 16, 4);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 16; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 16)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_5(float* buf);
+void helper_float_5(float* buf) {
+  helper_float_5_recursive(buf, 5);
+}
+void helper_float_6_recursive(float* buf, int depth);
+void helper_float_6_recursive(float* buf, int depth) {
+  if (depth == 3) {
+    helper_float_3(buf);
+    return;
+  }
+  if (depth == 6) {
+    helper_float_6_recursive(buf + 0, 3);
+    helper_float_6_recursive(buf + 8, 3);
+    helper_float_6_recursive(buf + 16, 3);
+    helper_float_6_recursive(buf + 24, 3);
+    helper_float_6_recursive(buf + 32, 3);
+    helper_float_6_recursive(buf + 40, 3);
+    helper_float_6_recursive(buf + 48, 3);
+    helper_float_6_recursive(buf + 56, 3);
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 8; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "LD1 {v4.4S}, [%4]\n"
+            "LD1 {v5.4S}, [%5]\n"
+            "LD1 {v6.4S}, [%6]\n"
+            "LD1 {v7.4S}, [%7]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v20.4S, v4.4S, v5.4S\n"
+            "FSUB v21.4S, v4.4S, v5.4S\n"
+            "FADD v22.4S, v6.4S, v7.4S\n"
+            "FSUB v23.4S, v6.4S, v7.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "FADD v4.4S, v20.4S, v22.4S\n"
+            "FSUB v6.4S, v20.4S, v22.4S\n"
+            "FADD v5.4S, v21.4S, v23.4S\n"
+            "FSUB v7.4S, v21.4S, v23.4S\n"
+            "FADD v16.4S, v0.4S, v4.4S\n"
+            "FSUB v20.4S, v0.4S, v4.4S\n"
+            "FADD v17.4S, v1.4S, v5.4S\n"
+            "FSUB v21.4S, v1.4S, v5.4S\n"
+            "FADD v18.4S, v2.4S, v6.4S\n"
+            "FSUB v22.4S, v2.4S, v6.4S\n"
+            "FADD v19.4S, v3.4S, v7.4S\n"
+            "FSUB v23.4S, v3.4S, v7.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n"
+            "ST1 {v18.4S}, [%2]\n"
+            "ST1 {v19.4S}, [%3]\n"
+            "ST1 {v20.4S}, [%4]\n"
+            "ST1 {v21.4S}, [%5]\n"
+            "ST1 {v22.4S}, [%6]\n"
+            "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8),
+            "r"(buf + j + k + 16),
+            "r"(buf + j + k + 24),
+            "r"(buf + j + k + 32),
+            "r"(buf + j + k + 40),
+            "r"(buf + j + k + 48),
+            "r"(buf + j + k + 56)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_6(float* buf);
+void helper_float_6(float* buf) {
+  helper_float_6_recursive(buf, 6);
+}
+void helper_float_7_recursive(float* buf, int depth);
+void helper_float_7_recursive(float* buf, int depth) {
+  if (depth == 3) {
+    helper_float_3(buf);
+    return;
+  }
+  if (depth == 7) {
+    helper_float_7_recursive(buf + 0, 3);
+    helper_float_7_recursive(buf + 8, 3);
+    helper_float_7_recursive(buf + 16, 3);
+    helper_float_7_recursive(buf + 24, 3);
+    helper_float_7_recursive(buf + 32, 3);
+    helper_float_7_recursive(buf + 40, 3);
+    helper_float_7_recursive(buf + 48, 3);
+    helper_float_7_recursive(buf + 56, 3);
+    helper_float_7_recursive(buf + 64, 3);
+    helper_float_7_recursive(buf + 72, 3);
+    helper_float_7_recursive(buf + 80, 3);
+    helper_float_7_recursive(buf + 88, 3);
+    helper_float_7_recursive(buf + 96, 3);
+    helper_float_7_recursive(buf + 104, 3);
+    helper_float_7_recursive(buf + 112, 3);
+    helper_float_7_recursive(buf + 120, 3);
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 8; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "LD1 {v4.4S}, [%4]\n"
+            "LD1 {v5.4S}, [%5]\n"
+            "LD1 {v6.4S}, [%6]\n"
+            "LD1 {v7.4S}, [%7]\n"
+            "LD1 {v8.4S}, [%8]\n"
+            "LD1 {v9.4S}, [%9]\n"
+            "LD1 {v10.4S}, [%10]\n"
+            "LD1 {v11.4S}, [%11]\n"
+            "LD1 {v12.4S}, [%12]\n"
+            "LD1 {v13.4S}, [%13]\n"
+            "LD1 {v14.4S}, [%14]\n"
+            "LD1 {v15.4S}, [%15]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v20.4S, v4.4S, v5.4S\n"
+            "FSUB v21.4S, v4.4S, v5.4S\n"
+            "FADD v22.4S, v6.4S, v7.4S\n"
+            "FSUB v23.4S, v6.4S, v7.4S\n"
+            "FADD v24.4S, v8.4S, v9.4S\n"
+            "FSUB v25.4S, v8.4S, v9.4S\n"
+            "FADD v26.4S, v10.4S, v11.4S\n"
+            "FSUB v27.4S, v10.4S, v11.4S\n"
+            "FADD v28.4S, v12.4S, v13.4S\n"
+            "FSUB v29.4S, v12.4S, v13.4S\n"
+            "FADD v30.4S, v14.4S, v15.4S\n"
+            "FSUB v31.4S, v14.4S, v15.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "FADD v4.4S, v20.4S, v22.4S\n"
+            "FSUB v6.4S, v20.4S, v22.4S\n"
+            "FADD v5.4S, v21.4S, v23.4S\n"
+            "FSUB v7.4S, v21.4S, v23.4S\n"
+            "FADD v8.4S, v24.4S, v26.4S\n"
+            "FSUB v10.4S, v24.4S, v26.4S\n"
+            "FADD v9.4S, v25.4S, v27.4S\n"
+            "FSUB v11.4S, v25.4S, v27.4S\n"
+            "FADD v12.4S, v28.4S, v30.4S\n"
+            "FSUB v14.4S, v28.4S, v30.4S\n"
+            "FADD v13.4S, v29.4S, v31.4S\n"
+            "FSUB v15.4S, v29.4S, v31.4S\n"
+            "FADD v16.4S, v0.4S, v4.4S\n"
+            "FSUB v20.4S, v0.4S, v4.4S\n"
+            "FADD v17.4S, v1.4S, v5.4S\n"
+            "FSUB v21.4S, v1.4S, v5.4S\n"
+            "FADD v18.4S, v2.4S, v6.4S\n"
+            "FSUB v22.4S, v2.4S, v6.4S\n"
+            "FADD v19.4S, v3.4S, v7.4S\n"
+            "FSUB v23.4S, v3.4S, v7.4S\n"
+            "FADD v24.4S, v8.4S, v12.4S\n"
+            "FSUB v28.4S, v8.4S, v12.4S\n"
+            "FADD v25.4S, v9.4S, v13.4S\n"
+            "FSUB v29.4S, v9.4S, v13.4S\n"
+            "FADD v26.4S, v10.4S, v14.4S\n"
+            "FSUB v30.4S, v10.4S, v14.4S\n"
+            "FADD v27.4S, v11.4S, v15.4S\n"
+            "FSUB v31.4S, v11.4S, v15.4S\n"
+            "FADD v0.4S, v16.4S, v24.4S\n"
+            "FSUB v8.4S, v16.4S, v24.4S\n"
+            "FADD v1.4S, v17.4S, v25.4S\n"
+            "FSUB v9.4S, v17.4S, v25.4S\n"
+            "FADD v2.4S, v18.4S, v26.4S\n"
+            "FSUB v10.4S, v18.4S, v26.4S\n"
+            "FADD v3.4S, v19.4S, v27.4S\n"
+            "FSUB v11.4S, v19.4S, v27.4S\n"
+            "FADD v4.4S, v20.4S, v28.4S\n"
+            "FSUB v12.4S, v20.4S, v28.4S\n"
+            "FADD v5.4S, v21.4S, v29.4S\n"
+            "FSUB v13.4S, v21.4S, v29.4S\n"
+            "FADD v6.4S, v22.4S, v30.4S\n"
+            "FSUB v14.4S, v22.4S, v30.4S\n"
+            "FADD v7.4S, v23.4S, v31.4S\n"
+            "FSUB v15.4S, v23.4S, v31.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n"
+            "ST1 {v4.4S}, [%4]\n"
+            "ST1 {v5.4S}, [%5]\n"
+            "ST1 {v6.4S}, [%6]\n"
+            "ST1 {v7.4S}, [%7]\n"
+            "ST1 {v8.4S}, [%8]\n"
+            "ST1 {v9.4S}, [%9]\n"
+            "ST1 {v10.4S}, [%10]\n"
+            "ST1 {v11.4S}, [%11]\n"
+            "ST1 {v12.4S}, [%12]\n"
+            "ST1 {v13.4S}, [%13]\n"
+            "ST1 {v14.4S}, [%14]\n"
+            "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8),
+            "r"(buf + j + k + 16),
+            "r"(buf + j + k + 24),
+            "r"(buf + j + k + 32),
+            "r"(buf + j + k + 40),
+            "r"(buf + j + k + 48),
+            "r"(buf + j + k + 56),
+            "r"(buf + j + k + 64),
+            "r"(buf + j + k + 72),
+            "r"(buf + j + k + 80),
+            "r"(buf + j + k + 88),
+            "r"(buf + j + k + 96),
+            "r"(buf + j + k + 104),
+            "r"(buf + j + k + 112),
+            "r"(buf + j + k + 120)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_7(float* buf);
+void helper_float_7(float* buf) {
+  helper_float_7_recursive(buf, 7);
+}
+static inline void helper_float_8(float* buf);
+static inline void helper_float_8(float* buf) {
+  for (int j = 0; j < 256; j += 64) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "LD1 {v4.4S}, [%4]\n"
+          "LD1 {v5.4S}, [%5]\n"
+          "LD1 {v6.4S}, [%6]\n"
+          "LD1 {v7.4S}, [%7]\n"
+          "LD1 {v8.4S}, [%8]\n"
+          "LD1 {v9.4S}, [%9]\n"
+          "LD1 {v10.4S}, [%10]\n"
+          "LD1 {v11.4S}, [%11]\n"
+          "LD1 {v12.4S}, [%12]\n"
+          "LD1 {v13.4S}, [%13]\n"
+          "LD1 {v14.4S}, [%14]\n"
+          "LD1 {v15.4S}, [%15]\n"
+          "TRN1 v16.4S, v0.4S, v0.4S\n"
+          "FNEG v17.4S, v0.4S\n"
+          "TRN2 v17.4S, v0.4S, v17.4S\n"
+          "FADD v0.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v1.4S, v1.4S\n"
+          "FNEG v17.4S, v1.4S\n"
+          "TRN2 v17.4S, v1.4S, v17.4S\n"
+          "FADD v1.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v2.4S, v2.4S\n"
+          "FNEG v17.4S, v2.4S\n"
+          "TRN2 v17.4S, v2.4S, v17.4S\n"
+          "FADD v2.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v3.4S, v3.4S\n"
+          "FNEG v17.4S, v3.4S\n"
+          "TRN2 v17.4S, v3.4S, v17.4S\n"
+          "FADD v3.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v4.4S, v4.4S\n"
+          "FNEG v17.4S, v4.4S\n"
+          "TRN2 v17.4S, v4.4S, v17.4S\n"
+          "FADD v4.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v5.4S, v5.4S\n"
+          "FNEG v17.4S, v5.4S\n"
+          "TRN2 v17.4S, v5.4S, v17.4S\n"
+          "FADD v5.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v6.4S, v6.4S\n"
+          "FNEG v17.4S, v6.4S\n"
+          "TRN2 v17.4S, v6.4S, v17.4S\n"
+          "FADD v6.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v7.4S, v7.4S\n"
+          "FNEG v17.4S, v7.4S\n"
+          "TRN2 v17.4S, v7.4S, v17.4S\n"
+          "FADD v7.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v8.4S, v8.4S\n"
+          "FNEG v17.4S, v8.4S\n"
+          "TRN2 v17.4S, v8.4S, v17.4S\n"
+          "FADD v8.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v9.4S, v9.4S\n"
+          "FNEG v17.4S, v9.4S\n"
+          "TRN2 v17.4S, v9.4S, v17.4S\n"
+          "FADD v9.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v10.4S, v10.4S\n"
+          "FNEG v17.4S, v10.4S\n"
+          "TRN2 v17.4S, v10.4S, v17.4S\n"
+          "FADD v10.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v11.4S, v11.4S\n"
+          "FNEG v17.4S, v11.4S\n"
+          "TRN2 v17.4S, v11.4S, v17.4S\n"
+          "FADD v11.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v12.4S, v12.4S\n"
+          "FNEG v17.4S, v12.4S\n"
+          "TRN2 v17.4S, v12.4S, v17.4S\n"
+          "FADD v12.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v13.4S, v13.4S\n"
+          "FNEG v17.4S, v13.4S\n"
+          "TRN2 v17.4S, v13.4S, v17.4S\n"
+          "FADD v13.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v14.4S, v14.4S\n"
+          "FNEG v17.4S, v14.4S\n"
+          "TRN2 v17.4S, v14.4S, v17.4S\n"
+          "FADD v14.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v15.4S, v15.4S\n"
+          "FNEG v17.4S, v15.4S\n"
+          "TRN2 v17.4S, v15.4S, v17.4S\n"
+          "FADD v15.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v0.D[0]\n"
+          "FNEG v17.4S, v0.4S\n"
+          "INS v17.D[0], v0.D[1]\n"
+          "FADD v0.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v1.D[0]\n"
+          "FNEG v17.4S, v1.4S\n"
+          "INS v17.D[0], v1.D[1]\n"
+          "FADD v1.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v2.D[0]\n"
+          "FNEG v17.4S, v2.4S\n"
+          "INS v17.D[0], v2.D[1]\n"
+          "FADD v2.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v3.D[0]\n"
+          "FNEG v17.4S, v3.4S\n"
+          "INS v17.D[0], v3.D[1]\n"
+          "FADD v3.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v4.D[0]\n"
+          "FNEG v17.4S, v4.4S\n"
+          "INS v17.D[0], v4.D[1]\n"
+          "FADD v4.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v5.D[0]\n"
+          "FNEG v17.4S, v5.4S\n"
+          "INS v17.D[0], v5.D[1]\n"
+          "FADD v5.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v6.D[0]\n"
+          "FNEG v17.4S, v6.4S\n"
+          "INS v17.D[0], v6.D[1]\n"
+          "FADD v6.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v7.D[0]\n"
+          "FNEG v17.4S, v7.4S\n"
+          "INS v17.D[0], v7.D[1]\n"
+          "FADD v7.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v8.D[0]\n"
+          "FNEG v17.4S, v8.4S\n"
+          "INS v17.D[0], v8.D[1]\n"
+          "FADD v8.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v9.D[0]\n"
+          "FNEG v17.4S, v9.4S\n"
+          "INS v17.D[0], v9.D[1]\n"
+          "FADD v9.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v10.D[0]\n"
+          "FNEG v17.4S, v10.4S\n"
+          "INS v17.D[0], v10.D[1]\n"
+          "FADD v10.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v11.D[0]\n"
+          "FNEG v17.4S, v11.4S\n"
+          "INS v17.D[0], v11.D[1]\n"
+          "FADD v11.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v12.D[0]\n"
+          "FNEG v17.4S, v12.4S\n"
+          "INS v17.D[0], v12.D[1]\n"
+          "FADD v12.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v13.D[0]\n"
+          "FNEG v17.4S, v13.4S\n"
+          "INS v17.D[0], v13.D[1]\n"
+          "FADD v13.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v14.D[0]\n"
+          "FNEG v17.4S, v14.4S\n"
+          "INS v17.D[0], v14.D[1]\n"
+          "FADD v14.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v15.D[0]\n"
+          "FNEG v17.4S, v15.4S\n"
+          "INS v17.D[0], v15.D[1]\n"
+          "FADD v15.4S, v16.4S, v17.4S\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v20.4S, v4.4S, v5.4S\n"
+          "FSUB v21.4S, v4.4S, v5.4S\n"
+          "FADD v22.4S, v6.4S, v7.4S\n"
+          "FSUB v23.4S, v6.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v9.4S\n"
+          "FSUB v25.4S, v8.4S, v9.4S\n"
+          "FADD v26.4S, v10.4S, v11.4S\n"
+          "FSUB v27.4S, v10.4S, v11.4S\n"
+          "FADD v28.4S, v12.4S, v13.4S\n"
+          "FSUB v29.4S, v12.4S, v13.4S\n"
+          "FADD v30.4S, v14.4S, v15.4S\n"
+          "FSUB v31.4S, v14.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "FADD v4.4S, v20.4S, v22.4S\n"
+          "FSUB v6.4S, v20.4S, v22.4S\n"
+          "FADD v5.4S, v21.4S, v23.4S\n"
+          "FSUB v7.4S, v21.4S, v23.4S\n"
+          "FADD v8.4S, v24.4S, v26.4S\n"
+          "FSUB v10.4S, v24.4S, v26.4S\n"
+          "FADD v9.4S, v25.4S, v27.4S\n"
+          "FSUB v11.4S, v25.4S, v27.4S\n"
+          "FADD v12.4S, v28.4S, v30.4S\n"
+          "FSUB v14.4S, v28.4S, v30.4S\n"
+          "FADD v13.4S, v29.4S, v31.4S\n"
+          "FSUB v15.4S, v29.4S, v31.4S\n"
+          "FADD v16.4S, v0.4S, v4.4S\n"
+          "FSUB v20.4S, v0.4S, v4.4S\n"
+          "FADD v17.4S, v1.4S, v5.4S\n"
+          "FSUB v21.4S, v1.4S, v5.4S\n"
+          "FADD v18.4S, v2.4S, v6.4S\n"
+          "FSUB v22.4S, v2.4S, v6.4S\n"
+          "FADD v19.4S, v3.4S, v7.4S\n"
+          "FSUB v23.4S, v3.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v12.4S\n"
+          "FSUB v28.4S, v8.4S, v12.4S\n"
+          "FADD v25.4S, v9.4S, v13.4S\n"
+          "FSUB v29.4S, v9.4S, v13.4S\n"
+          "FADD v26.4S, v10.4S, v14.4S\n"
+          "FSUB v30.4S, v10.4S, v14.4S\n"
+          "FADD v27.4S, v11.4S, v15.4S\n"
+          "FSUB v31.4S, v11.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v24.4S\n"
+          "FSUB v8.4S, v16.4S, v24.4S\n"
+          "FADD v1.4S, v17.4S, v25.4S\n"
+          "FSUB v9.4S, v17.4S, v25.4S\n"
+          "FADD v2.4S, v18.4S, v26.4S\n"
+          "FSUB v10.4S, v18.4S, v26.4S\n"
+          "FADD v3.4S, v19.4S, v27.4S\n"
+          "FSUB v11.4S, v19.4S, v27.4S\n"
+          "FADD v4.4S, v20.4S, v28.4S\n"
+          "FSUB v12.4S, v20.4S, v28.4S\n"
+          "FADD v5.4S, v21.4S, v29.4S\n"
+          "FSUB v13.4S, v21.4S, v29.4S\n"
+          "FADD v6.4S, v22.4S, v30.4S\n"
+          "FSUB v14.4S, v22.4S, v30.4S\n"
+          "FADD v7.4S, v23.4S, v31.4S\n"
+          "FSUB v15.4S, v23.4S, v31.4S\n"
+          "ST1 {v0.4S}, [%0]\n"
+          "ST1 {v1.4S}, [%1]\n"
+          "ST1 {v2.4S}, [%2]\n"
+          "ST1 {v3.4S}, [%3]\n"
+          "ST1 {v4.4S}, [%4]\n"
+          "ST1 {v5.4S}, [%5]\n"
+          "ST1 {v6.4S}, [%6]\n"
+          "ST1 {v7.4S}, [%7]\n"
+          "ST1 {v8.4S}, [%8]\n"
+          "ST1 {v9.4S}, [%9]\n"
+          "ST1 {v10.4S}, [%10]\n"
+          "ST1 {v11.4S}, [%11]\n"
+          "ST1 {v12.4S}, [%12]\n"
+          "ST1 {v13.4S}, [%13]\n"
+          "ST1 {v14.4S}, [%14]\n"
+          "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 4),
+          "r"(buf + j + k + 8),
+          "r"(buf + j + k + 12),
+          "r"(buf + j + k + 16),
+          "r"(buf + j + k + 20),
+          "r"(buf + j + k + 24),
+          "r"(buf + j + k + 28),
+          "r"(buf + j + k + 32),
+          "r"(buf + j + k + 36),
+          "r"(buf + j + k + 40),
+          "r"(buf + j + k + 44),
+          "r"(buf + j + k + 48),
+          "r"(buf + j + k + 52),
+          "r"(buf + j + k + 56),
+          "r"(buf + j + k + 60)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+  for (int j = 0; j < 256; j += 256) {
+    for (int k = 0; k < 64; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "ST1 {v0.4S}, [%0]\n"
+          "ST1 {v1.4S}, [%1]\n"
+          "ST1 {v2.4S}, [%2]\n"
+          "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 64),
+          "r"(buf + j + k + 128),
+          "r"(buf + j + k + 192)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+}
+void helper_float_9_recursive(float* buf, int depth);
+void helper_float_9_recursive(float* buf, int depth) {
+  if (depth == 8) {
+    helper_float_8(buf);
+    return;
+  }
+  if (depth == 9) {
+    helper_float_9_recursive(buf + 0, 8);
+    helper_float_9_recursive(buf + 256, 8);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 256)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_9(float* buf);
+void helper_float_9(float* buf) {
+  helper_float_9_recursive(buf, 9);
+}
+void helper_float_10_recursive(float* buf, int depth);
+void helper_float_10_recursive(float* buf, int depth) {
+  if (depth == 8) {
+    helper_float_8(buf);
+    return;
+  }
+  if (depth == 10) {
+    helper_float_10_recursive(buf + 0, 8);
+    helper_float_10_recursive(buf + 256, 8);
+    helper_float_10_recursive(buf + 512, 8);
+    helper_float_10_recursive(buf + 768, 8);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 256),
+            "r"(buf + j + k + 512),
+            "r"(buf + j + k + 768)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_10(float* buf);
+void helper_float_10(float* buf) {
+  helper_float_10_recursive(buf, 10);
+}
+void helper_float_11_recursive(float* buf, int depth);
+void helper_float_11_recursive(float* buf, int depth) {
+  if (depth == 10) {
+    helper_float_10(buf);
+    return;
+  }
+  if (depth == 11) {
+    helper_float_11_recursive(buf + 0, 10);
+    helper_float_11_recursive(buf + 1024, 10);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1024)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_11(float* buf);
+void helper_float_11(float* buf) {
+  helper_float_11_recursive(buf, 11);
+}
+void helper_float_12_recursive(float* buf, int depth);
+void helper_float_12_recursive(float* buf, int depth) {
+  if (depth == 10) {
+    helper_float_10(buf);
+    return;
+  }
+  if (depth == 12) {
+    helper_float_12_recursive(buf + 0, 10);
+    helper_float_12_recursive(buf + 1024, 10);
+    helper_float_12_recursive(buf + 2048, 10);
+    helper_float_12_recursive(buf + 3072, 10);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1024),
+            "r"(buf + j + k + 2048),
+            "r"(buf + j + k + 3072)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_12(float* buf);
+void helper_float_12(float* buf) {
+  helper_float_12_recursive(buf, 12);
+}
+static inline void helper_float_13(float* buf);
+static inline void helper_float_13(float* buf) {
+  for (int j = 0; j < 8192; j += 64) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "LD1 {v4.4S}, [%4]\n"
+          "LD1 {v5.4S}, [%5]\n"
+          "LD1 {v6.4S}, [%6]\n"
+          "LD1 {v7.4S}, [%7]\n"
+          "LD1 {v8.4S}, [%8]\n"
+          "LD1 {v9.4S}, [%9]\n"
+          "LD1 {v10.4S}, [%10]\n"
+          "LD1 {v11.4S}, [%11]\n"
+          "LD1 {v12.4S}, [%12]\n"
+          "LD1 {v13.4S}, [%13]\n"
+          "LD1 {v14.4S}, [%14]\n"
+          "LD1 {v15.4S}, [%15]\n"
+          "TRN1 v16.4S, v0.4S, v0.4S\n"
+          "FNEG v17.4S, v0.4S\n"
+          "TRN2 v17.4S, v0.4S, v17.4S\n"
+          "FADD v0.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v1.4S, v1.4S\n"
+          "FNEG v17.4S, v1.4S\n"
+          "TRN2 v17.4S, v1.4S, v17.4S\n"
+          "FADD v1.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v2.4S, v2.4S\n"
+          "FNEG v17.4S, v2.4S\n"
+          "TRN2 v17.4S, v2.4S, v17.4S\n"
+          "FADD v2.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v3.4S, v3.4S\n"
+          "FNEG v17.4S, v3.4S\n"
+          "TRN2 v17.4S, v3.4S, v17.4S\n"
+          "FADD v3.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v4.4S, v4.4S\n"
+          "FNEG v17.4S, v4.4S\n"
+          "TRN2 v17.4S, v4.4S, v17.4S\n"
+          "FADD v4.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v5.4S, v5.4S\n"
+          "FNEG v17.4S, v5.4S\n"
+          "TRN2 v17.4S, v5.4S, v17.4S\n"
+          "FADD v5.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v6.4S, v6.4S\n"
+          "FNEG v17.4S, v6.4S\n"
+          "TRN2 v17.4S, v6.4S, v17.4S\n"
+          "FADD v6.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v7.4S, v7.4S\n"
+          "FNEG v17.4S, v7.4S\n"
+          "TRN2 v17.4S, v7.4S, v17.4S\n"
+          "FADD v7.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v8.4S, v8.4S\n"
+          "FNEG v17.4S, v8.4S\n"
+          "TRN2 v17.4S, v8.4S, v17.4S\n"
+          "FADD v8.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v9.4S, v9.4S\n"
+          "FNEG v17.4S, v9.4S\n"
+          "TRN2 v17.4S, v9.4S, v17.4S\n"
+          "FADD v9.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v10.4S, v10.4S\n"
+          "FNEG v17.4S, v10.4S\n"
+          "TRN2 v17.4S, v10.4S, v17.4S\n"
+          "FADD v10.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v11.4S, v11.4S\n"
+          "FNEG v17.4S, v11.4S\n"
+          "TRN2 v17.4S, v11.4S, v17.4S\n"
+          "FADD v11.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v12.4S, v12.4S\n"
+          "FNEG v17.4S, v12.4S\n"
+          "TRN2 v17.4S, v12.4S, v17.4S\n"
+          "FADD v12.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v13.4S, v13.4S\n"
+          "FNEG v17.4S, v13.4S\n"
+          "TRN2 v17.4S, v13.4S, v17.4S\n"
+          "FADD v13.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v14.4S, v14.4S\n"
+          "FNEG v17.4S, v14.4S\n"
+          "TRN2 v17.4S, v14.4S, v17.4S\n"
+          "FADD v14.4S, v16.4S, v17.4S\n"
+          "TRN1 v16.4S, v15.4S, v15.4S\n"
+          "FNEG v17.4S, v15.4S\n"
+          "TRN2 v17.4S, v15.4S, v17.4S\n"
+          "FADD v15.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v0.D[0]\n"
+          "FNEG v17.4S, v0.4S\n"
+          "INS v17.D[0], v0.D[1]\n"
+          "FADD v0.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v1.D[0]\n"
+          "FNEG v17.4S, v1.4S\n"
+          "INS v17.D[0], v1.D[1]\n"
+          "FADD v1.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v2.D[0]\n"
+          "FNEG v17.4S, v2.4S\n"
+          "INS v17.D[0], v2.D[1]\n"
+          "FADD v2.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v3.D[0]\n"
+          "FNEG v17.4S, v3.4S\n"
+          "INS v17.D[0], v3.D[1]\n"
+          "FADD v3.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v4.D[0]\n"
+          "FNEG v17.4S, v4.4S\n"
+          "INS v17.D[0], v4.D[1]\n"
+          "FADD v4.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v5.D[0]\n"
+          "FNEG v17.4S, v5.4S\n"
+          "INS v17.D[0], v5.D[1]\n"
+          "FADD v5.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v6.D[0]\n"
+          "FNEG v17.4S, v6.4S\n"
+          "INS v17.D[0], v6.D[1]\n"
+          "FADD v6.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v7.D[0]\n"
+          "FNEG v17.4S, v7.4S\n"
+          "INS v17.D[0], v7.D[1]\n"
+          "FADD v7.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v8.D[0]\n"
+          "FNEG v17.4S, v8.4S\n"
+          "INS v17.D[0], v8.D[1]\n"
+          "FADD v8.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v9.D[0]\n"
+          "FNEG v17.4S, v9.4S\n"
+          "INS v17.D[0], v9.D[1]\n"
+          "FADD v9.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v10.D[0]\n"
+          "FNEG v17.4S, v10.4S\n"
+          "INS v17.D[0], v10.D[1]\n"
+          "FADD v10.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v11.D[0]\n"
+          "FNEG v17.4S, v11.4S\n"
+          "INS v17.D[0], v11.D[1]\n"
+          "FADD v11.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v12.D[0]\n"
+          "FNEG v17.4S, v12.4S\n"
+          "INS v17.D[0], v12.D[1]\n"
+          "FADD v12.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v13.D[0]\n"
+          "FNEG v17.4S, v13.4S\n"
+          "INS v17.D[0], v13.D[1]\n"
+          "FADD v13.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v14.D[0]\n"
+          "FNEG v17.4S, v14.4S\n"
+          "INS v17.D[0], v14.D[1]\n"
+          "FADD v14.4S, v16.4S, v17.4S\n"
+          "DUP v16.2D, v15.D[0]\n"
+          "FNEG v17.4S, v15.4S\n"
+          "INS v17.D[0], v15.D[1]\n"
+          "FADD v15.4S, v16.4S, v17.4S\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v20.4S, v4.4S, v5.4S\n"
+          "FSUB v21.4S, v4.4S, v5.4S\n"
+          "FADD v22.4S, v6.4S, v7.4S\n"
+          "FSUB v23.4S, v6.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v9.4S\n"
+          "FSUB v25.4S, v8.4S, v9.4S\n"
+          "FADD v26.4S, v10.4S, v11.4S\n"
+          "FSUB v27.4S, v10.4S, v11.4S\n"
+          "FADD v28.4S, v12.4S, v13.4S\n"
+          "FSUB v29.4S, v12.4S, v13.4S\n"
+          "FADD v30.4S, v14.4S, v15.4S\n"
+          "FSUB v31.4S, v14.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "FADD v4.4S, v20.4S, v22.4S\n"
+          "FSUB v6.4S, v20.4S, v22.4S\n"
+          "FADD v5.4S, v21.4S, v23.4S\n"
+          "FSUB v7.4S, v21.4S, v23.4S\n"
+          "FADD v8.4S, v24.4S, v26.4S\n"
+          "FSUB v10.4S, v24.4S, v26.4S\n"
+          "FADD v9.4S, v25.4S, v27.4S\n"
+          "FSUB v11.4S, v25.4S, v27.4S\n"
+          "FADD v12.4S, v28.4S, v30.4S\n"
+          "FSUB v14.4S, v28.4S, v30.4S\n"
+          "FADD v13.4S, v29.4S, v31.4S\n"
+          "FSUB v15.4S, v29.4S, v31.4S\n"
+          "FADD v16.4S, v0.4S, v4.4S\n"
+          "FSUB v20.4S, v0.4S, v4.4S\n"
+          "FADD v17.4S, v1.4S, v5.4S\n"
+          "FSUB v21.4S, v1.4S, v5.4S\n"
+          "FADD v18.4S, v2.4S, v6.4S\n"
+          "FSUB v22.4S, v2.4S, v6.4S\n"
+          "FADD v19.4S, v3.4S, v7.4S\n"
+          "FSUB v23.4S, v3.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v12.4S\n"
+          "FSUB v28.4S, v8.4S, v12.4S\n"
+          "FADD v25.4S, v9.4S, v13.4S\n"
+          "FSUB v29.4S, v9.4S, v13.4S\n"
+          "FADD v26.4S, v10.4S, v14.4S\n"
+          "FSUB v30.4S, v10.4S, v14.4S\n"
+          "FADD v27.4S, v11.4S, v15.4S\n"
+          "FSUB v31.4S, v11.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v24.4S\n"
+          "FSUB v8.4S, v16.4S, v24.4S\n"
+          "FADD v1.4S, v17.4S, v25.4S\n"
+          "FSUB v9.4S, v17.4S, v25.4S\n"
+          "FADD v2.4S, v18.4S, v26.4S\n"
+          "FSUB v10.4S, v18.4S, v26.4S\n"
+          "FADD v3.4S, v19.4S, v27.4S\n"
+          "FSUB v11.4S, v19.4S, v27.4S\n"
+          "FADD v4.4S, v20.4S, v28.4S\n"
+          "FSUB v12.4S, v20.4S, v28.4S\n"
+          "FADD v5.4S, v21.4S, v29.4S\n"
+          "FSUB v13.4S, v21.4S, v29.4S\n"
+          "FADD v6.4S, v22.4S, v30.4S\n"
+          "FSUB v14.4S, v22.4S, v30.4S\n"
+          "FADD v7.4S, v23.4S, v31.4S\n"
+          "FSUB v15.4S, v23.4S, v31.4S\n"
+          "ST1 {v0.4S}, [%0]\n"
+          "ST1 {v1.4S}, [%1]\n"
+          "ST1 {v2.4S}, [%2]\n"
+          "ST1 {v3.4S}, [%3]\n"
+          "ST1 {v4.4S}, [%4]\n"
+          "ST1 {v5.4S}, [%5]\n"
+          "ST1 {v6.4S}, [%6]\n"
+          "ST1 {v7.4S}, [%7]\n"
+          "ST1 {v8.4S}, [%8]\n"
+          "ST1 {v9.4S}, [%9]\n"
+          "ST1 {v10.4S}, [%10]\n"
+          "ST1 {v11.4S}, [%11]\n"
+          "ST1 {v12.4S}, [%12]\n"
+          "ST1 {v13.4S}, [%13]\n"
+          "ST1 {v14.4S}, [%14]\n"
+          "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 4),
+          "r"(buf + j + k + 8),
+          "r"(buf + j + k + 12),
+          "r"(buf + j + k + 16),
+          "r"(buf + j + k + 20),
+          "r"(buf + j + k + 24),
+          "r"(buf + j + k + 28),
+          "r"(buf + j + k + 32),
+          "r"(buf + j + k + 36),
+          "r"(buf + j + k + 40),
+          "r"(buf + j + k + 44),
+          "r"(buf + j + k + 48),
+          "r"(buf + j + k + 52),
+          "r"(buf + j + k + 56),
+          "r"(buf + j + k + 60)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+  for (int j = 0; j < 8192; j += 1024) {
+    for (int k = 0; k < 64; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "LD1 {v4.4S}, [%4]\n"
+          "LD1 {v5.4S}, [%5]\n"
+          "LD1 {v6.4S}, [%6]\n"
+          "LD1 {v7.4S}, [%7]\n"
+          "LD1 {v8.4S}, [%8]\n"
+          "LD1 {v9.4S}, [%9]\n"
+          "LD1 {v10.4S}, [%10]\n"
+          "LD1 {v11.4S}, [%11]\n"
+          "LD1 {v12.4S}, [%12]\n"
+          "LD1 {v13.4S}, [%13]\n"
+          "LD1 {v14.4S}, [%14]\n"
+          "LD1 {v15.4S}, [%15]\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v20.4S, v4.4S, v5.4S\n"
+          "FSUB v21.4S, v4.4S, v5.4S\n"
+          "FADD v22.4S, v6.4S, v7.4S\n"
+          "FSUB v23.4S, v6.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v9.4S\n"
+          "FSUB v25.4S, v8.4S, v9.4S\n"
+          "FADD v26.4S, v10.4S, v11.4S\n"
+          "FSUB v27.4S, v10.4S, v11.4S\n"
+          "FADD v28.4S, v12.4S, v13.4S\n"
+          "FSUB v29.4S, v12.4S, v13.4S\n"
+          "FADD v30.4S, v14.4S, v15.4S\n"
+          "FSUB v31.4S, v14.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "FADD v4.4S, v20.4S, v22.4S\n"
+          "FSUB v6.4S, v20.4S, v22.4S\n"
+          "FADD v5.4S, v21.4S, v23.4S\n"
+          "FSUB v7.4S, v21.4S, v23.4S\n"
+          "FADD v8.4S, v24.4S, v26.4S\n"
+          "FSUB v10.4S, v24.4S, v26.4S\n"
+          "FADD v9.4S, v25.4S, v27.4S\n"
+          "FSUB v11.4S, v25.4S, v27.4S\n"
+          "FADD v12.4S, v28.4S, v30.4S\n"
+          "FSUB v14.4S, v28.4S, v30.4S\n"
+          "FADD v13.4S, v29.4S, v31.4S\n"
+          "FSUB v15.4S, v29.4S, v31.4S\n"
+          "FADD v16.4S, v0.4S, v4.4S\n"
+          "FSUB v20.4S, v0.4S, v4.4S\n"
+          "FADD v17.4S, v1.4S, v5.4S\n"
+          "FSUB v21.4S, v1.4S, v5.4S\n"
+          "FADD v18.4S, v2.4S, v6.4S\n"
+          "FSUB v22.4S, v2.4S, v6.4S\n"
+          "FADD v19.4S, v3.4S, v7.4S\n"
+          "FSUB v23.4S, v3.4S, v7.4S\n"
+          "FADD v24.4S, v8.4S, v12.4S\n"
+          "FSUB v28.4S, v8.4S, v12.4S\n"
+          "FADD v25.4S, v9.4S, v13.4S\n"
+          "FSUB v29.4S, v9.4S, v13.4S\n"
+          "FADD v26.4S, v10.4S, v14.4S\n"
+          "FSUB v30.4S, v10.4S, v14.4S\n"
+          "FADD v27.4S, v11.4S, v15.4S\n"
+          "FSUB v31.4S, v11.4S, v15.4S\n"
+          "FADD v0.4S, v16.4S, v24.4S\n"
+          "FSUB v8.4S, v16.4S, v24.4S\n"
+          "FADD v1.4S, v17.4S, v25.4S\n"
+          "FSUB v9.4S, v17.4S, v25.4S\n"
+          "FADD v2.4S, v18.4S, v26.4S\n"
+          "FSUB v10.4S, v18.4S, v26.4S\n"
+          "FADD v3.4S, v19.4S, v27.4S\n"
+          "FSUB v11.4S, v19.4S, v27.4S\n"
+          "FADD v4.4S, v20.4S, v28.4S\n"
+          "FSUB v12.4S, v20.4S, v28.4S\n"
+          "FADD v5.4S, v21.4S, v29.4S\n"
+          "FSUB v13.4S, v21.4S, v29.4S\n"
+          "FADD v6.4S, v22.4S, v30.4S\n"
+          "FSUB v14.4S, v22.4S, v30.4S\n"
+          "FADD v7.4S, v23.4S, v31.4S\n"
+          "FSUB v15.4S, v23.4S, v31.4S\n"
+          "ST1 {v0.4S}, [%0]\n"
+          "ST1 {v1.4S}, [%1]\n"
+          "ST1 {v2.4S}, [%2]\n"
+          "ST1 {v3.4S}, [%3]\n"
+          "ST1 {v4.4S}, [%4]\n"
+          "ST1 {v5.4S}, [%5]\n"
+          "ST1 {v6.4S}, [%6]\n"
+          "ST1 {v7.4S}, [%7]\n"
+          "ST1 {v8.4S}, [%8]\n"
+          "ST1 {v9.4S}, [%9]\n"
+          "ST1 {v10.4S}, [%10]\n"
+          "ST1 {v11.4S}, [%11]\n"
+          "ST1 {v12.4S}, [%12]\n"
+          "ST1 {v13.4S}, [%13]\n"
+          "ST1 {v14.4S}, [%14]\n"
+          "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 64),
+          "r"(buf + j + k + 128),
+          "r"(buf + j + k + 192),
+          "r"(buf + j + k + 256),
+          "r"(buf + j + k + 320),
+          "r"(buf + j + k + 384),
+          "r"(buf + j + k + 448),
+          "r"(buf + j + k + 512),
+          "r"(buf + j + k + 576),
+          "r"(buf + j + k + 640),
+          "r"(buf + j + k + 704),
+          "r"(buf + j + k + 768),
+          "r"(buf + j + k + 832),
+          "r"(buf + j + k + 896),
+          "r"(buf + j + k + 960)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+  for (int j = 0; j < 8192; j += 8192) {
+    for (int k = 0; k < 1024; k += 4) {
+      __asm__ volatile(
+          "LD1 {v0.4S}, [%0]\n"
+          "LD1 {v1.4S}, [%1]\n"
+          "LD1 {v2.4S}, [%2]\n"
+          "LD1 {v3.4S}, [%3]\n"
+          "LD1 {v4.4S}, [%4]\n"
+          "LD1 {v5.4S}, [%5]\n"
+          "LD1 {v6.4S}, [%6]\n"
+          "LD1 {v7.4S}, [%7]\n"
+          "FADD v16.4S, v0.4S, v1.4S\n"
+          "FSUB v17.4S, v0.4S, v1.4S\n"
+          "FADD v18.4S, v2.4S, v3.4S\n"
+          "FSUB v19.4S, v2.4S, v3.4S\n"
+          "FADD v20.4S, v4.4S, v5.4S\n"
+          "FSUB v21.4S, v4.4S, v5.4S\n"
+          "FADD v22.4S, v6.4S, v7.4S\n"
+          "FSUB v23.4S, v6.4S, v7.4S\n"
+          "FADD v0.4S, v16.4S, v18.4S\n"
+          "FSUB v2.4S, v16.4S, v18.4S\n"
+          "FADD v1.4S, v17.4S, v19.4S\n"
+          "FSUB v3.4S, v17.4S, v19.4S\n"
+          "FADD v4.4S, v20.4S, v22.4S\n"
+          "FSUB v6.4S, v20.4S, v22.4S\n"
+          "FADD v5.4S, v21.4S, v23.4S\n"
+          "FSUB v7.4S, v21.4S, v23.4S\n"
+          "FADD v16.4S, v0.4S, v4.4S\n"
+          "FSUB v20.4S, v0.4S, v4.4S\n"
+          "FADD v17.4S, v1.4S, v5.4S\n"
+          "FSUB v21.4S, v1.4S, v5.4S\n"
+          "FADD v18.4S, v2.4S, v6.4S\n"
+          "FSUB v22.4S, v2.4S, v6.4S\n"
+          "FADD v19.4S, v3.4S, v7.4S\n"
+          "FSUB v23.4S, v3.4S, v7.4S\n"
+          "ST1 {v16.4S}, [%0]\n"
+          "ST1 {v17.4S}, [%1]\n"
+          "ST1 {v18.4S}, [%2]\n"
+          "ST1 {v19.4S}, [%3]\n"
+          "ST1 {v20.4S}, [%4]\n"
+          "ST1 {v21.4S}, [%5]\n"
+          "ST1 {v22.4S}, [%6]\n"
+          "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0),
+          "r"(buf + j + k + 1024),
+          "r"(buf + j + k + 2048),
+          "r"(buf + j + k + 3072),
+          "r"(buf + j + k + 4096),
+          "r"(buf + j + k + 5120),
+          "r"(buf + j + k + 6144),
+          "r"(buf + j + k + 7168)
+          : "%v0",
+            "%v1",
+            "%v2",
+            "%v3",
+            "%v4",
+            "%v5",
+            "%v6",
+            "%v7",
+            "%v8",
+            "%v9",
+            "%v10",
+            "%v11",
+            "%v12",
+            "%v13",
+            "%v14",
+            "%v15",
+            "%v16",
+            "%v17",
+            "%v18",
+            "%v19",
+            "%v20",
+            "%v21",
+            "%v22",
+            "%v23",
+            "%v24",
+            "%v25",
+            "%v26",
+            "%v27",
+            "%v28",
+            "%v29",
+            "%v30",
+            "%v31",
+            "memory");
+    }
+  }
+}
+void helper_float_14_recursive(float* buf, int depth);
+void helper_float_14_recursive(float* buf, int depth) {
+  if (depth == 10) {
+    helper_float_10(buf);
+    return;
+  }
+  if (depth == 14) {
+    helper_float_14_recursive(buf + 0, 10);
+    helper_float_14_recursive(buf + 1024, 10);
+    helper_float_14_recursive(buf + 2048, 10);
+    helper_float_14_recursive(buf + 3072, 10);
+    helper_float_14_recursive(buf + 4096, 10);
+    helper_float_14_recursive(buf + 5120, 10);
+    helper_float_14_recursive(buf + 6144, 10);
+    helper_float_14_recursive(buf + 7168, 10);
+    helper_float_14_recursive(buf + 8192, 10);
+    helper_float_14_recursive(buf + 9216, 10);
+    helper_float_14_recursive(buf + 10240, 10);
+    helper_float_14_recursive(buf + 11264, 10);
+    helper_float_14_recursive(buf + 12288, 10);
+    helper_float_14_recursive(buf + 13312, 10);
+    helper_float_14_recursive(buf + 14336, 10);
+    helper_float_14_recursive(buf + 15360, 10);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "LD1 {v4.4S}, [%4]\n"
+            "LD1 {v5.4S}, [%5]\n"
+            "LD1 {v6.4S}, [%6]\n"
+            "LD1 {v7.4S}, [%7]\n"
+            "LD1 {v8.4S}, [%8]\n"
+            "LD1 {v9.4S}, [%9]\n"
+            "LD1 {v10.4S}, [%10]\n"
+            "LD1 {v11.4S}, [%11]\n"
+            "LD1 {v12.4S}, [%12]\n"
+            "LD1 {v13.4S}, [%13]\n"
+            "LD1 {v14.4S}, [%14]\n"
+            "LD1 {v15.4S}, [%15]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v20.4S, v4.4S, v5.4S\n"
+            "FSUB v21.4S, v4.4S, v5.4S\n"
+            "FADD v22.4S, v6.4S, v7.4S\n"
+            "FSUB v23.4S, v6.4S, v7.4S\n"
+            "FADD v24.4S, v8.4S, v9.4S\n"
+            "FSUB v25.4S, v8.4S, v9.4S\n"
+            "FADD v26.4S, v10.4S, v11.4S\n"
+            "FSUB v27.4S, v10.4S, v11.4S\n"
+            "FADD v28.4S, v12.4S, v13.4S\n"
+            "FSUB v29.4S, v12.4S, v13.4S\n"
+            "FADD v30.4S, v14.4S, v15.4S\n"
+            "FSUB v31.4S, v14.4S, v15.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "FADD v4.4S, v20.4S, v22.4S\n"
+            "FSUB v6.4S, v20.4S, v22.4S\n"
+            "FADD v5.4S, v21.4S, v23.4S\n"
+            "FSUB v7.4S, v21.4S, v23.4S\n"
+            "FADD v8.4S, v24.4S, v26.4S\n"
+            "FSUB v10.4S, v24.4S, v26.4S\n"
+            "FADD v9.4S, v25.4S, v27.4S\n"
+            "FSUB v11.4S, v25.4S, v27.4S\n"
+            "FADD v12.4S, v28.4S, v30.4S\n"
+            "FSUB v14.4S, v28.4S, v30.4S\n"
+            "FADD v13.4S, v29.4S, v31.4S\n"
+            "FSUB v15.4S, v29.4S, v31.4S\n"
+            "FADD v16.4S, v0.4S, v4.4S\n"
+            "FSUB v20.4S, v0.4S, v4.4S\n"
+            "FADD v17.4S, v1.4S, v5.4S\n"
+            "FSUB v21.4S, v1.4S, v5.4S\n"
+            "FADD v18.4S, v2.4S, v6.4S\n"
+            "FSUB v22.4S, v2.4S, v6.4S\n"
+            "FADD v19.4S, v3.4S, v7.4S\n"
+            "FSUB v23.4S, v3.4S, v7.4S\n"
+            "FADD v24.4S, v8.4S, v12.4S\n"
+            "FSUB v28.4S, v8.4S, v12.4S\n"
+            "FADD v25.4S, v9.4S, v13.4S\n"
+            "FSUB v29.4S, v9.4S, v13.4S\n"
+            "FADD v26.4S, v10.4S, v14.4S\n"
+            "FSUB v30.4S, v10.4S, v14.4S\n"
+            "FADD v27.4S, v11.4S, v15.4S\n"
+            "FSUB v31.4S, v11.4S, v15.4S\n"
+            "FADD v0.4S, v16.4S, v24.4S\n"
+            "FSUB v8.4S, v16.4S, v24.4S\n"
+            "FADD v1.4S, v17.4S, v25.4S\n"
+            "FSUB v9.4S, v17.4S, v25.4S\n"
+            "FADD v2.4S, v18.4S, v26.4S\n"
+            "FSUB v10.4S, v18.4S, v26.4S\n"
+            "FADD v3.4S, v19.4S, v27.4S\n"
+            "FSUB v11.4S, v19.4S, v27.4S\n"
+            "FADD v4.4S, v20.4S, v28.4S\n"
+            "FSUB v12.4S, v20.4S, v28.4S\n"
+            "FADD v5.4S, v21.4S, v29.4S\n"
+            "FSUB v13.4S, v21.4S, v29.4S\n"
+            "FADD v6.4S, v22.4S, v30.4S\n"
+            "FSUB v14.4S, v22.4S, v30.4S\n"
+            "FADD v7.4S, v23.4S, v31.4S\n"
+            "FSUB v15.4S, v23.4S, v31.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n"
+            "ST1 {v4.4S}, [%4]\n"
+            "ST1 {v5.4S}, [%5]\n"
+            "ST1 {v6.4S}, [%6]\n"
+            "ST1 {v7.4S}, [%7]\n"
+            "ST1 {v8.4S}, [%8]\n"
+            "ST1 {v9.4S}, [%9]\n"
+            "ST1 {v10.4S}, [%10]\n"
+            "ST1 {v11.4S}, [%11]\n"
+            "ST1 {v12.4S}, [%12]\n"
+            "ST1 {v13.4S}, [%13]\n"
+            "ST1 {v14.4S}, [%14]\n"
+            "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1024),
+            "r"(buf + j + k + 2048),
+            "r"(buf + j + k + 3072),
+            "r"(buf + j + k + 4096),
+            "r"(buf + j + k + 5120),
+            "r"(buf + j + k + 6144),
+            "r"(buf + j + k + 7168),
+            "r"(buf + j + k + 8192),
+            "r"(buf + j + k + 9216),
+            "r"(buf + j + k + 10240),
+            "r"(buf + j + k + 11264),
+            "r"(buf + j + k + 12288),
+            "r"(buf + j + k + 13312),
+            "r"(buf + j + k + 14336),
+            "r"(buf + j + k + 15360)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_14(float* buf);
+void helper_float_14(float* buf) {
+  helper_float_14_recursive(buf, 14);
+}
+void helper_float_15_recursive(float* buf, int depth);
+void helper_float_15_recursive(float* buf, int depth) {
+  if (depth == 13) {
+    helper_float_13(buf);
+    return;
+  }
+  if (depth == 15) {
+    helper_float_15_recursive(buf + 0, 13);
+    helper_float_15_recursive(buf + 8192, 13);
+    helper_float_15_recursive(buf + 16384, 13);
+    helper_float_15_recursive(buf + 24576, 13);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8192),
+            "r"(buf + j + k + 16384),
+            "r"(buf + j + k + 24576)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_15(float* buf);
+void helper_float_15(float* buf) {
+  helper_float_15_recursive(buf, 15);
+}
+void helper_float_16_recursive(float* buf, int depth);
+void helper_float_16_recursive(float* buf, int depth) {
+  if (depth == 15) {
+    helper_float_15(buf);
+    return;
+  }
+  if (depth == 16) {
+    helper_float_16_recursive(buf + 0, 15);
+    helper_float_16_recursive(buf + 32768, 15);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 32768)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_16(float* buf);
+void helper_float_16(float* buf) {
+  helper_float_16_recursive(buf, 16);
+}
+void helper_float_17_recursive(float* buf, int depth);
+void helper_float_17_recursive(float* buf, int depth) {
+  if (depth == 15) {
+    helper_float_15(buf);
+    return;
+  }
+  if (depth == 17) {
+    helper_float_17_recursive(buf + 0, 15);
+    helper_float_17_recursive(buf + 32768, 15);
+    helper_float_17_recursive(buf + 65536, 15);
+    helper_float_17_recursive(buf + 98304, 15);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 32768),
+            "r"(buf + j + k + 65536),
+            "r"(buf + j + k + 98304)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_17(float* buf);
+void helper_float_17(float* buf) {
+  helper_float_17_recursive(buf, 17);
+}
+void helper_float_18_recursive(float* buf, int depth);
+void helper_float_18_recursive(float* buf, int depth) {
+  if (depth == 17) {
+    helper_float_17(buf);
+    return;
+  }
+  if (depth == 18) {
+    helper_float_18_recursive(buf + 0, 17);
+    helper_float_18_recursive(buf + 131072, 17);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 131072)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_18(float* buf);
+void helper_float_18(float* buf) {
+  helper_float_18_recursive(buf, 18);
+}
+void helper_float_19_recursive(float* buf, int depth);
+void helper_float_19_recursive(float* buf, int depth) {
+  if (depth == 18) {
+    helper_float_18(buf);
+    return;
+  }
+  if (depth == 19) {
+    helper_float_19_recursive(buf + 0, 18);
+    helper_float_19_recursive(buf + 262144, 18);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 262144)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_19(float* buf);
+void helper_float_19(float* buf) {
+  helper_float_19_recursive(buf, 19);
+}
+void helper_float_20_recursive(float* buf, int depth);
+void helper_float_20_recursive(float* buf, int depth) {
+  if (depth == 18) {
+    helper_float_18(buf);
+    return;
+  }
+  if (depth == 20) {
+    helper_float_20_recursive(buf + 0, 18);
+    helper_float_20_recursive(buf + 262144, 18);
+    helper_float_20_recursive(buf + 524288, 18);
+    helper_float_20_recursive(buf + 786432, 18);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 262144),
+            "r"(buf + j + k + 524288),
+            "r"(buf + j + k + 786432)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_20(float* buf);
+void helper_float_20(float* buf) {
+  helper_float_20_recursive(buf, 20);
+}
+void helper_float_21_recursive(float* buf, int depth);
+void helper_float_21_recursive(float* buf, int depth) {
+  if (depth == 20) {
+    helper_float_20(buf);
+    return;
+  }
+  if (depth == 21) {
+    helper_float_21_recursive(buf + 0, 20);
+    helper_float_21_recursive(buf + 1048576, 20);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1048576)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_21(float* buf);
+void helper_float_21(float* buf) {
+  helper_float_21_recursive(buf, 21);
+}
+void helper_float_22_recursive(float* buf, int depth);
+void helper_float_22_recursive(float* buf, int depth) {
+  if (depth == 20) {
+    helper_float_20(buf);
+    return;
+  }
+  if (depth == 22) {
+    helper_float_22_recursive(buf + 0, 20);
+    helper_float_22_recursive(buf + 1048576, 20);
+    helper_float_22_recursive(buf + 2097152, 20);
+    helper_float_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 1048576),
+            "r"(buf + j + k + 2097152),
+            "r"(buf + j + k + 3145728)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_22(float* buf);
+void helper_float_22(float* buf) {
+  helper_float_22_recursive(buf, 22);
+}
+void helper_float_23_recursive(float* buf, int depth);
+void helper_float_23_recursive(float* buf, int depth) {
+  if (depth == 22) {
+    helper_float_22(buf);
+    return;
+  }
+  if (depth == 23) {
+    helper_float_23_recursive(buf + 0, 22);
+    helper_float_23_recursive(buf + 4194304, 22);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 4194304; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 4194304)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_23(float* buf);
+void helper_float_23(float* buf) {
+  helper_float_23_recursive(buf, 23);
+}
+void helper_float_24_recursive(float* buf, int depth);
+void helper_float_24_recursive(float* buf, int depth) {
+  if (depth == 23) {
+    helper_float_23(buf);
+    return;
+  }
+  if (depth == 24) {
+    helper_float_24_recursive(buf + 0, 23);
+    helper_float_24_recursive(buf + 8388608, 23);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8388608)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_24(float* buf);
+void helper_float_24(float* buf) {
+  helper_float_24_recursive(buf, 24);
+}
+void helper_float_25_recursive(float* buf, int depth);
+void helper_float_25_recursive(float* buf, int depth) {
+  if (depth == 23) {
+    helper_float_23(buf);
+    return;
+  }
+  if (depth == 25) {
+    helper_float_25_recursive(buf + 0, 23);
+    helper_float_25_recursive(buf + 8388608, 23);
+    helper_float_25_recursive(buf + 16777216, 23);
+    helper_float_25_recursive(buf + 25165824, 23);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 8388608),
+            "r"(buf + j + k + 16777216),
+            "r"(buf + j + k + 25165824)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_25(float* buf);
+void helper_float_25(float* buf) {
+  helper_float_25_recursive(buf, 25);
+}
+void helper_float_26_recursive(float* buf, int depth);
+void helper_float_26_recursive(float* buf, int depth) {
+  if (depth == 25) {
+    helper_float_25(buf);
+    return;
+  }
+  if (depth == 26) {
+    helper_float_26_recursive(buf + 0, 25);
+    helper_float_26_recursive(buf + 33554432, 25);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 33554432; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 33554432)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_26(float* buf);
+void helper_float_26(float* buf) {
+  helper_float_26_recursive(buf, 26);
+}
+void helper_float_27_recursive(float* buf, int depth);
+void helper_float_27_recursive(float* buf, int depth) {
+  if (depth == 26) {
+    helper_float_26(buf);
+    return;
+  }
+  if (depth == 27) {
+    helper_float_27_recursive(buf + 0, 26);
+    helper_float_27_recursive(buf + 67108864, 26);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 67108864; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 67108864)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_27(float* buf);
+void helper_float_27(float* buf) {
+  helper_float_27_recursive(buf, 27);
+}
+void helper_float_28_recursive(float* buf, int depth);
+void helper_float_28_recursive(float* buf, int depth) {
+  if (depth == 26) {
+    helper_float_26(buf);
+    return;
+  }
+  if (depth == 28) {
+    helper_float_28_recursive(buf + 0, 26);
+    helper_float_28_recursive(buf + 67108864, 26);
+    helper_float_28_recursive(buf + 134217728, 26);
+    helper_float_28_recursive(buf + 201326592, 26);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 67108864; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "LD1 {v2.4S}, [%2]\n"
+            "LD1 {v3.4S}, [%3]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "FADD v18.4S, v2.4S, v3.4S\n"
+            "FSUB v19.4S, v2.4S, v3.4S\n"
+            "FADD v0.4S, v16.4S, v18.4S\n"
+            "FSUB v2.4S, v16.4S, v18.4S\n"
+            "FADD v1.4S, v17.4S, v19.4S\n"
+            "FSUB v3.4S, v17.4S, v19.4S\n"
+            "ST1 {v0.4S}, [%0]\n"
+            "ST1 {v1.4S}, [%1]\n"
+            "ST1 {v2.4S}, [%2]\n"
+            "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 67108864),
+            "r"(buf + j + k + 134217728),
+            "r"(buf + j + k + 201326592)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_28(float* buf);
+void helper_float_28(float* buf) {
+  helper_float_28_recursive(buf, 28);
+}
+void helper_float_29_recursive(float* buf, int depth);
+void helper_float_29_recursive(float* buf, int depth) {
+  if (depth == 28) {
+    helper_float_28(buf);
+    return;
+  }
+  if (depth == 29) {
+    helper_float_29_recursive(buf + 0, 28);
+    helper_float_29_recursive(buf + 268435456, 28);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 268435456; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 268435456)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_29(float* buf);
+void helper_float_29(float* buf) {
+  helper_float_29_recursive(buf, 29);
+}
+void helper_float_30_recursive(float* buf, int depth);
+void helper_float_30_recursive(float* buf, int depth) {
+  if (depth == 29) {
+    helper_float_29(buf);
+    return;
+  }
+  if (depth == 30) {
+    helper_float_30_recursive(buf + 0, 29);
+    helper_float_30_recursive(buf + 536870912, 29);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 536870912; k += 4) {
+        __asm__ volatile(
+            "LD1 {v0.4S}, [%0]\n"
+            "LD1 {v1.4S}, [%1]\n"
+            "FADD v16.4S, v0.4S, v1.4S\n"
+            "FSUB v17.4S, v0.4S, v1.4S\n"
+            "ST1 {v16.4S}, [%0]\n"
+            "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
+            "r"(buf + j + k + 536870912)
+            : "%v0",
+              "%v1",
+              "%v2",
+              "%v3",
+              "%v4",
+              "%v5",
+              "%v6",
+              "%v7",
+              "%v8",
+              "%v9",
+              "%v10",
+              "%v11",
+              "%v12",
+              "%v13",
+              "%v14",
+              "%v15",
+              "%v16",
+              "%v17",
+              "%v18",
+              "%v19",
+              "%v20",
+              "%v21",
+              "%v22",
+              "%v23",
+              "%v24",
+              "%v25",
+              "%v26",
+              "%v27",
+              "%v28",
+              "%v29",
+              "%v30",
+              "%v31",
+              "memory");
+      }
+    }
+    return;
+  }
+}
+void helper_float_30(float* buf);
+void helper_float_30(float* buf) {
+  helper_float_30_recursive(buf, 30);
+}
+int fht_float(float* buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_float_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_float_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_float_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_float_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_float_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_float_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_float_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_float_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_float_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_float_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_float_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_float_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_float_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_float_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_float_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_float_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_float_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_float_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_float_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_float_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_float_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_float_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_float_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_float_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_float_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_float_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_float_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_float_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_float_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_float_30(buf);
+    return 0;
+  }
+  return 1;
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c
new file mode 100644
index 00000000000..90d0ffc8180
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c
@@ -0,0 +1,26215 @@
+#include "fht.h"
+static inline void helper_float_1(float *buf);
+static inline void helper_float_1(float *buf) {
+  for (int j = 0; j < 2; j += 2) {
+    for (int k = 0; k < 1; ++k) {
+      float u = buf[j + k];
+      float v = buf[j + k + 1];
+      buf[j + k] = u + v;
+      buf[j + k + 1] = u - v;
+    }
+  }
+}
+static inline void helper_float_2(float *buf);
+static inline void helper_float_2(float *buf) {
+  for (int j = 0; j < 4; j += 4) {
+    __asm__ volatile (
+      "movups (%0), %%xmm0\n"
+      "movaps %%xmm0, %%xmm8\n"
+      "shufps $160, %%xmm8, %%xmm8\n"
+      "shufps $245, %%xmm0, %%xmm0\n"
+      "xorps %%xmm9, %%xmm9\n"
+      "subps %%xmm0, %%xmm9\n"
+      "addsubps %%xmm9, %%xmm8\n"
+      "movaps %%xmm8, %%xmm0\n"
+      "movaps %%xmm0, %%xmm8\n"
+      "shufps $68, %%xmm8, %%xmm8\n"
+      "xorps %%xmm9, %%xmm9\n"
+      "movaps %%xmm0, %%xmm10\n"
+      "shufps $14, %%xmm9, %%xmm10\n"
+      "movaps %%xmm0, %%xmm11\n"
+      "shufps $224, %%xmm11, %%xmm9\n"
+      "addps %%xmm8, %%xmm10\n"
+      "subps %%xmm9, %%xmm10\n"
+      "movaps %%xmm10, %%xmm0\n"
+      "movups %%xmm0, (%0)\n"
+      :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+    );
+  }
+}
+static inline void helper_float_3(float *buf);
+static inline void helper_float_3(float *buf) {
+  for (int j = 0; j < 8; j += 8) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_4(float *buf);
+static inline void helper_float_4(float *buf) {
+  for (int j = 0; j < 16; j += 16) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movups %%xmm0, (%0)\n"
+        "movups %%xmm1, (%1)\n"
+        "movups %%xmm2, (%2)\n"
+        "movups %%xmm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_5(float *buf);
+static inline void helper_float_5(float *buf) {
+  for (int j = 0; j < 32; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_6(float *buf);
+static inline void helper_float_6(float *buf) {
+  for (int j = 0; j < 64; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 64; j += 64) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_7_recursive(float *buf, int depth);
+void helper_float_7_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_7(float *buf);
+void helper_float_7(float *buf) {
+  helper_float_7_recursive(buf, 7);
+}
+static inline void helper_float_8(float *buf);
+static inline void helper_float_8(float *buf) {
+  for (int j = 0; j < 256; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 256; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_9(float *buf);
+static inline void helper_float_9(float *buf) {
+  for (int j = 0; j < 512; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 512; j += 512) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_10(float *buf);
+static inline void helper_float_10(float *buf) {
+  for (int j = 0; j < 1024; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 1024; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 1024; j += 1024) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movups %%xmm0, (%0)\n"
+        "movups %%xmm1, (%1)\n"
+        "movups %%xmm2, (%2)\n"
+        "movups %%xmm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_float_11(float *buf);
+static inline void helper_float_11(float *buf) {
+  for (int j = 0; j < 2048; j += 32) {
+    for (int k = 0; k < 4; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm0, %%xmm0\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm0, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm1, %%xmm1\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm1, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm2, %%xmm2\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm2, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm3, %%xmm3\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm3, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm4, %%xmm4\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm4, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm5, %%xmm5\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm5, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm6, %%xmm6\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm6, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $160, %%xmm8, %%xmm8\n"
+        "shufps $245, %%xmm7, %%xmm7\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "subps %%xmm7, %%xmm9\n"
+        "addsubps %%xmm9, %%xmm8\n"
+        "movaps %%xmm8, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm0, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm0, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm0\n"
+        "movaps %%xmm1, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm1, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm1, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm1\n"
+        "movaps %%xmm2, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm2\n"
+        "movaps %%xmm3, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm3, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm3\n"
+        "movaps %%xmm4, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm4, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm4, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm4\n"
+        "movaps %%xmm5, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm5, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm5, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm5\n"
+        "movaps %%xmm6, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm6, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm6, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm6\n"
+        "movaps %%xmm7, %%xmm8\n"
+        "shufps $68, %%xmm8, %%xmm8\n"
+        "xorps %%xmm9, %%xmm9\n"
+        "movaps %%xmm7, %%xmm10\n"
+        "shufps $14, %%xmm9, %%xmm10\n"
+        "movaps %%xmm7, %%xmm11\n"
+        "shufps $224, %%xmm11, %%xmm9\n"
+        "addps %%xmm8, %%xmm10\n"
+        "subps %%xmm9, %%xmm10\n"
+        "movaps %%xmm10, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 2048; j += 256) {
+    for (int k = 0; k < 32; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 2048; j += 2048) {
+    for (int k = 0; k < 256; k += 4) {
+      __asm__ volatile (
+        "movups (%0), %%xmm0\n"
+        "movups (%1), %%xmm1\n"
+        "movups (%2), %%xmm2\n"
+        "movups (%3), %%xmm3\n"
+        "movups (%4), %%xmm4\n"
+        "movups (%5), %%xmm5\n"
+        "movups (%6), %%xmm6\n"
+        "movups (%7), %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm9\n"
+        "addps %%xmm1, %%xmm8\n"
+        "subps %%xmm1, %%xmm9\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm11\n"
+        "addps %%xmm3, %%xmm10\n"
+        "subps %%xmm3, %%xmm11\n"
+        "movaps %%xmm4, %%xmm12\n"
+        "movaps %%xmm4, %%xmm13\n"
+        "addps %%xmm5, %%xmm12\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm6, %%xmm14\n"
+        "movaps %%xmm6, %%xmm15\n"
+        "addps %%xmm7, %%xmm14\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movaps %%xmm8, %%xmm0\n"
+        "movaps %%xmm8, %%xmm2\n"
+        "addps %%xmm10, %%xmm0\n"
+        "subps %%xmm10, %%xmm2\n"
+        "movaps %%xmm9, %%xmm1\n"
+        "movaps %%xmm9, %%xmm3\n"
+        "addps %%xmm11, %%xmm1\n"
+        "subps %%xmm11, %%xmm3\n"
+        "movaps %%xmm12, %%xmm4\n"
+        "movaps %%xmm12, %%xmm6\n"
+        "addps %%xmm14, %%xmm4\n"
+        "subps %%xmm14, %%xmm6\n"
+        "movaps %%xmm13, %%xmm5\n"
+        "movaps %%xmm13, %%xmm7\n"
+        "addps %%xmm15, %%xmm5\n"
+        "subps %%xmm15, %%xmm7\n"
+        "movaps %%xmm0, %%xmm8\n"
+        "movaps %%xmm0, %%xmm12\n"
+        "addps %%xmm4, %%xmm8\n"
+        "subps %%xmm4, %%xmm12\n"
+        "movaps %%xmm1, %%xmm9\n"
+        "movaps %%xmm1, %%xmm13\n"
+        "addps %%xmm5, %%xmm9\n"
+        "subps %%xmm5, %%xmm13\n"
+        "movaps %%xmm2, %%xmm10\n"
+        "movaps %%xmm2, %%xmm14\n"
+        "addps %%xmm6, %%xmm10\n"
+        "subps %%xmm6, %%xmm14\n"
+        "movaps %%xmm3, %%xmm11\n"
+        "movaps %%xmm3, %%xmm15\n"
+        "addps %%xmm7, %%xmm11\n"
+        "subps %%xmm7, %%xmm15\n"
+        "movups %%xmm8, (%0)\n"
+        "movups %%xmm9, (%1)\n"
+        "movups %%xmm10, (%2)\n"
+        "movups %%xmm11, (%3)\n"
+        "movups %%xmm12, (%4)\n"
+        "movups %%xmm13, (%5)\n"
+        "movups %%xmm14, (%6)\n"
+        "movups %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_float_12_recursive(float *buf, int depth);
+void helper_float_12_recursive(float *buf, int depth) {
+  if (depth == 7) {
+    for (int j = 0; j < 128; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 128; j += 128) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 10) {
+    helper_float_12_recursive(buf + 0, 7);
+    helper_float_12_recursive(buf + 128, 7);
+    helper_float_12_recursive(buf + 256, 7);
+    helper_float_12_recursive(buf + 384, 7);
+    helper_float_12_recursive(buf + 512, 7);
+    helper_float_12_recursive(buf + 640, 7);
+    helper_float_12_recursive(buf + 768, 7);
+    helper_float_12_recursive(buf + 896, 7);
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_12_recursive(buf + 0, 10);
+    helper_float_12_recursive(buf + 1024, 10);
+    helper_float_12_recursive(buf + 2048, 10);
+    helper_float_12_recursive(buf + 3072, 10);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_12(float *buf);
+void helper_float_12(float *buf) {
+  helper_float_12_recursive(buf, 12);
+}
+void helper_float_13_recursive(float *buf, int depth);
+void helper_float_13_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_float_13_recursive(buf + 0, 11);
+    helper_float_13_recursive(buf + 2048, 11);
+    helper_float_13_recursive(buf + 4096, 11);
+    helper_float_13_recursive(buf + 6144, 11);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_13(float *buf);
+void helper_float_13(float *buf) {
+  helper_float_13_recursive(buf, 13);
+}
+void helper_float_14_recursive(float *buf, int depth);
+void helper_float_14_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_14_recursive(buf + 0, 11);
+    helper_float_14_recursive(buf + 2048, 11);
+    helper_float_14_recursive(buf + 4096, 11);
+    helper_float_14_recursive(buf + 6144, 11);
+    helper_float_14_recursive(buf + 8192, 11);
+    helper_float_14_recursive(buf + 10240, 11);
+    helper_float_14_recursive(buf + 12288, 11);
+    helper_float_14_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_14(float *buf);
+void helper_float_14(float *buf) {
+  helper_float_14_recursive(buf, 14);
+}
+void helper_float_15_recursive(float *buf, int depth);
+void helper_float_15_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_15_recursive(buf + 0, 13);
+    helper_float_15_recursive(buf + 8192, 13);
+    helper_float_15_recursive(buf + 16384, 13);
+    helper_float_15_recursive(buf + 24576, 13);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_15(float *buf);
+void helper_float_15(float *buf) {
+  helper_float_15_recursive(buf, 15);
+}
+void helper_float_16_recursive(float *buf, int depth);
+void helper_float_16_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_16_recursive(buf + 0, 11);
+    helper_float_16_recursive(buf + 2048, 11);
+    helper_float_16_recursive(buf + 4096, 11);
+    helper_float_16_recursive(buf + 6144, 11);
+    helper_float_16_recursive(buf + 8192, 11);
+    helper_float_16_recursive(buf + 10240, 11);
+    helper_float_16_recursive(buf + 12288, 11);
+    helper_float_16_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_16_recursive(buf + 0, 14);
+    helper_float_16_recursive(buf + 16384, 14);
+    helper_float_16_recursive(buf + 32768, 14);
+    helper_float_16_recursive(buf + 49152, 14);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_16(float *buf);
+void helper_float_16(float *buf) {
+  helper_float_16_recursive(buf, 16);
+}
+void helper_float_17_recursive(float *buf, int depth);
+void helper_float_17_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_17_recursive(buf + 0, 11);
+    helper_float_17_recursive(buf + 2048, 11);
+    helper_float_17_recursive(buf + 4096, 11);
+    helper_float_17_recursive(buf + 6144, 11);
+    helper_float_17_recursive(buf + 8192, 11);
+    helper_float_17_recursive(buf + 10240, 11);
+    helper_float_17_recursive(buf + 12288, 11);
+    helper_float_17_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_17_recursive(buf + 0, 14);
+    helper_float_17_recursive(buf + 16384, 14);
+    helper_float_17_recursive(buf + 32768, 14);
+    helper_float_17_recursive(buf + 49152, 14);
+    helper_float_17_recursive(buf + 65536, 14);
+    helper_float_17_recursive(buf + 81920, 14);
+    helper_float_17_recursive(buf + 98304, 14);
+    helper_float_17_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_17(float *buf);
+void helper_float_17(float *buf) {
+  helper_float_17_recursive(buf, 17);
+}
+void helper_float_18_recursive(float *buf, int depth);
+void helper_float_18_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_18_recursive(buf + 0, 13);
+    helper_float_18_recursive(buf + 8192, 13);
+    helper_float_18_recursive(buf + 16384, 13);
+    helper_float_18_recursive(buf + 24576, 13);
+    helper_float_18_recursive(buf + 32768, 13);
+    helper_float_18_recursive(buf + 40960, 13);
+    helper_float_18_recursive(buf + 49152, 13);
+    helper_float_18_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_18_recursive(buf + 0, 16);
+    helper_float_18_recursive(buf + 65536, 16);
+    helper_float_18_recursive(buf + 131072, 16);
+    helper_float_18_recursive(buf + 196608, 16);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_18(float *buf);
+void helper_float_18(float *buf) {
+  helper_float_18_recursive(buf, 18);
+}
+void helper_float_19_recursive(float *buf, int depth);
+void helper_float_19_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_19_recursive(buf + 0, 13);
+    helper_float_19_recursive(buf + 8192, 13);
+    helper_float_19_recursive(buf + 16384, 13);
+    helper_float_19_recursive(buf + 24576, 13);
+    helper_float_19_recursive(buf + 32768, 13);
+    helper_float_19_recursive(buf + 40960, 13);
+    helper_float_19_recursive(buf + 49152, 13);
+    helper_float_19_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_19_recursive(buf + 0, 16);
+    helper_float_19_recursive(buf + 65536, 16);
+    helper_float_19_recursive(buf + 131072, 16);
+    helper_float_19_recursive(buf + 196608, 16);
+    helper_float_19_recursive(buf + 262144, 16);
+    helper_float_19_recursive(buf + 327680, 16);
+    helper_float_19_recursive(buf + 393216, 16);
+    helper_float_19_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_19(float *buf);
+void helper_float_19(float *buf) {
+  helper_float_19_recursive(buf, 19);
+}
+void helper_float_20_recursive(float *buf, int depth);
+void helper_float_20_recursive(float *buf, int depth) {
+  if (depth == 8) {
+    for (int j = 0; j < 256; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_float_20_recursive(buf + 0, 8);
+    helper_float_20_recursive(buf + 256, 8);
+    helper_float_20_recursive(buf + 512, 8);
+    helper_float_20_recursive(buf + 768, 8);
+    helper_float_20_recursive(buf + 1024, 8);
+    helper_float_20_recursive(buf + 1280, 8);
+    helper_float_20_recursive(buf + 1536, 8);
+    helper_float_20_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_20_recursive(buf + 0, 11);
+    helper_float_20_recursive(buf + 2048, 11);
+    helper_float_20_recursive(buf + 4096, 11);
+    helper_float_20_recursive(buf + 6144, 11);
+    helper_float_20_recursive(buf + 8192, 11);
+    helper_float_20_recursive(buf + 10240, 11);
+    helper_float_20_recursive(buf + 12288, 11);
+    helper_float_20_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_20_recursive(buf + 0, 14);
+    helper_float_20_recursive(buf + 16384, 14);
+    helper_float_20_recursive(buf + 32768, 14);
+    helper_float_20_recursive(buf + 49152, 14);
+    helper_float_20_recursive(buf + 65536, 14);
+    helper_float_20_recursive(buf + 81920, 14);
+    helper_float_20_recursive(buf + 98304, 14);
+    helper_float_20_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_20_recursive(buf + 0, 17);
+    helper_float_20_recursive(buf + 131072, 17);
+    helper_float_20_recursive(buf + 262144, 17);
+    helper_float_20_recursive(buf + 393216, 17);
+    helper_float_20_recursive(buf + 524288, 17);
+    helper_float_20_recursive(buf + 655360, 17);
+    helper_float_20_recursive(buf + 786432, 17);
+    helper_float_20_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_20(float *buf);
+void helper_float_20(float *buf) {
+  helper_float_20_recursive(buf, 20);
+}
+void helper_float_21_recursive(float *buf, int depth);
+void helper_float_21_recursive(float *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_float_21_recursive(buf + 0, 13);
+    helper_float_21_recursive(buf + 8192, 13);
+    helper_float_21_recursive(buf + 16384, 13);
+    helper_float_21_recursive(buf + 24576, 13);
+    helper_float_21_recursive(buf + 32768, 13);
+    helper_float_21_recursive(buf + 40960, 13);
+    helper_float_21_recursive(buf + 49152, 13);
+    helper_float_21_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_21_recursive(buf + 0, 16);
+    helper_float_21_recursive(buf + 65536, 16);
+    helper_float_21_recursive(buf + 131072, 16);
+    helper_float_21_recursive(buf + 196608, 16);
+    helper_float_21_recursive(buf + 262144, 16);
+    helper_float_21_recursive(buf + 327680, 16);
+    helper_float_21_recursive(buf + 393216, 16);
+    helper_float_21_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_21_recursive(buf + 0, 19);
+    helper_float_21_recursive(buf + 524288, 19);
+    helper_float_21_recursive(buf + 1048576, 19);
+    helper_float_21_recursive(buf + 1572864, 19);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 524288; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_21(float *buf);
+void helper_float_21(float *buf) {
+  helper_float_21_recursive(buf, 21);
+}
+void helper_float_22_recursive(float *buf, int depth);
+void helper_float_22_recursive(float *buf, int depth) {
+  if (depth == 11) {
+    for (int j = 0; j < 2048; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_22_recursive(buf + 0, 11);
+    helper_float_22_recursive(buf + 2048, 11);
+    helper_float_22_recursive(buf + 4096, 11);
+    helper_float_22_recursive(buf + 6144, 11);
+    helper_float_22_recursive(buf + 8192, 11);
+    helper_float_22_recursive(buf + 10240, 11);
+    helper_float_22_recursive(buf + 12288, 11);
+    helper_float_22_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_22_recursive(buf + 0, 14);
+    helper_float_22_recursive(buf + 16384, 14);
+    helper_float_22_recursive(buf + 32768, 14);
+    helper_float_22_recursive(buf + 49152, 14);
+    helper_float_22_recursive(buf + 65536, 14);
+    helper_float_22_recursive(buf + 81920, 14);
+    helper_float_22_recursive(buf + 98304, 14);
+    helper_float_22_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_22_recursive(buf + 0, 17);
+    helper_float_22_recursive(buf + 131072, 17);
+    helper_float_22_recursive(buf + 262144, 17);
+    helper_float_22_recursive(buf + 393216, 17);
+    helper_float_22_recursive(buf + 524288, 17);
+    helper_float_22_recursive(buf + 655360, 17);
+    helper_float_22_recursive(buf + 786432, 17);
+    helper_float_22_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_22_recursive(buf + 0, 20);
+    helper_float_22_recursive(buf + 1048576, 20);
+    helper_float_22_recursive(buf + 2097152, 20);
+    helper_float_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_22(float *buf);
+void helper_float_22(float *buf) {
+  helper_float_22_recursive(buf, 22);
+}
+void helper_float_23_recursive(float *buf, int depth);
+void helper_float_23_recursive(float *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_float_23_recursive(buf + 0, 6);
+    helper_float_23_recursive(buf + 64, 6);
+    helper_float_23_recursive(buf + 128, 6);
+    helper_float_23_recursive(buf + 192, 6);
+    helper_float_23_recursive(buf + 256, 6);
+    helper_float_23_recursive(buf + 320, 6);
+    helper_float_23_recursive(buf + 384, 6);
+    helper_float_23_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_float_23_recursive(buf + 0, 9);
+    helper_float_23_recursive(buf + 512, 9);
+    helper_float_23_recursive(buf + 1024, 9);
+    helper_float_23_recursive(buf + 1536, 9);
+    helper_float_23_recursive(buf + 2048, 9);
+    helper_float_23_recursive(buf + 2560, 9);
+    helper_float_23_recursive(buf + 3072, 9);
+    helper_float_23_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_23_recursive(buf + 0, 12);
+    helper_float_23_recursive(buf + 4096, 12);
+    helper_float_23_recursive(buf + 8192, 12);
+    helper_float_23_recursive(buf + 12288, 12);
+    helper_float_23_recursive(buf + 16384, 12);
+    helper_float_23_recursive(buf + 20480, 12);
+    helper_float_23_recursive(buf + 24576, 12);
+    helper_float_23_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_23_recursive(buf + 0, 15);
+    helper_float_23_recursive(buf + 32768, 15);
+    helper_float_23_recursive(buf + 65536, 15);
+    helper_float_23_recursive(buf + 98304, 15);
+    helper_float_23_recursive(buf + 131072, 15);
+    helper_float_23_recursive(buf + 163840, 15);
+    helper_float_23_recursive(buf + 196608, 15);
+    helper_float_23_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_23_recursive(buf + 0, 18);
+    helper_float_23_recursive(buf + 262144, 18);
+    helper_float_23_recursive(buf + 524288, 18);
+    helper_float_23_recursive(buf + 786432, 18);
+    helper_float_23_recursive(buf + 1048576, 18);
+    helper_float_23_recursive(buf + 1310720, 18);
+    helper_float_23_recursive(buf + 1572864, 18);
+    helper_float_23_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_float_23_recursive(buf + 0, 21);
+    helper_float_23_recursive(buf + 2097152, 21);
+    helper_float_23_recursive(buf + 4194304, 21);
+    helper_float_23_recursive(buf + 6291456, 21);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_23(float *buf);
+void helper_float_23(float *buf) {
+  helper_float_23_recursive(buf, 23);
+}
+void helper_float_24_recursive(float *buf, int depth);
+void helper_float_24_recursive(float *buf, int depth) {
+  if (depth == 15) {
+    for (int j = 0; j < 32768; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32768; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32768; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32768; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_24_recursive(buf + 0, 15);
+    helper_float_24_recursive(buf + 32768, 15);
+    helper_float_24_recursive(buf + 65536, 15);
+    helper_float_24_recursive(buf + 98304, 15);
+    helper_float_24_recursive(buf + 131072, 15);
+    helper_float_24_recursive(buf + 163840, 15);
+    helper_float_24_recursive(buf + 196608, 15);
+    helper_float_24_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_24_recursive(buf + 0, 18);
+    helper_float_24_recursive(buf + 262144, 18);
+    helper_float_24_recursive(buf + 524288, 18);
+    helper_float_24_recursive(buf + 786432, 18);
+    helper_float_24_recursive(buf + 1048576, 18);
+    helper_float_24_recursive(buf + 1310720, 18);
+    helper_float_24_recursive(buf + 1572864, 18);
+    helper_float_24_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_24_recursive(buf + 0, 21);
+    helper_float_24_recursive(buf + 2097152, 21);
+    helper_float_24_recursive(buf + 4194304, 21);
+    helper_float_24_recursive(buf + 6291456, 21);
+    helper_float_24_recursive(buf + 8388608, 21);
+    helper_float_24_recursive(buf + 10485760, 21);
+    helper_float_24_recursive(buf + 12582912, 21);
+    helper_float_24_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_24(float *buf);
+void helper_float_24(float *buf) {
+  helper_float_24_recursive(buf, 24);
+}
+void helper_float_25_recursive(float *buf, int depth);
+void helper_float_25_recursive(float *buf, int depth) {
+  if (depth == 8) {
+    for (int j = 0; j < 256; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_float_25_recursive(buf + 0, 8);
+    helper_float_25_recursive(buf + 256, 8);
+    helper_float_25_recursive(buf + 512, 8);
+    helper_float_25_recursive(buf + 768, 8);
+    helper_float_25_recursive(buf + 1024, 8);
+    helper_float_25_recursive(buf + 1280, 8);
+    helper_float_25_recursive(buf + 1536, 8);
+    helper_float_25_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_25_recursive(buf + 0, 11);
+    helper_float_25_recursive(buf + 2048, 11);
+    helper_float_25_recursive(buf + 4096, 11);
+    helper_float_25_recursive(buf + 6144, 11);
+    helper_float_25_recursive(buf + 8192, 11);
+    helper_float_25_recursive(buf + 10240, 11);
+    helper_float_25_recursive(buf + 12288, 11);
+    helper_float_25_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_25_recursive(buf + 0, 14);
+    helper_float_25_recursive(buf + 16384, 14);
+    helper_float_25_recursive(buf + 32768, 14);
+    helper_float_25_recursive(buf + 49152, 14);
+    helper_float_25_recursive(buf + 65536, 14);
+    helper_float_25_recursive(buf + 81920, 14);
+    helper_float_25_recursive(buf + 98304, 14);
+    helper_float_25_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_25_recursive(buf + 0, 17);
+    helper_float_25_recursive(buf + 131072, 17);
+    helper_float_25_recursive(buf + 262144, 17);
+    helper_float_25_recursive(buf + 393216, 17);
+    helper_float_25_recursive(buf + 524288, 17);
+    helper_float_25_recursive(buf + 655360, 17);
+    helper_float_25_recursive(buf + 786432, 17);
+    helper_float_25_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_float_25_recursive(buf + 0, 20);
+    helper_float_25_recursive(buf + 1048576, 20);
+    helper_float_25_recursive(buf + 2097152, 20);
+    helper_float_25_recursive(buf + 3145728, 20);
+    helper_float_25_recursive(buf + 4194304, 20);
+    helper_float_25_recursive(buf + 5242880, 20);
+    helper_float_25_recursive(buf + 6291456, 20);
+    helper_float_25_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_float_25_recursive(buf + 0, 23);
+    helper_float_25_recursive(buf + 8388608, 23);
+    helper_float_25_recursive(buf + 16777216, 23);
+    helper_float_25_recursive(buf + 25165824, 23);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_25(float *buf);
+void helper_float_25(float *buf) {
+  helper_float_25_recursive(buf, 25);
+}
+void helper_float_26_recursive(float *buf, int depth);
+void helper_float_26_recursive(float *buf, int depth) {
+  if (depth == 5) {
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_float_26_recursive(buf + 0, 5);
+    helper_float_26_recursive(buf + 32, 5);
+    helper_float_26_recursive(buf + 64, 5);
+    helper_float_26_recursive(buf + 96, 5);
+    helper_float_26_recursive(buf + 128, 5);
+    helper_float_26_recursive(buf + 160, 5);
+    helper_float_26_recursive(buf + 192, 5);
+    helper_float_26_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_float_26_recursive(buf + 0, 8);
+    helper_float_26_recursive(buf + 256, 8);
+    helper_float_26_recursive(buf + 512, 8);
+    helper_float_26_recursive(buf + 768, 8);
+    helper_float_26_recursive(buf + 1024, 8);
+    helper_float_26_recursive(buf + 1280, 8);
+    helper_float_26_recursive(buf + 1536, 8);
+    helper_float_26_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_float_26_recursive(buf + 0, 11);
+    helper_float_26_recursive(buf + 2048, 11);
+    helper_float_26_recursive(buf + 4096, 11);
+    helper_float_26_recursive(buf + 6144, 11);
+    helper_float_26_recursive(buf + 8192, 11);
+    helper_float_26_recursive(buf + 10240, 11);
+    helper_float_26_recursive(buf + 12288, 11);
+    helper_float_26_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_float_26_recursive(buf + 0, 14);
+    helper_float_26_recursive(buf + 16384, 14);
+    helper_float_26_recursive(buf + 32768, 14);
+    helper_float_26_recursive(buf + 49152, 14);
+    helper_float_26_recursive(buf + 65536, 14);
+    helper_float_26_recursive(buf + 81920, 14);
+    helper_float_26_recursive(buf + 98304, 14);
+    helper_float_26_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_float_26_recursive(buf + 0, 17);
+    helper_float_26_recursive(buf + 131072, 17);
+    helper_float_26_recursive(buf + 262144, 17);
+    helper_float_26_recursive(buf + 393216, 17);
+    helper_float_26_recursive(buf + 524288, 17);
+    helper_float_26_recursive(buf + 655360, 17);
+    helper_float_26_recursive(buf + 786432, 17);
+    helper_float_26_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_float_26_recursive(buf + 0, 20);
+    helper_float_26_recursive(buf + 1048576, 20);
+    helper_float_26_recursive(buf + 2097152, 20);
+    helper_float_26_recursive(buf + 3145728, 20);
+    helper_float_26_recursive(buf + 4194304, 20);
+    helper_float_26_recursive(buf + 5242880, 20);
+    helper_float_26_recursive(buf + 6291456, 20);
+    helper_float_26_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_float_26_recursive(buf + 0, 23);
+    helper_float_26_recursive(buf + 8388608, 23);
+    helper_float_26_recursive(buf + 16777216, 23);
+    helper_float_26_recursive(buf + 25165824, 23);
+    helper_float_26_recursive(buf + 33554432, 23);
+    helper_float_26_recursive(buf + 41943040, 23);
+    helper_float_26_recursive(buf + 50331648, 23);
+    helper_float_26_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_26(float *buf);
+void helper_float_26(float *buf) {
+  helper_float_26_recursive(buf, 26);
+}
+void helper_float_27_recursive(float *buf, int depth);
+void helper_float_27_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_27_recursive(buf + 0, 12);
+    helper_float_27_recursive(buf + 4096, 12);
+    helper_float_27_recursive(buf + 8192, 12);
+    helper_float_27_recursive(buf + 12288, 12);
+    helper_float_27_recursive(buf + 16384, 12);
+    helper_float_27_recursive(buf + 20480, 12);
+    helper_float_27_recursive(buf + 24576, 12);
+    helper_float_27_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_27_recursive(buf + 0, 15);
+    helper_float_27_recursive(buf + 32768, 15);
+    helper_float_27_recursive(buf + 65536, 15);
+    helper_float_27_recursive(buf + 98304, 15);
+    helper_float_27_recursive(buf + 131072, 15);
+    helper_float_27_recursive(buf + 163840, 15);
+    helper_float_27_recursive(buf + 196608, 15);
+    helper_float_27_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_27_recursive(buf + 0, 18);
+    helper_float_27_recursive(buf + 262144, 18);
+    helper_float_27_recursive(buf + 524288, 18);
+    helper_float_27_recursive(buf + 786432, 18);
+    helper_float_27_recursive(buf + 1048576, 18);
+    helper_float_27_recursive(buf + 1310720, 18);
+    helper_float_27_recursive(buf + 1572864, 18);
+    helper_float_27_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_27_recursive(buf + 0, 21);
+    helper_float_27_recursive(buf + 2097152, 21);
+    helper_float_27_recursive(buf + 4194304, 21);
+    helper_float_27_recursive(buf + 6291456, 21);
+    helper_float_27_recursive(buf + 8388608, 21);
+    helper_float_27_recursive(buf + 10485760, 21);
+    helper_float_27_recursive(buf + 12582912, 21);
+    helper_float_27_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_27_recursive(buf + 0, 24);
+    helper_float_27_recursive(buf + 16777216, 24);
+    helper_float_27_recursive(buf + 33554432, 24);
+    helper_float_27_recursive(buf + 50331648, 24);
+    helper_float_27_recursive(buf + 67108864, 24);
+    helper_float_27_recursive(buf + 83886080, 24);
+    helper_float_27_recursive(buf + 100663296, 24);
+    helper_float_27_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_27(float *buf);
+void helper_float_27(float *buf) {
+  helper_float_27_recursive(buf, 27);
+}
+void helper_float_28_recursive(float *buf, int depth);
+void helper_float_28_recursive(float *buf, int depth) {
+  if (depth == 16) {
+    for (int j = 0; j < 65536; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 65536; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 65536; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 65536; j += 16384) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 16384; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_float_28_recursive(buf + 0, 16);
+    helper_float_28_recursive(buf + 65536, 16);
+    helper_float_28_recursive(buf + 131072, 16);
+    helper_float_28_recursive(buf + 196608, 16);
+    helper_float_28_recursive(buf + 262144, 16);
+    helper_float_28_recursive(buf + 327680, 16);
+    helper_float_28_recursive(buf + 393216, 16);
+    helper_float_28_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_float_28_recursive(buf + 0, 19);
+    helper_float_28_recursive(buf + 524288, 19);
+    helper_float_28_recursive(buf + 1048576, 19);
+    helper_float_28_recursive(buf + 1572864, 19);
+    helper_float_28_recursive(buf + 2097152, 19);
+    helper_float_28_recursive(buf + 2621440, 19);
+    helper_float_28_recursive(buf + 3145728, 19);
+    helper_float_28_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_float_28_recursive(buf + 0, 22);
+    helper_float_28_recursive(buf + 4194304, 22);
+    helper_float_28_recursive(buf + 8388608, 22);
+    helper_float_28_recursive(buf + 12582912, 22);
+    helper_float_28_recursive(buf + 16777216, 22);
+    helper_float_28_recursive(buf + 20971520, 22);
+    helper_float_28_recursive(buf + 25165824, 22);
+    helper_float_28_recursive(buf + 29360128, 22);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 4194304; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 28) {
+    helper_float_28_recursive(buf + 0, 25);
+    helper_float_28_recursive(buf + 33554432, 25);
+    helper_float_28_recursive(buf + 67108864, 25);
+    helper_float_28_recursive(buf + 100663296, 25);
+    helper_float_28_recursive(buf + 134217728, 25);
+    helper_float_28_recursive(buf + 167772160, 25);
+    helper_float_28_recursive(buf + 201326592, 25);
+    helper_float_28_recursive(buf + 234881024, 25);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 33554432; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_28(float *buf);
+void helper_float_28(float *buf) {
+  helper_float_28_recursive(buf, 28);
+}
+void helper_float_29_recursive(float *buf, int depth);
+void helper_float_29_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_29_recursive(buf + 0, 12);
+    helper_float_29_recursive(buf + 4096, 12);
+    helper_float_29_recursive(buf + 8192, 12);
+    helper_float_29_recursive(buf + 12288, 12);
+    helper_float_29_recursive(buf + 16384, 12);
+    helper_float_29_recursive(buf + 20480, 12);
+    helper_float_29_recursive(buf + 24576, 12);
+    helper_float_29_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_29_recursive(buf + 0, 15);
+    helper_float_29_recursive(buf + 32768, 15);
+    helper_float_29_recursive(buf + 65536, 15);
+    helper_float_29_recursive(buf + 98304, 15);
+    helper_float_29_recursive(buf + 131072, 15);
+    helper_float_29_recursive(buf + 163840, 15);
+    helper_float_29_recursive(buf + 196608, 15);
+    helper_float_29_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_29_recursive(buf + 0, 18);
+    helper_float_29_recursive(buf + 262144, 18);
+    helper_float_29_recursive(buf + 524288, 18);
+    helper_float_29_recursive(buf + 786432, 18);
+    helper_float_29_recursive(buf + 1048576, 18);
+    helper_float_29_recursive(buf + 1310720, 18);
+    helper_float_29_recursive(buf + 1572864, 18);
+    helper_float_29_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_29_recursive(buf + 0, 21);
+    helper_float_29_recursive(buf + 2097152, 21);
+    helper_float_29_recursive(buf + 4194304, 21);
+    helper_float_29_recursive(buf + 6291456, 21);
+    helper_float_29_recursive(buf + 8388608, 21);
+    helper_float_29_recursive(buf + 10485760, 21);
+    helper_float_29_recursive(buf + 12582912, 21);
+    helper_float_29_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_29_recursive(buf + 0, 24);
+    helper_float_29_recursive(buf + 16777216, 24);
+    helper_float_29_recursive(buf + 33554432, 24);
+    helper_float_29_recursive(buf + 50331648, 24);
+    helper_float_29_recursive(buf + 67108864, 24);
+    helper_float_29_recursive(buf + 83886080, 24);
+    helper_float_29_recursive(buf + 100663296, 24);
+    helper_float_29_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 29) {
+    helper_float_29_recursive(buf + 0, 27);
+    helper_float_29_recursive(buf + 134217728, 27);
+    helper_float_29_recursive(buf + 268435456, 27);
+    helper_float_29_recursive(buf + 402653184, 27);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 134217728; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movups %%xmm0, (%0)\n"
+          "movups %%xmm1, (%1)\n"
+          "movups %%xmm2, (%2)\n"
+          "movups %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_29(float *buf);
+void helper_float_29(float *buf) {
+  helper_float_29_recursive(buf, 29);
+}
+void helper_float_30_recursive(float *buf, int depth);
+void helper_float_30_recursive(float *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 32) {
+      for (int k = 0; k < 4; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm0, %%xmm0\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm0, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm1, %%xmm1\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm1, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm2, %%xmm2\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm2, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm3, %%xmm3\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm3, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm4, %%xmm4\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm4, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm5, %%xmm5\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm5, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm6, %%xmm6\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm6, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $160, %%xmm8, %%xmm8\n"
+          "shufps $245, %%xmm7, %%xmm7\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "subps %%xmm7, %%xmm9\n"
+          "addsubps %%xmm9, %%xmm8\n"
+          "movaps %%xmm8, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm0, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm0, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm0\n"
+          "movaps %%xmm1, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm1, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm1, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm1\n"
+          "movaps %%xmm2, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm2\n"
+          "movaps %%xmm3, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm3, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm3\n"
+          "movaps %%xmm4, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm4, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm4, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm4\n"
+          "movaps %%xmm5, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm5, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm5, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm5\n"
+          "movaps %%xmm6, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm6, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm6, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm6\n"
+          "movaps %%xmm7, %%xmm8\n"
+          "shufps $68, %%xmm8, %%xmm8\n"
+          "xorps %%xmm9, %%xmm9\n"
+          "movaps %%xmm7, %%xmm10\n"
+          "shufps $14, %%xmm9, %%xmm10\n"
+          "movaps %%xmm7, %%xmm11\n"
+          "shufps $224, %%xmm11, %%xmm9\n"
+          "addps %%xmm8, %%xmm10\n"
+          "subps %%xmm9, %%xmm10\n"
+          "movaps %%xmm10, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 256) {
+      for (int k = 0; k < 32; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 2048) {
+      for (int k = 0; k < 256; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 2048; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_float_30_recursive(buf + 0, 12);
+    helper_float_30_recursive(buf + 4096, 12);
+    helper_float_30_recursive(buf + 8192, 12);
+    helper_float_30_recursive(buf + 12288, 12);
+    helper_float_30_recursive(buf + 16384, 12);
+    helper_float_30_recursive(buf + 20480, 12);
+    helper_float_30_recursive(buf + 24576, 12);
+    helper_float_30_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_float_30_recursive(buf + 0, 15);
+    helper_float_30_recursive(buf + 32768, 15);
+    helper_float_30_recursive(buf + 65536, 15);
+    helper_float_30_recursive(buf + 98304, 15);
+    helper_float_30_recursive(buf + 131072, 15);
+    helper_float_30_recursive(buf + 163840, 15);
+    helper_float_30_recursive(buf + 196608, 15);
+    helper_float_30_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_float_30_recursive(buf + 0, 18);
+    helper_float_30_recursive(buf + 262144, 18);
+    helper_float_30_recursive(buf + 524288, 18);
+    helper_float_30_recursive(buf + 786432, 18);
+    helper_float_30_recursive(buf + 1048576, 18);
+    helper_float_30_recursive(buf + 1310720, 18);
+    helper_float_30_recursive(buf + 1572864, 18);
+    helper_float_30_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_float_30_recursive(buf + 0, 21);
+    helper_float_30_recursive(buf + 2097152, 21);
+    helper_float_30_recursive(buf + 4194304, 21);
+    helper_float_30_recursive(buf + 6291456, 21);
+    helper_float_30_recursive(buf + 8388608, 21);
+    helper_float_30_recursive(buf + 10485760, 21);
+    helper_float_30_recursive(buf + 12582912, 21);
+    helper_float_30_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_float_30_recursive(buf + 0, 24);
+    helper_float_30_recursive(buf + 16777216, 24);
+    helper_float_30_recursive(buf + 33554432, 24);
+    helper_float_30_recursive(buf + 50331648, 24);
+    helper_float_30_recursive(buf + 67108864, 24);
+    helper_float_30_recursive(buf + 83886080, 24);
+    helper_float_30_recursive(buf + 100663296, 24);
+    helper_float_30_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 30) {
+    helper_float_30_recursive(buf + 0, 27);
+    helper_float_30_recursive(buf + 134217728, 27);
+    helper_float_30_recursive(buf + 268435456, 27);
+    helper_float_30_recursive(buf + 402653184, 27);
+    helper_float_30_recursive(buf + 536870912, 27);
+    helper_float_30_recursive(buf + 671088640, 27);
+    helper_float_30_recursive(buf + 805306368, 27);
+    helper_float_30_recursive(buf + 939524096, 27);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 134217728; k += 4) {
+        __asm__ volatile (
+          "movups (%0), %%xmm0\n"
+          "movups (%1), %%xmm1\n"
+          "movups (%2), %%xmm2\n"
+          "movups (%3), %%xmm3\n"
+          "movups (%4), %%xmm4\n"
+          "movups (%5), %%xmm5\n"
+          "movups (%6), %%xmm6\n"
+          "movups (%7), %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm9\n"
+          "addps %%xmm1, %%xmm8\n"
+          "subps %%xmm1, %%xmm9\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm11\n"
+          "addps %%xmm3, %%xmm10\n"
+          "subps %%xmm3, %%xmm11\n"
+          "movaps %%xmm4, %%xmm12\n"
+          "movaps %%xmm4, %%xmm13\n"
+          "addps %%xmm5, %%xmm12\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm6, %%xmm14\n"
+          "movaps %%xmm6, %%xmm15\n"
+          "addps %%xmm7, %%xmm14\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movaps %%xmm8, %%xmm0\n"
+          "movaps %%xmm8, %%xmm2\n"
+          "addps %%xmm10, %%xmm0\n"
+          "subps %%xmm10, %%xmm2\n"
+          "movaps %%xmm9, %%xmm1\n"
+          "movaps %%xmm9, %%xmm3\n"
+          "addps %%xmm11, %%xmm1\n"
+          "subps %%xmm11, %%xmm3\n"
+          "movaps %%xmm12, %%xmm4\n"
+          "movaps %%xmm12, %%xmm6\n"
+          "addps %%xmm14, %%xmm4\n"
+          "subps %%xmm14, %%xmm6\n"
+          "movaps %%xmm13, %%xmm5\n"
+          "movaps %%xmm13, %%xmm7\n"
+          "addps %%xmm15, %%xmm5\n"
+          "subps %%xmm15, %%xmm7\n"
+          "movaps %%xmm0, %%xmm8\n"
+          "movaps %%xmm0, %%xmm12\n"
+          "addps %%xmm4, %%xmm8\n"
+          "subps %%xmm4, %%xmm12\n"
+          "movaps %%xmm1, %%xmm9\n"
+          "movaps %%xmm1, %%xmm13\n"
+          "addps %%xmm5, %%xmm9\n"
+          "subps %%xmm5, %%xmm13\n"
+          "movaps %%xmm2, %%xmm10\n"
+          "movaps %%xmm2, %%xmm14\n"
+          "addps %%xmm6, %%xmm10\n"
+          "subps %%xmm6, %%xmm14\n"
+          "movaps %%xmm3, %%xmm11\n"
+          "movaps %%xmm3, %%xmm15\n"
+          "addps %%xmm7, %%xmm11\n"
+          "subps %%xmm7, %%xmm15\n"
+          "movups %%xmm8, (%0)\n"
+          "movups %%xmm9, (%1)\n"
+          "movups %%xmm10, (%2)\n"
+          "movups %%xmm11, (%3)\n"
+          "movups %%xmm12, (%4)\n"
+          "movups %%xmm13, (%5)\n"
+          "movups %%xmm14, (%6)\n"
+          "movups %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_float_30(float *buf);
+void helper_float_30(float *buf) {
+  helper_float_30_recursive(buf, 30);
+}
+int fht_float(float *buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_float_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_float_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_float_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_float_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_float_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_float_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_float_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_float_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_float_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_float_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_float_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_float_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_float_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_float_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_float_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_float_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_float_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_float_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_float_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_float_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_float_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_float_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_float_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_float_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_float_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_float_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_float_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_float_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_float_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_float_30(buf);
+    return 0;
+  }
+  return 1;
+}
+static inline void helper_double_1(double *buf);
+static inline void helper_double_1(double *buf) {
+  for (int j = 0; j < 2; j += 2) {
+    __asm__ volatile (
+      "movupd (%0), %%xmm0\n"
+      "movapd %%xmm0, %%xmm8\n"
+      "haddpd %%xmm8, %%xmm8\n"
+      "movapd %%xmm0, %%xmm9\n"
+      "hsubpd %%xmm9, %%xmm9\n"
+      "blendpd $1, %%xmm8, %%xmm9\n"
+      "movapd %%xmm9, %%xmm0\n"
+      "movupd %%xmm0, (%0)\n"
+      :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+    );
+  }
+}
+void helper_double_2_recursive(double *buf, int depth);
+void helper_double_2_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_2(double *buf);
+void helper_double_2(double *buf) {
+  helper_double_2_recursive(buf, 2);
+}
+void helper_double_3_recursive(double *buf, int depth);
+void helper_double_3_recursive(double *buf, int depth) {
+  if (depth == 3) {
+    for (int j = 0; j < 8; j += 8) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_3(double *buf);
+void helper_double_3(double *buf) {
+  helper_double_3_recursive(buf, 3);
+}
+static inline void helper_double_4(double *buf);
+static inline void helper_double_4(double *buf) {
+  for (int j = 0; j < 16; j += 16) {
+    for (int k = 0; k < 2; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm0\n"
+        "movapd %%xmm1, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm2, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm2, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm2\n"
+        "movapd %%xmm3, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm3, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "movapd %%xmm4, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm4, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm4\n"
+        "movapd %%xmm5, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm5, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm5\n"
+        "movapd %%xmm6, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm6, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm6\n"
+        "movapd %%xmm7, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm7, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_5_recursive(double *buf, int depth);
+void helper_double_5_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_5_recursive(buf + 0, 2);
+    helper_double_5_recursive(buf + 4, 2);
+    helper_double_5_recursive(buf + 8, 2);
+    helper_double_5_recursive(buf + 12, 2);
+    helper_double_5_recursive(buf + 16, 2);
+    helper_double_5_recursive(buf + 20, 2);
+    helper_double_5_recursive(buf + 24, 2);
+    helper_double_5_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_5(double *buf);
+void helper_double_5(double *buf) {
+  helper_double_5_recursive(buf, 5);
+}
+static inline void helper_double_6(double *buf);
+static inline void helper_double_6(double *buf) {
+  for (int j = 0; j < 64; j += 16) {
+    for (int k = 0; k < 2; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm0\n"
+        "movapd %%xmm1, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm2, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm2, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm2\n"
+        "movapd %%xmm3, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm3, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "movapd %%xmm4, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm4, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm4\n"
+        "movapd %%xmm5, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm5, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm5\n"
+        "movapd %%xmm6, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm6, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm6\n"
+        "movapd %%xmm7, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm7, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 64; j += 64) {
+    for (int k = 0; k < 16; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movupd %%xmm0, (%0)\n"
+        "movupd %%xmm1, (%1)\n"
+        "movupd %%xmm2, (%2)\n"
+        "movupd %%xmm3, (%3)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+static inline void helper_double_7(double *buf);
+static inline void helper_double_7(double *buf) {
+  for (int j = 0; j < 128; j += 16) {
+    for (int k = 0; k < 2; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm0\n"
+        "movapd %%xmm1, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm2, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm2, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm2\n"
+        "movapd %%xmm3, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm3, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "movapd %%xmm4, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm4, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm4\n"
+        "movapd %%xmm5, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm5, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm5\n"
+        "movapd %%xmm6, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm6, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm6\n"
+        "movapd %%xmm7, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm7, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 128; j += 128) {
+    for (int k = 0; k < 16; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_8_recursive(double *buf, int depth);
+void helper_double_8_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_8_recursive(buf + 0, 2);
+    helper_double_8_recursive(buf + 4, 2);
+    helper_double_8_recursive(buf + 8, 2);
+    helper_double_8_recursive(buf + 12, 2);
+    helper_double_8_recursive(buf + 16, 2);
+    helper_double_8_recursive(buf + 20, 2);
+    helper_double_8_recursive(buf + 24, 2);
+    helper_double_8_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_8_recursive(buf + 0, 5);
+    helper_double_8_recursive(buf + 32, 5);
+    helper_double_8_recursive(buf + 64, 5);
+    helper_double_8_recursive(buf + 96, 5);
+    helper_double_8_recursive(buf + 128, 5);
+    helper_double_8_recursive(buf + 160, 5);
+    helper_double_8_recursive(buf + 192, 5);
+    helper_double_8_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_8(double *buf);
+void helper_double_8(double *buf) {
+  helper_double_8_recursive(buf, 8);
+}
+void helper_double_9_recursive(double *buf, int depth);
+void helper_double_9_recursive(double *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_double_9_recursive(buf + 0, 6);
+    helper_double_9_recursive(buf + 64, 6);
+    helper_double_9_recursive(buf + 128, 6);
+    helper_double_9_recursive(buf + 192, 6);
+    helper_double_9_recursive(buf + 256, 6);
+    helper_double_9_recursive(buf + 320, 6);
+    helper_double_9_recursive(buf + 384, 6);
+    helper_double_9_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_9(double *buf);
+void helper_double_9(double *buf) {
+  helper_double_9_recursive(buf, 9);
+}
+void helper_double_10_recursive(double *buf, int depth);
+void helper_double_10_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_10(double *buf);
+void helper_double_10(double *buf) {
+  helper_double_10_recursive(buf, 10);
+}
+void helper_double_11_recursive(double *buf, int depth);
+void helper_double_11_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_11_recursive(buf + 0, 2);
+    helper_double_11_recursive(buf + 4, 2);
+    helper_double_11_recursive(buf + 8, 2);
+    helper_double_11_recursive(buf + 12, 2);
+    helper_double_11_recursive(buf + 16, 2);
+    helper_double_11_recursive(buf + 20, 2);
+    helper_double_11_recursive(buf + 24, 2);
+    helper_double_11_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_11_recursive(buf + 0, 5);
+    helper_double_11_recursive(buf + 32, 5);
+    helper_double_11_recursive(buf + 64, 5);
+    helper_double_11_recursive(buf + 96, 5);
+    helper_double_11_recursive(buf + 128, 5);
+    helper_double_11_recursive(buf + 160, 5);
+    helper_double_11_recursive(buf + 192, 5);
+    helper_double_11_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_11_recursive(buf + 0, 8);
+    helper_double_11_recursive(buf + 256, 8);
+    helper_double_11_recursive(buf + 512, 8);
+    helper_double_11_recursive(buf + 768, 8);
+    helper_double_11_recursive(buf + 1024, 8);
+    helper_double_11_recursive(buf + 1280, 8);
+    helper_double_11_recursive(buf + 1536, 8);
+    helper_double_11_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_11(double *buf);
+void helper_double_11(double *buf) {
+  helper_double_11_recursive(buf, 11);
+}
+void helper_double_12_recursive(double *buf, int depth);
+void helper_double_12_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_12_recursive(buf + 0, 10);
+    helper_double_12_recursive(buf + 1024, 10);
+    helper_double_12_recursive(buf + 2048, 10);
+    helper_double_12_recursive(buf + 3072, 10);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_12(double *buf);
+void helper_double_12(double *buf) {
+  helper_double_12_recursive(buf, 12);
+}
+static inline void helper_double_13(double *buf);
+static inline void helper_double_13(double *buf) {
+  for (int j = 0; j < 8192; j += 16) {
+    for (int k = 0; k < 2; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm0\n"
+        "movapd %%xmm1, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm2, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm2, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm2\n"
+        "movapd %%xmm3, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm3, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "movapd %%xmm4, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm4, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm4\n"
+        "movapd %%xmm5, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm5, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm5\n"
+        "movapd %%xmm6, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm6, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm6\n"
+        "movapd %%xmm7, %%xmm8\n"
+        "haddpd %%xmm8, %%xmm8\n"
+        "movapd %%xmm7, %%xmm9\n"
+        "hsubpd %%xmm9, %%xmm9\n"
+        "blendpd $1, %%xmm8, %%xmm9\n"
+        "movapd %%xmm9, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 8192; j += 128) {
+    for (int k = 0; k < 16; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 8192; j += 1024) {
+    for (int k = 0; k < 128; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+  for (int j = 0; j < 8192; j += 8192) {
+    for (int k = 0; k < 1024; k += 2) {
+      __asm__ volatile (
+        "movupd (%0), %%xmm0\n"
+        "movupd (%1), %%xmm1\n"
+        "movupd (%2), %%xmm2\n"
+        "movupd (%3), %%xmm3\n"
+        "movupd (%4), %%xmm4\n"
+        "movupd (%5), %%xmm5\n"
+        "movupd (%6), %%xmm6\n"
+        "movupd (%7), %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm9\n"
+        "addpd %%xmm1, %%xmm8\n"
+        "subpd %%xmm1, %%xmm9\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm11\n"
+        "addpd %%xmm3, %%xmm10\n"
+        "subpd %%xmm3, %%xmm11\n"
+        "movapd %%xmm4, %%xmm12\n"
+        "movapd %%xmm4, %%xmm13\n"
+        "addpd %%xmm5, %%xmm12\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm6, %%xmm14\n"
+        "movapd %%xmm6, %%xmm15\n"
+        "addpd %%xmm7, %%xmm14\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movapd %%xmm8, %%xmm0\n"
+        "movapd %%xmm8, %%xmm2\n"
+        "addpd %%xmm10, %%xmm0\n"
+        "subpd %%xmm10, %%xmm2\n"
+        "movapd %%xmm9, %%xmm1\n"
+        "movapd %%xmm9, %%xmm3\n"
+        "addpd %%xmm11, %%xmm1\n"
+        "subpd %%xmm11, %%xmm3\n"
+        "movapd %%xmm12, %%xmm4\n"
+        "movapd %%xmm12, %%xmm6\n"
+        "addpd %%xmm14, %%xmm4\n"
+        "subpd %%xmm14, %%xmm6\n"
+        "movapd %%xmm13, %%xmm5\n"
+        "movapd %%xmm13, %%xmm7\n"
+        "addpd %%xmm15, %%xmm5\n"
+        "subpd %%xmm15, %%xmm7\n"
+        "movapd %%xmm0, %%xmm8\n"
+        "movapd %%xmm0, %%xmm12\n"
+        "addpd %%xmm4, %%xmm8\n"
+        "subpd %%xmm4, %%xmm12\n"
+        "movapd %%xmm1, %%xmm9\n"
+        "movapd %%xmm1, %%xmm13\n"
+        "addpd %%xmm5, %%xmm9\n"
+        "subpd %%xmm5, %%xmm13\n"
+        "movapd %%xmm2, %%xmm10\n"
+        "movapd %%xmm2, %%xmm14\n"
+        "addpd %%xmm6, %%xmm10\n"
+        "subpd %%xmm6, %%xmm14\n"
+        "movapd %%xmm3, %%xmm11\n"
+        "movapd %%xmm3, %%xmm15\n"
+        "addpd %%xmm7, %%xmm11\n"
+        "subpd %%xmm7, %%xmm15\n"
+        "movupd %%xmm8, (%0)\n"
+        "movupd %%xmm9, (%1)\n"
+        "movupd %%xmm10, (%2)\n"
+        "movupd %%xmm11, (%3)\n"
+        "movupd %%xmm12, (%4)\n"
+        "movupd %%xmm13, (%5)\n"
+        "movupd %%xmm14, (%6)\n"
+        "movupd %%xmm15, (%7)\n"
+        :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+      );
+    }
+  }
+}
+void helper_double_14_recursive(double *buf, int depth);
+void helper_double_14_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_14_recursive(buf + 0, 9);
+    helper_double_14_recursive(buf + 512, 9);
+    helper_double_14_recursive(buf + 1024, 9);
+    helper_double_14_recursive(buf + 1536, 9);
+    helper_double_14_recursive(buf + 2048, 9);
+    helper_double_14_recursive(buf + 2560, 9);
+    helper_double_14_recursive(buf + 3072, 9);
+    helper_double_14_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_14_recursive(buf + 0, 12);
+    helper_double_14_recursive(buf + 4096, 12);
+    helper_double_14_recursive(buf + 8192, 12);
+    helper_double_14_recursive(buf + 12288, 12);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_14(double *buf);
+void helper_double_14(double *buf) {
+  helper_double_14_recursive(buf, 14);
+}
+void helper_double_15_recursive(double *buf, int depth);
+void helper_double_15_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_15_recursive(buf + 0, 10);
+    helper_double_15_recursive(buf + 1024, 10);
+    helper_double_15_recursive(buf + 2048, 10);
+    helper_double_15_recursive(buf + 3072, 10);
+    helper_double_15_recursive(buf + 4096, 10);
+    helper_double_15_recursive(buf + 5120, 10);
+    helper_double_15_recursive(buf + 6144, 10);
+    helper_double_15_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_15_recursive(buf + 0, 13);
+    helper_double_15_recursive(buf + 8192, 13);
+    helper_double_15_recursive(buf + 16384, 13);
+    helper_double_15_recursive(buf + 24576, 13);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_15(double *buf);
+void helper_double_15(double *buf) {
+  helper_double_15_recursive(buf, 15);
+}
+void helper_double_16_recursive(double *buf, int depth);
+void helper_double_16_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_16_recursive(buf + 0, 2);
+    helper_double_16_recursive(buf + 4, 2);
+    helper_double_16_recursive(buf + 8, 2);
+    helper_double_16_recursive(buf + 12, 2);
+    helper_double_16_recursive(buf + 16, 2);
+    helper_double_16_recursive(buf + 20, 2);
+    helper_double_16_recursive(buf + 24, 2);
+    helper_double_16_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_16_recursive(buf + 0, 5);
+    helper_double_16_recursive(buf + 32, 5);
+    helper_double_16_recursive(buf + 64, 5);
+    helper_double_16_recursive(buf + 96, 5);
+    helper_double_16_recursive(buf + 128, 5);
+    helper_double_16_recursive(buf + 160, 5);
+    helper_double_16_recursive(buf + 192, 5);
+    helper_double_16_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_16_recursive(buf + 0, 8);
+    helper_double_16_recursive(buf + 256, 8);
+    helper_double_16_recursive(buf + 512, 8);
+    helper_double_16_recursive(buf + 768, 8);
+    helper_double_16_recursive(buf + 1024, 8);
+    helper_double_16_recursive(buf + 1280, 8);
+    helper_double_16_recursive(buf + 1536, 8);
+    helper_double_16_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_16_recursive(buf + 0, 11);
+    helper_double_16_recursive(buf + 2048, 11);
+    helper_double_16_recursive(buf + 4096, 11);
+    helper_double_16_recursive(buf + 6144, 11);
+    helper_double_16_recursive(buf + 8192, 11);
+    helper_double_16_recursive(buf + 10240, 11);
+    helper_double_16_recursive(buf + 12288, 11);
+    helper_double_16_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_16_recursive(buf + 0, 14);
+    helper_double_16_recursive(buf + 16384, 14);
+    helper_double_16_recursive(buf + 32768, 14);
+    helper_double_16_recursive(buf + 49152, 14);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_16(double *buf);
+void helper_double_16(double *buf) {
+  helper_double_16_recursive(buf, 16);
+}
+void helper_double_17_recursive(double *buf, int depth);
+void helper_double_17_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_17_recursive(buf + 0, 12);
+    helper_double_17_recursive(buf + 4096, 12);
+    helper_double_17_recursive(buf + 8192, 12);
+    helper_double_17_recursive(buf + 12288, 12);
+    helper_double_17_recursive(buf + 16384, 12);
+    helper_double_17_recursive(buf + 20480, 12);
+    helper_double_17_recursive(buf + 24576, 12);
+    helper_double_17_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_17_recursive(buf + 0, 15);
+    helper_double_17_recursive(buf + 32768, 15);
+    helper_double_17_recursive(buf + 65536, 15);
+    helper_double_17_recursive(buf + 98304, 15);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_17(double *buf);
+void helper_double_17(double *buf) {
+  helper_double_17_recursive(buf, 17);
+}
+void helper_double_18_recursive(double *buf, int depth);
+void helper_double_18_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_18_recursive(buf + 0, 12);
+    helper_double_18_recursive(buf + 4096, 12);
+    helper_double_18_recursive(buf + 8192, 12);
+    helper_double_18_recursive(buf + 12288, 12);
+    helper_double_18_recursive(buf + 16384, 12);
+    helper_double_18_recursive(buf + 20480, 12);
+    helper_double_18_recursive(buf + 24576, 12);
+    helper_double_18_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_18_recursive(buf + 0, 15);
+    helper_double_18_recursive(buf + 32768, 15);
+    helper_double_18_recursive(buf + 65536, 15);
+    helper_double_18_recursive(buf + 98304, 15);
+    helper_double_18_recursive(buf + 131072, 15);
+    helper_double_18_recursive(buf + 163840, 15);
+    helper_double_18_recursive(buf + 196608, 15);
+    helper_double_18_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_18(double *buf);
+void helper_double_18(double *buf) {
+  helper_double_18_recursive(buf, 18);
+}
+void helper_double_19_recursive(double *buf, int depth);
+void helper_double_19_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_19_recursive(buf + 0, 2);
+    helper_double_19_recursive(buf + 4, 2);
+    helper_double_19_recursive(buf + 8, 2);
+    helper_double_19_recursive(buf + 12, 2);
+    helper_double_19_recursive(buf + 16, 2);
+    helper_double_19_recursive(buf + 20, 2);
+    helper_double_19_recursive(buf + 24, 2);
+    helper_double_19_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_19_recursive(buf + 0, 5);
+    helper_double_19_recursive(buf + 32, 5);
+    helper_double_19_recursive(buf + 64, 5);
+    helper_double_19_recursive(buf + 96, 5);
+    helper_double_19_recursive(buf + 128, 5);
+    helper_double_19_recursive(buf + 160, 5);
+    helper_double_19_recursive(buf + 192, 5);
+    helper_double_19_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_19_recursive(buf + 0, 8);
+    helper_double_19_recursive(buf + 256, 8);
+    helper_double_19_recursive(buf + 512, 8);
+    helper_double_19_recursive(buf + 768, 8);
+    helper_double_19_recursive(buf + 1024, 8);
+    helper_double_19_recursive(buf + 1280, 8);
+    helper_double_19_recursive(buf + 1536, 8);
+    helper_double_19_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_19_recursive(buf + 0, 11);
+    helper_double_19_recursive(buf + 2048, 11);
+    helper_double_19_recursive(buf + 4096, 11);
+    helper_double_19_recursive(buf + 6144, 11);
+    helper_double_19_recursive(buf + 8192, 11);
+    helper_double_19_recursive(buf + 10240, 11);
+    helper_double_19_recursive(buf + 12288, 11);
+    helper_double_19_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_19_recursive(buf + 0, 14);
+    helper_double_19_recursive(buf + 16384, 14);
+    helper_double_19_recursive(buf + 32768, 14);
+    helper_double_19_recursive(buf + 49152, 14);
+    helper_double_19_recursive(buf + 65536, 14);
+    helper_double_19_recursive(buf + 81920, 14);
+    helper_double_19_recursive(buf + 98304, 14);
+    helper_double_19_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_19_recursive(buf + 0, 17);
+    helper_double_19_recursive(buf + 131072, 17);
+    helper_double_19_recursive(buf + 262144, 17);
+    helper_double_19_recursive(buf + 393216, 17);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_19(double *buf);
+void helper_double_19(double *buf) {
+  helper_double_19_recursive(buf, 19);
+}
+void helper_double_20_recursive(double *buf, int depth);
+void helper_double_20_recursive(double *buf, int depth) {
+  if (depth == 12) {
+    for (int j = 0; j < 4096; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_20_recursive(buf + 0, 12);
+    helper_double_20_recursive(buf + 4096, 12);
+    helper_double_20_recursive(buf + 8192, 12);
+    helper_double_20_recursive(buf + 12288, 12);
+    helper_double_20_recursive(buf + 16384, 12);
+    helper_double_20_recursive(buf + 20480, 12);
+    helper_double_20_recursive(buf + 24576, 12);
+    helper_double_20_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_20_recursive(buf + 0, 15);
+    helper_double_20_recursive(buf + 32768, 15);
+    helper_double_20_recursive(buf + 65536, 15);
+    helper_double_20_recursive(buf + 98304, 15);
+    helper_double_20_recursive(buf + 131072, 15);
+    helper_double_20_recursive(buf + 163840, 15);
+    helper_double_20_recursive(buf + 196608, 15);
+    helper_double_20_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_20_recursive(buf + 0, 18);
+    helper_double_20_recursive(buf + 262144, 18);
+    helper_double_20_recursive(buf + 524288, 18);
+    helper_double_20_recursive(buf + 786432, 18);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 262144; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_20(double *buf);
+void helper_double_20(double *buf) {
+  helper_double_20_recursive(buf, 20);
+}
+void helper_double_21_recursive(double *buf, int depth);
+void helper_double_21_recursive(double *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_21_recursive(buf + 0, 13);
+    helper_double_21_recursive(buf + 8192, 13);
+    helper_double_21_recursive(buf + 16384, 13);
+    helper_double_21_recursive(buf + 24576, 13);
+    helper_double_21_recursive(buf + 32768, 13);
+    helper_double_21_recursive(buf + 40960, 13);
+    helper_double_21_recursive(buf + 49152, 13);
+    helper_double_21_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_21_recursive(buf + 0, 16);
+    helper_double_21_recursive(buf + 65536, 16);
+    helper_double_21_recursive(buf + 131072, 16);
+    helper_double_21_recursive(buf + 196608, 16);
+    helper_double_21_recursive(buf + 262144, 16);
+    helper_double_21_recursive(buf + 327680, 16);
+    helper_double_21_recursive(buf + 393216, 16);
+    helper_double_21_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_21_recursive(buf + 0, 19);
+    helper_double_21_recursive(buf + 524288, 19);
+    helper_double_21_recursive(buf + 1048576, 19);
+    helper_double_21_recursive(buf + 1572864, 19);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 524288; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_21(double *buf);
+void helper_double_21(double *buf) {
+  helper_double_21_recursive(buf, 21);
+}
+void helper_double_22_recursive(double *buf, int depth);
+void helper_double_22_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_22_recursive(buf + 0, 2);
+    helper_double_22_recursive(buf + 4, 2);
+    helper_double_22_recursive(buf + 8, 2);
+    helper_double_22_recursive(buf + 12, 2);
+    helper_double_22_recursive(buf + 16, 2);
+    helper_double_22_recursive(buf + 20, 2);
+    helper_double_22_recursive(buf + 24, 2);
+    helper_double_22_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_22_recursive(buf + 0, 5);
+    helper_double_22_recursive(buf + 32, 5);
+    helper_double_22_recursive(buf + 64, 5);
+    helper_double_22_recursive(buf + 96, 5);
+    helper_double_22_recursive(buf + 128, 5);
+    helper_double_22_recursive(buf + 160, 5);
+    helper_double_22_recursive(buf + 192, 5);
+    helper_double_22_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_22_recursive(buf + 0, 8);
+    helper_double_22_recursive(buf + 256, 8);
+    helper_double_22_recursive(buf + 512, 8);
+    helper_double_22_recursive(buf + 768, 8);
+    helper_double_22_recursive(buf + 1024, 8);
+    helper_double_22_recursive(buf + 1280, 8);
+    helper_double_22_recursive(buf + 1536, 8);
+    helper_double_22_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_22_recursive(buf + 0, 11);
+    helper_double_22_recursive(buf + 2048, 11);
+    helper_double_22_recursive(buf + 4096, 11);
+    helper_double_22_recursive(buf + 6144, 11);
+    helper_double_22_recursive(buf + 8192, 11);
+    helper_double_22_recursive(buf + 10240, 11);
+    helper_double_22_recursive(buf + 12288, 11);
+    helper_double_22_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_22_recursive(buf + 0, 14);
+    helper_double_22_recursive(buf + 16384, 14);
+    helper_double_22_recursive(buf + 32768, 14);
+    helper_double_22_recursive(buf + 49152, 14);
+    helper_double_22_recursive(buf + 65536, 14);
+    helper_double_22_recursive(buf + 81920, 14);
+    helper_double_22_recursive(buf + 98304, 14);
+    helper_double_22_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_22_recursive(buf + 0, 17);
+    helper_double_22_recursive(buf + 131072, 17);
+    helper_double_22_recursive(buf + 262144, 17);
+    helper_double_22_recursive(buf + 393216, 17);
+    helper_double_22_recursive(buf + 524288, 17);
+    helper_double_22_recursive(buf + 655360, 17);
+    helper_double_22_recursive(buf + 786432, 17);
+    helper_double_22_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_22_recursive(buf + 0, 20);
+    helper_double_22_recursive(buf + 1048576, 20);
+    helper_double_22_recursive(buf + 2097152, 20);
+    helper_double_22_recursive(buf + 3145728, 20);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 1048576; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_22(double *buf);
+void helper_double_22(double *buf) {
+  helper_double_22_recursive(buf, 22);
+}
+void helper_double_23_recursive(double *buf, int depth);
+void helper_double_23_recursive(double *buf, int depth) {
+  if (depth == 2) {
+    for (int j = 0; j < 4; j += 4) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 5) {
+    helper_double_23_recursive(buf + 0, 2);
+    helper_double_23_recursive(buf + 4, 2);
+    helper_double_23_recursive(buf + 8, 2);
+    helper_double_23_recursive(buf + 12, 2);
+    helper_double_23_recursive(buf + 16, 2);
+    helper_double_23_recursive(buf + 20, 2);
+    helper_double_23_recursive(buf + 24, 2);
+    helper_double_23_recursive(buf + 28, 2);
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 4; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_23_recursive(buf + 0, 5);
+    helper_double_23_recursive(buf + 32, 5);
+    helper_double_23_recursive(buf + 64, 5);
+    helper_double_23_recursive(buf + 96, 5);
+    helper_double_23_recursive(buf + 128, 5);
+    helper_double_23_recursive(buf + 160, 5);
+    helper_double_23_recursive(buf + 192, 5);
+    helper_double_23_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_23_recursive(buf + 0, 8);
+    helper_double_23_recursive(buf + 256, 8);
+    helper_double_23_recursive(buf + 512, 8);
+    helper_double_23_recursive(buf + 768, 8);
+    helper_double_23_recursive(buf + 1024, 8);
+    helper_double_23_recursive(buf + 1280, 8);
+    helper_double_23_recursive(buf + 1536, 8);
+    helper_double_23_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_23_recursive(buf + 0, 11);
+    helper_double_23_recursive(buf + 2048, 11);
+    helper_double_23_recursive(buf + 4096, 11);
+    helper_double_23_recursive(buf + 6144, 11);
+    helper_double_23_recursive(buf + 8192, 11);
+    helper_double_23_recursive(buf + 10240, 11);
+    helper_double_23_recursive(buf + 12288, 11);
+    helper_double_23_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_23_recursive(buf + 0, 14);
+    helper_double_23_recursive(buf + 16384, 14);
+    helper_double_23_recursive(buf + 32768, 14);
+    helper_double_23_recursive(buf + 49152, 14);
+    helper_double_23_recursive(buf + 65536, 14);
+    helper_double_23_recursive(buf + 81920, 14);
+    helper_double_23_recursive(buf + 98304, 14);
+    helper_double_23_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_23_recursive(buf + 0, 17);
+    helper_double_23_recursive(buf + 131072, 17);
+    helper_double_23_recursive(buf + 262144, 17);
+    helper_double_23_recursive(buf + 393216, 17);
+    helper_double_23_recursive(buf + 524288, 17);
+    helper_double_23_recursive(buf + 655360, 17);
+    helper_double_23_recursive(buf + 786432, 17);
+    helper_double_23_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_23_recursive(buf + 0, 20);
+    helper_double_23_recursive(buf + 1048576, 20);
+    helper_double_23_recursive(buf + 2097152, 20);
+    helper_double_23_recursive(buf + 3145728, 20);
+    helper_double_23_recursive(buf + 4194304, 20);
+    helper_double_23_recursive(buf + 5242880, 20);
+    helper_double_23_recursive(buf + 6291456, 20);
+    helper_double_23_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_23(double *buf);
+void helper_double_23(double *buf) {
+  helper_double_23_recursive(buf, 23);
+}
+void helper_double_24_recursive(double *buf, int depth);
+void helper_double_24_recursive(double *buf, int depth) {
+  if (depth == 13) {
+    for (int j = 0; j < 8192; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_24_recursive(buf + 0, 13);
+    helper_double_24_recursive(buf + 8192, 13);
+    helper_double_24_recursive(buf + 16384, 13);
+    helper_double_24_recursive(buf + 24576, 13);
+    helper_double_24_recursive(buf + 32768, 13);
+    helper_double_24_recursive(buf + 40960, 13);
+    helper_double_24_recursive(buf + 49152, 13);
+    helper_double_24_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_24_recursive(buf + 0, 16);
+    helper_double_24_recursive(buf + 65536, 16);
+    helper_double_24_recursive(buf + 131072, 16);
+    helper_double_24_recursive(buf + 196608, 16);
+    helper_double_24_recursive(buf + 262144, 16);
+    helper_double_24_recursive(buf + 327680, 16);
+    helper_double_24_recursive(buf + 393216, 16);
+    helper_double_24_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_24_recursive(buf + 0, 19);
+    helper_double_24_recursive(buf + 524288, 19);
+    helper_double_24_recursive(buf + 1048576, 19);
+    helper_double_24_recursive(buf + 1572864, 19);
+    helper_double_24_recursive(buf + 2097152, 19);
+    helper_double_24_recursive(buf + 2621440, 19);
+    helper_double_24_recursive(buf + 3145728, 19);
+    helper_double_24_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_24_recursive(buf + 0, 22);
+    helper_double_24_recursive(buf + 4194304, 22);
+    helper_double_24_recursive(buf + 8388608, 22);
+    helper_double_24_recursive(buf + 12582912, 22);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 4194304; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_24(double *buf);
+void helper_double_24(double *buf) {
+  helper_double_24_recursive(buf, 24);
+}
+void helper_double_25_recursive(double *buf, int depth);
+void helper_double_25_recursive(double *buf, int depth) {
+  if (depth == 10) {
+    for (int j = 0; j < 1024; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 1024; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 13) {
+    helper_double_25_recursive(buf + 0, 10);
+    helper_double_25_recursive(buf + 1024, 10);
+    helper_double_25_recursive(buf + 2048, 10);
+    helper_double_25_recursive(buf + 3072, 10);
+    helper_double_25_recursive(buf + 4096, 10);
+    helper_double_25_recursive(buf + 5120, 10);
+    helper_double_25_recursive(buf + 6144, 10);
+    helper_double_25_recursive(buf + 7168, 10);
+    for (int j = 0; j < 8192; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 16) {
+    helper_double_25_recursive(buf + 0, 13);
+    helper_double_25_recursive(buf + 8192, 13);
+    helper_double_25_recursive(buf + 16384, 13);
+    helper_double_25_recursive(buf + 24576, 13);
+    helper_double_25_recursive(buf + 32768, 13);
+    helper_double_25_recursive(buf + 40960, 13);
+    helper_double_25_recursive(buf + 49152, 13);
+    helper_double_25_recursive(buf + 57344, 13);
+    for (int j = 0; j < 65536; j += 65536) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 19) {
+    helper_double_25_recursive(buf + 0, 16);
+    helper_double_25_recursive(buf + 65536, 16);
+    helper_double_25_recursive(buf + 131072, 16);
+    helper_double_25_recursive(buf + 196608, 16);
+    helper_double_25_recursive(buf + 262144, 16);
+    helper_double_25_recursive(buf + 327680, 16);
+    helper_double_25_recursive(buf + 393216, 16);
+    helper_double_25_recursive(buf + 458752, 16);
+    for (int j = 0; j < 524288; j += 524288) {
+      for (int k = 0; k < 65536; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 22) {
+    helper_double_25_recursive(buf + 0, 19);
+    helper_double_25_recursive(buf + 524288, 19);
+    helper_double_25_recursive(buf + 1048576, 19);
+    helper_double_25_recursive(buf + 1572864, 19);
+    helper_double_25_recursive(buf + 2097152, 19);
+    helper_double_25_recursive(buf + 2621440, 19);
+    helper_double_25_recursive(buf + 3145728, 19);
+    helper_double_25_recursive(buf + 3670016, 19);
+    for (int j = 0; j < 4194304; j += 4194304) {
+      for (int k = 0; k < 524288; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 25) {
+    helper_double_25_recursive(buf + 0, 22);
+    helper_double_25_recursive(buf + 4194304, 22);
+    helper_double_25_recursive(buf + 8388608, 22);
+    helper_double_25_recursive(buf + 12582912, 22);
+    helper_double_25_recursive(buf + 16777216, 22);
+    helper_double_25_recursive(buf + 20971520, 22);
+    helper_double_25_recursive(buf + 25165824, 22);
+    helper_double_25_recursive(buf + 29360128, 22);
+    for (int j = 0; j < 33554432; j += 33554432) {
+      for (int k = 0; k < 4194304; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_25(double *buf);
+void helper_double_25(double *buf) {
+  helper_double_25_recursive(buf, 25);
+}
+void helper_double_26_recursive(double *buf, int depth);
+void helper_double_26_recursive(double *buf, int depth) {
+  if (depth == 5) {
+    for (int j = 0; j < 32; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 32; j += 32) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 8) {
+    helper_double_26_recursive(buf + 0, 5);
+    helper_double_26_recursive(buf + 32, 5);
+    helper_double_26_recursive(buf + 64, 5);
+    helper_double_26_recursive(buf + 96, 5);
+    helper_double_26_recursive(buf + 128, 5);
+    helper_double_26_recursive(buf + 160, 5);
+    helper_double_26_recursive(buf + 192, 5);
+    helper_double_26_recursive(buf + 224, 5);
+    for (int j = 0; j < 256; j += 256) {
+      for (int k = 0; k < 32; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 11) {
+    helper_double_26_recursive(buf + 0, 8);
+    helper_double_26_recursive(buf + 256, 8);
+    helper_double_26_recursive(buf + 512, 8);
+    helper_double_26_recursive(buf + 768, 8);
+    helper_double_26_recursive(buf + 1024, 8);
+    helper_double_26_recursive(buf + 1280, 8);
+    helper_double_26_recursive(buf + 1536, 8);
+    helper_double_26_recursive(buf + 1792, 8);
+    for (int j = 0; j < 2048; j += 2048) {
+      for (int k = 0; k < 256; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 14) {
+    helper_double_26_recursive(buf + 0, 11);
+    helper_double_26_recursive(buf + 2048, 11);
+    helper_double_26_recursive(buf + 4096, 11);
+    helper_double_26_recursive(buf + 6144, 11);
+    helper_double_26_recursive(buf + 8192, 11);
+    helper_double_26_recursive(buf + 10240, 11);
+    helper_double_26_recursive(buf + 12288, 11);
+    helper_double_26_recursive(buf + 14336, 11);
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 2048; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_26_recursive(buf + 0, 14);
+    helper_double_26_recursive(buf + 16384, 14);
+    helper_double_26_recursive(buf + 32768, 14);
+    helper_double_26_recursive(buf + 49152, 14);
+    helper_double_26_recursive(buf + 65536, 14);
+    helper_double_26_recursive(buf + 81920, 14);
+    helper_double_26_recursive(buf + 98304, 14);
+    helper_double_26_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_26_recursive(buf + 0, 17);
+    helper_double_26_recursive(buf + 131072, 17);
+    helper_double_26_recursive(buf + 262144, 17);
+    helper_double_26_recursive(buf + 393216, 17);
+    helper_double_26_recursive(buf + 524288, 17);
+    helper_double_26_recursive(buf + 655360, 17);
+    helper_double_26_recursive(buf + 786432, 17);
+    helper_double_26_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_26_recursive(buf + 0, 20);
+    helper_double_26_recursive(buf + 1048576, 20);
+    helper_double_26_recursive(buf + 2097152, 20);
+    helper_double_26_recursive(buf + 3145728, 20);
+    helper_double_26_recursive(buf + 4194304, 20);
+    helper_double_26_recursive(buf + 5242880, 20);
+    helper_double_26_recursive(buf + 6291456, 20);
+    helper_double_26_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_26_recursive(buf + 0, 23);
+    helper_double_26_recursive(buf + 8388608, 23);
+    helper_double_26_recursive(buf + 16777216, 23);
+    helper_double_26_recursive(buf + 25165824, 23);
+    helper_double_26_recursive(buf + 33554432, 23);
+    helper_double_26_recursive(buf + 41943040, 23);
+    helper_double_26_recursive(buf + 50331648, 23);
+    helper_double_26_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_26(double *buf);
+void helper_double_26(double *buf) {
+  helper_double_26_recursive(buf, 26);
+}
+void helper_double_27_recursive(double *buf, int depth);
+void helper_double_27_recursive(double *buf, int depth) {
+  if (depth == 6) {
+    for (int j = 0; j < 64; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_double_27_recursive(buf + 0, 6);
+    helper_double_27_recursive(buf + 64, 6);
+    helper_double_27_recursive(buf + 128, 6);
+    helper_double_27_recursive(buf + 192, 6);
+    helper_double_27_recursive(buf + 256, 6);
+    helper_double_27_recursive(buf + 320, 6);
+    helper_double_27_recursive(buf + 384, 6);
+    helper_double_27_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_27_recursive(buf + 0, 9);
+    helper_double_27_recursive(buf + 512, 9);
+    helper_double_27_recursive(buf + 1024, 9);
+    helper_double_27_recursive(buf + 1536, 9);
+    helper_double_27_recursive(buf + 2048, 9);
+    helper_double_27_recursive(buf + 2560, 9);
+    helper_double_27_recursive(buf + 3072, 9);
+    helper_double_27_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_27_recursive(buf + 0, 12);
+    helper_double_27_recursive(buf + 4096, 12);
+    helper_double_27_recursive(buf + 8192, 12);
+    helper_double_27_recursive(buf + 12288, 12);
+    helper_double_27_recursive(buf + 16384, 12);
+    helper_double_27_recursive(buf + 20480, 12);
+    helper_double_27_recursive(buf + 24576, 12);
+    helper_double_27_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_27_recursive(buf + 0, 15);
+    helper_double_27_recursive(buf + 32768, 15);
+    helper_double_27_recursive(buf + 65536, 15);
+    helper_double_27_recursive(buf + 98304, 15);
+    helper_double_27_recursive(buf + 131072, 15);
+    helper_double_27_recursive(buf + 163840, 15);
+    helper_double_27_recursive(buf + 196608, 15);
+    helper_double_27_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_27_recursive(buf + 0, 18);
+    helper_double_27_recursive(buf + 262144, 18);
+    helper_double_27_recursive(buf + 524288, 18);
+    helper_double_27_recursive(buf + 786432, 18);
+    helper_double_27_recursive(buf + 1048576, 18);
+    helper_double_27_recursive(buf + 1310720, 18);
+    helper_double_27_recursive(buf + 1572864, 18);
+    helper_double_27_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_27_recursive(buf + 0, 21);
+    helper_double_27_recursive(buf + 2097152, 21);
+    helper_double_27_recursive(buf + 4194304, 21);
+    helper_double_27_recursive(buf + 6291456, 21);
+    helper_double_27_recursive(buf + 8388608, 21);
+    helper_double_27_recursive(buf + 10485760, 21);
+    helper_double_27_recursive(buf + 12582912, 21);
+    helper_double_27_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_27_recursive(buf + 0, 24);
+    helper_double_27_recursive(buf + 16777216, 24);
+    helper_double_27_recursive(buf + 33554432, 24);
+    helper_double_27_recursive(buf + 50331648, 24);
+    helper_double_27_recursive(buf + 67108864, 24);
+    helper_double_27_recursive(buf + 83886080, 24);
+    helper_double_27_recursive(buf + 100663296, 24);
+    helper_double_27_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_27(double *buf);
+void helper_double_27(double *buf) {
+  helper_double_27_recursive(buf, 27);
+}
+void helper_double_28_recursive(double *buf, int depth);
+void helper_double_28_recursive(double *buf, int depth) {
+  if (depth == 14) {
+    for (int j = 0; j < 16384; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 16384; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 16384; j += 1024) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 16384; j += 8192) {
+      for (int k = 0; k < 1024; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 16384; j += 16384) {
+      for (int k = 0; k < 8192; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 17) {
+    helper_double_28_recursive(buf + 0, 14);
+    helper_double_28_recursive(buf + 16384, 14);
+    helper_double_28_recursive(buf + 32768, 14);
+    helper_double_28_recursive(buf + 49152, 14);
+    helper_double_28_recursive(buf + 65536, 14);
+    helper_double_28_recursive(buf + 81920, 14);
+    helper_double_28_recursive(buf + 98304, 14);
+    helper_double_28_recursive(buf + 114688, 14);
+    for (int j = 0; j < 131072; j += 131072) {
+      for (int k = 0; k < 16384; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 20) {
+    helper_double_28_recursive(buf + 0, 17);
+    helper_double_28_recursive(buf + 131072, 17);
+    helper_double_28_recursive(buf + 262144, 17);
+    helper_double_28_recursive(buf + 393216, 17);
+    helper_double_28_recursive(buf + 524288, 17);
+    helper_double_28_recursive(buf + 655360, 17);
+    helper_double_28_recursive(buf + 786432, 17);
+    helper_double_28_recursive(buf + 917504, 17);
+    for (int j = 0; j < 1048576; j += 1048576) {
+      for (int k = 0; k < 131072; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 23) {
+    helper_double_28_recursive(buf + 0, 20);
+    helper_double_28_recursive(buf + 1048576, 20);
+    helper_double_28_recursive(buf + 2097152, 20);
+    helper_double_28_recursive(buf + 3145728, 20);
+    helper_double_28_recursive(buf + 4194304, 20);
+    helper_double_28_recursive(buf + 5242880, 20);
+    helper_double_28_recursive(buf + 6291456, 20);
+    helper_double_28_recursive(buf + 7340032, 20);
+    for (int j = 0; j < 8388608; j += 8388608) {
+      for (int k = 0; k < 1048576; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 26) {
+    helper_double_28_recursive(buf + 0, 23);
+    helper_double_28_recursive(buf + 8388608, 23);
+    helper_double_28_recursive(buf + 16777216, 23);
+    helper_double_28_recursive(buf + 25165824, 23);
+    helper_double_28_recursive(buf + 33554432, 23);
+    helper_double_28_recursive(buf + 41943040, 23);
+    helper_double_28_recursive(buf + 50331648, 23);
+    helper_double_28_recursive(buf + 58720256, 23);
+    for (int j = 0; j < 67108864; j += 67108864) {
+      for (int k = 0; k < 8388608; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 28) {
+    helper_double_28_recursive(buf + 0, 26);
+    helper_double_28_recursive(buf + 67108864, 26);
+    helper_double_28_recursive(buf + 134217728, 26);
+    helper_double_28_recursive(buf + 201326592, 26);
+    for (int j = 0; j < 268435456; j += 268435456) {
+      for (int k = 0; k < 67108864; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_28(double *buf);
+void helper_double_28(double *buf) {
+  helper_double_28_recursive(buf, 28);
+}
+void helper_double_29_recursive(double *buf, int depth);
+void helper_double_29_recursive(double *buf, int depth) {
+  if (depth == 9) {
+    for (int j = 0; j < 512; j += 16) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm4, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm4, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm4\n"
+          "movapd %%xmm5, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm5, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm5\n"
+          "movapd %%xmm6, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm6, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm6\n"
+          "movapd %%xmm7, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm7, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 128) {
+      for (int k = 0; k < 16; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 128; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_29_recursive(buf + 0, 9);
+    helper_double_29_recursive(buf + 512, 9);
+    helper_double_29_recursive(buf + 1024, 9);
+    helper_double_29_recursive(buf + 1536, 9);
+    helper_double_29_recursive(buf + 2048, 9);
+    helper_double_29_recursive(buf + 2560, 9);
+    helper_double_29_recursive(buf + 3072, 9);
+    helper_double_29_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_29_recursive(buf + 0, 12);
+    helper_double_29_recursive(buf + 4096, 12);
+    helper_double_29_recursive(buf + 8192, 12);
+    helper_double_29_recursive(buf + 12288, 12);
+    helper_double_29_recursive(buf + 16384, 12);
+    helper_double_29_recursive(buf + 20480, 12);
+    helper_double_29_recursive(buf + 24576, 12);
+    helper_double_29_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_29_recursive(buf + 0, 15);
+    helper_double_29_recursive(buf + 32768, 15);
+    helper_double_29_recursive(buf + 65536, 15);
+    helper_double_29_recursive(buf + 98304, 15);
+    helper_double_29_recursive(buf + 131072, 15);
+    helper_double_29_recursive(buf + 163840, 15);
+    helper_double_29_recursive(buf + 196608, 15);
+    helper_double_29_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_29_recursive(buf + 0, 18);
+    helper_double_29_recursive(buf + 262144, 18);
+    helper_double_29_recursive(buf + 524288, 18);
+    helper_double_29_recursive(buf + 786432, 18);
+    helper_double_29_recursive(buf + 1048576, 18);
+    helper_double_29_recursive(buf + 1310720, 18);
+    helper_double_29_recursive(buf + 1572864, 18);
+    helper_double_29_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_29_recursive(buf + 0, 21);
+    helper_double_29_recursive(buf + 2097152, 21);
+    helper_double_29_recursive(buf + 4194304, 21);
+    helper_double_29_recursive(buf + 6291456, 21);
+    helper_double_29_recursive(buf + 8388608, 21);
+    helper_double_29_recursive(buf + 10485760, 21);
+    helper_double_29_recursive(buf + 12582912, 21);
+    helper_double_29_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_29_recursive(buf + 0, 24);
+    helper_double_29_recursive(buf + 16777216, 24);
+    helper_double_29_recursive(buf + 33554432, 24);
+    helper_double_29_recursive(buf + 50331648, 24);
+    helper_double_29_recursive(buf + 67108864, 24);
+    helper_double_29_recursive(buf + 83886080, 24);
+    helper_double_29_recursive(buf + 100663296, 24);
+    helper_double_29_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 29) {
+    helper_double_29_recursive(buf + 0, 27);
+    helper_double_29_recursive(buf + 134217728, 27);
+    helper_double_29_recursive(buf + 268435456, 27);
+    helper_double_29_recursive(buf + 402653184, 27);
+    for (int j = 0; j < 536870912; j += 536870912) {
+      for (int k = 0; k < 134217728; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_29(double *buf);
+void helper_double_29(double *buf) {
+  helper_double_29_recursive(buf, 29);
+}
+void helper_double_30_recursive(double *buf, int depth);
+void helper_double_30_recursive(double *buf, int depth) {
+  if (depth == 3) {
+    for (int j = 0; j < 8; j += 8) {
+      for (int k = 0; k < 2; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm0\n"
+          "movapd %%xmm1, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm2, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm2, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm2\n"
+          "movapd %%xmm3, %%xmm8\n"
+          "haddpd %%xmm8, %%xmm8\n"
+          "movapd %%xmm3, %%xmm9\n"
+          "hsubpd %%xmm9, %%xmm9\n"
+          "blendpd $1, %%xmm8, %%xmm9\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movupd %%xmm0, (%0)\n"
+          "movupd %%xmm1, (%1)\n"
+          "movupd %%xmm2, (%2)\n"
+          "movupd %%xmm3, (%3)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 6) {
+    helper_double_30_recursive(buf + 0, 3);
+    helper_double_30_recursive(buf + 8, 3);
+    helper_double_30_recursive(buf + 16, 3);
+    helper_double_30_recursive(buf + 24, 3);
+    helper_double_30_recursive(buf + 32, 3);
+    helper_double_30_recursive(buf + 40, 3);
+    helper_double_30_recursive(buf + 48, 3);
+    helper_double_30_recursive(buf + 56, 3);
+    for (int j = 0; j < 64; j += 64) {
+      for (int k = 0; k < 8; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 9) {
+    helper_double_30_recursive(buf + 0, 6);
+    helper_double_30_recursive(buf + 64, 6);
+    helper_double_30_recursive(buf + 128, 6);
+    helper_double_30_recursive(buf + 192, 6);
+    helper_double_30_recursive(buf + 256, 6);
+    helper_double_30_recursive(buf + 320, 6);
+    helper_double_30_recursive(buf + 384, 6);
+    helper_double_30_recursive(buf + 448, 6);
+    for (int j = 0; j < 512; j += 512) {
+      for (int k = 0; k < 64; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 12) {
+    helper_double_30_recursive(buf + 0, 9);
+    helper_double_30_recursive(buf + 512, 9);
+    helper_double_30_recursive(buf + 1024, 9);
+    helper_double_30_recursive(buf + 1536, 9);
+    helper_double_30_recursive(buf + 2048, 9);
+    helper_double_30_recursive(buf + 2560, 9);
+    helper_double_30_recursive(buf + 3072, 9);
+    helper_double_30_recursive(buf + 3584, 9);
+    for (int j = 0; j < 4096; j += 4096) {
+      for (int k = 0; k < 512; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 15) {
+    helper_double_30_recursive(buf + 0, 12);
+    helper_double_30_recursive(buf + 4096, 12);
+    helper_double_30_recursive(buf + 8192, 12);
+    helper_double_30_recursive(buf + 12288, 12);
+    helper_double_30_recursive(buf + 16384, 12);
+    helper_double_30_recursive(buf + 20480, 12);
+    helper_double_30_recursive(buf + 24576, 12);
+    helper_double_30_recursive(buf + 28672, 12);
+    for (int j = 0; j < 32768; j += 32768) {
+      for (int k = 0; k < 4096; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 18) {
+    helper_double_30_recursive(buf + 0, 15);
+    helper_double_30_recursive(buf + 32768, 15);
+    helper_double_30_recursive(buf + 65536, 15);
+    helper_double_30_recursive(buf + 98304, 15);
+    helper_double_30_recursive(buf + 131072, 15);
+    helper_double_30_recursive(buf + 163840, 15);
+    helper_double_30_recursive(buf + 196608, 15);
+    helper_double_30_recursive(buf + 229376, 15);
+    for (int j = 0; j < 262144; j += 262144) {
+      for (int k = 0; k < 32768; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 21) {
+    helper_double_30_recursive(buf + 0, 18);
+    helper_double_30_recursive(buf + 262144, 18);
+    helper_double_30_recursive(buf + 524288, 18);
+    helper_double_30_recursive(buf + 786432, 18);
+    helper_double_30_recursive(buf + 1048576, 18);
+    helper_double_30_recursive(buf + 1310720, 18);
+    helper_double_30_recursive(buf + 1572864, 18);
+    helper_double_30_recursive(buf + 1835008, 18);
+    for (int j = 0; j < 2097152; j += 2097152) {
+      for (int k = 0; k < 262144; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 24) {
+    helper_double_30_recursive(buf + 0, 21);
+    helper_double_30_recursive(buf + 2097152, 21);
+    helper_double_30_recursive(buf + 4194304, 21);
+    helper_double_30_recursive(buf + 6291456, 21);
+    helper_double_30_recursive(buf + 8388608, 21);
+    helper_double_30_recursive(buf + 10485760, 21);
+    helper_double_30_recursive(buf + 12582912, 21);
+    helper_double_30_recursive(buf + 14680064, 21);
+    for (int j = 0; j < 16777216; j += 16777216) {
+      for (int k = 0; k < 2097152; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 27) {
+    helper_double_30_recursive(buf + 0, 24);
+    helper_double_30_recursive(buf + 16777216, 24);
+    helper_double_30_recursive(buf + 33554432, 24);
+    helper_double_30_recursive(buf + 50331648, 24);
+    helper_double_30_recursive(buf + 67108864, 24);
+    helper_double_30_recursive(buf + 83886080, 24);
+    helper_double_30_recursive(buf + 100663296, 24);
+    helper_double_30_recursive(buf + 117440512, 24);
+    for (int j = 0; j < 134217728; j += 134217728) {
+      for (int k = 0; k < 16777216; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+  if (depth == 30) {
+    helper_double_30_recursive(buf + 0, 27);
+    helper_double_30_recursive(buf + 134217728, 27);
+    helper_double_30_recursive(buf + 268435456, 27);
+    helper_double_30_recursive(buf + 402653184, 27);
+    helper_double_30_recursive(buf + 536870912, 27);
+    helper_double_30_recursive(buf + 671088640, 27);
+    helper_double_30_recursive(buf + 805306368, 27);
+    helper_double_30_recursive(buf + 939524096, 27);
+    for (int j = 0; j < 1073741824; j += 1073741824) {
+      for (int k = 0; k < 134217728; k += 2) {
+        __asm__ volatile (
+          "movupd (%0), %%xmm0\n"
+          "movupd (%1), %%xmm1\n"
+          "movupd (%2), %%xmm2\n"
+          "movupd (%3), %%xmm3\n"
+          "movupd (%4), %%xmm4\n"
+          "movupd (%5), %%xmm5\n"
+          "movupd (%6), %%xmm6\n"
+          "movupd (%7), %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm9\n"
+          "addpd %%xmm1, %%xmm8\n"
+          "subpd %%xmm1, %%xmm9\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm11\n"
+          "addpd %%xmm3, %%xmm10\n"
+          "subpd %%xmm3, %%xmm11\n"
+          "movapd %%xmm4, %%xmm12\n"
+          "movapd %%xmm4, %%xmm13\n"
+          "addpd %%xmm5, %%xmm12\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm6, %%xmm14\n"
+          "movapd %%xmm6, %%xmm15\n"
+          "addpd %%xmm7, %%xmm14\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movapd %%xmm8, %%xmm0\n"
+          "movapd %%xmm8, %%xmm2\n"
+          "addpd %%xmm10, %%xmm0\n"
+          "subpd %%xmm10, %%xmm2\n"
+          "movapd %%xmm9, %%xmm1\n"
+          "movapd %%xmm9, %%xmm3\n"
+          "addpd %%xmm11, %%xmm1\n"
+          "subpd %%xmm11, %%xmm3\n"
+          "movapd %%xmm12, %%xmm4\n"
+          "movapd %%xmm12, %%xmm6\n"
+          "addpd %%xmm14, %%xmm4\n"
+          "subpd %%xmm14, %%xmm6\n"
+          "movapd %%xmm13, %%xmm5\n"
+          "movapd %%xmm13, %%xmm7\n"
+          "addpd %%xmm15, %%xmm5\n"
+          "subpd %%xmm15, %%xmm7\n"
+          "movapd %%xmm0, %%xmm8\n"
+          "movapd %%xmm0, %%xmm12\n"
+          "addpd %%xmm4, %%xmm8\n"
+          "subpd %%xmm4, %%xmm12\n"
+          "movapd %%xmm1, %%xmm9\n"
+          "movapd %%xmm1, %%xmm13\n"
+          "addpd %%xmm5, %%xmm9\n"
+          "subpd %%xmm5, %%xmm13\n"
+          "movapd %%xmm2, %%xmm10\n"
+          "movapd %%xmm2, %%xmm14\n"
+          "addpd %%xmm6, %%xmm10\n"
+          "subpd %%xmm6, %%xmm14\n"
+          "movapd %%xmm3, %%xmm11\n"
+          "movapd %%xmm3, %%xmm15\n"
+          "addpd %%xmm7, %%xmm11\n"
+          "subpd %%xmm7, %%xmm15\n"
+          "movupd %%xmm8, (%0)\n"
+          "movupd %%xmm9, (%1)\n"
+          "movupd %%xmm10, (%2)\n"
+          "movupd %%xmm11, (%3)\n"
+          "movupd %%xmm12, (%4)\n"
+          "movupd %%xmm13, (%5)\n"
+          "movupd %%xmm14, (%6)\n"
+          "movupd %%xmm15, (%7)\n"
+          :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
+        );
+      }
+    }
+    return;
+  }
+}
+void helper_double_30(double *buf);
+void helper_double_30(double *buf) {
+  helper_double_30_recursive(buf, 30);
+}
+int fht_double(double *buf, int log_n) {
+  if (log_n == 0) {
+    return 0;
+  }
+  if (log_n == 1) {
+    helper_double_1(buf);
+    return 0;
+  }
+  if (log_n == 2) {
+    helper_double_2(buf);
+    return 0;
+  }
+  if (log_n == 3) {
+    helper_double_3(buf);
+    return 0;
+  }
+  if (log_n == 4) {
+    helper_double_4(buf);
+    return 0;
+  }
+  if (log_n == 5) {
+    helper_double_5(buf);
+    return 0;
+  }
+  if (log_n == 6) {
+    helper_double_6(buf);
+    return 0;
+  }
+  if (log_n == 7) {
+    helper_double_7(buf);
+    return 0;
+  }
+  if (log_n == 8) {
+    helper_double_8(buf);
+    return 0;
+  }
+  if (log_n == 9) {
+    helper_double_9(buf);
+    return 0;
+  }
+  if (log_n == 10) {
+    helper_double_10(buf);
+    return 0;
+  }
+  if (log_n == 11) {
+    helper_double_11(buf);
+    return 0;
+  }
+  if (log_n == 12) {
+    helper_double_12(buf);
+    return 0;
+  }
+  if (log_n == 13) {
+    helper_double_13(buf);
+    return 0;
+  }
+  if (log_n == 14) {
+    helper_double_14(buf);
+    return 0;
+  }
+  if (log_n == 15) {
+    helper_double_15(buf);
+    return 0;
+  }
+  if (log_n == 16) {
+    helper_double_16(buf);
+    return 0;
+  }
+  if (log_n == 17) {
+    helper_double_17(buf);
+    return 0;
+  }
+  if (log_n == 18) {
+    helper_double_18(buf);
+    return 0;
+  }
+  if (log_n == 19) {
+    helper_double_19(buf);
+    return 0;
+  }
+  if (log_n == 20) {
+    helper_double_20(buf);
+    return 0;
+  }
+  if (log_n == 21) {
+    helper_double_21(buf);
+    return 0;
+  }
+  if (log_n == 22) {
+    helper_double_22(buf);
+    return 0;
+  }
+  if (log_n == 23) {
+    helper_double_23(buf);
+    return 0;
+  }
+  if (log_n == 24) {
+    helper_double_24(buf);
+    return 0;
+  }
+  if (log_n == 25) {
+    helper_double_25(buf);
+    return 0;
+  }
+  if (log_n == 26) {
+    helper_double_26(buf);
+    return 0;
+  }
+  if (log_n == 27) {
+    helper_double_27(buf);
+    return 0;
+  }
+  if (log_n == 28) {
+    helper_double_28(buf);
+    return 0;
+  }
+  if (log_n == 29) {
+    helper_double_29(buf);
+    return 0;
+  }
+  if (log_n == 30) {
+    helper_double_30(buf);
+    return 0;
+  }
+  return 1;
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py
new file mode 100644
index 00000000000..bf3655efda4
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/gen.py
@@ -0,0 +1,869 @@
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
+import csv
+import os
+import subprocess
+import sys
+
+max_log_n = 30
+
+
+def is_distinct(l):
+    return len(set(l)) == len(l)
+
+
+def float_avx_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("float_avx_0 needs at least four auxiliary registers")
+    # given source ABCDEFGH, destination register gets AACCEEGG
+    res = ident + '"vpermilps $160, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    # given source ABCDEFGH, destination register gets BBDDFFHH
+    res += ident + '"vpermilps $245, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        aux_registers[1],
+    )
+    # aux2 <- 0
+    res += ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        aux_registers[2],
+        aux_registers[2],
+    )
+    # aux3 <- -B -B -D -D -F -F -H -H
+    res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[2],
+        aux_registers[3],
+    )
+    # reg <- (A+B)(A-B)(C+D)(C-D)(E+F)(E-F)(G+H)(G-H)
+    res += ident + '"vaddsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[0],
+        register,
+    )
+    return res
+
+
+def float_avx_1(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 5:
+        raise Exception("float_avx_1 needs at least five auxiliary registers")
+    # Given source ABCDEFGH, r0 <- ABABEFEF
+    res = ident + '"vpermilps $68, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    # Given source ABCDEFGH, r1 <- CDCDGHGH
+    res += ident + '"vpermilps $238, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        aux_registers[1],
+    )
+    # r2 <- 0
+    res += ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        aux_registers[2],
+        aux_registers[2],
+    )
+    # r3 <- -C -D -C -D -G -H -G -H
+    res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[2],
+        aux_registers[3],
+    )
+    # r4 <- C D -C -D G H -G -H
+    res += ident + '"vblendps $204, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[1],
+        aux_registers[4],
+    )
+    # reg <- (A + C) (B + D) (A - C) (B - D) etc.
+    res += ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[4],
+        register,
+    )
+    return res
+
+
+def float_avx_2(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("float_avx_2 needs at least four auxiliary registers")
+    # r0 <- 0
+    res = ident + '"vxorps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[0],
+        aux_registers[0],
+    )
+    # r1 <- -A -B -C -D -E -F -G -H
+    res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        aux_registers[0],
+        aux_registers[1],
+    )
+    # r2 <- ABABEFEF
+    res += ident + '"vperm2f128 $0, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        register,
+        aux_registers[2],
+    )
+    # r3 <- C D -C -D G H -G -H
+    res += ident + '"vperm2f128 $49, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        register,
+        aux_registers[3],
+    )
+    # reg <- (A + C) (B + D)(A - C) (B - D) etc.
+    res += ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        aux_registers[3],
+        register,
+    )
+    return res
+
+
+def float_avx_3_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = ident + '"vaddps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        from_register_1,
+        from_register_0,
+        to_register_0,
+    )
+    res += ident + '"vsubps %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        from_register_1,
+        from_register_0,
+        to_register_1,
+    )
+    return res
+
+
+def double_avx_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("double_avx_0 needs at least four auxiliary registers")
+    # r0 <- AACC
+    res = ident + '"vpermilpd $0, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    # r1 <- BBDD
+    res += ident + '"vpermilpd $15, %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1])
+    # r2 <- 0
+    res += ident + '"vxorpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        aux_registers[2],
+        aux_registers[2],
+    )
+    # r3 <- -B -B -D -D
+    res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[2],
+        aux_registers[3],
+    )
+    # reg <- (A + B)(A - B)(C + D)(C - D)
+    res += ident + '"vaddsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[0],
+        register,
+    )
+    return res
+
+
+def double_avx_1(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("double_avx_1 needs at least four auxiliary registers")
+    # r0 <- ABAB
+    res = ident + '"vperm2f128 $0, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        register,
+        aux_registers[0],
+    )
+    # r1 <- 0
+    res += ident + '"vxorpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[1],
+        aux_registers[1],
+    )
+    # r2 <- -A -B -C -D
+    res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        register,
+        aux_registers[1],
+        aux_registers[2],
+    )
+    # r3 <- C D -C -D
+    res += ident + '"vperm2f128 $49, %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[2],
+        register,
+        aux_registers[3],
+    )
+    # reg <- (A + C)(B + D)(A - C)(B - D)
+    res += ident + '"vaddpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[0],
+        register,
+    )
+    return res
+
+
+def double_avx_2_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = ident + '"vaddpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        from_register_1,
+        from_register_0,
+        to_register_0,
+    )
+    res += ident + '"vsubpd %%%%%s, %%%%%s, %%%%%s\\n"\n' % (
+        from_register_1,
+        from_register_0,
+        to_register_1,
+    )
+    return res
+
+
+def float_sse_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 2:
+        raise Exception("float_sse_0 needs at least two auxiliary registers")
+    res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    res += ident + '"shufps $160, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[0],
+    )
+    res += ident + '"shufps $245, %%%%%s, %%%%%s\\n"\n' % (register, register)
+    res += ident + '"xorps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1])
+    res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1])
+    res += ident + '"addsubps %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[0],
+    )
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], register)
+    return res
+
+
+def float_sse_1(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 4:
+        raise Exception("float_sse_1 needs at least four auxiliary registers")
+    res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    res += ident + '"shufps $68, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[0],
+    )
+    res += ident + '"xorps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1])
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[2])
+    res += ident + '"shufps $14, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[1],
+        aux_registers[2],
+    )
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[3])
+    res += ident + '"shufps $224, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[3],
+        aux_registers[1],
+    )
+    res += ident + '"addps %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], aux_registers[2])
+    res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[2])
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (aux_registers[2], register)
+    return res
+
+
+def float_sse_2_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_0)
+    res += ident + '"movaps %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_1)
+    res += ident + '"addps %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_0)
+    res += ident + '"subps %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_1)
+    return res
+
+
+def double_sse_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 2:
+        raise Exception("double_sse_0 needs at least two auxiliary registers")
+    res = ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[0])
+    res += ident + '"haddpd %%%%%s, %%%%%s\\n"\n' % (aux_registers[0], aux_registers[0])
+    res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (register, aux_registers[1])
+    res += ident + '"hsubpd %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], aux_registers[1])
+    res += ident + '"blendpd $1, %%%%%s, %%%%%s\\n"\n' % (
+        aux_registers[0],
+        aux_registers[1],
+    )
+    res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (aux_registers[1], register)
+    return res
+
+
+def double_sse_1_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_0)
+    res += ident + '"movapd %%%%%s, %%%%%s\\n"\n' % (from_register_0, to_register_1)
+    res += ident + '"addpd %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_0)
+    res += ident + '"subpd %%%%%s, %%%%%s\\n"\n' % (from_register_1, to_register_1)
+    return res
+
+
+# Given reg = ABCD, return (A+B)(A-B)(C+D)(C-D)
+def float_neon_0(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 2:
+        raise Exception("float_neon_0 needs at least two auxiliary registers")
+    # r0 <- AACC
+    res = f'{ident}"TRN1 {aux_registers[0]}.4S, {register}.4S, {register}.4S\\n"\n'
+    # r1 <- -A -B -C -D
+    res += f'{ident}"FNEG {aux_registers[1]}.4S, {register}.4S\\n"\n'
+    # r2 <- B (-B) D -D
+    res += f'{ident}"TRN2 {aux_registers[1]}.4S, {register}.4S, {aux_registers[1]}.4S\\n"\n'
+    # reg <- (A+B)(A-B)(C+D)(C-D)
+    res += f'{ident}"FADD {register}.4S, {aux_registers[0]}.4S, {aux_registers[1]}.4S\\n"\n'
+
+    return res
+
+
+# Given reg = ABCD, return (A + C)(B + D)(A - C)(B - D)
+def float_neon_1(register, aux_registers, ident=""):
+    if not is_distinct(aux_registers):
+        raise Exception("auxiliary registers must be distinct")
+    if register in aux_registers:
+        raise Exception("the main register can't be one of the auxiliary ones")
+    if len(aux_registers) < 2:
+        raise Exception("float_neon_1 needs at least two auxiliary registers")
+    # r0 <- ABAB
+    res = f'{ident}"DUP {aux_registers[0]}.2D, {register}.D[0]\\n"\n'
+    # r1 <- -A -B -C -D
+    res += f'{ident}"FNEG {aux_registers[1]}.4S, {register}.4S\\n"\n'
+    # r1 <- C D -C -D
+    res += f'{ident}"INS {aux_registers[1]}.D[0], {register}.D[1]\\n"\n'
+    # reg <- (A + C)(B + D)(A - C)(B - D)
+    res += f'{ident}"FADD {register}.4S, {aux_registers[0]}.4S, {aux_registers[1]}.4S\\n"\n'
+
+    return res
+
+
+def float_neon_2_etc(
+    from_register_0, from_register_1, to_register_0, to_register_1, ident=""
+):
+    if not is_distinct(
+        [from_register_0, from_register_1, to_register_0, to_register_1]
+    ):
+        raise Exception("four registers must be distinct")
+    res = f'{ident}"FADD {to_register_0}.4S, {from_register_0}.4S, {from_register_1}.4S\\n"\n'
+    res += f'{ident}"FSUB {to_register_1}.4S, {from_register_0}.4S, {from_register_1}.4S\\n"\n'
+    return res
+
+
+def plain_step(type_name, buf_name, log_n, it, ident=""):
+    if log_n <= 0:
+        raise Exception("log_n must be positive")
+    if it < 0:
+        raise Exception("it must be non-negative")
+    if it >= log_n:
+        raise Exception("it must be smaller than log_n")
+    n = 1 << log_n
+    res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << (it + 1))
+    res += ident + "  for (int k = 0; k < %d; ++k) {\n" % (1 << it)
+    res += ident + "    %s u = %s[j + k];\n" % (type_name, buf_name)
+    res += ident + "    %s v = %s[j + k + %d];\n" % (type_name, buf_name, 1 << it)
+    res += ident + "    %s[j + k] = u + v;\n" % buf_name
+    res += ident + "    %s[j + k + %d] = u - v;\n" % (buf_name, 1 << it)
+    res += ident + "  }\n"
+    res += ident + "}\n"
+    return res
+
+
+MOVE_INSTRUCTION_USE_NEON = "NEON MOV"
+
+
+def composite_step(
+    buf_name,
+    log_n,
+    from_it,
+    to_it,
+    log_w,
+    registers,
+    move_instruction,
+    special_steps,
+    main_step,
+    ident="",
+):
+    # HACK: NEON needs different syntax for loads and stores.
+    use_neon_movs = move_instruction == MOVE_INSTRUCTION_USE_NEON
+    if log_n < log_w:
+        raise Exception("need at least %d elements" % (1 << log_w))
+    num_registers = len(registers)
+    if num_registers % 2 == 1:
+        raise Exception("odd number of registers: %d" % num_registers)
+    num_nontrivial_levels = 0
+    if to_it > log_w:
+        first_nontrivial = max(from_it, log_w)
+        num_nontrivial_levels = to_it - first_nontrivial
+        if 1 << num_nontrivial_levels > num_registers / 2:
+            raise Exception("not enough registers")
+    n = 1 << log_n
+    input_registers = []
+    output_registers = []
+    for i in range(num_registers):
+        if i < num_registers / 2:
+            input_registers.append(registers[i])
+        else:
+            output_registers.append(registers[i])
+    clobber = ", ".join(['"%%%s"' % x for x in registers])
+    if num_nontrivial_levels == 0:
+        res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << log_w)
+        res += ident + "  __asm__ volatile (\n"
+        if use_neon_movs:
+            res += f'{ident}    "LD1 {{{input_registers[0]}.4S}}, [%0]\\n"\n'
+        else:
+            res += ident + '    "%s (%%0), %%%%%s\\n"\n' % (
+                move_instruction,
+                input_registers[0],
+            )
+        for it in range(from_it, to_it):
+            res += special_steps[it](
+                input_registers[0], output_registers, ident + "    "
+            )
+        if use_neon_movs:
+            res += f'{ident}    "ST1 {{{input_registers[0]}.4S}}, [%0]\\n"\n'
+        else:
+            res += ident + '    "%s %%%%%s, (%%0)\\n"\n' % (
+                move_instruction,
+                input_registers[0],
+            )
+        res += ident + '    :: "r"(%s + j) : %s, "memory"\n' % (buf_name, clobber)
+        res += ident + "  );\n"
+        res += ident + "}\n"
+        return res
+    res = ident + "for (int j = 0; j < %d; j += %d) {\n" % (n, 1 << to_it)
+    res += ident + "  for (int k = 0; k < %d; k += %d) {\n" % (
+        1 << (to_it - num_nontrivial_levels),
+        1 << log_w,
+    )
+    subcube = []
+    for l in range(1 << num_nontrivial_levels):
+        subcube.append("j + k + " + str(l * (1 << (to_it - num_nontrivial_levels))))
+    res += ident + "    __asm__ volatile (\n"
+    for l in range(1 << num_nontrivial_levels):
+        if use_neon_movs:
+            res += f'{ident}      "LD1 {{{input_registers[l]}.4S}}, [%{l}]\\n"\n'
+        else:
+            res += ident + '      "%s (%%%d), %%%%%s\\n"\n' % (
+                move_instruction,
+                l,
+                input_registers[l],
+            )
+    for it in range(from_it, log_w):
+        for ii in range(1 << num_nontrivial_levels):
+            res += special_steps[it](
+                input_registers[ii], output_registers, ident + "      "
+            )
+    for it in range(num_nontrivial_levels):
+        for ii in range(0, 1 << num_nontrivial_levels, 1 << (it + 1)):
+            for jj in range(1 << it):
+                res += main_step(
+                    input_registers[ii + jj],
+                    input_registers[ii + jj + (1 << it)],
+                    output_registers[ii + jj],
+                    output_registers[ii + jj + (1 << it)],
+                    ident + "      ",
+                )
+        tmp = input_registers
+        input_registers = output_registers
+        output_registers = tmp
+    for l in range(1 << num_nontrivial_levels):
+        if use_neon_movs:
+            res += f'{ident}      "ST1 {{{input_registers[l]}.4S}}, [%{l}]\\n"\n'
+        else:
+            res += ident + '      "%s %%%%%s, (%%%d)\\n"\n' % (
+                move_instruction,
+                input_registers[l],
+                l,
+            )
+    res += ident + '      :: %s : %s, "memory"\n' % (
+        ", ".join(['"r"(%s + %s)' % (buf_name, x) for x in subcube]),
+        clobber,
+    )
+    res += ident + "    );\n"
+    res += ident + "  }\n"
+    res += ident + "}\n"
+    return res
+
+
+def float_avx_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        3,
+        ["ymm%d" % x for x in range(16)],
+        "vmovups",
+        [float_avx_0, float_avx_1, float_avx_2],
+        float_avx_3_etc,
+        ident,
+    )
+
+
+def double_avx_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        2,
+        ["ymm%d" % x for x in range(16)],
+        "vmovupd",
+        [double_avx_0, double_avx_1],
+        double_avx_2_etc,
+        ident,
+    )
+
+
+def float_sse_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        2,
+        ["xmm%d" % x for x in range(16)],
+        "movups",
+        [float_sse_0, float_sse_1],
+        float_sse_2_etc,
+        ident,
+    )
+
+
+def double_sse_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        1,
+        ["xmm%d" % x for x in range(16)],
+        "movupd",
+        [double_sse_0],
+        double_sse_1_etc,
+        ident,
+    )
+
+
+NEON_VECTOR_REGS = [f"v{x}" for x in range(0, 32)]
+
+
+def float_neon_composite_step(buf_name, log_n, from_it, to_it, ident=""):
+    return composite_step(
+        buf_name,
+        log_n,
+        from_it,
+        to_it,
+        2,
+        NEON_VECTOR_REGS,
+        MOVE_INSTRUCTION_USE_NEON,
+        [float_neon_0, float_neon_1],
+        float_neon_2_etc,
+        ident,
+    )
+
+
+def plain_unmerged(type_name, log_n):
+    signature = "static inline void helper_%s_%d(%s *buf)" % (
+        type_name,
+        log_n,
+        type_name,
+    )
+    res = "%s;\n" % signature
+    res += "%s {\n" % signature
+    for i in range(log_n):
+        res += plain_step(type_name, "buf", log_n, i, "  ")
+    res += "}\n"
+    return res
+
+
+def greedy_merged(type_name, log_n, composite_step):
+    try:
+        composite_step("buf", log_n, 0, 0)
+    except Exception:
+        raise Exception("log_n is too small: %d" % log_n)
+    signature = "static inline void helper_%s_%d(%s *buf)" % (
+        type_name,
+        log_n,
+        type_name,
+    )
+    res = "%s;\n" % signature
+    res += "%s {\n" % signature
+    cur_it = 0
+    while cur_it < log_n:
+        cur_to_it = log_n
+        while True:
+            try:
+                composite_step("buf", log_n, cur_it, cur_to_it)
+                break
+            except Exception as e:
+                print(f"warning: {e}")
+                cur_to_it -= 1
+                continue
+        res += composite_step("buf", log_n, cur_it, cur_to_it, "  ")
+        cur_it = cur_to_it
+    res += "}\n"
+    return res
+
+
+def greedy_merged_recursive(type_name, log_n, threshold_step, composite_step):
+    if threshold_step > log_n:
+        raise Exception("threshold_step must be at most log_n")
+    try:
+        composite_step("buf", threshold_step, 0, 0)
+    except Exception:
+        raise Exception("threshold_step is too small: %d" % threshold_step)
+    signature = "void helper_%s_%d_recursive(%s *buf, int depth)" % (
+        type_name,
+        log_n,
+        type_name,
+    )
+    res = "%s;\n" % signature
+    res += "%s {\n" % signature
+    res += "  if (depth == %d) {\n" % threshold_step
+    if threshold_step == log_n:
+        cur_it = 0
+        while cur_it < threshold_step:
+            cur_to_it = threshold_step
+            while True:
+                try:
+                    composite_step("buf", threshold_step, cur_it, cur_to_it)
+                    break
+                except Exception:
+                    cur_to_it -= 1
+                    continue
+            res += composite_step("buf", threshold_step, cur_it, cur_to_it, "    ")
+            cur_it = cur_to_it
+    else:
+        res += "    helper_%s_%d(buf);\n" % (type_name, threshold_step)
+
+    res += "    return;\n"
+    res += "  }\n"
+    cur_it = threshold_step
+    while cur_it < log_n:
+        cur_to_it = log_n
+        while True:
+            try:
+                composite_step("buf", cur_to_it, cur_it, cur_to_it)
+                break
+            except Exception:
+                cur_to_it -= 1
+                continue
+        res += "  if (depth == %d) {\n" % cur_to_it
+        for i in range(1 << (cur_to_it - cur_it)):
+            res += "    helper_%s_%d_recursive(buf + %d, %d);\n" % (
+                type_name,
+                log_n,
+                i * (1 << cur_it),
+                cur_it,
+            )
+        if cur_to_it < log_n:
+            res += "    helper_%s_%d(buf);" % (type_name, cur_to_it)
+        else:
+            res += composite_step("buf", cur_to_it, cur_it, cur_to_it, "    ")
+        res += "    return;\n"
+        res += "  }\n"
+        cur_it = cur_to_it
+    res += "}\n"
+    signature = "void helper_%s_%d(%s *buf)" % (type_name, log_n, type_name)
+    res += "%s;\n" % signature
+    res += "%s {\n" % signature
+    res += "  helper_%s_%d_recursive(buf, %d);\n" % (type_name, log_n, log_n)
+    res += "}\n"
+    return res
+
+
+def extract_time(data):
+    cpu_time = float(data["cpu_time"])
+    time_unit = data["time_unit"]
+    if time_unit != "ns":
+        raise Exception("nanoseconds expected")
+    return cpu_time / 1e9
+
+
+def get_mean_stddev():
+    with open("measurements/output.csv", "r") as csvfile:
+        reader = csv.reader(csvfile)
+        first = True
+        for row in reader:
+            if first:
+                header = row
+                first = False
+            else:
+                data = {}
+                for x, y in zip(header, row):
+                    data[x] = y
+                if data["name"] == "benchmark_fht_mean":
+                    mean = extract_time(data)
+                elif data["name"] == "benchmark_fht_stddev":
+                    stddev = extract_time(data)
+    return mean
+
+
+def measure_time(code, log_n, type_name, method_name, num_it=3):
+    if num_it % 2 == 0:
+        raise Exception("even number of runs: %d" % num_it)
+    with open("measurements/to_run.h", "w") as output:
+        output.write(code)
+        output.write("const int log_n = %d;\n" % log_n)
+        signature = "void run(%s *buf)" % type_name
+        output.write("%s;\n" % signature)
+        output.write("%s {\n" % signature)
+        output.write("  %s(buf);\n" % method_name)
+        output.write("}\n")
+    with open("/dev/null", "wb") as devnull:
+        code = subprocess.call(
+            "cd measurements && make run_%s" % type_name, shell=True, stdout=devnull
+        )
+        if code != 0:
+            raise Exception("bad exit code")
+        code = subprocess.call(
+            "./measurements/run_%s --benchmark_repetitions=%d --benchmark_format=csv > ./measurements/output.csv"
+            % (type_name, num_it),
+            shell=True,
+            stderr=devnull,
+        )
+        if code != 0:
+            raise Exception("bad exit code")
+    return get_mean_stddev()
+
+
+# Configuration parameter; set to False if you want the absolute fastest code without regard to size.
+CARE_ABOUT_CODE_SIZE = True
+
+# When CARE_ABOUT_CODE_SIZE, accept the smallest code that is not slower than
+# MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE * the fastest time.
+MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE = 1.1
+
+
+if __name__ == "__main__":
+    final_code = '// @generated\n#include "fht.h"\n'
+    code_so_far = ""
+    hall_of_fame = []
+    for type_name, composite_step_generator in [("float", float_neon_composite_step)]:
+        for log_n in range(1, max_log_n + 1):
+            sys.stdout.write("log_n = %d\n" % log_n)
+            times = []
+            try:
+                (res, desc) = (
+                    greedy_merged(type_name, log_n, composite_step_generator),
+                    "greedy_merged",
+                )
+            except Exception:
+                (res, desc) = (plain_unmerged(type_name, log_n), "plain_unmerged")
+            time = measure_time(
+                code_so_far + res, log_n, type_name, "helper_%s_%d" % (type_name, log_n)
+            )
+            code_size = res.count("\n")
+            times.append((time, res, code_size, desc))
+            sys.stdout.write(
+                "log_n = %d; iterative; code_size = %d; time = %.10e\n"
+                % (log_n, code_size, time)
+            )
+            for threshold_step in range(1, log_n + 1):
+                try:
+                    res = greedy_merged_recursive(
+                        type_name, log_n, threshold_step, composite_step_generator
+                    )
+                    time = measure_time(
+                        code_so_far + res,
+                        log_n,
+                        type_name,
+                        "helper_%s_%d" % (type_name, log_n),
+                    )
+                    code_size = res.count("\n")
+                    times.append(
+                        (
+                            time,
+                            res,
+                            code_size,
+                            "greedy_merged_recursive %d" % threshold_step,
+                        )
+                    )
+                    sys.stdout.write(
+                        "log_n = %d; threshold_step = %d; code_size = %d; time = %.10e\n"
+                        % (log_n, threshold_step, code_size, time)
+                    )
+                except Exception as e:
+                    sys.stdout.write(f"FAIL: {threshold_step} ({e})\n")
+            if CARE_ABOUT_CODE_SIZE:
+                fastest_time = min(times)[0]
+                times_by_size = sorted(times, key=lambda x: x[2])
+                for x in times_by_size:
+                    if x[0] <= fastest_time * MAX_PERFORMANCE_PENALTY_FOR_REDUCED_SIZE:
+                        smallest_acceptable = x
+                        break
+                (best_time, best_code, best_code_size, best_desc) = smallest_acceptable
+            else:
+                (best_time, best_code, best_code_size, best_desc) = min(times)
+            hall_of_fame.append((type_name, log_n, best_time, best_desc))
+            final_code += best_code
+            code_so_far += best_code
+            sys.stdout.write(
+                "log_n = %d; best_time = %.10e; %s\n" % (log_n, best_time, best_desc)
+            )
+        final_code += "int fht_%s(%s *buf, int log_n) {\n" % (type_name, type_name)
+        final_code += "  if (log_n == 0) {\n"
+        final_code += "    return 0;\n"
+        final_code += "  }\n"
+        for i in range(1, max_log_n + 1):
+            final_code += "  if (log_n == %d) {\n" % i
+            final_code += "    helper_%s_%d(buf);\n" % (type_name, i)
+            final_code += "    return 0;\n"
+            final_code += "  }\n"
+        final_code += "  return 1;\n"
+        final_code += "}\n"
+    with open("fht_neon.c", "w") as output:
+        output.write(final_code)
+    sys.stdout.write("hall of fame\n")
+    with open("hall_of_fame_neon.txt", "w") as hof:
+        for type_name, log_n, best_time, best_desc in hall_of_fame:
+            s = "type_name = %s; log_n = %d; best_time = %.10e; best_desc = %s\n" % (
+                type_name,
+                log_n,
+                best_time,
+                best_desc,
+            )
+            sys.stdout.write(s)
+            hof.write(s)
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt
new file mode 100644
index 00000000000..316ac08e5bc
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_avx.txt
@@ -0,0 +1,60 @@
+type_name = float; log_n = 1; best_time = 3.8273900000e-08; best_desc = plain_unmerged
+type_name = float; log_n = 2; best_time = 3.8694400000e-08; best_desc = plain_unmerged
+type_name = float; log_n = 3; best_time = 4.4120100000e-08; best_desc = greedy_merged
+type_name = float; log_n = 4; best_time = 4.6617800000e-08; best_desc = greedy_merged
+type_name = float; log_n = 5; best_time = 4.8970800000e-08; best_desc = greedy_merged
+type_name = float; log_n = 6; best_time = 5.3648500000e-08; best_desc = greedy_merged
+type_name = float; log_n = 7; best_time = 7.1866600000e-08; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 8; best_time = 1.0585600000e-07; best_desc = greedy_merged_recursive 6
+type_name = float; log_n = 9; best_time = 1.7403800000e-07; best_desc = greedy_merged
+type_name = float; log_n = 10; best_time = 3.4412700000e-07; best_desc = greedy_merged_recursive 10
+type_name = float; log_n = 11; best_time = 6.5679200000e-07; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 12; best_time = 1.3143800000e-06; best_desc = greedy_merged
+type_name = float; log_n = 13; best_time = 2.8488300000e-06; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 14; best_time = 6.1163700000e-06; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 15; best_time = 1.3664400000e-05; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 16; best_time = 3.0120900000e-05; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 17; best_time = 6.5561000000e-05; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 18; best_time = 1.4176100000e-04; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 19; best_time = 3.0320000000e-04; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 20; best_time = 6.7070400000e-04; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 21; best_time = 1.4708400000e-03; best_desc = greedy_merged_recursive 9
+type_name = float; log_n = 22; best_time = 3.9836500000e-03; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 23; best_time = 8.8893400000e-03; best_desc = greedy_merged_recursive 9
+type_name = float; log_n = 24; best_time = 1.9483500000e-02; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 25; best_time = 4.5779600000e-02; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 26; best_time = 9.7643700000e-02; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 27; best_time = 2.1200800000e-01; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 28; best_time = 4.9995900000e-01; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 29; best_time = 1.0615600000e+00; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 30; best_time = 2.2532100000e+00; best_desc = greedy_merged_recursive 6
+type_name = double; log_n = 1; best_time = 3.8275100000e-08; best_desc = plain_unmerged
+type_name = double; log_n = 2; best_time = 4.1286300000e-08; best_desc = greedy_merged
+type_name = double; log_n = 3; best_time = 4.3412600000e-08; best_desc = greedy_merged
+type_name = double; log_n = 4; best_time = 4.5500500000e-08; best_desc = greedy_merged_recursive 4
+type_name = double; log_n = 5; best_time = 4.9231800000e-08; best_desc = greedy_merged
+type_name = double; log_n = 6; best_time = 6.2857100000e-08; best_desc = greedy_merged
+type_name = double; log_n = 7; best_time = 8.9013300000e-08; best_desc = greedy_merged
+type_name = double; log_n = 8; best_time = 1.4163900000e-07; best_desc = greedy_merged
+type_name = double; log_n = 9; best_time = 2.7611500000e-07; best_desc = greedy_merged
+type_name = double; log_n = 10; best_time = 5.2217100000e-07; best_desc = greedy_merged
+type_name = double; log_n = 11; best_time = 1.0466200000e-06; best_desc = greedy_merged
+type_name = double; log_n = 12; best_time = 2.3401300000e-06; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 13; best_time = 5.0560300000e-06; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 14; best_time = 1.1394900000e-05; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 15; best_time = 2.5470800000e-05; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 16; best_time = 5.7387600000e-05; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 17; best_time = 1.2497400000e-04; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 18; best_time = 2.6934700000e-04; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 19; best_time = 6.0233800000e-04; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 20; best_time = 1.3345100000e-03; best_desc = greedy_merged_recursive 9
+type_name = double; log_n = 21; best_time = 3.6883500000e-03; best_desc = greedy_merged_recursive 7
+type_name = double; log_n = 22; best_time = 8.6217800000e-03; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 23; best_time = 1.9016200000e-02; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 24; best_time = 4.5002100000e-02; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 25; best_time = 9.7061600000e-02; best_desc = greedy_merged_recursive 8
+type_name = double; log_n = 26; best_time = 2.1355500000e-01; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 27; best_time = 4.8974200000e-01; best_desc = greedy_merged_recursive 9
+type_name = double; log_n = 28; best_time = 1.0586200000e+00; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 29; best_time = 2.1763100000e+00; best_desc = greedy_merged_recursive 11
+type_name = double; log_n = 30; best_time = 4.8801600000e+00; best_desc = greedy_merged_recursive 9
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt
new file mode 100644
index 00000000000..547009956e5
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_neon.txt
@@ -0,0 +1,30 @@
+type_name = float; log_n = 1; best_time = 4.1929000000e-08; best_desc = plain_unmerged
+type_name = float; log_n = 2; best_time = 4.1758100000e-08; best_desc = greedy_merged
+type_name = float; log_n = 3; best_time = 4.2130400000e-08; best_desc = greedy_merged_recursive 2
+type_name = float; log_n = 4; best_time = 4.1849300000e-08; best_desc = greedy_merged_recursive 3
+type_name = float; log_n = 5; best_time = 4.2931300000e-08; best_desc = greedy_merged_recursive 4
+type_name = float; log_n = 6; best_time = 4.5379000000e-08; best_desc = greedy_merged_recursive 3
+type_name = float; log_n = 7; best_time = 6.4887900000e-08; best_desc = greedy_merged_recursive 3
+type_name = float; log_n = 8; best_time = 1.0970500000e-07; best_desc = greedy_merged
+type_name = float; log_n = 9; best_time = 2.2306600000e-07; best_desc = greedy_merged_recursive 8
+type_name = float; log_n = 10; best_time = 4.4169300000e-07; best_desc = greedy_merged_recursive 8
+type_name = float; log_n = 11; best_time = 9.7532700000e-07; best_desc = greedy_merged_recursive 10
+type_name = float; log_n = 12; best_time = 1.9247200000e-06; best_desc = greedy_merged_recursive 10
+type_name = float; log_n = 13; best_time = 3.6199200000e-06; best_desc = greedy_merged
+type_name = float; log_n = 14; best_time = 8.4450100000e-06; best_desc = greedy_merged_recursive 10
+type_name = float; log_n = 15; best_time = 1.6781100000e-05; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 16; best_time = 3.7584000000e-05; best_desc = greedy_merged_recursive 15
+type_name = float; log_n = 17; best_time = 7.6645500000e-05; best_desc = greedy_merged_recursive 15
+type_name = float; log_n = 18; best_time = 1.7394400000e-04; best_desc = greedy_merged_recursive 17
+type_name = float; log_n = 19; best_time = 3.9186900000e-04; best_desc = greedy_merged_recursive 18
+type_name = float; log_n = 20; best_time = 8.0344800000e-04; best_desc = greedy_merged_recursive 18
+type_name = float; log_n = 21; best_time = 1.8539700000e-03; best_desc = greedy_merged_recursive 20
+type_name = float; log_n = 22; best_time = 3.6448200000e-03; best_desc = greedy_merged_recursive 20
+type_name = float; log_n = 23; best_time = 8.4403500000e-03; best_desc = greedy_merged_recursive 22
+type_name = float; log_n = 24; best_time = 1.8726400000e-02; best_desc = greedy_merged_recursive 23
+type_name = float; log_n = 25; best_time = 3.8848300000e-02; best_desc = greedy_merged_recursive 23
+type_name = float; log_n = 26; best_time = 8.6437100000e-02; best_desc = greedy_merged_recursive 25
+type_name = float; log_n = 27; best_time = 1.9369800000e-01; best_desc = greedy_merged_recursive 26
+type_name = float; log_n = 28; best_time = 3.9619200000e-01; best_desc = greedy_merged_recursive 26
+type_name = float; log_n = 29; best_time = 1.0401300000e+00; best_desc = greedy_merged_recursive 28
+type_name = float; log_n = 30; best_time = 2.0733800000e+00; best_desc = greedy_merged_recursive 29
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt
new file mode 100644
index 00000000000..67c1d5cfe0d
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/hall_of_fame_sse.txt
@@ -0,0 +1,60 @@
+type_name = float; log_n = 1; best_time = 3.8234100000e-08; best_desc = plain_unmerged
+type_name = float; log_n = 2; best_time = 3.9592000000e-08; best_desc = greedy_merged
+type_name = float; log_n = 3; best_time = 4.2633300000e-08; best_desc = greedy_merged
+type_name = float; log_n = 4; best_time = 4.5965700000e-08; best_desc = greedy_merged
+type_name = float; log_n = 5; best_time = 5.2128100000e-08; best_desc = greedy_merged
+type_name = float; log_n = 6; best_time = 6.9547900000e-08; best_desc = greedy_merged
+type_name = float; log_n = 7; best_time = 1.0289400000e-07; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 8; best_time = 1.7410400000e-07; best_desc = greedy_merged
+type_name = float; log_n = 9; best_time = 3.5127900000e-07; best_desc = greedy_merged
+type_name = float; log_n = 10; best_time = 6.8896800000e-07; best_desc = greedy_merged
+type_name = float; log_n = 11; best_time = 1.3963700000e-06; best_desc = greedy_merged
+type_name = float; log_n = 12; best_time = 3.0889100000e-06; best_desc = greedy_merged_recursive 7
+type_name = float; log_n = 13; best_time = 6.3768900000e-06; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 14; best_time = 1.3732600000e-05; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 15; best_time = 2.8962800000e-05; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 16; best_time = 6.2055900000e-05; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 17; best_time = 1.3487500000e-04; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 18; best_time = 2.7989100000e-04; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 19; best_time = 5.9871200000e-04; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 20; best_time = 1.3111100000e-03; best_desc = greedy_merged_recursive 8
+type_name = float; log_n = 21; best_time = 2.7614800000e-03; best_desc = greedy_merged_recursive 13
+type_name = float; log_n = 22; best_time = 6.4353000000e-03; best_desc = greedy_merged_recursive 11
+type_name = float; log_n = 23; best_time = 1.4471700000e-02; best_desc = greedy_merged_recursive 6
+type_name = float; log_n = 24; best_time = 3.1766100000e-02; best_desc = greedy_merged_recursive 15
+type_name = float; log_n = 25; best_time = 6.9094300000e-02; best_desc = greedy_merged_recursive 8
+type_name = float; log_n = 26; best_time = 1.4882800000e-01; best_desc = greedy_merged_recursive 5
+type_name = float; log_n = 27; best_time = 3.1941300000e-01; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 28; best_time = 6.9037700000e-01; best_desc = greedy_merged_recursive 16
+type_name = float; log_n = 29; best_time = 1.4692400000e+00; best_desc = greedy_merged_recursive 12
+type_name = float; log_n = 30; best_time = 3.0698600000e+00; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 1; best_time = 3.8948100000e-08; best_desc = greedy_merged
+type_name = double; log_n = 2; best_time = 4.0811300000e-08; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 3; best_time = 4.3109200000e-08; best_desc = greedy_merged_recursive 3
+type_name = double; log_n = 4; best_time = 4.9621200000e-08; best_desc = greedy_merged
+type_name = double; log_n = 5; best_time = 6.3119400000e-08; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 6; best_time = 9.2256300000e-08; best_desc = greedy_merged
+type_name = double; log_n = 7; best_time = 1.5220200000e-07; best_desc = greedy_merged
+type_name = double; log_n = 8; best_time = 2.9771700000e-07; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 9; best_time = 6.0368400000e-07; best_desc = greedy_merged_recursive 6
+type_name = double; log_n = 10; best_time = 1.2246100000e-06; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 11; best_time = 2.6907000000e-06; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 12; best_time = 5.6900800000e-06; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 13; best_time = 1.2392900000e-05; best_desc = greedy_merged
+type_name = double; log_n = 14; best_time = 2.6329500000e-05; best_desc = greedy_merged_recursive 9
+type_name = double; log_n = 15; best_time = 5.6564400000e-05; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 16; best_time = 1.2357300000e-04; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 17; best_time = 2.5763800000e-04; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 18; best_time = 5.5563300000e-04; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 19; best_time = 1.2115600000e-03; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 20; best_time = 2.5899100000e-03; best_desc = greedy_merged_recursive 12
+type_name = double; log_n = 21; best_time = 6.0839900000e-03; best_desc = greedy_merged_recursive 13
+type_name = double; log_n = 22; best_time = 1.3738100000e-02; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 23; best_time = 3.0164700000e-02; best_desc = greedy_merged_recursive 2
+type_name = double; log_n = 24; best_time = 6.6689900000e-02; best_desc = greedy_merged_recursive 13
+type_name = double; log_n = 25; best_time = 1.4307200000e-01; best_desc = greedy_merged_recursive 10
+type_name = double; log_n = 26; best_time = 3.0875700000e-01; best_desc = greedy_merged_recursive 5
+type_name = double; log_n = 27; best_time = 6.7026700000e-01; best_desc = greedy_merged_recursive 6
+type_name = double; log_n = 28; best_time = 1.4210300000e+00; best_desc = greedy_merged_recursive 14
+type_name = double; log_n = 29; best_time = 3.0175300000e+00; best_desc = greedy_merged_recursive 9
+type_name = double; log_n = 30; best_time = 6.4575800000e+00; best_desc = greedy_merged_recursive 3
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile
new file mode 100644
index 00000000000..807d5fe626b
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/Makefile
@@ -0,0 +1,13 @@
+CXX=g++
+CXX_FLAGS=-O3 -Wall -march=native -std=c++11 `pkg-config benchmark --cflags --libs` -lpthread
+
+.PHONY: run_float run_double clean
+
+run_float:
+	$(CXX) run_float.cpp -o run_float $(CXX_FLAGS)
+
+run_double:
+	$(CXX) run_double.cpp -o run_double $(CXX_FLAGS)
+
+clean:
+	rm -rf run_float run_double
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp
new file mode 100644
index 00000000000..711456a1f7a
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_double.cpp
@@ -0,0 +1,27 @@
+#include "to_run.h"
+
+#include <benchmark/benchmark.h>
+
+#include <chrono>
+#include <stdexcept>
+
+#include <cstdlib>
+
+static void benchmark_fht(benchmark::State &state) {
+  double *buf;
+  if (posix_memalign((void**)&buf, 32, sizeof(double) * (1 << log_n))) {
+    throw std::runtime_error("posix_memalign failed");
+  }
+  while (state.KeepRunning()) {
+    auto start = std::chrono::high_resolution_clock::now();
+    run(buf);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+  free(buf);
+}
+
+BENCHMARK(benchmark_fht);
+
+BENCHMARK_MAIN();
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp
new file mode 100644
index 00000000000..d84159d6ca7
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/measurements/run_float.cpp
@@ -0,0 +1,27 @@
+#include "to_run.h"
+
+#include <benchmark/benchmark.h>
+
+#include <chrono>
+#include <stdexcept>
+
+#include <cstdlib>
+
+static void benchmark_fht(benchmark::State &state) {
+  float *buf;
+  if (posix_memalign((void**)&buf, 32, sizeof(float) * (1 << log_n))) {
+    throw std::runtime_error("posix_memalign failed");
+  }
+  while (state.KeepRunning()) {
+    auto start = std::chrono::high_resolution_clock::now();
+    run(buf);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+  free(buf);
+}
+
+BENCHMARK(benchmark_fht);
+
+BENCHMARK_MAIN();
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/targets.bzl b/extension/llm/custom_ops/spinquant/third-party/FFHT/targets.bzl
new file mode 100644
index 00000000000..9ba0ae32fb4
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/targets.bzl
@@ -0,0 +1,34 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_library(
+        name = "dumb_fht",
+        srcs = ["dumb_fht.c"],
+        exported_headers = ["dumb_fht.h"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+    )
+
+    runtime.cxx_library(
+        name = "fht",
+        srcs = select({
+            "DEFAULT": [],
+            "ovr_config//cpu:arm64": ["fht_neon.c"],
+            "ovr_config//cpu:x86_64": ["fht_avx.c"],
+        }),
+        exported_headers = ["fht.h"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+    )
+
+    runtime.cxx_binary(
+        name = "test_float",
+        srcs = ["test_float.c"],
+        deps = [
+            ":dumb_fht",
+            ":fht",
+        ],
+    )
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c
new file mode 100644
index 00000000000..f532ae6e2ff
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double.c
@@ -0,0 +1,68 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "fht.h"
+
+void dumb_fht(double *buf, int log_n);
+void dumb_fht(double *buf, int log_n) {
+    int n = 1 << log_n;
+    for (int i = 0; i < log_n; ++i) {
+        int s1 = 1 << i;
+        int s2 = s1 << 1;
+        for (int j = 0; j < n; j += s2) {
+            for (int k = 0; k < s1; ++k) {
+                double u = buf[j + k];
+                double v = buf[j + k + s1];
+                buf[j + k] = u + v;
+                buf[j + k + s1] = u - v;
+            }
+        }
+    }
+}
+
+int main(void) {
+    srand(4057218);
+    for (int log_n = 1; log_n <= 30; ++log_n) {
+        printf("%d ", log_n);
+        int n = 1 << log_n;
+        void *buf = malloc(sizeof(double) * n + 32);
+        char *start = buf;
+        while ((size_t)start % 32 != 0) start = start + 1;
+        double *a = (double*)start;
+        double *aux = (double*)malloc(sizeof(double) * n);
+        for (int i = 0; i < n; ++i) {
+            a[i] = 1.0 - 2.0 * (rand() & 1);
+            aux[i] = a[i];
+        }
+        fht_double(a, log_n);
+        dumb_fht(aux, log_n);
+        double max_error = 0.0;
+        for (int i = 0; i < n; ++i) {
+            double error = fabs(a[i] - aux[i]);
+            if (error > max_error) {
+                max_error = error;
+            }
+        }
+        if (max_error > 1e-5) {
+            printf("ERROR: %.10lf\n", max_error);
+            return 1;
+        }
+        for (int num_it = 10;; num_it *= 2) {
+            clock_t tt1 = clock();
+            for (int it = 0; it < num_it; ++it) {
+                fht_double(a, log_n);
+            }
+            clock_t tt2 = clock();
+            double sec = (tt2 - tt1) / (CLOCKS_PER_SEC + 0.0);
+            if (sec >= 1.0) {
+                printf("%.10e\n", sec / (num_it + 0.0));
+                break;
+            }
+        }
+        free(buf);
+        free(aux);
+    }
+    return 0;
+}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c
new file mode 100644
index 00000000000..4e39d6aff46
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float.c
@@ -0,0 +1,53 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "dumb_fht.h"
+#include "fht.h"
+
+int main(void) {
+  srand(4057218);
+  for (int log_n = 1; log_n <= 30; ++log_n) {
+    printf("%d ", log_n);
+    int n = 1 << log_n;
+    void* buf = malloc(sizeof(float) * n + 32);
+    char* start = buf;
+    while ((size_t)start % 32 != 0)
+      start = start + 1;
+    float* a = (float*)start;
+    float* aux = (float*)malloc(sizeof(double) * n);
+    for (int i = 0; i < n; ++i) {
+      a[i] = 1.0 - 2.0 * (rand() & 1);
+      aux[i] = a[i];
+    }
+    fht_float(a, log_n);
+    dumb_fht(aux, log_n);
+    double max_error = 0.0;
+    for (int i = 0; i < n; ++i) {
+      double error = fabs(a[i] - aux[i]);
+      if (error > max_error) {
+        max_error = error;
+      }
+    }
+    if (max_error > 1e-5) {
+      printf("ERROR: %.10lf\n", max_error);
+      return 1;
+    }
+    for (int num_it = 10;; num_it *= 2) {
+      clock_t tt1 = clock();
+      for (int it = 0; it < num_it; ++it) {
+        fht_float(a, log_n);
+      }
+      clock_t tt2 = clock();
+      double sec = (tt2 - tt1) / (CLOCKS_PER_SEC + 0.0);
+      if (sec >= 1.0) {
+        printf("%.10e\n", sec / (num_it + 0.0));
+        break;
+      }
+    }
+    free(buf);
+    free(aux);
+  }
+  return 0;
+}
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index ded25054acc..c2843f5c2f7 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -9,8 +9,18 @@ def define_common_targets():
     for mkl_dep in ["", "_mkl_noomp"]:
         runtime.cxx_library(
             name = "custom_ops" + mkl_dep,
-            srcs = ["op_sdpa.cpp", "op_fallback.cpp"],
-            exported_headers = ["op_sdpa.h", "op_fallback.h"],
+            srcs = [
+                "op_fallback.cpp",
+                "op_fast_hadamard_transform.cpp",
+                "op_sdpa.cpp",
+                "op_update_quantized_cache.cpp",
+            ],
+            exported_headers = [
+                "op_fallback.h",
+                "op_fast_hadamard_transform.h",
+                "op_sdpa.h",
+                "op_update_quantized_cache.h",
+            ],
             exported_deps = [
                 "//executorch/runtime/kernel:kernel_includes",
                 "//executorch/kernels/portable/cpu:scalar_utils",
@@ -20,6 +30,10 @@ def define_common_targets():
                 "//executorch/extension/parallel:thread_parallel",
                 "//executorch/extension/threadpool:threadpool",
             ],
+            deps = [
+                "//executorch/kernels/portable/cpu/util:reduce_util",
+                "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
+            ],
             compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
             visibility = [
                 "//executorch/...",
@@ -34,8 +48,13 @@ def define_common_targets():
         runtime.cxx_library(
             name = "custom_ops_aot_lib" + mkl_dep,
             srcs = [
+                "op_fast_hadamard_transform_aten.cpp",
                 "op_sdpa_aot.cpp",
+                "op_tile_crop.cpp",
+                "op_tile_crop_aot.cpp",
             ],
+            headers = ["op_tile_crop.h"],
+            compiler_flags = ["-Wno-global-constructors"],
             visibility = [
                 "//executorch/...",
                 "@EXECUTORCH_CLIENTS",
@@ -106,6 +125,20 @@ def define_common_targets():
         ],
     )
 
+    runtime.python_library(
+        name = "model_sharding_py",
+        srcs = [
+            "model_sharding.py",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//caffe2:torch",
+        ],
+    )
+
     runtime.cxx_library(
         name = "op_tile_crop",
         srcs = ["op_tile_crop.cpp"],
diff --git a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
index dd63c68f138..bfd64cb8975 100644
--- a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
+++ b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
@@ -373,10 +373,10 @@ class SDPATestCommon(unittest.TestCase):
 
     def setup_caches(self):
         self.k_cache = torch.zeros(
-            (1, self.max_seq_len, self.n_heads_kv, self.head_dim)
+            (self.n_batch, self.max_seq_len, self.n_heads_kv, self.head_dim)
         )
         self.v_cache = torch.zeros(
-            (1, self.max_seq_len, self.n_heads_kv, self.head_dim)
+            (self.n_batch, self.max_seq_len, self.n_heads_kv, self.head_dim)
         )
         self.mask = torch.full(
             (self.max_seq_len, self.max_seq_len),
@@ -386,6 +386,7 @@ def setup_caches(self):
 
     def setUp(self):
         torch.manual_seed(42)
+        self.n_batch = 5
         self.n_heads_kv = 32
         self.n_heads_q = 32
         self.head_dim = 128
@@ -410,27 +411,27 @@ def _test_sdpa_common(
         scale_tensors=False,
     ):
         # Range arbitrarily chosen to reproduce a numerical error on x86 in some of the long context tests
-        tensor_scale_max = 20
-        tensor_scale_min = -20
+        tensor_scale_max = 15
+        tensor_scale_min = -15
         self.n_heads_kv = n_heads_kv
         self.n_heads_q = n_heads_q
         self.head_dim = head_dim
         self.max_seq_len = max_seq_len
         self.setup_caches()
         q = self._scale_tensor(
-            torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)),
+            torch.rand((self.n_batch, seq_len, self.n_heads_kv, self.head_dim)),
             tensor_scale_max,
             tensor_scale_min,
             scale_tensors,
         )
         k = self._scale_tensor(
-            torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)),
+            torch.rand((self.n_batch, seq_len, self.n_heads_kv, self.head_dim)),
             tensor_scale_max,
             tensor_scale_min,
             scale_tensors,
         )
         v = self._scale_tensor(
-            torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)),
+            torch.rand((self.n_batch, seq_len, self.n_heads_kv, self.head_dim)),
             tensor_scale_max,
             tensor_scale_min,
             scale_tensors,
@@ -448,19 +449,25 @@ def _test_sdpa_common(
         self.assertTrue(torch.allclose(ref_output, op_output, atol=1e-6))
 
         q = self._scale_tensor(
-            torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)),
+            torch.rand(
+                (self.n_batch, next_iter_seq_len, self.n_heads_kv, self.head_dim)
+            ),
             tensor_scale_max,
             tensor_scale_min,
             scale_tensors,
         )
         k = self._scale_tensor(
-            torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)),
+            torch.rand(
+                (self.n_batch, next_iter_seq_len, self.n_heads_kv, self.head_dim)
+            ),
             tensor_scale_max,
             tensor_scale_min,
             scale_tensors,
         )
         v = self._scale_tensor(
-            torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)),
+            torch.rand(
+                (self.n_batch, next_iter_seq_len, self.n_heads_kv, self.head_dim)
+            ),
             tensor_scale_max,
             tensor_scale_min,
             scale_tensors,
diff --git a/extension/llm/custom_ops/test_update_quantized_cache.py b/extension/llm/custom_ops/test_update_quantized_cache.py
new file mode 100644
index 00000000000..75e1f4cc6ae
--- /dev/null
+++ b/extension/llm/custom_ops/test_update_quantized_cache.py
@@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import unittest
+
+import torch
+
+
+class UpdateQuantizedKVCacheTest(unittest.TestCase):
+
+    def _reset(self):
+        self.quantized_k_cache = torch.zeros(
+            (self.batch_size, self.seq_len, self.num_heads, self.head_dim),
+            dtype=torch.int8,
+        )
+        self.quantized_v_cache = torch.zeros(
+            (self.batch_size, self.seq_len, self.num_heads, self.head_dim),
+            dtype=torch.int8,
+        )
+        self.k_scales_cache = torch.zeros(
+            (self.batch_size, self.seq_len, self.num_heads, 1), dtype=torch.float64
+        )
+        self.v_scales_cache = torch.zeros(
+            (self.batch_size, self.seq_len, self.num_heads, 1), dtype=torch.float64
+        )
+        self.k_zero_points_cache = torch.zeros(
+            (self.batch_size, self.seq_len, self.num_heads, 1), dtype=torch.int64
+        )
+        self.v_zero_points_cache = torch.zeros(
+            (self.batch_size, self.seq_len, self.num_heads, 1), dtype=torch.int64
+        )
+
+    def setUp(self):
+        torch.manual_seed(42)
+        self.batch_size = 1
+        self.seq_len = 10
+        self.num_heads = 8
+        self.head_dim = 4
+        self._reset()
+
+    def _update_k(self, start_pos, value, scales, zero_points):
+        seq_len = value.size(1)
+        self.quantized_k_cache[:, start_pos : start_pos + seq_len, :, :] = value
+        self.k_scales_cache[:, start_pos : start_pos + seq_len, :, :] = scales
+        self.k_zero_points_cache[:, start_pos : start_pos + seq_len, :, :] = zero_points
+
+    def _update_v(self, start_pos, value, scales, zero_points):
+        seq_len = value.size(1)
+        self.quantized_v_cache[:, start_pos : start_pos + seq_len, :, :] = value
+        self.v_scales_cache[:, start_pos : start_pos + seq_len, :, :] = scales
+        self.v_zero_points_cache[:, start_pos : start_pos + seq_len, :, :] = zero_points
+
+    def _update_and_validate(
+        self, k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
+    ):
+        k_cache = self.quantized_k_cache.clone()
+        v_cache = self.quantized_v_cache.clone()
+        k_scales_cache = self.k_scales_cache.clone()
+        v_scales_cache = self.v_scales_cache.clone()
+        k_zero_points_cache = self.k_zero_points_cache.clone()
+        v_zero_points_cache = self.v_zero_points_cache.clone()
+        self._update_k(start_pos, k, k_scales, k_zero_points)
+        self._update_v(start_pos, v, v_scales, v_zero_points)
+
+        torch.ops.llama.update_quantized_cache(k, k_cache, start_pos)
+        torch.ops.llama.update_quantized_cache(k_scales, k_scales_cache, start_pos)
+        torch.ops.llama.update_quantized_cache(
+            k_zero_points, k_zero_points_cache, start_pos
+        )
+
+        torch.ops.llama.update_quantized_cache(v, v_cache, start_pos)
+        torch.ops.llama.update_quantized_cache(v_scales, v_scales_cache, start_pos)
+        torch.ops.llama.update_quantized_cache(
+            v_zero_points, v_zero_points_cache, start_pos
+        )
+
+        self.assertTrue(torch.allclose(k_cache, self.quantized_k_cache))
+        self.assertTrue(torch.allclose(v_cache, self.quantized_v_cache))
+        self.assertTrue(torch.allclose(k_scales_cache, self.k_scales_cache))
+        self.assertTrue(torch.allclose(v_scales_cache, self.v_scales_cache))
+        self.assertTrue(torch.allclose(k_zero_points_cache, self.k_zero_points_cache))
+        self.assertTrue(torch.allclose(v_zero_points_cache, self.v_zero_points_cache))
+
+    def test_update_kv_cache_simple(self):
+        k = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        v = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        v_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+        v_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+        start_pos = 0
+        self._update_and_validate(
+            k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
+        )
+
+    def test_update_kv_cache_large_update(self):
+        self._reset()
+        k = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+        v = torch.randint(0, 50, (1, 3, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        v_scales = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 3, 8, 1), dtype=torch.int64)
+        v_zero_points = torch.randint(0, 20, (1, 3, 8, 1), dtype=torch.int64)
+        start_pos = 0
+        self._update_and_validate(
+            k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
+        )
+
+    def test_update_kv_cache_update_nonzero_offset(self):
+        self._reset()
+        k = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        v = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        v_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+        v_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+        start_pos = 2
+        self._update_and_validate(
+            k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
+        )
+
+    def test_update_kv_cache_more_updates(self):
+        self._reset()
+        k = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        v = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        v_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+        v_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+        start_pos = 2
+        self._update_and_validate(
+            k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
+        )
+
+        k = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        v = torch.randint(0, 50, (1, 1, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        v_scales = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+        v_zero_points = torch.randint(0, 20, (1, 1, 8, 1), dtype=torch.int64)
+        start_pos = 4
+
+        self._update_and_validate(
+            k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
+        )
+
+    def test_batched_update_kv_cache_more_updates(self):
+        self.batch_size = 7
+        self._reset()
+        k = torch.randint(0, 50, (self.batch_size, 1, 8, 4), dtype=torch.int8)
+        v = torch.randint(0, 50, (self.batch_size, 1, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((self.batch_size, 1, 8, 1), dtype=torch.float64)
+        v_scales = torch.rand((self.batch_size, 1, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(
+            0, 20, (self.batch_size, 1, 8, 1), dtype=torch.int64
+        )
+        v_zero_points = torch.randint(
+            0, 20, (self.batch_size, 1, 8, 1), dtype=torch.int64
+        )
+        start_pos = 2
+        self._update_and_validate(
+            k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
+        )
+
+        k = torch.randint(0, 50, (self.batch_size, 1, 8, 4), dtype=torch.int8)
+        v = torch.randint(0, 50, (self.batch_size, 1, 8, 4), dtype=torch.int8)
+        k_scales = torch.rand((self.batch_size, 1, 8, 1), dtype=torch.float64)
+        v_scales = torch.rand((self.batch_size, 1, 8, 1), dtype=torch.float64)
+        k_zero_points = torch.randint(
+            0, 20, (self.batch_size, 1, 8, 1), dtype=torch.int64
+        )
+        v_zero_points = torch.randint(
+            0, 20, (self.batch_size, 1, 8, 1), dtype=torch.int64
+        )
+        start_pos = 4
+
+        self._update_and_validate(
+            k, v, k_scales, v_scales, k_zero_points, v_zero_points, start_pos
+        )
diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS
index be9bc183dbe..e4ade20228b 100644
--- a/extension/llm/export/TARGETS
+++ b/extension/llm/export/TARGETS
@@ -27,6 +27,8 @@ runtime.python_library(
         "//executorch/backends/apple/coreml:backend",
         "//executorch/backends/apple/coreml:partitioner",
         "//executorch/backends/apple/mps:partitioner",
+        "//executorch/backends/qualcomm/partition:partition",
+        "//executorch/backends/qualcomm/quantizer:quantizer",
         "//executorch/backends/transforms:duplicate_dynamic_quant_chain",
         "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
         "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 338d997297d..ee54fe3660d 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -16,7 +16,7 @@
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
 )
-from executorch.backends.xnnpack.passes.convert_to_linear import ConvertToLinearPass
+from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
 from executorch.exir import EdgeProgramManager
 from executorch.exir.backend.partitioner import Partitioner
 
@@ -146,6 +146,7 @@ def source_transform(
 
         if self.verbose:
             logging.info(f"Applied source transforms: {self.applied_source_transforms}")
+        logging.info(f"Model after source transforms: {self.model}")
         return self
 
     def _get_dynamic_shape(self) -> Any:
@@ -209,7 +210,9 @@ def pt2e_calibrate(
             from executorch.examples.models.llama2.eval_llama_lib import (
                 GraphModuleEvalWrapper,
             )
-            from executorch.examples.models.llama2.evaluate import evaluate_model
+            from executorch.examples.models.llama2.evaluate import (  # pyre-ignore[21]
+                evaluate_model,
+            )
         except ImportError:
             raise ImportError(
                 "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh"
@@ -389,9 +392,7 @@ def to_executorch(self) -> "LLMEdgeManager":
                     ConvertToLinearPass(),
                     QuantFusionPass(),
                 ],
-                memory_planning_pass=MemoryPlanningPass(
-                    "greedy", alloc_graph_input=False
-                ),
+                memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
             )
         )
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index eca78bc9346..37b215a51ff 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -7,16 +7,28 @@
 from typing import Optional
 
 
-def get_xnnpack_partitioner():
+def get_xnnpack_partitioner(dynamic_quant_only_partitioner: bool = True):
+    """
+    Returns the XNNPACK partitioner.
+
+    @arg dynamic_quant_only_partitioner:
+        This is enabled by default to keep BC.
+        If dynamic_quant_only_partitioner is True, then only dynamically quantized
+        linear layers will be partitioned.
+        Else, anything which can be will be partitioned greedily.
+    """
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
         XnnpackDynamicallyQuantizedPartitioner,
+        XnnpackPartitioner,
     )
 
-    # Following changes due to.
-    # 1. We need dynamically quantized partitioner for both pt2e_quantize options
-    #    as well as "qmode 8da4w" which is also dynamic quantizes linear layers.
-    # 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
-    return XnnpackDynamicallyQuantizedPartitioner()
+    if dynamic_quant_only_partitioner:
+        # Following changes due to.
+        # 1. We need dynamically quantized partitioner for both pt2e_quantize options
+        #    as well as "qmode 8da4w" which is also dynamic quantizes linear layers.
+        # 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
+        return XnnpackDynamicallyQuantizedPartitioner()
+    return XnnpackPartitioner()
 
 
 def get_vulkan_partitioner(
@@ -56,7 +68,7 @@ def get_mps_partitioner(use_kv_cache: bool = False):
 
 
 def get_coreml_partitioner(
-    enable_state: bool = False,
+    ios: int = 15,
     embedding_quantize: Optional[str] = None,
     pt2e_quantize: Optional[str] = None,
     coreml_quantize: Optional[str] = None,
@@ -74,26 +86,42 @@ def get_coreml_partitioner(
             "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"
         )
 
-    minimum_deployment_target = ct.target.iOS15
-    # In Core ML, stateful execution is introduced in iOS 18
-    if enable_state:
-        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
-    # In Core ML, quantization is introduced in iOS 16
-    if embedding_quantize is not None or pt2e_quantize is not None:
-        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS16)
-    # In Core ML, 8-bit activation quantization is introduced in iOS 17
-    if (
-        embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 8
-    ) or pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"):
-        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
-    # In Core ML, 4-bit weight compression is introduced in iOS 18
-    if (
-        (embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4)
-        or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
-        or coreml_quantize == "b4w"
-    ):
-        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
+    def _validate_ios_version() -> None:
+        assert ios in (15, 16, 17, 18)
 
+        if embedding_quantize is not None and ios < 18:
+            raise ValueError(
+                "In Core ML, per-block quantization is introduced in iOS 18"
+            )
+
+        use_quantization = pt2e_quantize is not None or coreml_quantize is not None
+        if use_quantization and ios < 16:
+            raise ValueError("In Core ML, quantization is introduced in iOS 16")
+
+        use_8a = (pt2e_quantize is not None and "8a" in pt2e_quantize) or (
+            coreml_quantize is not None and "8a" in coreml_quantize
+        )
+        if use_8a and ios < 17:
+            raise ValueError(
+                "In Core ML, 8-bit activation quantization is introduced in iOS 17"
+            )
+
+        use_4w = (pt2e_quantize is not None and "4w" in pt2e_quantize) or (
+            coreml_quantize is not None and "4w" in coreml_quantize
+        )
+        if use_4w and ios < 18:
+            raise ValueError(
+                "In Core ML, 4-bit weight compression is introduced in iOS 18"
+            )
+
+    _validate_ios_version()
+
+    minimum_deployment_target = {
+        15: ct.target.iOS15,
+        16: ct.target.iOS16,
+        17: ct.target.iOS17,
+        18: ct.target.iOS18,
+    }[ios]
     op_linear_quantizer_config = None
     if coreml_quantize == "b4w":
         op_linear_quantizer_config = {
@@ -103,7 +131,6 @@ def get_coreml_partitioner(
             "block_size": 32,
             "weight_threshold": 512,
         }
-
     compile_specs = CoreMLBackend.generate_compile_specs(  # pyre-fixme[16]
         minimum_deployment_target=minimum_deployment_target,
         compute_precision=ct.precision(ct.precision.FLOAT16.value),
@@ -112,9 +139,12 @@ def get_coreml_partitioner(
         model_type=CoreMLBackend.MODEL_TYPE.MODEL,  # pyre-fixme[16]
         op_linear_quantizer_config=op_linear_quantizer_config,
     )
+
+    take_over_mutable_buffer = minimum_deployment_target >= ct.target.iOS18
+
     return CoreMLPartitioner(  # pyre-fixme[16]
         compile_specs=compile_specs,
-        take_over_mutable_buffer=enable_state,
+        take_over_mutable_buffer=take_over_mutable_buffer,
     )
 
 
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
index 902ba892966..94b5129bbaf 100644
--- a/extension/llm/runner/stats.h
+++ b/extension/llm/runner/stats.h
@@ -52,6 +52,19 @@ struct Stats {
     aggregate_sampling_timer_start_timestamp = 0;
   }
 
+  void reset() {
+    model_load_start_ms = 0;
+    model_load_end_ms = 0;
+    inference_start_ms = 0;
+    prompt_eval_end_ms = 0;
+    first_token_ms = 0;
+    inference_end_ms = 0;
+    aggregate_sampling_time_ms = 0;
+    num_prompt_tokens = 0;
+    num_generated_tokens = 0;
+    aggregate_sampling_timer_start_timestamp = 0;
+  }
+
  private:
   long aggregate_sampling_timer_start_timestamp = 0;
 };
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 725fe707bb7..58ada0c246f 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -123,7 +123,9 @@ runtime::Result<std::unordered_set<std::string>> Module::method_names() {
   return result;
 }
 
-runtime::Error Module::load_method(const std::string& method_name) {
+runtime::Error Module::load_method(
+    const std::string& method_name,
+    torch::executor::EventTracer* tracer) {
   if (!is_method_loaded(method_name)) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
 
@@ -151,9 +153,8 @@ runtime::Error Module::load_method(const std::string& method_name) {
         method_holder.planned_memory.get(),
         temp_allocator_.get());
     method_holder.method = ET_UNWRAP_UNIQUE(program_->load_method(
-        method_name.c_str(),
-        method_holder.memory_manager.get(),
-        event_tracer_.get()));
+        method_name.c_str(), method_holder.memory_manager.get(), tracer));
+    method_holder.inputs.resize(method_holder.method->inputs_size());
     methods_.emplace(method_name, std::move(method_holder));
   }
   return runtime::Error::Ok;
@@ -167,12 +168,22 @@ runtime::Result<runtime::MethodMeta> Module::method_meta(
 
 runtime::Result<std::vector<runtime::EValue>> Module::execute(
     const std::string& method_name,
-    const std::vector<runtime::EValue>& input) {
+    const std::vector<runtime::EValue>& input_values) {
   ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
   auto& method = methods_.at(method_name).method;
+  auto& inputs = methods_.at(method_name).inputs;
 
+  for (size_t i = 0; i < input_values.size(); ++i) {
+    if (!input_values[i].isNone()) {
+      inputs[i] = input_values[i];
+    }
+  }
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    ET_CHECK_OR_RETURN_ERROR(
+        !inputs[i].isNone(), InvalidArgument, "input %zu is none", i);
+  }
   ET_CHECK_OK_OR_RETURN_ERROR(method->set_inputs(
-      exec_aten::ArrayRef<runtime::EValue>(input.data(), input.size())));
+      exec_aten::ArrayRef<runtime::EValue>(inputs.data(), inputs.size())));
   ET_CHECK_OK_OR_RETURN_ERROR(method->execute());
 
   const auto outputs_size = method->outputs_size();
@@ -183,12 +194,42 @@ runtime::Result<std::vector<runtime::EValue>> Module::execute(
   return outputs;
 }
 
-runtime::Error Module::set_output_data_ptr(
+runtime::Error Module::set_input(
+    const std::string& method_name,
+    const runtime::EValue& input_value,
+    size_t input_index) {
+  ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
+  methods_.at(method_name).inputs.at(input_index) = input_value;
+  return runtime::Error::Ok;
+}
+
+runtime::Error Module::set_inputs(
+    const std::string& method_name,
+    const std::vector<runtime::EValue>& input_values) {
+  ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
+  auto& inputs = methods_.at(method_name).inputs;
+  ET_CHECK_OR_RETURN_ERROR(
+      inputs.size() == input_values.size(),
+      InvalidArgument,
+      "input size: %zu does not match method input size: %zu",
+      input_values.size(),
+      inputs.size());
+  inputs = input_values;
+  return runtime::Error::Ok;
+}
+
+runtime::Error Module::set_output(
+    const std::string& method_name,
     runtime::EValue output_value,
     size_t output_index) {
-  ET_CHECK_OK_OR_RETURN_ERROR(load_method("forward"));
-  auto& output_tensor = output_value.toTensor();
-  auto& method = methods_.at("forward").method;
+  ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
+  auto& method = methods_.at(method_name).method;
+  ET_CHECK_OR_RETURN_ERROR(
+      output_value.isTensor(),
+      InvalidArgument,
+      "output type: %zu is not tensor",
+      (size_t)output_value.tag);
+  const auto& output_tensor = output_value.toTensor();
   return method->set_output_data_ptr(
       output_tensor.mutable_data_ptr(), output_tensor.nbytes(), output_index);
 }
diff --git a/extension/module/module.h b/extension/module/module.h
index c1fe11147f7..f7c9b1c8c56 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -44,6 +44,7 @@ class Module {
    *
    * @param[in] file_path The path to the ExecuTorch program file to load.
    * @param[in] load_mode The loading mode to use.
+   * @param[in] event_tracer A EventTracer used for tracking and logging events.
    */
   explicit Module(
       const std::string& file_path,
@@ -132,11 +133,28 @@ class Module {
    * needed. The loaded method is cached to reuse the next time it's executed.
    *
    * @param[in] method_name The name of the method to load.
+   * @param[in] event_tracer A EventTracer used for tracking and logging events.
    *
    * @returns An Error to indicate success or failure.
    */
   ET_NODISCARD
-  runtime::Error load_method(const std::string& method_name);
+  runtime::Error load_method(
+      const std::string& method_name,
+      torch::executor::EventTracer* event_tracer = nullptr);
+
+  /**
+   * Load the 'forward' method from the program and set up memory management if
+   * needed. The loaded method is cached to reuse the next time it's executed.
+   *
+   * @param[in] event_tracer An event tracer used for tracking and logging
+   * events.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD inline runtime::Error load_forward(
+      torch::executor::EventTracer* event_tracer = nullptr) {
+    return load_method("forward", event_tracer);
+  }
 
   /**
    * Checks if a specific method is loaded.
@@ -163,11 +181,12 @@ class Module {
       const std::string& method_name);
 
   /**
-   * Execute a specific method with the given input and retrieve output.
-   * Loads the program and method before executing if needed.
+   * Execute a specific method with the given input values and retrieve the
+   * output values. Loads the program and method before executing if needed.
    *
    * @param[in] method_name The name of the method to execute.
-   * @param[in] input A vector of input values to be passed to the method.
+   * @param[in] input_values A vector of input values to be passed to the
+   * method.
    *
    * @returns A Result object containing either a vector of output values
    *          from the method or an error to indicate failure.
@@ -175,22 +194,22 @@ class Module {
   ET_NODISCARD
   runtime::Result<std::vector<runtime::EValue>> execute(
       const std::string& method_name,
-      const std::vector<runtime::EValue>& input);
+      const std::vector<runtime::EValue>& input_values);
 
   /**
    * Execute a specific method with a single input value.
    * Loads the program and method before executing if needed.
    *
    * @param[in] method_name The name of the method to execute.
-   * @param[in] input A value to be passed to the method.
+   * @param[in] input_value A value to be passed to the method.
    *
    * @returns A Result object containing either a vector of output values
    *          from the method or an error to indicate failure.
    */
   ET_NODISCARD inline runtime::Result<std::vector<runtime::EValue>> execute(
       const std::string& method_name,
-      const runtime::EValue& input) {
-    return execute(method_name, std::vector<runtime::EValue>{input});
+      const runtime::EValue& input_value) {
+    return execute(method_name, std::vector<runtime::EValue>{input_value});
   }
 
   /**
@@ -208,19 +227,20 @@ class Module {
   }
 
   /**
-   * Retrieve the output value of a specific method with the given input.
+   * Retrieve the output value of a specific method with the given input values.
    * Loads the program and method before execution if needed.
    *
    * @param[in] method_name The name of the method to execute.
-   * @param[in] input A vector of input values to be passed to the method.
+   * @param[in] input_values A vector of input values to be passed to the
+   * method.
    *
    * @returns A Result object containing either the first output value from the
    * method or an error to indicate failure.
    */
   ET_NODISCARD inline runtime::Result<runtime::EValue> get(
       const std::string& method_name,
-      const std::vector<runtime::EValue>& input) {
-    auto result = ET_UNWRAP(execute(method_name, input));
+      const std::vector<runtime::EValue>& input_values) {
+    auto result = ET_UNWRAP(execute(method_name, input_values));
     if (result.empty()) {
       return runtime::Error::InvalidArgument;
     }
@@ -232,15 +252,15 @@ class Module {
    * Loads the program and method before execution if needed.
    *
    * @param[in] method_name The name of the method to execute.
-   * @param[in] input A value to be passed to the method.
+   * @param[in] input_value A value to be passed to the method.
    *
    * @returns A Result object containing either the first output value from the
    * method or an error to indicate failure.
    */
   ET_NODISCARD inline runtime::Result<runtime::EValue> get(
       const std::string& method_name,
-      const runtime::EValue& input) {
-    return get(method_name, std::vector<runtime::EValue>{input});
+      const runtime::EValue& input_value) {
+    return get(method_name, std::vector<runtime::EValue>{input_value});
   }
 
   /**
@@ -258,31 +278,31 @@ class Module {
   }
 
   /**
-   * Execute the 'forward' method with the given input and retrieve output.
-   * Loads the program and method before executing if needed.
+   * Execute the 'forward' method with the given input values and retrieve the
+   * output values. Loads the program and method before executing if needed.
    *
-   * @param[in] input A vector of input values for the 'forward' method.
+   * @param[in] input_values A vector of input values for the 'forward' method.
    *
    * @returns A Result object containing either a vector of output values
    *          from the 'forward' method or an error to indicate failure.
    */
   ET_NODISCARD inline runtime::Result<std::vector<runtime::EValue>> forward(
-      const std::vector<runtime::EValue>& input) {
-    return execute("forward", input);
+      const std::vector<runtime::EValue>& input_values) {
+    return execute("forward", input_values);
   }
 
   /**
    * Execute the 'forward' method with a single value.
    * Loads the program and method before executing if needed.
    *
-   * @param[in] input A value for the 'forward' method.
+   * @param[in] input_value A value for the 'forward' method.
    *
    * @returns A Result object containing either a vector of output values
    *          from the 'forward' method or an error to indicate failure.
    */
   ET_NODISCARD inline runtime::Result<std::vector<runtime::EValue>> forward(
-      const runtime::EValue& input) {
-    return forward(std::vector<runtime::EValue>{input});
+      const runtime::EValue& input_value) {
+    return forward(std::vector<runtime::EValue>{input_value});
   }
 
   /**
@@ -296,6 +316,98 @@ class Module {
     return forward(std::vector<runtime::EValue>{});
   }
 
+  /**
+   * Sets a single input value for a specific method.
+   *
+   * @param[in] method_name The name of the method.
+   * @param[in] input_value The EValue to set as the method input.
+   * @param[in] input_index Zero-based index of the input to set.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD
+  runtime::Error set_input(
+      const std::string& method_name,
+      const runtime::EValue& input_value,
+      size_t input_index);
+
+  /**
+   * Sets a single input value for the "forward" method.
+   *
+   * @param[in] input_value The EValue to set as the method input.
+   * @param[in] input_index Zero-based index of the input to set.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD
+  inline runtime::Error set_input(
+      const runtime::EValue& input_value,
+      size_t input_index) {
+    return set_input("forward", input_value, input_index);
+  }
+
+  /**
+   * Sets all input values for a specific method.
+   *
+   * @param[in] method_name The name of the method.
+   * @param[in] input_values A vector of EValues to set as the method inputs.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD
+  runtime::Error set_inputs(
+      const std::string& method_name,
+      const std::vector<runtime::EValue>& input_values);
+
+  /**
+   * Sets all input values for the "forward" method.
+   *
+   * @param[in] input_values A vector of EValues to set as the method inputs.
+   *
+   * @returns An Error to indicate success or failure.
+   */
+  ET_NODISCARD
+  inline runtime::Error set_inputs(
+      const std::vector<runtime::EValue>& input_values) {
+    return set_inputs("forward", input_values);
+  }
+
+  /**
+   * Sets the output tensor for a specific method.
+   *
+   * @param[in] method_name The name of the method.
+   * @param[in] output_value The EValue containing the Tensor to set as the
+   * method output.
+   * @param[in] output_index Zero-based index of the output to set.
+   *
+   * @returns An Error to indicate success or failure.
+   *
+   * @note Only Tensor outputs are currently supported for setting.
+   */
+  ET_NODISCARD
+  runtime::Error set_output(
+      const std::string& method_name,
+      runtime::EValue output_value,
+      size_t output_index = 0);
+
+  /**
+   * Sets the output tensor for the "forward" method.
+   *
+   * @param[in] output_value The EValue containing the Tensor to set as the
+   * method output.
+   * @param[in] output_index Zero-based index of the output to set.
+   *
+   * @returns An Error to indicate success or failure.
+   *
+   * @note Only Tensor outputs are currently supported for setting.
+   */
+  ET_NODISCARD
+  inline runtime::Error set_output(
+      runtime::EValue output_value,
+      size_t output_index = 0) {
+    return set_output("forward", std::move(output_value), output_index);
+  }
+
   /**
    * Retrieves the EventTracer instance being used by the Module.
    * EventTracer is used for tracking and logging events during the execution
@@ -308,18 +420,6 @@ class Module {
     return event_tracer_.get();
   }
 
-  /**
-   * Set output data pointer for forward method.
-   *
-   * @param[in] output_value A Tensor for the output of 'forward' method.
-   * @param[in] output_index Index of the output in 'forward' method.
-   *
-   * @returns An Error to indicate success or failure of the loading process.
-   */
-  runtime::Error set_output_data_ptr(
-      runtime::EValue output_value,
-      size_t output_index);
-
  private:
   struct MethodHolder {
     std::vector<std::vector<uint8_t>> planned_buffers;
@@ -327,6 +427,7 @@ class Module {
     std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
     std::unique_ptr<runtime::MemoryManager> memory_manager;
     std::unique_ptr<runtime::Method> method;
+    std::vector<runtime::EValue> inputs;
   };
 
  private:
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index 94c24488591..ff8a5ee9040 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -32,6 +32,7 @@ et_cxx_test(
   EXTRA_LIBS
   extension_data_loader
   extension_module_static
+  extension_tensor
   portable_kernels
   portable_ops_lib
 )
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 7db4784dc93..86b9d849a22 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -14,6 +14,7 @@
 #include <gtest/gtest.h>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/tensor/tensor.h>
 
 using namespace ::executorch::extension;
 using namespace ::executorch::runtime;
@@ -58,7 +59,7 @@ TEST_F(ModuleTest, TestMethodNames) {
   Module module(model_path_);
 
   const auto method_names = module.method_names();
-  EXPECT_TRUE(method_names.ok());
+  EXPECT_EQ(method_names.error(), Error::Ok);
   EXPECT_EQ(method_names.get(), std::unordered_set<std::string>{"forward"});
 }
 
@@ -66,7 +67,7 @@ TEST_F(ModuleTest, TestNonExistentMethodNames) {
   Module module("/path/to/nonexistent/file.pte");
 
   const auto method_names = module.method_names();
-  EXPECT_FALSE(method_names.ok());
+  EXPECT_NE(method_names.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestLoadMethod) {
@@ -92,7 +93,7 @@ TEST_F(ModuleTest, TestMethodMeta) {
   Module module(model_path_);
 
   const auto meta = module.method_meta("forward");
-  EXPECT_TRUE(meta.ok());
+  EXPECT_EQ(meta.error(), Error::Ok);
   EXPECT_STREQ(meta->name(), "forward");
   EXPECT_EQ(meta->num_inputs(), 2);
   EXPECT_EQ(*(meta->input_tag(0)), Tag::Tensor);
@@ -100,13 +101,13 @@ TEST_F(ModuleTest, TestMethodMeta) {
   EXPECT_EQ(*(meta->output_tag(0)), Tag::Tensor);
 
   const auto input_meta = meta->input_tensor_meta(0);
-  EXPECT_TRUE(input_meta.ok());
+  EXPECT_EQ(input_meta.error(), Error::Ok);
   EXPECT_EQ(input_meta->scalar_type(), exec_aten::ScalarType::Float);
   EXPECT_EQ(input_meta->sizes().size(), 1);
   EXPECT_EQ(input_meta->sizes()[0], 1);
 
   const auto output_meta = meta->output_tensor_meta(0);
-  EXPECT_TRUE(output_meta.ok());
+  EXPECT_EQ(output_meta.error(), Error::Ok);
   EXPECT_EQ(output_meta->scalar_type(), exec_aten::ScalarType::Float);
   EXPECT_EQ(output_meta->sizes().size(), 1);
   EXPECT_EQ(output_meta->sizes()[0], 1);
@@ -116,22 +117,16 @@ TEST_F(ModuleTest, TestNonExistentMethodMeta) {
   Module module("/path/to/nonexistent/file.pte");
 
   const auto meta = module.method_meta("forward");
-  EXPECT_FALSE(meta.ok());
+  EXPECT_NE(meta.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestExecute) {
   Module module(model_path_);
+  auto tensor = make_tensor_ptr({1.f});
 
-  std::array<float, 1> input{1};
-  std::array<int32_t, 1> sizes{1};
-  exec_aten::TensorImpl tensor(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
-  const auto result = module.execute(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result.ok());
-
-  EXPECT_TRUE(result.ok());
   EXPECT_TRUE(module.is_loaded());
   EXPECT_TRUE(module.is_method_loaded("forward"));
 
@@ -146,14 +141,10 @@ TEST_F(ModuleTest, TestExecutePreload) {
   const auto error = module.load();
   EXPECT_EQ(error, Error::Ok);
 
-  std::array<float, 1> input{1};
-  std::array<int32_t, 1> sizes{1};
-  exec_aten::TensorImpl tensor(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  const auto result = module.execute(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result.ok());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
@@ -166,14 +157,10 @@ TEST_F(ModuleTest, TestExecutePreload_method) {
   const auto error = module.load_method("forward");
   EXPECT_EQ(error, Error::Ok);
 
-  std::array<float, 1> input{1};
-  std::array<int32_t, 1> sizes{1};
-  exec_aten::TensorImpl tensor(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  const auto result = module.execute(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result.ok());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
@@ -189,14 +176,10 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) {
   const auto load_method_error = module.load_method("forward");
   EXPECT_EQ(load_method_error, Error::Ok);
 
-  std::array<float, 1> input{1};
-  std::array<int32_t, 1> sizes{1};
-  exec_aten::TensorImpl tensor(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  const auto result = module.execute(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result.ok());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
@@ -208,7 +191,7 @@ TEST_F(ModuleTest, TestExecuteOnNonExistent) {
 
   const auto result = module.execute("forward");
 
-  EXPECT_FALSE(result.ok());
+  EXPECT_NE(result.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestExecuteOnCurrupted) {
@@ -216,47 +199,34 @@ TEST_F(ModuleTest, TestExecuteOnCurrupted) {
 
   const auto result = module.execute("forward");
 
-  EXPECT_FALSE(result.ok());
+  EXPECT_NE(result.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestGet) {
   Module module(model_path_);
 
-  std::array<float, 1> input{1};
-  std::array<int32_t, 1> sizes{1};
-  exec_aten::TensorImpl tensor(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
-
-  const auto result = module.get(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
+  auto tensor = make_tensor_ptr({1.f});
 
-  EXPECT_TRUE(result.ok());
+  const auto result = module.get("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
   const auto data = result->toTensor().const_data_ptr<float>();
   EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestForward) {
   auto module = std::make_unique<Module>(model_path_);
+  auto tensor = make_tensor_ptr({21.f});
 
-  std::array<float, 1> input{1};
-  std::array<int32_t, 1> sizes{1};
-  exec_aten::TensorImpl tensor(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
-
-  const auto result =
-      module->forward({exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result.ok());
+  const auto result = module->forward({tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 2, 1e-5);
+  EXPECT_NEAR(data[0], 42, 1e-5);
 
-  std::array<float, 2> input2{2, 3};
-  exec_aten::TensorImpl tensor2(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input2.data());
-  const auto result2 = module->forward(
-      {exec_aten::Tensor(&tensor2), exec_aten::Tensor(&tensor2)});
-  EXPECT_TRUE(result2.ok());
+  auto tensor2 = make_tensor_ptr({2.f});
+  const auto result2 = module->forward({tensor2, tensor2});
+  EXPECT_EQ(result2.error(), Error::Ok);
 
   const auto data2 = result->at(0).toTensor().const_data_ptr<float>();
 
@@ -268,7 +238,7 @@ TEST_F(ModuleTest, TestForwardWithInvalidInputs) {
 
   const auto result = module.forward(EValue());
 
-  EXPECT_FALSE(result.ok());
+  EXPECT_NE(result.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestProgramSharingBetweenModules) {
@@ -283,10 +253,10 @@ TEST_F(ModuleTest, TestProgramSharingBetweenModules) {
   EXPECT_TRUE(module2.is_loaded());
 
   auto method_names1 = module1.method_names();
-  EXPECT_TRUE(method_names1.ok());
+  EXPECT_EQ(method_names1.error(), Error::Ok);
 
   auto method_names2 = module2.method_names();
-  EXPECT_TRUE(method_names2.ok());
+  EXPECT_EQ(method_names2.error(), Error::Ok);
   EXPECT_EQ(method_names1.get(), method_names2.get());
 
   auto load_method_error = module1.load_method("forward");
@@ -301,7 +271,7 @@ TEST_F(ModuleTest, TestProgramSharingBetweenModules) {
 
 TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) {
   auto loader = FileDataLoader::from(model_path_.c_str());
-  EXPECT_TRUE(loader.ok());
+  EXPECT_EQ(loader.error(), Error::Ok);
   auto data_loader = std::make_unique<FileDataLoader>(std::move(loader.get()));
 
   auto module1 = std::make_unique<Module>(std::move(data_loader));
@@ -310,27 +280,21 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) {
   EXPECT_EQ(load_error, Error::Ok);
   EXPECT_TRUE(module1->is_loaded());
 
-  std::array<float, 1> input{1};
-  std::array<int32_t, 1> sizes{1};
-  exec_aten::TensorImpl tensor(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  auto result1 = module1->execute(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result1.ok());
+  const auto result1 = module1->execute("forward", {tensor, tensor});
+  EXPECT_EQ(result1.error(), Error::Ok);
 
   auto module2 = std::make_unique<Module>(module1->program());
 
-  auto result2 = module2->execute(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result2.ok());
+  const auto result2 = module2->execute("forward", {tensor, tensor});
+  EXPECT_EQ(result2.error(), Error::Ok);
 
   module1 = std::make_unique<Module>("/path/to/nonexistent/file.pte");
   EXPECT_FALSE(module1->is_loaded());
 
-  auto result3 = module2->execute(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result3.ok());
+  const auto result3 = module2->execute("forward", {tensor, tensor});
+  EXPECT_EQ(result3.error(), Error::Ok);
 }
 
 TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) {
@@ -338,7 +302,7 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) {
 
   {
     auto loader = FileDataLoader::from(model_path_.c_str());
-    EXPECT_TRUE(loader.ok());
+    EXPECT_EQ(loader.error(), Error::Ok);
     auto data_loader =
         std::make_unique<FileDataLoader>(std::move(loader.get()));
     auto* data_loader_ptr = data_loader.get();
@@ -361,14 +325,10 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) {
 
   EXPECT_EQ(module.program(), shared_program);
 
-  std::array<float, 1> input{1};
-  std::array<int32_t, 1> sizes{1};
-  exec_aten::TensorImpl tensor(
-      exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data());
+  auto tensor = make_tensor_ptr({1.f});
 
-  auto result = module.execute(
-      "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-  EXPECT_TRUE(result.ok());
+  const auto result = module.execute("forward", {tensor, tensor});
+  EXPECT_EQ(result.error(), Error::Ok);
 
   auto data = result->at(0).toTensor().const_data_ptr<float>();
 
@@ -392,16 +352,10 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
   auto thread = [](std::shared_ptr<Program> program,
                    const std::array<float, 1>& input) {
     Module module(program);
-    std::array<int32_t, 1> sizes{1};
-    exec_aten::TensorImpl tensor(
-        exec_aten::ScalarType::Float,
-        sizes.size(),
-        sizes.data(),
-        (void*)input.data());
+    auto tensor = from_blob((void*)input.data(), {1});
 
-    const auto result = module.forward(
-        {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)});
-    EXPECT_TRUE(result.ok());
+    const auto result = module.forward({tensor, tensor});
+    EXPECT_EQ(result.error(), Error::Ok);
 
     const auto data = result->at(0).toTensor().const_data_ptr<float>();
     EXPECT_NEAR(data[0], (input[0] * 2), 1e-5);
@@ -419,3 +373,65 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
   t4.join();
   t5.join();
 }
+
+TEST_F(ModuleTest, TestSetInputsBeforeExecute) {
+  Module module(model_path_);
+
+  auto tensor1 = make_tensor_ptr({4.f});
+  auto tensor2 = make_tensor_ptr({5.f});
+
+  EXPECT_EQ(module.set_inputs({tensor1, tensor2}), Error::Ok);
+
+  const auto result = module.forward();
+  EXPECT_EQ(result.error(), Error::Ok);
+
+  const auto data = result->at(0).toTensor().const_data_ptr<float>();
+  EXPECT_NEAR(data[0], 9, 1e-5);
+}
+
+TEST_F(ModuleTest, TestSetInputCombinedWithExecute) {
+  Module module(model_path_);
+
+  auto tensor1 = make_tensor_ptr({2.f});
+  auto tensor2 = make_tensor_ptr({3.f});
+
+  EXPECT_EQ(module.set_input(tensor2, 1), Error::Ok);
+
+  const auto result = module.forward(tensor1);
+  EXPECT_EQ(result.error(), Error::Ok);
+
+  const auto data = result->at(0).toTensor().const_data_ptr<float>();
+  EXPECT_NEAR(data[0], 5, 1e-5);
+}
+
+TEST_F(ModuleTest, TestPartiallySetInputs) {
+  Module module(model_path_);
+
+  auto tensor = make_tensor_ptr({1.f});
+
+  EXPECT_EQ(module.set_input(tensor, 0), Error::Ok);
+
+  const auto result = module.forward();
+  EXPECT_NE(result.error(), Error::Ok);
+}
+
+TEST_F(ModuleTest, TestUnsetInputs) {
+  Module module(model_path_);
+
+  const auto result = module.forward();
+  EXPECT_NE(result.error(), Error::Ok);
+}
+
+TEST_F(ModuleTest, TestSetOutputInvalidIndex) {
+  Module module(model_path_);
+
+  auto output_tensor = empty({1});
+
+  EXPECT_NE(module.set_output(output_tensor, 1), Error::Ok);
+}
+
+TEST_F(ModuleTest, TestSetOutputInvalidType) {
+  Module module(model_path_);
+
+  EXPECT_NE(module.set_output(EValue()), Error::Ok);
+}
diff --git a/extension/module/test/resources/README.md b/extension/module/test/resources/README.md
index 5067c870a3c..e2b54633fae 100644
--- a/extension/module/test/resources/README.md
+++ b/extension/module/test/resources/README.md
@@ -1,4 +1,11 @@
 ## Resources
 
 ### model.pte
-- generated via `buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="add"` after D62209852.
+- Internally generated after D62209852, 2024-09-06 with:
+    ```
+    buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="add"
+    ```
+- In OSS, the same file can be generated after [#5145](https://github.com/pytorch/executorch/pull/5145), 2024-09-06 with:
+    ```
+    python -m examples.portable.scripts.export --model_name="add"
+    ```
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index f53a082add6..bc4ce2c6af7 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -1,3 +1,8 @@
+load(
+    "@fbsource//tools/build_defs:default_platform_defs.bzl",
+    "ANDROID",
+    "CXX",
+)
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
@@ -7,20 +12,28 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    runtime.cxx_test(
-        name = "test",
-        srcs = [
-            "module_test.cpp",
-        ],
-        deps = [
-            "//executorch/kernels/portable:generated_lib",
-            "//executorch/extension/data_loader:file_data_loader",
-            "//executorch/extension/module:module",
-        ],
-        env = {
-            "RESOURCES_PATH": "$(location :resources)/resources",
-        },
-    )
+    for aten_mode in (True, False):
+        aten_suffix = ("_aten" if aten_mode else "")
+
+        runtime.cxx_test(
+            name = "test" + aten_suffix,
+            srcs = [
+                "module_test.cpp",
+            ],
+            deps = [
+                "//executorch/kernels/portable:generated_lib" + aten_suffix,
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
+            ],
+            env = {
+                "RESOURCES_PATH": "$(location :resources)/resources",
+            },
+            platforms = [CXX, ANDROID],  # Cannot bundle resources on Apple platform.
+            compiler_flags = [
+                "-Wno-error=deprecated-declarations",
+            ],
+        )
 
     runtime.filegroup(
         name = "resources",
diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp
index fb2d3e7b0ae..dfbb911d3a9 100644
--- a/extension/parallel/thread_parallel.cpp
+++ b/extension/parallel/thread_parallel.cpp
@@ -20,7 +20,7 @@ namespace {
 thread_local int64_t thread_num_ = 0;
 }
 
-using namespace torch::executorch::threadpool;
+using namespace ::executorch::extension::threadpool;
 
 inline int64_t divup(int64_t x, int64_t y) {
   return (x + y - 1) / y;
diff --git a/extension/pybindings/portable_lib.py b/extension/pybindings/portable_lib.py
index b9ed089f918..d094710e67e 100644
--- a/extension/pybindings/portable_lib.py
+++ b/extension/pybindings/portable_lib.py
@@ -6,6 +6,22 @@
 
 # pyre-strict
 
+"""API for loading and executing ExecuTorch PTE files using the C++ runtime.
+
+.. warning::
+
+    This API is experimental and subject to change without notice.
+"""
+
+import warnings as _warnings
+
+import executorch.exir._warnings as _exir_warnings
+
+_warnings.warn(
+    "This API is experimental and subject to change without notice.",
+    _exir_warnings.ExperimentalWarning,
+)
+
 # When installed as a pip wheel, we must import `torch` before trying to import
 # the pybindings shared library extension. This will load libtorch.so and
 # related libs, ensuring that the pybindings lib can resolve those runtime
@@ -15,6 +31,8 @@
 # Let users import everything from the C++ _portable_lib extension as if this
 # python file defined them. Although we could import these dynamically, it
 # wouldn't preserve the static type annotations.
+#
+# Note that all of these are experimental, and subject to change without notice.
 from executorch.extension.pybindings._portable_lib import (  # noqa: F401
     # Disable "imported but unused" (F401) checks.
     _create_profile_block,  # noqa: F401
@@ -32,3 +50,5 @@
 # Clean up so that `dir(portable_lib)` is the same as `dir(_portable_lib)`
 # (apart from some __dunder__ names).
 del _torch
+del _exir_warnings
+del _warnings
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index d674f2fe58c..a2a65787cb4 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -24,6 +24,7 @@
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/runtime/core/data_loader.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/kernel/operator_registry.h>
@@ -55,6 +56,16 @@
     }                                                             \
   })
 
+#define THROW_INDEX_IF_ERROR(error, message, ...)                 \
+  ({                                                              \
+    if ((error) != Error::Ok) {                                   \
+      char msg_buf[128];                                          \
+      snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \
+      /* pybind will convert this to a python exception. */       \
+      throw std::out_of_range(msg_buf);                           \
+    }                                                             \
+  })
+
 // Our logs work by writing to stderr. By default this is done through fprintf
 // (as defined in posix.cpp) which then does not show up in python environments.
 // Here we override the pal to use std::cerr which can be properly redirected by
@@ -129,26 +140,26 @@ void setup_output_storage(
     const std::vector<Span<uint8_t>>& output_storages) {
   if (output_storages.size() != method.outputs_size()) {
     THROW_IF_ERROR(
-        Error(),
+        Error::InvalidArgument,
         "number of output storages %zu does not match number of outputs %zu",
         output_storages.size(),
         method.outputs_size());
   }
   for (size_t i = 0; i < output_storages.size(); ++i) {
     if (output_storages[i].size() == 0) {
-      // Skip empty output storages, this would happen for non-tensor outputs.
+      // Skip empty output storages, this would happen for non-tensor outputs
+      // and memory planned outputs.
       continue;
     }
     Error output_status = method.set_output_data_ptr(
         output_storages[i].data(), output_storages[i].size(), i);
-    // InvalidState can be the status if outputs are already memory planned.
-    // That's fine and we don't need to alert the user to that error.
-    if (output_status != Error::Ok && output_status != Error::InvalidState) {
-      ET_LOG(
-          Error,
-          "Cannot set_output_data_ptr(): 0x%" PRIx32,
-          static_cast<uint32_t>(output_status));
-    }
+    // We already should be skipping non-tensor outputs, and memory planned
+    // outputs so any error is real.
+    THROW_IF_ERROR(
+        output_status,
+        "set_output_data_ptr failed for output %zu with error 0x%" PRIx32,
+        i,
+        static_cast<uint32_t>(output_status));
   }
 }
 
@@ -238,10 +249,10 @@ class Module final {
       const std::vector<EValue>& args,
       const std::optional<std::vector<Span<uint8_t>>>& output_storages =
           std::nullopt) {
-    auto& method = methods_[method_name];
+    auto& method = get_method(method_name);
     exec_aten::ArrayRef<EValue> input_evalue_list(args.data(), args.size());
 
-    Error set_inputs_status = method->set_inputs(input_evalue_list);
+    Error set_inputs_status = method.set_inputs(input_evalue_list);
     THROW_IF_ERROR(
         set_inputs_status,
         "method->set_inputs() for method '%s' failed with error 0x%" PRIx32,
@@ -262,9 +273,9 @@ class Module final {
         c10::autograd_dispatch_keyset);
 #endif
     if (output_storages) {
-      setup_output_storage(*method, *output_storages);
+      setup_output_storage(method, *output_storages);
     }
-    Error execute_status = method->execute();
+    Error execute_status = method.execute();
     THROW_IF_ERROR(
         execute_status,
         "method->execute() failed with error 0x%" PRIx32,
@@ -291,7 +302,9 @@ class Module final {
   Method& get_method(const std::string& method_name) {
     if (methods_.count(method_name) == 0) {
       THROW_IF_ERROR(
-          Error(), "no such method in program: %s", method_name.c_str());
+          Error::InvalidArgument,
+          "no such method in program: %s",
+          method_name.c_str());
     }
     return *methods_[method_name].get();
   }
@@ -448,6 +461,119 @@ struct PyBundledModule final {
   size_t program_len_;
 };
 
+/// Expose a subset of TensorInfo information to python.
+struct PyTensorInfo final {
+  explicit PyTensorInfo(
+      std::shared_ptr<Module> module,
+      torch::executor::TensorInfo info)
+      : module_(std::move(module)), info_(info) {}
+
+  py::tuple sizes() const {
+    const auto shape = info_.sizes();
+    py::tuple tup(shape.size());
+    for (size_t i = 0; i < shape.size(); ++i) {
+      tup[i] = py::cast(shape[i]);
+    }
+    return tup;
+  }
+
+  int8_t dtype() const {
+    return static_cast<std::underlying_type<exec_aten::ScalarType>::type>(
+        info_.scalar_type());
+  }
+
+  bool is_memory_planned() const {
+    return info_.is_memory_planned();
+  }
+
+  size_t nbytes() const {
+    return info_.nbytes();
+  }
+
+  std::string repr() const {
+    std::string size_str = "[";
+    for (const auto& d : info_.sizes()) {
+      size_str.append(std::to_string(d));
+      size_str.append(", ");
+    }
+    if (size_str.length() >= 2) {
+      // Pop the last two characters (command and space) and add close bracket.
+      size_str.pop_back();
+      size_str.pop_back();
+    }
+    size_str.append("]");
+    return "TensorInfo(sizes=" + size_str + ", dtype=" +
+        std::string(executorch::runtime::toString(info_.scalar_type())) +
+        ", is_memory_planned=" +
+        (info_.is_memory_planned() ? "True" : "False") +
+        ", nbytes=" + std::to_string(info_.nbytes()) + ")";
+  }
+
+ private:
+  // TensorInfo relies on module to be alive.
+  std::shared_ptr<Module> module_;
+  torch::executor::TensorInfo info_;
+};
+
+/// Expose a subset of MethodMeta information to python.
+struct PyMethodMeta final {
+  explicit PyMethodMeta(
+      std::shared_ptr<Module> module,
+      torch::executor::MethodMeta meta)
+      : module_(std::move(module)), meta_(meta) {}
+
+  const char* name() const {
+    return meta_.name();
+  }
+
+  size_t num_inputs() const {
+    return meta_.num_inputs();
+  }
+
+  std::unique_ptr<PyTensorInfo> input_tensor_meta(size_t index) const {
+    const auto result = meta_.input_tensor_meta(index);
+    THROW_INDEX_IF_ERROR(
+        result.error(), "Cannot get input tensor meta at %zu", index);
+    return std::make_unique<PyTensorInfo>(module_, result.get());
+  }
+
+  size_t num_outputs() const {
+    return meta_.num_outputs();
+  }
+
+  std::unique_ptr<PyTensorInfo> output_tensor_meta(size_t index) const {
+    const auto result = meta_.output_tensor_meta(index);
+    THROW_INDEX_IF_ERROR(
+        result.error(), "Cannot get output tensor meta at %zu", index);
+    return std::make_unique<PyTensorInfo>(module_, result.get());
+  }
+
+  py::str repr() const {
+    py::list input_meta_strs;
+    for (size_t i = 0; i < meta_.num_inputs(); ++i) {
+      input_meta_strs.append(py::str(input_tensor_meta(i)->repr()));
+    }
+    py::list output_meta_strs;
+    for (size_t i = 0; i < meta_.num_outputs(); ++i) {
+      output_meta_strs.append(py::str(output_tensor_meta(i)->repr()));
+    }
+    // Add quotes to be more similar to Python's repr for strings.
+    py::str format =
+        "MethodMeta(name='{}', num_inputs={}, input_tensor_meta={}, num_outputs={}, output_tensor_meta={})";
+    return format.format(
+        std::string(meta_.name()),
+        std::to_string(meta_.num_inputs()),
+        input_meta_strs,
+        std::to_string(meta_.num_outputs()),
+        output_meta_strs);
+  }
+
+ private:
+  // Must keep the Module object alive or else the meta object is invalidated.
+  std::shared_ptr<Module> module_;
+  torch::executor::MethodMeta meta_;
+};
+
 struct PyModule final {
   explicit PyModule(
       const py::bytes& buffer,
@@ -751,34 +877,47 @@ struct PyModule final {
     return list;
   }
 
+  std::unique_ptr<PyMethodMeta> method_meta(const std::string method_name) {
+    auto& method = module_->get_method(method_name);
+    return std::make_unique<PyMethodMeta>(module_, method.method_meta());
+  }
+
  private:
-  std::unique_ptr<Module> module_;
+  std::shared_ptr<Module> module_;
   // Need to keep-alive output storages until they can be compared in case of
   // bundled programs.
   std::vector<std::vector<uint8_t>> output_storages_;
 
   std::vector<std::vector<uint8_t>> make_output_storages(const Method& method) {
     const auto num_outputs = method.outputs_size();
-    // These output storages will not be used if the ExecuTorch program already
-    // pre-allocated output space. That is represented by an error from
-    // set_output_data_ptr.
-    std::vector<std::vector<uint8_t>> output_storages(num_outputs);
+    // Create a buffer for each output tensor. Memory planned outputs and non
+    // tensor outputs get an empty buffer in this list which is ignored later.
+    std::vector<std::vector<uint8_t>> output_storages;
+    output_storages_.reserve(num_outputs);
+    auto meta = method.method_meta();
     for (size_t i = 0; i < num_outputs; ++i) {
+      auto output_type = meta.output_tag(i);
+      THROW_IF_ERROR(
+          output_type.error(), "Failed to get output type for output %zu", i);
+      if (output_type.get() != Tag::Tensor) {
+        // Skip allocating storage for non-tensor outputs.
+        output_storages.emplace_back();
+        continue;
+      }
       const auto& output_tensor_meta =
           method.method_meta().output_tensor_meta(i);
-      if (!output_tensor_meta.ok()) {
-        // If the output isn't a tensor it won't have a tensor meta.
-        ET_LOG(
-            Error,
-            "Tensor meta doesn't exist for output %zu, error is 0x%" PRIx32
-            ", skipping allocating storage",
-            i,
-            static_cast<uint32_t>(output_tensor_meta.error()));
-        output_storages[i] = std::vector<uint8_t>();
+      THROW_IF_ERROR(
+          output_tensor_meta.error(),
+          "Failed to get output tensor meta for output %zu",
+          i);
+      if (output_tensor_meta.get().is_memory_planned()) {
+        // Skip allocating storage for planned memory outputs.
+        output_storages.emplace_back();
         continue;
       }
+      // Allocate storage for the output tensor.
       const size_t output_size = output_tensor_meta.get().nbytes();
-      output_storages[i] = std::vector<uint8_t>(output_size);
+      output_storages.emplace_back(output_size);
     }
     return output_storages;
   }
@@ -866,6 +1005,11 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
           py::arg("method_name"),
           py::arg("clone_outputs") = true,
           call_guard)
+      .def(
+          "method_meta",
+          &PyModule::method_meta,
+          py::arg("method_name"),
+          call_guard)
       .def(
           "run_method",
           &PyModule::run_method,
@@ -900,6 +1044,27 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
           call_guard);
 
   py::class_<PyBundledModule>(m, "BundledModule");
+  py::class_<PyTensorInfo>(m, "TensorInfo")
+      .def("sizes", &PyTensorInfo::sizes, call_guard)
+      .def("dtype", &PyTensorInfo::dtype, call_guard)
+      .def("is_memory_planned", &PyTensorInfo::is_memory_planned, call_guard)
+      .def("nbytes", &PyTensorInfo::nbytes, call_guard)
+      .def("__repr__", &PyTensorInfo::repr, call_guard);
+  py::class_<PyMethodMeta>(m, "MethodMeta")
+      .def("name", &PyMethodMeta::name, call_guard)
+      .def("num_inputs", &PyMethodMeta::num_inputs, call_guard)
+      .def("num_outputs", &PyMethodMeta::num_outputs, call_guard)
+      .def(
+          "input_tensor_meta",
+          &PyMethodMeta::input_tensor_meta,
+          py::arg("index"),
+          call_guard)
+      .def(
+          "output_tensor_meta",
+          &PyMethodMeta::output_tensor_meta,
+          py::arg("index"),
+          call_guard)
+      .def("__repr__", &PyMethodMeta::repr, call_guard);
 }
 
 } // namespace pybindings
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index e02ae0046f1..0b7be42ca7a 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -5,9 +5,21 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-strict
+from __future__ import annotations
+
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
+from executorch.exir._warnings import experimental
+
+@experimental("This API is experimental and subject to change without notice.")
 class ExecuTorchModule:
+    """ExecuTorchModule is a Python wrapper around a C++ ExecuTorch program.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def __call__(self, inputs: Any) -> List[Any]: ...
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
@@ -33,17 +45,98 @@ class ExecuTorchModule:
     def write_etdump_result_to_file(
         self, path: str, debug_buffer_path: Optional[str] = None
     ) -> None: ...
+    def method_meta(self, method_name: str) -> MethodMeta: ...
+
+@experimental("This API is experimental and subject to change without notice.")
+class BundledModule:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+class TensorInfo:
+    """Metadata about a tensor such as the shape and dtype.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
 
-class BundledModule: ...
+    def sizes(self) -> Tuple[int, ...]:
+        """Shape of the tensor as a tuple"""
+        ...
 
+    def dtype(self) -> int:
+        """The data type of the elements inside the tensor.
+        See documentation for ScalarType in executorch/runtime/core/portable_type/scalar_type.h
+        for the values these integers can take."""
+        ...
+
+    def is_memory_planned(self) -> bool:
+        """True if the tensor is already memory planned, meaning no allocation
+        needs to be provided. False otherwise"""
+        ...
+
+    def nbytes(self) -> int:
+        """Number of bytes in the tensor. Not the same as numel if the dtype is
+        larger than 1 byte wide"""
+        ...
+
+    def __repr__(self) -> str: ...
+
+@experimental("This API is experimental and subject to change without notice.")
+class MethodMeta:
+    """Metadata about a method such as the number of inputs and outputs.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+
+    def name(self) -> str:
+        """The name of the method, such as 'forward'"""
+        ...
+
+    def num_inputs(self) -> int:
+        """The number of user inputs to the method. This does not include any
+        internal buffers or weights, which don't need to be provided by the user"""
+        ...
+
+    def num_outputs(self) -> int:
+        """The number of outputs from the method. This does not include any mutated
+        internal buffers"""
+        ...
+
+    def input_tensor_meta(self, index: int) -> TensorInfo:
+        """The tensor info for the 'index'th input. Index must be in the interval
+        [0, num_inputs()). Raises an IndexError if the index is out of bounds"""
+        ...
+
+    def output_tensor_meta(self, index: int) -> TensorInfo:
+        """The tensor info for the 'index'th output. Index must be in the interval
+        [0, num_outputs()). Raises an IndexError if the index is out of bounds"""
+        ...
+
+    def __repr__(self) -> str: ...
+
+@experimental("This API is experimental and subject to change without notice.")
 def _load_for_executorch(
     path: str, enable_etdump: bool = False, debug_buffer_size: int = 0
 ) -> ExecuTorchModule:
     """Load an ExecuTorch Program from a file.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+
     Args:
         path: File path to the ExecuTorch program as a string.
         enable_etdump: If true, enables an ETDump which can store profiling information.
-            See documentation at https://pytorch.org/executorch/stable/sdk-etdump.html
+            See documentation at https://pytorch.org/executorch/stable/etdump.html
             for how to use it.
         debug_buffer_size: If non-zero, enables a debug buffer which can store
             intermediate results of each instruction in the ExecuTorch program.
@@ -53,23 +146,75 @@ def _load_for_executorch(
     """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
 def _load_for_executorch_from_buffer(
     buffer: bytes, enable_etdump: bool = False, debug_buffer_size: int = 0
 ) -> ExecuTorchModule:
-    """Same as _load_for_executorch, but takes a byte buffer instead of a file path."""
+    """Same as _load_for_executorch, but takes a byte buffer instead of a file path.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
 def _load_for_executorch_from_bundled_program(
     module: BundledModule, enable_etdump: bool = False, debug_buffer_size: int = 0
 ) -> ExecuTorchModule:
     """Same as _load_for_executorch, but takes a bundled program instead of a file path.
-    See https://pytorch.org/executorch/stable/sdk-bundled-io.html for documentation."""
+
+    See https://pytorch.org/executorch/stable/bundled-io.html for documentation.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
 def _load_bundled_program_from_buffer(
     buffer: bytes, non_const_pool_size: int = ...
-) -> BundledModule: ...
-def _get_operator_names() -> List[str]: ...
-def _create_profile_block(name: str) -> None: ...
-def _dump_profile_results() -> bytes: ...
-def _reset_profile_results() -> None: ...
+) -> BundledModule:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _get_operator_names() -> List[str]:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _create_profile_block(name: str) -> None:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _dump_profile_results() -> bytes:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _reset_profile_results() -> None:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
diff --git a/extension/pybindings/test/TARGETS b/extension/pybindings/test/TARGETS
index feb4779a05e..335bd68ed1e 100644
--- a/extension/pybindings/test/TARGETS
+++ b/extension/pybindings/test/TARGETS
@@ -3,7 +3,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 oncall("executorch")
 
 runtime.python_library(
-    # @autodeps-skip
     # autodeps has a real hard time tracking the owner of the pybindings
     # from portable and the suggested fixes I could find didnt work, so
     # just disabling for now
diff --git a/extension/pybindings/test/make_test.py b/extension/pybindings/test/make_test.py
index 708e67e4309..b44de2680fb 100644
--- a/extension/pybindings/test/make_test.py
+++ b/extension/pybindings/test/make_test.py
@@ -7,10 +7,11 @@
 # pyre-unsafe
 
 import unittest
-from typing import Any, Callable, Tuple
+from typing import Any, Callable, Optional, Tuple
 
 import torch
-from executorch.exir import ExecutorchProgramManager, to_edge
+from executorch.exir import ExecutorchBackendConfig, ExecutorchProgramManager, to_edge
+from executorch.exir.passes import MemoryPlanningPass
 from torch.export import export
 
 
@@ -75,8 +76,25 @@ def get_methods_to_export(self):
             def get_inputs(self):
                 return (torch.ones(2, 2),)
 
+        class ModuleAddConstReturn(torch.nn.Module):
+            """The module to serialize and execute."""
+
+            def __init__(self):
+                super(ModuleAddConstReturn, self).__init__()
+                self.state = torch.ones(2, 2)
+
+            def forward(self, x):
+                return x + self.state, self.state
+
+            def get_methods_to_export(self):
+                return ("forward",)
+
+            def get_inputs(self):
+                return (torch.ones(2, 2),)
+
         def create_program(
             eager_module: torch.nn.Module,
+            et_config: Optional[ExecutorchBackendConfig] = None,
         ) -> Tuple[ExecutorchProgramManager, Tuple[Any, ...]]:
             """Returns an executorch program based on ModuleAdd, along with inputs."""
 
@@ -103,7 +121,7 @@ def forward(self, *args, **kwargs):
                 )
                 exported_methods[method_name] = export(wrapped_mod, method_input)
 
-            exec_prog = to_edge(exported_methods).to_executorch()
+            exec_prog = to_edge(exported_methods).to_executorch(config=et_config)
 
             # Create the ExecuTorch program from the graph.
             exec_prog.dump_executorch_program(verbose=True)
@@ -251,6 +269,90 @@ def test_quantized_ops(tester):
             expected = example_inputs[0] + example_inputs[1]
             tester.assertEqual(str(expected), str(executorch_output))
 
+        def test_constant_output_not_memory_planned(tester):
+            # Create an ExecuTorch program from ModuleAdd.
+            exported_program, inputs = create_program(
+                ModuleAddConstReturn(),
+                et_config=ExecutorchBackendConfig(
+                    memory_planning_pass=MemoryPlanningPass(alloc_graph_output=False)
+                ),
+            )
+
+            exported_program.dump_executorch_program(verbose=True)
+
+            # Use pybindings to load and execute the program.
+            executorch_module = load_fn(exported_program.buffer)
+            # Invoke the callable on executorch_module instead of calling module.forward.
+            # Use only one input to test this case.
+            executorch_output = executorch_module((torch.ones(2, 2),))
+            print(executorch_output)
+
+            # The test module adds the input to torch.ones(2,2), so its output should be the same
+            # as adding them directly.
+            expected = torch.ones(2, 2) + torch.ones(2, 2)
+            tester.assertEqual(str(expected), str(executorch_output[0]))
+
+            # The test module returns the state. Check that its value is correct.
+            tester.assertEqual(str(torch.ones(2, 2)), str(executorch_output[1]))
+
+        def test_method_meta(tester) -> None:
+            # pyre-fixme[16]: Callable `make_test` has no attribute `wrapper`.
+            exported_program, inputs = create_program(ModuleAdd())
+
+            # Use pybindings to load the program and query its metadata.
+            executorch_module = load_fn(exported_program.buffer)
+            meta = executorch_module.method_meta("forward")
+
+            # Ensure that all these APIs work even if the module object is destroyed.
+            del executorch_module
+            tester.assertEqual(meta.name(), "forward")
+            tester.assertEqual(meta.num_inputs(), 2)
+            tester.assertEqual(meta.num_outputs(), 1)
+            # Common string for all these tensors.
+            tensor_info = "TensorInfo(sizes=[2, 2], dtype=Float, is_memory_planned=True, nbytes=16)"
+            float_dtype = 6
+            tester.assertEqual(
+                str(meta),
+                "MethodMeta(name='forward', num_inputs=2, "
+                f"input_tensor_meta=['{tensor_info}', '{tensor_info}'], "
+                f"num_outputs=1, output_tensor_meta=['{tensor_info}'])",
+            )
+
+            input_tensors = [meta.input_tensor_meta(i) for i in range(2)]
+            output_tensor = meta.output_tensor_meta(0)
+            # Check that accessing out of bounds raises IndexError.
+            with tester.assertRaises(IndexError):
+                meta.input_tensor_meta(2)
+            # Test that tensor metadata can outlive method metadata.
+            del meta
+            tester.assertEqual([t.sizes() for t in input_tensors], [(2, 2), (2, 2)])
+            tester.assertEqual(
+                [t.dtype() for t in input_tensors], [float_dtype, float_dtype]
+            )
+            tester.assertEqual(
+                [t.is_memory_planned() for t in input_tensors], [True, True]
+            )
+            tester.assertEqual([t.nbytes() for t in input_tensors], [16, 16])
+            tester.assertEqual(str(input_tensors), f"[{tensor_info}, {tensor_info}]")
+
+            tester.assertEqual(output_tensor.sizes(), (2, 2))
+            tester.assertEqual(output_tensor.dtype(), float_dtype)
+            tester.assertEqual(output_tensor.is_memory_planned(), True)
+            tester.assertEqual(output_tensor.nbytes(), 16)
+            tester.assertEqual(str(output_tensor), tensor_info)
+
+        def test_bad_name(tester) -> None:
+            # Create an ExecuTorch program from ModuleAdd.
+            # pyre-fixme[16]: Callable `make_test` has no attribute `wrapper`.
+            exported_program, inputs = create_program(ModuleAdd())
+
+            # Use pybindings to load and execute the program.
+            executorch_module = load_fn(exported_program.buffer)
+            # Invoke the callable on executorch_module instead of calling module.forward.
+            with tester.assertRaises(RuntimeError):
+                executorch_module.run_method("not_a_real_method", inputs)
+
+        ######### RUN TEST CASES #########
         test_e2e(tester)
         test_multiple_entry(tester)
         test_output_lifespan(tester)
@@ -258,5 +360,8 @@ def test_quantized_ops(tester):
         test_module_single_input(tester)
         test_stderr_redirect(tester)
         test_quantized_ops(tester)
+        test_constant_output_not_memory_planned(tester)
+        test_method_meta(tester)
+        test_bad_name(tester)
 
     return wrapper
diff --git a/extension/tensor/CMakeLists.txt b/extension/tensor/CMakeLists.txt
index 2cf1bf2956f..31ee1b7fd4e 100644
--- a/extension/tensor/CMakeLists.txt
+++ b/extension/tensor/CMakeLists.txt
@@ -18,7 +18,7 @@ endif()
 
 list(TRANSFORM _extension_tensor__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(extension_tensor ${_extension_tensor__srcs})
-target_link_libraries(extension_tensor executorch_no_prim_ops)
+target_link_libraries(extension_tensor executorch_core)
 target_include_directories(extension_tensor PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(extension_tensor PUBLIC ${_common_compile_options})
 
diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl
index 8493d093fa1..2a8f9193571 100644
--- a/extension/tensor/targets.bzl
+++ b/extension/tensor/targets.bzl
@@ -13,13 +13,11 @@ def define_common_targets():
         runtime.cxx_library(
             name = "tensor" + aten_suffix,
             srcs = [
-                "tensor_impl_ptr.cpp",
                 "tensor_ptr.cpp",
                 "tensor_ptr_maker.cpp",
             ],
             exported_headers = [
                 "tensor.h",
-                "tensor_impl_ptr.h",
                 "tensor_ptr.h",
                 "tensor_ptr_maker.h",
             ],
diff --git a/extension/tensor/tensor.h b/extension/tensor/tensor.h
index 0de8c39b75d..80a41018a20 100644
--- a/extension/tensor/tensor.h
+++ b/extension/tensor/tensor.h
@@ -9,6 +9,5 @@
 #pragma once
 
 // Umbrella header for the Tensor extension.
-#include <executorch/extension/tensor/tensor_impl_ptr.h>
 #include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/extension/tensor/tensor_ptr_maker.h>
diff --git a/extension/tensor/tensor_impl_ptr.cpp b/extension/tensor/tensor_impl_ptr.cpp
deleted file mode 100644
index 01f69095b23..00000000000
--- a/extension/tensor/tensor_impl_ptr.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/extension/tensor/tensor_impl_ptr.h>
-
-#include <algorithm>
-#include <numeric>
-
-#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
-
-namespace executorch {
-namespace extension {
-namespace {
-#ifndef USE_ATEN_LIB
-// No-op deleter that does nothing when called.
-static void noop_deleter(void*) {}
-
-/**
- * Custom deleter for TensorImplPtr that ensures the memory associated with
- * dynamic metadata (sizes, dim_order, and strides) is properly managed when the
- * TensorImpl is destroyed.
- *
- * Since TensorImpl does not own the metadata arrays (sizes, dim_order,
- * strides), this deleter is responsible for releasing that memory when the
- * TensorImpl is destroyed.
- */
-struct TensorImplPtrDeleter final {
-  // A custom deleter of the std::shared_ptr is required to be copyable until
-  // C++20, so any data it holds must be copyable too. Hence, we use shared_ptr
-  // to hold the data and metadata to avoid unnecessary copies.
-  std::shared_ptr<void> data;
-  std::shared_ptr<std::vector<exec_aten::SizesType>> sizes;
-  std::shared_ptr<std::vector<exec_aten::DimOrderType>> dim_order;
-  std::shared_ptr<std::vector<exec_aten::StridesType>> strides;
-
-  void operator()(exec_aten::TensorImpl* pointer) {
-    // Release all resources immediately since the data held by the
-    // TensorImplPtrDeleter is tied to the managed object, not the smart pointer
-    // itself. We need to free this memory when the object is destroyed, not
-    // when the smart pointer (and deleter) are eventually destroyed or reset.
-    data.reset();
-    sizes.reset();
-    dim_order.reset();
-    strides.reset();
-    delete pointer;
-  }
-};
-#endif // USE_ATEN_LIB
-} // namespace
-
-TensorImplPtr make_tensor_impl_ptr(
-    exec_aten::ScalarType type,
-    std::vector<exec_aten::SizesType> sizes,
-    void* data,
-    std::vector<exec_aten::DimOrderType> dim_order,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::TensorShapeDynamism dynamism,
-    std::function<void(void*)> deleter) {
-  const auto dim = sizes.size();
-  if (dim_order.empty()) {
-    dim_order.resize(dim);
-    std::iota(dim_order.begin(), dim_order.end(), 0);
-    if (!strides.empty()) {
-      std::sort(dim_order.begin(), dim_order.end(), [&](size_t a, size_t b) {
-        return strides[a] > strides[b];
-      });
-    }
-  }
-  std::vector<exec_aten::StridesType> computed_strides(dim);
-  auto error = runtime::dim_order_to_stride(
-      sizes.data(), dim_order.data(), dim, computed_strides.data());
-  ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
-
-  if (!strides.empty()) {
-    ET_CHECK_MSG(computed_strides == strides, "Invalid strides provided.");
-  } else {
-    strides = std::move(computed_strides);
-  }
-#ifndef USE_ATEN_LIB
-  auto tensor_impl = std::make_unique<exec_aten::TensorImpl>(
-      type,
-      dim,
-      sizes.data(),
-      data,
-      dim_order.data(),
-      strides.data(),
-      dynamism);
-  return TensorImplPtr(
-      tensor_impl.release(),
-      TensorImplPtrDeleter{
-          std::shared_ptr<void>(
-              data, deleter ? std::move(deleter) : noop_deleter),
-          std::make_shared<std::vector<exec_aten::SizesType>>(std::move(sizes)),
-          std::make_shared<std::vector<exec_aten::DimOrderType>>(
-              std::move(dim_order)),
-          std::make_shared<std::vector<exec_aten::StridesType>>(
-              std::move(strides))});
-#else
-  auto options = c10::TensorOptions()
-                     .dtype(c10::scalarTypeToTypeMeta(type))
-                     .device(c10::kCPU);
-  auto storage = c10::Storage(
-      c10::Storage::use_byte_size_t(),
-      at::detail::computeStorageNbytes(
-          sizes, strides, options.dtype().itemsize()),
-      c10::InefficientStdFunctionContext::makeDataPtr(
-          data, std::move(deleter), options.device()),
-      nullptr,
-      false);
-  auto tensor_impl = c10::make_intrusive<at::TensorImpl>(
-      std::move(storage),
-      c10::DispatchKeySet(c10::DispatchKey::CPU),
-      options.dtype());
-  tensor_impl->set_sizes_and_strides(sizes, strides);
-  return tensor_impl;
-#endif // USE_ATEN_LIB
-}
-
-TensorImplPtr make_tensor_impl_ptr(
-    exec_aten::ScalarType scalar_type,
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<uint8_t> data,
-    std::vector<exec_aten::DimOrderType> dim_order,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::TensorShapeDynamism dynamism) {
-  ET_CHECK_MSG(
-      data.size() >= exec_aten::compute_numel(sizes.data(), sizes.size()) *
-              exec_aten::elementSize(scalar_type),
-      "Data size is smaller than required by sizes and scalar type.");
-  auto raw_data_ptr = data.data();
-  auto data_ptr = std::make_shared<std::vector<uint8_t>>(std::move(data));
-  return make_tensor_impl_ptr(
-      scalar_type,
-      std::move(sizes),
-      raw_data_ptr,
-      std::move(dim_order),
-      std::move(strides),
-      dynamism,
-      [data_ptr = std::move(data_ptr)](void*) {});
-}
-
-} // namespace extension
-} // namespace executorch
diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h
deleted file mode 100644
index 5f34f929b96..00000000000
--- a/extension/tensor/tensor_impl_ptr.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
-#include <executorch/runtime/platform/compiler.h>
-
-namespace executorch {
-namespace extension {
-
-#ifndef USE_ATEN_LIB
-/**
- * A smart pointer type for managing the lifecycle of a TensorImpl.
- *
- * TensorImplPtr uses a shared pointer because multiple Tensor objects might
- * share the same underlying data and metadata. This shared ownership model
- * ensures that the TensorImpl is only destroyed when all references to it are
- * gone, providing a safe and efficient way to manage shared tensor
- * implementations. This abstraction is designed to be a safer and more
- * convenient alternative to the original TensorImpl, which does not
- * manage metadata by design.
- */
-using TensorImplPtr = std::shared_ptr<exec_aten::TensorImpl>;
-#else
-/**
- * A smart pointer type for managing the lifecycle of a TensorImpl.
- *
- * TensorImplPtr uses an intrusive pointer when working with ATen, ensuring
- * efficient reference counting and shared ownership of the underlying data and
- * metadata.
- */
-using TensorImplPtr =
-    c10::intrusive_ptr<exec_aten::TensorImpl, at::UndefinedTensorImpl>;
-#endif // USE_ATEN_LIB
-
-/**
- * Creates a TensorImplPtr that manages a newly created TensorImpl with the
- * specified properties.
- *
- * @param type The scalar type of the tensor elements.
- * @param sizes A vector specifying the size of each dimension.
- * @param data A pointer to the data buffer.
- * @param dim_order A vector specifying the order of dimensions.
- * @param strides A vector specifying the strides of each dimension.
- * @param dynamism Specifies the mutability of the tensor's shape.
- * @param deleter A custom deleter function for managing the lifetime of the
- * data buffer. If provided, this deleter will be called when the managed
- * TensorImpl object is destroyed.
- * @return A TensorImplPtr managing the newly created TensorImpl.
- */
-TensorImplPtr make_tensor_impl_ptr(
-    exec_aten::ScalarType type,
-    std::vector<exec_aten::SizesType> sizes,
-    void* data,
-    std::vector<exec_aten::DimOrderType> dim_order = {},
-    std::vector<exec_aten::StridesType> strides = {},
-    exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
-    std::function<void(void*)> deleter = nullptr);
-
-/**
- * Creates a TensorImplPtr that manages a newly created TensorImpl with the
- * specified properties.
- *
- * This template overload is specialized for cases where the tensor data is
- * provided as a vector. The scalar type is automatically deduced from the
- * vector's data type. The deleter ensures that the data vector is properly
- * managed and its lifetime is tied to the TensorImpl.
- *
- * @tparam T The C++ type of the tensor elements, deduced from the vector.
- * @param sizes A vector specifying the size of each dimension.
- * @param data A vector containing the tensor's data.
- * @param dim_order A vector specifying the order of dimensions.
- * @param strides A vector specifying the strides of each dimension.
- * @param dynamism Specifies the mutability of the tensor's shape.
- * @return A TensorImplPtr that manages the newly created TensorImpl.
- */
-template <typename T = float>
-TensorImplPtr make_tensor_impl_ptr(
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<T> data,
-    std::vector<exec_aten::DimOrderType> dim_order = {},
-    std::vector<exec_aten::StridesType> strides = {},
-    exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
-  constexpr exec_aten::ScalarType scalar_type =
-      runtime::CppTypeToScalarType<T>::value;
-  const auto raw_data_ptr = data.data();
-  auto data_ptr = std::make_shared<std::vector<T>>(std::move(data));
-  return make_tensor_impl_ptr(
-      scalar_type,
-      std::move(sizes),
-      raw_data_ptr,
-      std::move(dim_order),
-      std::move(strides),
-      dynamism,
-      [data_ptr = std::move(data_ptr)](void*) {});
-}
-
-/**
- * Creates a TensorImplPtr that manages a newly created TensorImpl with the
- * specified properties.
- *
- * This template overload is specialized for cases where the tensor data is
- * provided as a vector. The scalar type is automatically deduced from the
- * vector's data type. The deleter ensures that the data vector is properly
- * managed and its lifetime is tied to the TensorImpl.
- *
- * @tparam T The C++ type of the tensor elements, deduced from the vector.
- * @param data A vector containing the tensor's data.
- * @param dynamism Specifies the mutability of the tensor's shape.
- * @return A TensorImplPtr that manages the newly created TensorImpl.
- */
-template <typename T = float>
-TensorImplPtr make_tensor_impl_ptr(
-    std::vector<T> data,
-    exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
-  constexpr exec_aten::ScalarType scalar_type =
-      runtime::CppTypeToScalarType<T>::value;
-  std::vector<exec_aten::SizesType> sizes{exec_aten::SizesType(data.size())};
-  const auto raw_data_ptr = data.data();
-  auto data_ptr = std::make_shared<std::vector<T>>(std::move(data));
-  return make_tensor_impl_ptr(
-      scalar_type,
-      std::move(sizes),
-      raw_data_ptr,
-      {0},
-      {1},
-      dynamism,
-      [data_ptr = std::move(data_ptr)](void*) {});
-}
-
-/**
- * Creates a TensorImplPtr that manages a newly created TensorImpl with the
- * specified properties.
- *
- * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
- * and a scalar type to interpret the data. The vector is managed, and the
- * memory's lifetime is tied to the TensorImpl.
- *
- * @param scalar_type The scalar type of the tensor elements.
- * @param sizes A vector specifying the size of each dimension.
- * @param data A vector containing the raw memory for the tensor's data.
- * @param dim_order A vector specifying the order of dimensions.
- * @param strides A vector specifying the strides of each dimension.
- * @param dynamism Specifies the mutability of the tensor's shape.
- * @return A TensorImplPtr managing the newly created TensorImpl.
- */
-TensorImplPtr make_tensor_impl_ptr(
-    exec_aten::ScalarType scalar_type,
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<uint8_t> data,
-    std::vector<exec_aten::DimOrderType> dim_order = {},
-    std::vector<exec_aten::StridesType> strides = {},
-    exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
-
-} // namespace extension
-} // namespace executorch
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
index 7a0aa997f02..4e660cd6f82 100644
--- a/extension/tensor/tensor_ptr.cpp
+++ b/extension/tensor/tensor_ptr.cpp
@@ -8,10 +8,180 @@
 
 #include <executorch/extension/tensor/tensor_ptr.h>
 
+#include <numeric>
+
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 namespace executorch {
 namespace extension {
+namespace {
+#ifndef USE_ATEN_LIB
+/**
+ * A structure that consolidates the metadata (sizes, dim_order, strides) and
+ * the data buffer associated with a Tensor. Since Tensor does not own
+ * the memory for these metadata arrays or the data itself, this structure
+ * ensures that they are managed together and have the same lifetime as the
+ * Tensor. When the Tensor is destroyed, the Storage structure ensures
+ * proper cleanup of the associated metadata and data if needed.
+ */
+struct Storage final {
+  exec_aten::TensorImpl tensor_impl;
+  exec_aten::Tensor tensor;
+  std::vector<exec_aten::SizesType> sizes;
+  std::vector<exec_aten::DimOrderType> dim_order;
+  std::vector<exec_aten::StridesType> strides;
+  std::function<void(void*)> deleter;
+
+  Storage(
+      exec_aten::TensorImpl&& tensor_impl,
+      std::vector<exec_aten::SizesType>&& sizes,
+      std::vector<exec_aten::DimOrderType>&& dim_order,
+      std::vector<exec_aten::StridesType>&& strides,
+      std::function<void(void*)>&& deleter)
+      : tensor_impl(std::move(tensor_impl)),
+        tensor(&this->tensor_impl),
+        sizes(std::move(sizes)),
+        dim_order(std::move(dim_order)),
+        strides(std::move(strides)),
+        deleter(std::move(deleter)) {}
+
+  ~Storage() {
+    if (deleter) {
+      deleter(tensor_impl.mutable_data());
+    }
+  }
+};
+#endif // USE_ATEN_LIB
+} // namespace
+
+TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    void* data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism,
+    std::function<void(void*)> deleter) {
+  const auto dim = sizes.size();
+  ET_CHECK_MSG(
+      dim_order.empty() || dim_order.size() == dim,
+      "dim_order size must match sizes or be empty.");
+  ET_CHECK_MSG(
+      strides.empty() || strides.size() == dim,
+      "strides size must match sizes or be empty.");
+
+  if (dim_order.empty()) {
+    dim_order.resize(dim);
+    std::iota(dim_order.begin(), dim_order.end(), 0);
+    if (!strides.empty()) {
+      std::sort(dim_order.begin(), dim_order.end(), [&](size_t a, size_t b) {
+        return strides[a] > strides[b];
+      });
+    }
+  }
+  std::vector<exec_aten::StridesType> computed_strides(dim);
+  auto error = runtime::dim_order_to_stride(
+      sizes.data(), dim_order.data(), dim, computed_strides.data());
+  ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
+
+  if (!strides.empty()) {
+    ET_CHECK_MSG(computed_strides == strides, "Invalid strides provided.");
+  } else {
+    strides = std::move(computed_strides);
+  }
+#ifndef USE_ATEN_LIB
+  exec_aten::TensorImpl tensor_impl(
+      type,
+      dim,
+      sizes.data(),
+      data,
+      dim_order.data(),
+      strides.data(),
+      dim > 0 ? dynamism : exec_aten::TensorShapeDynamism::STATIC);
+  auto storage = std::make_shared<Storage>(
+      std::move(tensor_impl),
+      std::move(sizes),
+      std::move(dim_order),
+      std::move(strides),
+      std::move(deleter));
+  const auto tensor_ptr = &storage->tensor;
+  return std::shared_ptr<exec_aten::Tensor>(std::move(storage), tensor_ptr);
+#else
+  auto options = c10::TensorOptions()
+                     .dtype(c10::scalarTypeToTypeMeta(type))
+                     .device(c10::kCPU);
+  auto storage = c10::Storage(
+      c10::Storage::use_byte_size_t(),
+      at::detail::computeStorageNbytes(
+          sizes, strides, options.dtype().itemsize()),
+      c10::InefficientStdFunctionContext::makeDataPtr(
+          data, std::move(deleter), options.device()),
+      nullptr,
+      false);
+  auto tensor_impl = c10::make_intrusive<exec_aten::TensorImpl>(
+      std::move(storage),
+      c10::DispatchKeySet(c10::DispatchKey::CPU),
+      options.dtype());
+  tensor_impl->set_sizes_and_strides(sizes, strides);
+  return std::make_shared<exec_aten::Tensor>(std::move(tensor_impl));
+#endif // USE_ATEN_LIB
+}
+
+TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type,
+    exec_aten::TensorShapeDynamism dynamism) {
+  ET_CHECK_MSG(
+      data.size() >= exec_aten::compute_numel(sizes.data(), sizes.size()) *
+              exec_aten::elementSize(type),
+      "Data size is smaller than required by sizes and scalar type.");
+  auto data_ptr = data.data();
+  return make_tensor_ptr(
+      std::move(sizes),
+      data_ptr,
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism,
+      // Data is moved into the deleter and is destroyed together with Storage.
+      [data = std::move(data)](void*) {});
+}
+
+TensorPtr clone_tensor_ptr(const exec_aten::Tensor& tensor) {
+  std::vector<exec_aten::SizesType> sizes(
+      tensor.sizes().begin(), tensor.sizes().end());
+  std::vector<exec_aten::DimOrderType> dim_order{
+#ifndef USE_ATEN_LIB
+      tensor.dim_order().begin(), tensor.dim_order().end()
+#endif // USE_ATEN_LIB
+  };
+  std::vector<exec_aten::StridesType> strides(
+      tensor.strides().begin(), tensor.strides().end());
+  auto dynamism = exec_aten::TensorShapeDynamism::DYNAMIC_BOUND;
+#ifndef USE_ATEN_LIB
+  dynamism = tensor.shape_dynamism();
+#endif // USE_ATEN_LIB
+  return tensor.const_data_ptr()
+      ? make_tensor_ptr(
+            std::move(sizes),
+            std::vector<uint8_t>(
+                (uint8_t*)tensor.const_data_ptr(),
+                (uint8_t*)tensor.const_data_ptr() + tensor.nbytes()),
+            std::move(dim_order),
+            std::move(strides),
+            tensor.scalar_type(),
+            dynamism)
+      : make_tensor_ptr(
+            std::move(sizes),
+            nullptr,
+            std::move(dim_order),
+            std::move(strides),
+            tensor.scalar_type(),
+            dynamism);
+}
 
 runtime::Error resize_tensor_ptr(
     TensorPtr& tensor,
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index f477199a3e1..44f42186cbf 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -8,110 +8,52 @@
 
 #pragma once
 
-#include <executorch/extension/tensor/tensor_impl_ptr.h>
+#include <functional>
+#include <memory>
+#include <vector>
+
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 
 namespace executorch {
 namespace extension {
 
-#ifndef USE_ATEN_LIB
-namespace internal {
-/**
- * Custom deleter for TensorPtr that ensures the associated TensorImplPtr
- * is properly managed.
- *
- * Since Tensor does not own its TensorImpl, this deleter is responsible for
- * managing the lifecycle of the TensorImplPtr, ensuring that the dynamic
- * metadata (sizes, dim_order, strides) is properly released when the Tensor is
- * destroyed.
- */
-struct TensorPtrDeleter final {
-  TensorImplPtr tensor_impl;
-
-  void operator()(exec_aten::Tensor* pointer) {
-    // Release all resources immediately since the data held by the
-    // TensorPtrDeleter is tied to the managed object, not the smart pointer
-    // itself. We need to free this memory when the object is destroyed, not
-    // when the smart pointer (and deleter) are eventually destroyed or reset.
-    tensor_impl.reset();
-    delete pointer;
-  }
-};
-} // namespace internal
-
 /**
  * A smart pointer type for managing the lifecycle of a Tensor.
- *
- * TensorPtr uses a unique pointer to enforce that each Tensor object has
- * distinct ownership. This abstraction serves as a more convenient and safer
- * replacement for the standard Tensor, which does not manage its
- * metadata by design. Using TensorPtr simplifies memory management and ensures
- * that the underlying TensorImpl is safely shared among tensors when needed.
  */
-using TensorPtr =
-    std::unique_ptr<exec_aten::Tensor, internal::TensorPtrDeleter>;
-#else
-/**
- * A smart pointer type for managing the lifecycle of a Tensor.
- *
- * When using ATen, this is a standard unique_ptr for exec_aten::Tensor.
- * In ATen, the Tensor class owns its TensorImpl and associated metadata,
- * so no custom deleter is required.
- */
-using TensorPtr = std::unique_ptr<exec_aten::Tensor>;
-#endif // USE_ATEN_LIB
+using TensorPtr = std::shared_ptr<exec_aten::Tensor>;
 
 /**
- * Creates a new TensorPtr that manages a newly created Tensor with the given
- * TensorImplPtr.
- *
- * This function wraps the provided TensorImplPtr in a TensorPtr, ensuring that
- * the Tensor object's lifecycle is properly managed. The TensorPtr will
- * uniquely own the Tensor object, while the underlying TensorImplPtr may be
- * shared with other Tensors.
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
  *
- * @param tensor_impl A TensorImplPtr to the TensorImpl to be managed.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter will be called when the managed Tensor
+ * object is destroyed.
  * @return A TensorPtr that manages the newly created Tensor.
  */
-inline TensorPtr make_tensor_ptr(TensorImplPtr tensor_impl) {
-#ifndef USE_ATEN_LIB
-  auto tensor = std::make_unique<exec_aten::Tensor>(tensor_impl.get());
-  return TensorPtr(
-      tensor.release(), internal::TensorPtrDeleter{std::move(tensor_impl)});
-#else
-  return std::make_unique<exec_aten::Tensor>(std::move(tensor_impl));
-#endif // USE_ATEN_LIB
-}
-
-/**
- * Creates a new TensorPtr that shares the same TensorImplPtr as an existing
- * TensorPtr.
- *
- * This function creates a new TensorPtr that shares the
- * underlying TensorImpl with the provided TensorPtr, ensuring that the
- * underlying data and metadata are not duplicated but safely shared between the
- * tensor objects.
- *
- * @param tensor A TensorPtr to the existing Tensor from which to create a copy.
- * @return A new TensorPtr that shares the underlying TensorImplPtr with the
- * original.
- */
-inline TensorPtr make_tensor_ptr(const TensorPtr& tensor) {
-#ifndef USE_ATEN_LIB
-  return make_tensor_ptr(tensor.get_deleter().tensor_impl);
-#else
-  return make_tensor_ptr(tensor->getIntrusivePtr());
-#endif // USE_ATEN_LIB
-}
+TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    void* data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    const exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    const exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr);
 
 /**
  * Creates a TensorPtr that manages a Tensor with the specified properties.
  *
- * @param type The scalar type of the tensor elements.
  * @param sizes A vector specifying the size of each dimension.
  * @param data A pointer to the data buffer.
- * @param dim_order A vector specifying the order of dimensions.
- * @param strides A vector specifying the strides of the tensor.
+ * @param type The scalar type of the tensor elements.
  * @param dynamism Specifies the mutability of the tensor's shape.
  * @param deleter A custom deleter function for managing the lifetime of the
  * data buffer. If provided, this deleter will be called when the managed Tensor
@@ -119,22 +61,14 @@ inline TensorPtr make_tensor_ptr(const TensorPtr& tensor) {
  * @return A TensorPtr that manages the newly created Tensor.
  */
 inline TensorPtr make_tensor_ptr(
-    const exec_aten::ScalarType type,
     std::vector<exec_aten::SizesType> sizes,
     void* data,
-    std::vector<exec_aten::DimOrderType> dim_order = {},
-    std::vector<exec_aten::StridesType> strides = {},
+    const exec_aten::ScalarType type = exec_aten::ScalarType::Float,
     const exec_aten::TensorShapeDynamism dynamism =
         exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
     std::function<void(void*)> deleter = nullptr) {
-  return make_tensor_ptr(make_tensor_impl_ptr(
-      type,
-      std::move(sizes),
-      data,
-      std::move(dim_order),
-      std::move(strides),
-      dynamism,
-      std::move(deleter)));
+  return make_tensor_ptr(
+      std::move(sizes), data, {}, {}, type, dynamism, std::move(deleter));
 }
 
 /**
@@ -142,30 +76,66 @@ inline TensorPtr make_tensor_ptr(
  *
  * This template overload is specialized for cases where the tensor data is
  * provided as a vector. The scalar type is automatically deduced from the
- * vector's data type.
+ * vector's data type. If the specified `type` differs from the deduced type of
+ * the vector's elements, and casting is allowed, the data will be cast to the
+ * specified `type`. This allows for flexible creation of tensors with data
+ * vectors of one type and a different scalar type.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the vector.
  * @param sizes A vector specifying the size of each dimension.
  * @param data A vector containing the tensor's data.
  * @param dim_order A vector specifying the order of dimensions.
  * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
  * @param dynamism Specifies the mutability of the tensor's shape.
  * @return A TensorPtr that manages the newly created TensorImpl.
  */
-template <typename T = float>
-TensorPtr make_tensor_ptr(
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
     std::vector<exec_aten::SizesType> sizes,
     std::vector<T> data,
     std::vector<exec_aten::DimOrderType> dim_order = {},
     std::vector<exec_aten::StridesType> strides = {},
+    exec_aten::ScalarType type = deduced_type,
     exec_aten::TensorShapeDynamism dynamism =
         exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
-  return make_tensor_ptr(make_tensor_impl_ptr(
+  if (type != deduced_type) {
+    ET_CHECK_MSG(
+        runtime::canCast(deduced_type, type),
+        "Cannot cast deduced type to specified type.");
+    std::vector<uint8_t> casted_data(data.size() * runtime::elementSize(type));
+    ET_SWITCH_REALHBBF16_TYPES(type, nullptr, "make_tensor_ptr", CTYPE, [&] {
+      std::transform(
+          data.begin(),
+          data.end(),
+          reinterpret_cast<CTYPE*>(casted_data.data()),
+          [](const T& val) { return static_cast<CTYPE>(val); });
+    });
+    const auto raw_data_ptr = casted_data.data();
+    auto data_ptr =
+        std::make_shared<std::vector<uint8_t>>(std::move(casted_data));
+    return make_tensor_ptr(
+        std::move(sizes),
+        raw_data_ptr,
+        std::move(dim_order),
+        std::move(strides),
+        type,
+        dynamism,
+        [data_ptr = std::move(data_ptr)](void*) {});
+  }
+  const auto raw_data_ptr = data.data();
+  auto data_ptr = std::make_shared<std::vector<T>>(std::move(data));
+  return make_tensor_ptr(
       std::move(sizes),
-      std::move(data),
+      raw_data_ptr,
       std::move(dim_order),
       std::move(strides),
-      dynamism));
+      type,
+      dynamism,
+      [data_ptr = std::move(data_ptr)](void*) {});
 }
 
 /**
@@ -173,19 +143,71 @@ TensorPtr make_tensor_ptr(
  *
  * This template overload is specialized for cases where the tensor data is
  * provided as a vector. The scalar type is automatically deduced from the
- * vector's data type.
+ * vector's data type. If the specified `type` differs from the deduced type of
+ * the vector's elements, and casting is allowed, the data will be cast to the
+ * specified `type`. This allows for flexible creation of tensors with data
+ * vectors of one type and a different scalar type.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the vector.
  * @param data A vector containing the tensor's data.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
  * @param dynamism Specifies the mutability of the tensor's shape.
  * @return A TensorPtr that manages the newly created TensorImpl.
  */
-template <typename T = float>
-TensorPtr make_tensor_ptr(
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
     std::vector<T> data,
+    exec_aten::ScalarType type = deduced_type,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  std::vector<exec_aten::SizesType> sizes{exec_aten::SizesType(data.size())};
+  return make_tensor_ptr(
+      std::move(sizes), std::move(data), {0}, {1}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This template overload is specialized for cases where the tensor data is
+ * provided as an initializer list. The scalar type is automatically deduced
+ * from the initializer list's data type. If the specified `type` differs from
+ * the deduced type of the initializer list's elements, and casting is allowed,
+ * the data will be cast to the specified `type`. This allows for flexible
+ * creation of tensors with data vectors of one type and a different scalar
+ * type.
+ *
+ * @tparam T The C++ type of the tensor elements, deduced from the initializer
+ * list.
+ * @param sizes A vector specifying the size of each dimension.
+ * @param list An initializer list containing the tensor's data.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::initializer_list<T> list,
+    std::vector<exec_aten::DimOrderType> dim_order = {},
+    std::vector<exec_aten::StridesType> strides = {},
+    exec_aten::ScalarType type = deduced_type,
     exec_aten::TensorShapeDynamism dynamism =
         exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
-  return make_tensor_ptr(make_tensor_impl_ptr(std::move(data), dynamism));
+  return make_tensor_ptr(
+      std::move(sizes),
+      std::vector<T>(std::move(list)),
+      std::move(dim_order),
+      std::move(strides),
+      type,
+      dynamism);
 }
 
 /**
@@ -193,20 +215,43 @@ TensorPtr make_tensor_ptr(
  *
  * This template overload allows creating a Tensor from an initializer list
  * of data. The scalar type is automatically deduced from the type of the
- * initializer list's elements.
+ * initializer list's elements. If the specified `type` differs from
+ * the deduced type of the initializer list's elements, and casting is allowed,
+ * the data will be cast to the specified `type`. This allows for flexible
+ * creation of tensors with data vectors of one type and a different scalar
+ * type.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the initializer
  * list.
- * @param data An initializer list containing the tensor's data.
+ * @param list An initializer list containing the tensor's data.
+ * @param type The scalar type of the tensor elements. If it differs from the
+ * deduced type, the data will be cast to this type if allowed.
  * @param dynamism Specifies the mutability of the tensor's shape.
  * @return A TensorPtr that manages the newly created TensorImpl.
  */
-template <typename T = float>
-TensorPtr make_tensor_ptr(
-    std::initializer_list<T> data,
+template <
+    typename T = float,
+    exec_aten::ScalarType deduced_type = runtime::CppTypeToScalarType<T>::value>
+inline TensorPtr make_tensor_ptr(
+    std::initializer_list<T> list,
+    exec_aten::ScalarType type = deduced_type,
     exec_aten::TensorShapeDynamism dynamism =
         exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
-  return make_tensor_ptr(std::vector<T>(data), dynamism);
+  std::vector<exec_aten::SizesType> sizes{exec_aten::SizesType(list.size())};
+  return make_tensor_ptr(
+      std::move(sizes), std::move(list), {0}, {1}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr that manages a Tensor with a single scalar value.
+ *
+ * @tparam T The C++ type of the scalar value.
+ * @param value The scalar value to be used for the Tensor.
+ * @return A TensorPtr that manages the newly created TensorImpl.
+ */
+template <typename T>
+inline TensorPtr make_tensor_ptr(T value) {
+  return make_tensor_ptr({}, std::vector<T>{value});
 }
 
 /**
@@ -216,37 +261,104 @@ TensorPtr make_tensor_ptr(
  * and a scalar type to interpret the data. The vector is managed, and the
  * memory's lifetime is tied to the TensorImpl.
  *
- * @param scalar_type The scalar type of the tensor elements.
  * @param sizes A vector specifying the size of each dimension.
  * @param data A vector containing the raw memory for the tensor's data.
  * @param dim_order A vector specifying the order of dimensions.
  * @param strides A vector specifying the strides of each dimension.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @return A TensorPtr managing the newly created Tensor.
+ */
+TensorPtr make_tensor_ptr(
+    std::vector<exec_aten::SizesType> sizes,
+    std::vector<uint8_t> data,
+    std::vector<exec_aten::DimOrderType> dim_order,
+    std::vector<exec_aten::StridesType> strides,
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
+    exec_aten::TensorShapeDynamism dynamism =
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a TensorPtr that manages a Tensor with the specified properties.
+ *
+ * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
+ * and a scalar type to interpret the data. The vector is managed, and the
+ * memory's lifetime is tied to the TensorImpl.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A vector containing the raw memory for the tensor's data.
+ * @param type The scalar type of the tensor elements.
  * @param dynamism Specifies the mutability of the tensor's shape.
  * @return A TensorPtr managing the newly created Tensor.
  */
 inline TensorPtr make_tensor_ptr(
-    exec_aten::ScalarType scalar_type,
     std::vector<exec_aten::SizesType> sizes,
     std::vector<uint8_t> data,
-    std::vector<exec_aten::DimOrderType> dim_order = {},
-    std::vector<exec_aten::StridesType> strides = {},
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
     exec_aten::TensorShapeDynamism dynamism =
         exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
-  return make_tensor_ptr(make_tensor_impl_ptr(
-      scalar_type,
-      std::move(sizes),
-      std::move(data),
-      std::move(dim_order),
-      std::move(strides),
-      dynamism));
+  return make_tensor_ptr(
+      std::move(sizes), std::move(data), {}, {}, type, dynamism);
+}
+
+/**
+ * Creates a TensorPtr to manage a new Tensor with the same properties
+ * as the given Tensor, sharing the same data without owning it.
+ *
+ * @param tensor The Tensor whose properties are used to create a new TensorPtr.
+ * @return A new TensorPtr managing a Tensor with the same properties as the
+ * original.
+ */
+inline TensorPtr make_tensor_ptr(const exec_aten::Tensor& tensor) {
+  return make_tensor_ptr(
+      std::vector<exec_aten::SizesType>(
+          tensor.sizes().begin(), tensor.sizes().end()),
+      tensor.mutable_data_ptr(),
+#ifndef USE_ATEN_LIB
+      std::vector<exec_aten::DimOrderType>(
+          tensor.dim_order().begin(), tensor.dim_order().end()),
+      std::vector<exec_aten::StridesType>(
+          tensor.strides().begin(), tensor.strides().end()),
+      tensor.scalar_type(),
+      tensor.shape_dynamism()
+#else // USE_ATEN_LIB
+      {},
+      std::vector<exec_aten::StridesType>(
+          tensor.strides().begin(), tensor.strides().end()),
+      tensor.scalar_type()
+#endif // USE_ATEN_LIB
+  );
+}
+
+/**
+ * Creates a TensorPtr that manages a new Tensor with the same properties
+ * as the given Tensor, but with a copy of the data owned by the returned
+ * TensorPtr, or nullptr if the original data is null.
+ *
+ * @param tensor The Tensor to clone.
+ * @return A new TensorPtr that manages a Tensor with the same properties as the
+ * original but with copied data.
+ */
+TensorPtr clone_tensor_ptr(const exec_aten::Tensor& tensor);
+
+/**
+ * Creates a new TensorPtr by cloning the given TensorPtr, copying the
+ * underlying data.
+ *
+ * @param tensor The TensorPtr to clone.
+ * @return A new TensorPtr that manages a Tensor with the same properties as the
+ * original but with copied data.
+ */
+inline TensorPtr clone_tensor_ptr(const TensorPtr& tensor) {
+  return clone_tensor_ptr(*tensor);
 }
 
 /**
- * Resizes the Tensor managed by the given TensorPtr to the new sizes provided.
+ * Resizes the Tensor managed by the provided TensorPtr to the new sizes.
  *
  * @param tensor A TensorPtr managing the Tensor to resize.
  * @param sizes A vector representing the new sizes for each dimension.
- * @return Error::Ok on success, or an appropriate error code otherwise.
+ * @return Error::Ok on success, or an appropriate error code on failure.
  */
 ET_NODISCARD
 runtime::Error resize_tensor_ptr(
diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp
index 1a09fea4cac..cbea6da1e74 100644
--- a/extension/tensor/tensor_ptr_maker.cpp
+++ b/extension/tensor/tensor_ptr_maker.cpp
@@ -105,11 +105,11 @@ TensorPtr empty_strided(
       exec_aten::compute_numel(sizes.data(), sizes.size()) *
       exec_aten::elementSize(type));
   return make_tensor_ptr(
-      type,
       std::move(sizes),
       std::move(data),
       {},
       std::move(strides),
+      type,
       dynamism);
 }
 
diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h
index 4e65480b7fd..3f2d267a4e4 100644
--- a/extension/tensor/tensor_ptr_maker.h
+++ b/extension/tensor/tensor_ptr_maker.h
@@ -15,13 +15,13 @@ namespace extension {
 
 /**
  * A helper class for creating TensorPtr instances from raw data and tensor
- * properties. Note that the TensorPtr created by this class will not own the
- * data, so it must outlive the TensorPtr.
+ * properties. Note that the TensorPtr created by this class does not own the
+ * data, so the data must outlive the TensorPtr.
  *
- * TensorPtrMaker provides a fluent interface for specifying various properties
- * of a tensor, such as its type, sizes, data pointer, dimension order, strides,
- * and shape dynamism. The final tensor is created by invoking make_tensor_ptr()
- * or converting TensorPtrMaker to TensorPtr.
+ * TensorPtrMaker provides a fluent interface for specifying various tensor
+ * properties, such as type, sizes, data pointer, dimension order, strides, and
+ * shape dynamism. The final tensor is created by invoking make_tensor_ptr() or
+ * by converting TensorPtrMaker to TensorPtr.
  */
 class TensorPtrMaker final {
  public:
@@ -99,11 +99,11 @@ class TensorPtrMaker final {
    */
   TensorPtr make_tensor_ptr() && {
     return ::executorch::extension::make_tensor_ptr(
-        type_,
         std::move(sizes_),
         data_,
         std::move(dim_order_),
         std::move(strides_),
+        type_,
         dynamism_,
         std::move(deleter_));
   }
@@ -167,16 +167,16 @@ inline TensorPtrMaker for_blob(
  * Creates a TensorPtr from a raw data pointer and tensor sizes, with an
  * optional dynamism setting.
  *
- * This function is a convenient way to create a tensor from existing data, with
- * the option to specify whether the tensor's shape is static, dynamic, or
- * bounded.
+ * This function provides a convenient way to create a tensor from existing
+ * data, with the option to specify whether the tensor's shape is static or
+ * dynamic.
  *
- * @param data A pointer to the raw data to be used by the tensor. It must
+ * @param data A pointer to the raw data used by the tensor. The data must
  * outlive the TensorPtr created by this function.
  * @param sizes A vector specifying the size of each dimension.
  * @param type The scalar type of the tensor elements.
  * @param dynamism Specifies whether the tensor's shape is static or dynamic.
- * @return A TensorPtr instance that manages the newly created Tensor.
+ * @return A TensorPtr instance managing the newly created Tensor.
  */
 inline TensorPtr from_blob(
     void* data,
@@ -195,15 +195,16 @@ inline TensorPtr from_blob(
  *
  * This function allows for the creation of a tensor from existing data, with
  * the option to specify custom strides for each dimension and whether the
- * tensor's shape is static, dynamic, or bounded.
+ * tensor’s shape is static, dynamic, or bounded.
  *
- * @param data A pointer to the raw data to be used by the tensor. It must
+ * @param data A pointer to the raw data used by the tensor. The data must
  * outlive the TensorPtr created by this function.
  * @param sizes A vector specifying the size of each dimension.
  * @param strides A vector specifying the stride for each dimension.
  * @param type The scalar type of the tensor elements.
- * @param dynamism Specifies whether the tensor's shape is static or dynamic.
- * @return A TensorPtr instance that manages the newly created Tensor.
+ * @param dynamism Specifies whether the tensor's shape is static, dynamic, or
+ * bounded.
+ * @return A TensorPtr instance managing the newly created Tensor.
  */
 inline TensorPtr from_blob(
     void* data,
@@ -306,9 +307,10 @@ TensorPtr empty_strided(
  * This function allocates memory for the tensor elements but does not
  * initialize them with any specific values.
  *
- * @param other A reference to another tensor, whose size and properties will be
+ * @param other A reference to another tensor, whose size and properties are
  * used.
- * @param type The scalar type of the tensor elements.
+ * @param type The scalar type of the tensor elements. If not provided, the
+ * scalar type of the other tensor is used.
  * @param dynamism Specifies whether the tensor's shape is static or dynamic.
  * @return A TensorPtr instance managing the newly created Tensor.
  */
@@ -397,7 +399,7 @@ inline TensorPtr full_like(
  * Creates a TensorPtr filled with the specified value.
  *
  * @param sizes A vector specifying the size of each dimension.
- * @param fill_value The value to fill the tensor with.
+ * @param fill_value The value used to fill the tensor.
  * @param type The scalar type of the tensor elements.
  * @param dynamism Specifies whether the tensor's shape is static or dynamic.
  * @return A TensorPtr instance managing the newly created Tensor.
@@ -412,29 +414,26 @@ inline TensorPtr full(
 }
 
 /**
- * Creates a TensorPtr that holds a scalar value.
+ * Creates a TensorPtr holding a scalar value.
  *
- * @param value The scalar value to create the tensor with.
+ * @param value The scalar value for the tensor.
  * @param type The scalar type of the tensor elements.
- * @param dynamism Specifies whether the tensor's shape is static or dynamic.
  * @return A TensorPtr instance managing the newly created scalar Tensor.
  */
 inline TensorPtr scalar_tensor(
     exec_aten::Scalar value,
-    exec_aten::ScalarType type = exec_aten::ScalarType::Float,
-    exec_aten::TensorShapeDynamism dynamism =
-        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) {
-  return full({}, value, type, dynamism);
+    exec_aten::ScalarType type = exec_aten::ScalarType::Float) {
+  return full({}, value, type);
 }
 
 /**
  * Creates a TensorPtr filled with ones, with the same size and properties as
  * another tensor.
  *
- * @param other A reference to another tensor, whose size and properties will be
+ * @param other A reference to another tensor, whose size and properties are
  * used.
- * @param type The scalar type of the tensor elements. If not specified, the
- * scalar type of the `other` tensor is used.
+ * @param type The scalar type of the tensor elements. If not provided, the
+ * scalar type of the other tensor is used.
  * @param dynamism Specifies whether the tensor's shape is static or dynamic.
  * @return A TensorPtr instance managing the newly created Tensor.
  */
@@ -555,7 +554,8 @@ inline TensorPtr rand(
 }
 
 /**
- * Creates a TensorPtr filled with random values from a normal distribution.
+ * Creates a TensorPtr filled with random values between 0 and 1, with specified
+ * strides.
  *
  * @param sizes A vector specifying the size of each dimension.
  * @param strides A vector specifying the stride for each dimension.
@@ -596,7 +596,8 @@ inline TensorPtr randn_like(
 }
 
 /**
- * Creates a TensorPtr filled with random values from a normal distribution.
+ * Creates a TensorPtr filled with random values sampled from a normal
+ * distribution.
  *
  * @param sizes A vector specifying the size of each dimension.
  * @param type The scalar type of the tensor elements.
@@ -663,10 +664,11 @@ inline TensorPtr randint_like(
 }
 
 /**
- * Creates a TensorPtr filled with random integer values in the given range.
+ * Creates a TensorPtr filled with random integer values within the specified
+ * range.
  *
- * @param low The lower bound (inclusive) of the random values.
- * @param high The upper bound (exclusive) of the random values.
+ * @param low The inclusive lower bound of the random values.
+ * @param high The exclusive upper bound of the random values.
  * @param sizes A vector specifying the size of each dimension.
  * @param type The scalar type of the tensor elements.
  * @param dynamism Specifies whether the tensor's shape is static or dynamic.
diff --git a/extension/tensor/test/CMakeLists.txt b/extension/tensor/test/CMakeLists.txt
index 132a40c31ba..5c04664199e 100644
--- a/extension/tensor/test/CMakeLists.txt
+++ b/extension/tensor/test/CMakeLists.txt
@@ -23,9 +23,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs tensor_impl_ptr_test.cpp tensor_ptr_maker_test.cpp
-               tensor_ptr_test.cpp
-)
+set(_test_srcs tensor_ptr_maker_test.cpp tensor_ptr_test.cpp)
 
 et_cxx_test(
   extension_tensor_test SOURCES ${_test_srcs} EXTRA_LIBS extension_tensor
diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl
index 632cc3fb88e..3c81ac8def0 100644
--- a/extension/tensor/test/targets.bzl
+++ b/extension/tensor/test/targets.bzl
@@ -13,7 +13,6 @@ def define_common_targets():
         runtime.cxx_test(
             name = "test" + aten_suffix,
             srcs = [
-                "tensor_impl_ptr_test.cpp",
                 "tensor_ptr_maker_test.cpp",
                 "tensor_ptr_test.cpp",
             ],
diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp
deleted file mode 100644
index f7fd062c462..00000000000
--- a/extension/tensor/test/tensor_impl_ptr_test.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/extension/tensor/tensor_impl_ptr.h>
-
-#include <gtest/gtest.h>
-
-#include <executorch/runtime/platform/runtime.h>
-#include <executorch/test/utils/DeathTest.h>
-
-using namespace executorch::extension;
-using namespace executorch::runtime;
-
-class TensorImplPtrTest : public ::testing::Test {
- protected:
-  static void SetUpTestSuite() {
-    runtime_init();
-  }
-};
-
-TEST_F(TensorImplPtrTest, ScalarTensorCreation) {
-  float scalar_data = 3.14f;
-  auto tensor_impl =
-      make_tensor_impl_ptr(exec_aten::ScalarType::Float, {}, &scalar_data);
-
-  EXPECT_EQ(tensor_impl->numel(), 1);
-  EXPECT_EQ(tensor_impl->dim(), 0);
-  EXPECT_EQ(tensor_impl->sizes().size(), 0);
-  EXPECT_EQ(tensor_impl->strides().size(), 0);
-  EXPECT_EQ((float*)tensor_impl->data(), &scalar_data);
-  EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f);
-}
-
-TEST_F(TensorImplPtrTest, ScalarTensorOwningData) {
-  auto tensor_impl = make_tensor_impl_ptr({}, {3.14f});
-
-  EXPECT_EQ(tensor_impl->numel(), 1);
-  EXPECT_EQ(tensor_impl->dim(), 0);
-  EXPECT_EQ(tensor_impl->sizes().size(), 0);
-  EXPECT_EQ(tensor_impl->strides().size(), 0);
-  EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplCreation) {
-  float data[20] = {2};
-  auto tensor_impl = make_tensor_impl_ptr(
-      exec_aten::ScalarType::Float, {4, 5}, data, {0, 1}, {5, 1});
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 4);
-  EXPECT_EQ(tensor_impl->size(1), 5);
-  EXPECT_EQ(tensor_impl->strides()[0], 5);
-  EXPECT_EQ(tensor_impl->strides()[1], 1);
-  EXPECT_EQ(tensor_impl->data(), data);
-  EXPECT_EQ(tensor_impl->data(), data);
-  EXPECT_EQ(((float*)tensor_impl->data())[0], 2);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplSharedOwnership) {
-  float data[20] = {2};
-  auto tensor_impl1 =
-      make_tensor_impl_ptr(exec_aten::ScalarType::Float, {4, 5}, data);
-  auto tensor_impl2 = tensor_impl1;
-
-  EXPECT_EQ(tensor_impl1.get(), tensor_impl2.get());
-  EXPECT_EQ(tensor_impl1.use_count(), tensor_impl2.use_count());
-
-  tensor_impl1.reset();
-  EXPECT_EQ(tensor_impl2.use_count(), 1);
-  EXPECT_NE(tensor_impl2.get(), nullptr);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplInferredDimOrderAndStrides) {
-  float data[12] = {0};
-  auto tensor_impl = make_tensor_impl_ptr(
-      exec_aten::ScalarType::Float, {3, 4}, data, {}, {4, 1});
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 3);
-  EXPECT_EQ(tensor_impl->size(1), 4);
-  EXPECT_EQ(tensor_impl->strides()[0], 4);
-  EXPECT_EQ(tensor_impl->strides()[1], 1);
-  EXPECT_EQ(tensor_impl->data(), data);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplInferredDimOrderCustomStrides) {
-  float data[12] = {0};
-  auto tensor_impl = make_tensor_impl_ptr(
-      exec_aten::ScalarType::Float, {3, 4}, data, {}, {1, 3});
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 3);
-  EXPECT_EQ(tensor_impl->size(1), 4);
-  EXPECT_EQ(tensor_impl->strides()[0], 1);
-  EXPECT_EQ(tensor_impl->strides()[1], 3);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplDefaultDimOrderAndStrides) {
-  float data[24] = {0};
-  auto tensor_impl =
-      make_tensor_impl_ptr(exec_aten::ScalarType::Float, {2, 3, 4}, data);
-
-  EXPECT_EQ(tensor_impl->dim(), 3);
-  EXPECT_EQ(tensor_impl->size(0), 2);
-  EXPECT_EQ(tensor_impl->size(1), 3);
-  EXPECT_EQ(tensor_impl->size(2), 4);
-  EXPECT_EQ(tensor_impl->strides()[0], 12);
-  EXPECT_EQ(tensor_impl->strides()[1], 4);
-  EXPECT_EQ(tensor_impl->strides()[2], 1);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplMismatchStridesAndDimOrder) {
-  float data[12] = {0};
-  ET_EXPECT_DEATH(
-      {
-        auto _ = make_tensor_impl_ptr(
-            exec_aten::ScalarType::Float, {3, 4}, data, {1, 0}, {1, 4});
-      },
-      "");
-}
-
-TEST_F(TensorImplPtrTest, TensorImplCustomDimOrderAndStrides) {
-  float data[12] = {0};
-  auto tensor_impl = make_tensor_impl_ptr(
-      exec_aten::ScalarType::Float, {3, 4}, data, {1, 0}, {1, 3});
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 3);
-  EXPECT_EQ(tensor_impl->size(1), 4);
-  EXPECT_EQ(tensor_impl->strides()[0], 1);
-  EXPECT_EQ(tensor_impl->strides()[1], 3);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplInvalidDimOrder) {
-  ET_EXPECT_DEATH(
-      {
-        float data[20] = {2};
-        auto _ = make_tensor_impl_ptr(
-            exec_aten::ScalarType::Float, {4, 5}, data, {2, 1});
-      },
-      "");
-}
-
-TEST_F(TensorImplPtrTest, TensorImplCustomDeleter) {
-  float data[20] = {4};
-  auto tensor_impl =
-      make_tensor_impl_ptr(exec_aten::ScalarType::Float, {4, 5}, data);
-
-  TensorImplPtr copied_tensor_impl = tensor_impl;
-  EXPECT_EQ(tensor_impl.use_count(), copied_tensor_impl.use_count());
-
-  tensor_impl.reset();
-  EXPECT_EQ(copied_tensor_impl.use_count(), 1);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplDataDeleterReleasesCapturedSharedPtr) {
-  auto deleter_called = false;
-  std::shared_ptr<float[]> data_ptr(
-      new float[10], [](float* ptr) { delete[] ptr; });
-  auto tensor_impl = make_tensor_impl_ptr(
-      exec_aten::ScalarType::Float,
-      {4, 5},
-      data_ptr.get(),
-      {},
-      {},
-      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
-      [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
-
-  EXPECT_EQ(data_ptr.use_count(), 2);
-
-  tensor_impl.reset();
-  EXPECT_TRUE(deleter_called);
-  EXPECT_EQ(data_ptr.use_count(), 1);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplOwningData) {
-  auto tensor_impl = make_tensor_impl_ptr(
-      {2, 5},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f},
-      {1, 0},
-      {1, 2});
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 2);
-  EXPECT_EQ(tensor_impl->size(1), 5);
-  EXPECT_EQ(tensor_impl->strides()[0], 1);
-  EXPECT_EQ(tensor_impl->strides()[1], 2);
-  EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f);
-  EXPECT_EQ(((float*)tensor_impl->data())[9], 10.0f);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) {
-  auto tensor_impl = make_tensor_impl_ptr({0, 5}, std::vector<float>());
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 0);
-  EXPECT_EQ(tensor_impl->size(1), 5);
-  EXPECT_EQ(tensor_impl->strides()[0], 5);
-  EXPECT_EQ(tensor_impl->strides()[1], 1);
-  EXPECT_EQ(tensor_impl->data(), nullptr);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplDataOnlyDoubleType) {
-  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
-  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
-
-  EXPECT_EQ(tensor_impl->dim(), 1);
-  EXPECT_EQ(tensor_impl->size(0), 4);
-  EXPECT_EQ(tensor_impl->strides()[0], 1);
-  EXPECT_EQ(((double*)tensor_impl->data())[0], 1.0);
-  EXPECT_EQ(((double*)tensor_impl->data())[3], 4.0);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt32Type) {
-  std::vector<int32_t> data = {10, 20, 30, 40};
-  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
-
-  EXPECT_EQ(tensor_impl->dim(), 1);
-  EXPECT_EQ(tensor_impl->size(0), 4);
-  EXPECT_EQ(tensor_impl->strides()[0], 1);
-  EXPECT_EQ(((int32_t*)tensor_impl->data())[0], 10);
-  EXPECT_EQ(((int32_t*)tensor_impl->data())[3], 40);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt64Type) {
-  std::vector<int64_t> data = {100, 200, 300, 400};
-  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
-
-  EXPECT_EQ(tensor_impl->dim(), 1);
-  EXPECT_EQ(tensor_impl->size(0), 4);
-  EXPECT_EQ(tensor_impl->strides()[0], 1);
-  EXPECT_EQ(((int64_t*)tensor_impl->data())[0], 100);
-  EXPECT_EQ(((int64_t*)tensor_impl->data())[3], 400);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplDataOnlyUint8Type) {
-  std::vector<uint8_t> data = {10, 20, 30, 40};
-  auto tensor_impl = make_tensor_impl_ptr(std::move(data));
-
-  EXPECT_EQ(tensor_impl->dim(), 1);
-  EXPECT_EQ(tensor_impl->size(0), 4);
-  EXPECT_EQ(tensor_impl->strides()[0], 1);
-  EXPECT_EQ(((uint8_t*)tensor_impl->data())[0], 10);
-  EXPECT_EQ(((uint8_t*)tensor_impl->data())[3], 40);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplAmbiguityWithMixedVectors) {
-  std::vector<exec_aten::SizesType> sizes = {2, 2};
-  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
-  auto tensor_impl = make_tensor_impl_ptr(std::move(sizes), std::move(data));
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 2);
-  EXPECT_EQ(tensor_impl->size(1), 2);
-  EXPECT_EQ(tensor_impl->strides()[0], 2);
-  EXPECT_EQ(tensor_impl->strides()[1], 1);
-  EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f);
-  EXPECT_EQ(((float*)tensor_impl->data())[3], 4.0f);
-
-  auto tensor_impl2 = make_tensor_impl_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
-
-  EXPECT_EQ(tensor_impl2->dim(), 2);
-  EXPECT_EQ(tensor_impl2->size(0), 2);
-  EXPECT_EQ(tensor_impl2->size(1), 2);
-  EXPECT_EQ(tensor_impl2->strides()[0], 2);
-  EXPECT_EQ(tensor_impl2->strides()[1], 1);
-  EXPECT_EQ(((float*)tensor_impl2->data())[0], 1.0f);
-  EXPECT_EQ(((float*)tensor_impl2->data())[3], 4.0f);
-}
-
-TEST_F(TensorImplPtrTest, SharedDataManagement) {
-  auto data = std::make_shared<std::vector<float>>(100, 1.0f);
-  auto tensor_impl1 = make_tensor_impl_ptr(
-      exec_aten::ScalarType::Float, {10, 10}, data->data());
-  auto tensor_impl2 = tensor_impl1;
-
-  EXPECT_EQ(tensor_impl1.get(), tensor_impl2.get());
-  EXPECT_EQ(tensor_impl1.use_count(), 2);
-  EXPECT_EQ(((float*)tensor_impl1->data())[0], 1.0f);
-
-  ((float*)tensor_impl1->mutable_data())[0] = 2.0f;
-  EXPECT_EQ(((float*)tensor_impl2->data())[0], 2.0f);
-
-  tensor_impl1.reset();
-  EXPECT_NE(tensor_impl2.get(), nullptr);
-  EXPECT_EQ(tensor_impl2.use_count(), 1);
-
-  EXPECT_EQ(((float*)tensor_impl2->data())[0], 2.0f);
-}
-
-TEST_F(TensorImplPtrTest, CustomDeleterWithSharedData) {
-  auto data = std::make_shared<std::vector<float>>(100, 1.0f);
-  bool deleter_called = false;
-  {
-    auto tensor_impl = make_tensor_impl_ptr(
-        exec_aten::ScalarType::Float,
-        {10, 10},
-        data->data(),
-        {},
-        {},
-        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
-        [data, &deleter_called](void*) mutable {
-          deleter_called = true;
-          data.reset();
-        });
-
-    EXPECT_EQ(data.use_count(), 2);
-    EXPECT_FALSE(deleter_called);
-  }
-  EXPECT_TRUE(deleter_called);
-  EXPECT_EQ(data.use_count(), 1);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplDeducedScalarType) {
-  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
-  auto tensor_impl = make_tensor_impl_ptr({2, 2}, std::move(data));
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 2);
-  EXPECT_EQ(tensor_impl->size(1), 2);
-  EXPECT_EQ(tensor_impl->strides()[0], 2);
-  EXPECT_EQ(tensor_impl->strides()[1], 1);
-  EXPECT_EQ(((double*)tensor_impl->data())[0], 1.0);
-  EXPECT_EQ(((double*)tensor_impl->data())[3], 4.0);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplUint8BufferWithFloatScalarType) {
-  std::vector<uint8_t> data(
-      4 * exec_aten::elementSize(exec_aten::ScalarType::Float));
-
-  float* float_data = reinterpret_cast<float*>(data.data());
-  float_data[0] = 1.0f;
-  float_data[1] = 2.0f;
-  float_data[2] = 3.0f;
-  float_data[3] = 4.0f;
-
-  auto tensor_impl = make_tensor_impl_ptr(
-      exec_aten::ScalarType::Float, {2, 2}, std::move(data));
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 2);
-  EXPECT_EQ(tensor_impl->size(1), 2);
-  EXPECT_EQ(tensor_impl->strides()[0], 2);
-  EXPECT_EQ(tensor_impl->strides()[1], 1);
-
-  EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f);
-  EXPECT_EQ(((float*)tensor_impl->data())[1], 2.0f);
-  EXPECT_EQ(((float*)tensor_impl->data())[2], 3.0f);
-  EXPECT_EQ(((float*)tensor_impl->data())[3], 4.0f);
-}
-
-TEST_F(TensorImplPtrTest, TensorImplUint8BufferTooSmallExpectDeath) {
-  std::vector<uint8_t> data(
-      2 * exec_aten::elementSize(exec_aten::ScalarType::Float));
-  ET_EXPECT_DEATH(
-      {
-        auto tensor_impl = make_tensor_impl_ptr(
-            exec_aten::ScalarType::Float, {2, 2}, std::move(data));
-      },
-      "");
-}
-
-TEST_F(TensorImplPtrTest, TensorImplUint8BufferTooLarge) {
-  std::vector<uint8_t> data(
-      4 * exec_aten::elementSize(exec_aten::ScalarType::Float));
-  auto tensor_impl = make_tensor_impl_ptr(
-      exec_aten::ScalarType::Float, {2, 2}, std::move(data));
-
-  EXPECT_EQ(tensor_impl->dim(), 2);
-  EXPECT_EQ(tensor_impl->size(0), 2);
-  EXPECT_EQ(tensor_impl->size(1), 2);
-  EXPECT_EQ(tensor_impl->strides()[0], 2);
-  EXPECT_EQ(tensor_impl->strides()[1], 1);
-}
diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp
index 41f3fa21439..4bfc56338ec 100644
--- a/extension/tensor/test/tensor_ptr_maker_test.cpp
+++ b/extension/tensor/test/tensor_ptr_maker_test.cpp
@@ -363,11 +363,11 @@ TEST_F(TensorPtrMakerTest, CreateRandTensorWithDoubleType) {
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandnTensor) {
-  auto tensor = randn({4, 5});
+  auto tensor = randn({100, 100});
 
   EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 4);
-  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->size(0), 100);
+  EXPECT_EQ(tensor->size(1), 100);
   EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
 
   auto sum = 0.0f;
@@ -375,15 +375,15 @@ TEST_F(TensorPtrMakerTest, CreateRandnTensor) {
     sum += tensor->const_data_ptr<float>()[i];
   }
   const auto average = sum / tensor->numel();
-  EXPECT_NEAR(average, 0.0f, 0.5f);
+  EXPECT_NEAR(average, 0.0f, 1.0f);
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandnTensorWithDoubleType) {
-  auto tensor = randn({4, 5}, exec_aten::ScalarType::Double);
+  auto tensor = randn({100, 100}, exec_aten::ScalarType::Double);
 
   EXPECT_EQ(tensor->dim(), 2);
-  EXPECT_EQ(tensor->size(0), 4);
-  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->size(0), 100);
+  EXPECT_EQ(tensor->size(1), 100);
   EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
 
   auto sum = 0.0;
@@ -391,7 +391,7 @@ TEST_F(TensorPtrMakerTest, CreateRandnTensorWithDoubleType) {
     sum += tensor->const_data_ptr<double>()[i];
   }
   const auto average = sum / tensor->numel();
-  EXPECT_NEAR(average, 0.0, 0.5);
+  EXPECT_NEAR(average, 0.0, 1.0);
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithIntType) {
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index 653e2ef98d7..b9f8b9a2a78 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -11,6 +11,7 @@
 #include <gtest/gtest.h>
 
 #include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
 
 using namespace ::executorch::extension;
 using namespace ::executorch::runtime;
@@ -24,7 +25,7 @@ class TensorPtrTest : public ::testing::Test {
 
 TEST_F(TensorPtrTest, ScalarTensorCreation) {
   float scalar_data = 3.14f;
-  auto tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {}, &scalar_data);
+  auto tensor = make_tensor_ptr({}, &scalar_data);
 
   EXPECT_EQ(tensor->numel(), 1);
   EXPECT_EQ(tensor->dim(), 0);
@@ -44,10 +45,43 @@ TEST_F(TensorPtrTest, ScalarTensorOwningData) {
   EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3.14f);
 }
 
+TEST_F(TensorPtrTest, ScalarTensorSingleValueCreation) {
+  auto tensor_float = make_tensor_ptr(3.14f);
+  EXPECT_EQ(tensor_float->dim(), 0);
+  EXPECT_EQ(tensor_float->numel(), 1);
+  EXPECT_EQ(tensor_float->sizes().size(), 0);
+  EXPECT_EQ(tensor_float->strides().size(), 0);
+  EXPECT_EQ(tensor_float->const_data_ptr<float>()[0], 3.14f);
+  EXPECT_EQ(tensor_float->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto tensor_int32 = make_tensor_ptr(42);
+  EXPECT_EQ(tensor_int32->dim(), 0);
+  EXPECT_EQ(tensor_int32->numel(), 1);
+  EXPECT_EQ(tensor_int32->sizes().size(), 0);
+  EXPECT_EQ(tensor_int32->strides().size(), 0);
+  EXPECT_EQ(tensor_int32->const_data_ptr<int32_t>()[0], 42);
+  EXPECT_EQ(tensor_int32->scalar_type(), exec_aten::ScalarType::Int);
+
+  auto tensor_double = make_tensor_ptr(2.718);
+  EXPECT_EQ(tensor_double->dim(), 0);
+  EXPECT_EQ(tensor_double->numel(), 1);
+  EXPECT_EQ(tensor_double->sizes().size(), 0);
+  EXPECT_EQ(tensor_double->strides().size(), 0);
+  EXPECT_EQ(tensor_double->const_data_ptr<double>()[0], 2.718);
+  EXPECT_EQ(tensor_double->scalar_type(), exec_aten::ScalarType::Double);
+
+  auto tensor_int64 = make_tensor_ptr(static_cast<int64_t>(10000000000));
+  EXPECT_EQ(tensor_int64->dim(), 0);
+  EXPECT_EQ(tensor_int64->numel(), 1);
+  EXPECT_EQ(tensor_int64->sizes().size(), 0);
+  EXPECT_EQ(tensor_int64->strides().size(), 0);
+  EXPECT_EQ(tensor_int64->const_data_ptr<int64_t>()[0], 10000000000);
+  EXPECT_EQ(tensor_int64->scalar_type(), exec_aten::ScalarType::Long);
+}
+
 TEST_F(TensorPtrTest, CreateTensorWithStridesAndDimOrder) {
   float data[20] = {2};
-  auto tensor = make_tensor_ptr(
-      exec_aten::ScalarType::Float, {4, 5}, data, {0, 1}, {5, 1});
+  auto tensor = make_tensor_ptr({4, 5}, data, {0, 1}, {5, 1});
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
@@ -59,19 +93,18 @@ TEST_F(TensorPtrTest, CreateTensorWithStridesAndDimOrder) {
 
 TEST_F(TensorPtrTest, TensorSharingImpl) {
   float data[20] = {2};
-  auto tensor1 = make_tensor_ptr(exec_aten::ScalarType::Float, {4, 5}, data);
-  auto tensor2 = make_tensor_ptr(tensor1);
+  auto tensor1 = make_tensor_ptr({4, 5}, data);
+  auto tensor2 = tensor1;
+  EXPECT_EQ(tensor1.get(), tensor2.get());
   EXPECT_EQ(tensor1->unsafeGetTensorImpl(), tensor2->unsafeGetTensorImpl());
 }
 
-TEST_F(TensorPtrTest, TensorImplLifetime) {
+TEST_F(TensorPtrTest, TensorLifetime) {
   TensorPtr tensor;
   EXPECT_EQ(tensor, nullptr);
   {
     float data[20] = {2};
-    auto tensor_impl =
-        make_tensor_impl_ptr(exec_aten::ScalarType::Float, {4, 5}, data);
-    tensor = make_tensor_ptr(tensor_impl);
+    tensor = make_tensor_ptr({4, 5}, data);
   }
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
@@ -80,10 +113,10 @@ TEST_F(TensorPtrTest, TensorImplLifetime) {
 
 TEST_F(TensorPtrTest, TensorWithZeroDimensionAndElements) {
   float data[20] = {2};
-  auto tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {}, data);
+  auto tensor = make_tensor_ptr({}, data);
   EXPECT_EQ(tensor->dim(), 0);
   EXPECT_EQ(tensor->numel(), 1);
-  tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {0, 5}, data);
+  tensor = make_tensor_ptr({0, 5}, data);
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->numel(), 0);
 }
@@ -91,11 +124,11 @@ TEST_F(TensorPtrTest, TensorWithZeroDimensionAndElements) {
 TEST_F(TensorPtrTest, TensorResize) {
   float data[20] = {2};
   auto tensor = make_tensor_ptr(
-      exec_aten::ScalarType::Float,
       {4, 5},
       data,
       {},
       {},
+      exec_aten::ScalarType::Float,
       exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
   EXPECT_EQ(resize_tensor_ptr(tensor, {5, 4}), Error::Ok);
   EXPECT_EQ(tensor->size(0), 5);
@@ -104,7 +137,7 @@ TEST_F(TensorPtrTest, TensorResize) {
 
 TEST_F(TensorPtrTest, TensorDataAccess) {
   float data[6] = {1, 2, 3, 4, 5, 6};
-  auto tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {2, 3}, data);
+  auto tensor = make_tensor_ptr({2, 3}, data);
   EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1);
   EXPECT_EQ(tensor->const_data_ptr<float>()[5], 6);
   tensor->mutable_data_ptr<float>()[0] = 10;
@@ -115,11 +148,11 @@ TEST_F(TensorPtrTest, TensorWithCustomDataDeleter) {
   auto deleter_called = false;
   float* data = new float[20]();
   auto tensor = make_tensor_ptr(
-      exec_aten::ScalarType::Float,
       {4, 5},
       data,
       {},
       {},
+      exec_aten::ScalarType::Float,
       exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [&deleter_called](void* ptr) {
         deleter_called = true;
@@ -135,11 +168,11 @@ TEST_F(TensorPtrTest, TensorManagesMovedVector) {
   std::vector<float> data(20, 3.0f);
   auto* data_ptr = data.data();
   auto tensor = make_tensor_ptr(
-      exec_aten::ScalarType::Float,
       {4, 5},
       data_ptr,
       {},
       {},
+      exec_aten::ScalarType::Float,
       exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [moved_data = std::move(data), &deleter_called](void*) mutable {
         deleter_called = true;
@@ -157,11 +190,11 @@ TEST_F(TensorPtrTest, TensorDeleterReleasesCapturedSharedPtr) {
   std::shared_ptr<float[]> data_ptr(
       new float[10], [](float* ptr) { delete[] ptr; });
   auto tensor = make_tensor_ptr(
-      exec_aten::ScalarType::Float,
       {4, 5},
       data_ptr.get(),
       {},
       {},
+      exec_aten::ScalarType::Float,
       exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
 
@@ -200,7 +233,7 @@ TEST_F(TensorPtrTest, TensorOwningEmptyData) {
   EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
 }
 
-TEST_F(TensorPtrTest, TensorImplDataOnly) {
+TEST_F(TensorPtrTest, TensorDataOnly) {
   auto tensor = make_tensor_ptr({1.0f, 2.0f, 3.0f, 4.0f});
 
   EXPECT_EQ(tensor->dim(), 1);
@@ -211,7 +244,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnly) {
   EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
 }
 
-TEST_F(TensorPtrTest, TensorImplDataOnlyDoubleType) {
+TEST_F(TensorPtrTest, TensorDataOnlyDoubleType) {
   std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
   auto tensor = make_tensor_ptr(std::move(data));
 
@@ -223,7 +256,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnlyDoubleType) {
   EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
 }
 
-TEST_F(TensorPtrTest, TensorImplDataOnlyInt32Type) {
+TEST_F(TensorPtrTest, TensorDataOnlyInt32Type) {
   std::vector<int32_t> data = {10, 20, 30, 40};
   auto tensor = make_tensor_ptr(std::move(data));
 
@@ -235,7 +268,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnlyInt32Type) {
   EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
 }
 
-TEST_F(TensorPtrTest, TensorImplDataOnlyInt64Type) {
+TEST_F(TensorPtrTest, TensorDataOnlyInt64Type) {
   std::vector<int64_t> data = {100, 200, 300, 400};
   auto tensor = make_tensor_ptr(std::move(data));
 
@@ -247,7 +280,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnlyInt64Type) {
   EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
 }
 
-TEST_F(TensorPtrTest, TensorImplDataOnlyUint8Type) {
+TEST_F(TensorPtrTest, TensorDataOnlyUint8Type) {
   std::vector<uint8_t> data = {10, 20, 30, 40};
   auto tensor = make_tensor_ptr(std::move(data));
 
@@ -259,7 +292,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnlyUint8Type) {
   EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Byte);
 }
 
-TEST_F(TensorPtrTest, TensorImplAmbiguityWithMixedVectors) {
+TEST_F(TensorPtrTest, TensorAmbiguityWithMixedVectors) {
   std::vector<exec_aten::SizesType> sizes = {2, 2};
   std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
   auto tensor = make_tensor_ptr(std::move(sizes), std::move(data));
@@ -287,7 +320,7 @@ TEST_F(TensorPtrTest, TensorSharingImplModifiesSharedDataVector) {
   std::vector<float> data = {1, 2, 3, 4, 5, 6};
 
   auto tensor1 = make_tensor_ptr({2, 3}, std::move(data));
-  auto tensor2 = make_tensor_ptr(tensor1);
+  auto tensor2 = tensor1;
 
   tensor1->mutable_data_ptr<float>()[0] = 10;
   EXPECT_EQ(tensor2->const_data_ptr<float>()[0], 10);
@@ -299,13 +332,8 @@ TEST_F(TensorPtrTest, TensorSharingImplModifiesSharedDataVector) {
 TEST_F(TensorPtrTest, TensorSharingImplResizingAffectsBothVector) {
   std::vector<float> data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
 
-  auto tensor1 = make_tensor_ptr(
-      {3, 4},
-      std::move(data),
-      {},
-      {},
-      exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
-  auto tensor2 = make_tensor_ptr(tensor1);
+  auto tensor1 = make_tensor_ptr({3, 4}, std::move(data));
+  auto tensor2 = tensor1;
 
   EXPECT_EQ(resize_tensor_ptr(tensor1, {2, 6}), Error::Ok);
   EXPECT_EQ(tensor2->size(0), 2);
@@ -315,3 +343,471 @@ TEST_F(TensorPtrTest, TensorSharingImplResizingAffectsBothVector) {
   EXPECT_EQ(tensor1->size(0), 4);
   EXPECT_EQ(tensor1->size(1), 3);
 }
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(*tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int32_t>(), tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(*tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<int32_t>(),
+      tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[0], 1);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[3], 4);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<int32_t>(),
+      tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[0], 1);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[3], 4);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(*tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<double>(), tensor->const_data_ptr<double>());
+  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(*tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<double>(),
+      tensor->const_data_ptr<double>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[0], 1.0);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[3], 4.0);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<double>(),
+      tensor->const_data_ptr<double>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[0], 1.0);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[3], 4.0);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(*tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int64_t>(), tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(*tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<int64_t>(),
+      tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[0], 100);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[3], 400);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned_tensor = clone_tensor_ptr(tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_NE(
+      cloned_tensor->const_data_ptr<int64_t>(),
+      tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[0], 100);
+  EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[3], 400);
+  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrNull) {
+  auto tensor = make_tensor_ptr({2, 2}, nullptr);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
+
+  EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
+  EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(cloned_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(cloned_tensor->const_data_ptr(), tensor->const_data_ptr());
+  EXPECT_EQ(cloned_tensor->const_data_ptr(), nullptr);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromIntToFloat) {
+  std::vector<int32_t> int_data = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr(
+      {2, 3}, std::move(int_data), {}, {}, exec_aten::ScalarType::Float);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto data_ptr = tensor->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data_ptr[0], 1.0f);
+  EXPECT_FLOAT_EQ(data_ptr[5], 6.0f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromIntToDouble) {
+  std::vector<int32_t> int_data = {1, 2, 3};
+  auto tensor =
+      make_tensor_ptr(std::move(int_data), exec_aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+
+  auto data_ptr = tensor->const_data_ptr<double>();
+  EXPECT_DOUBLE_EQ(data_ptr[0], 1.0);
+  EXPECT_DOUBLE_EQ(data_ptr[1], 2.0);
+  EXPECT_DOUBLE_EQ(data_ptr[2], 3.0);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromFloatToHalf) {
+  std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
+  auto tensor =
+      make_tensor_ptr(std::move(float_data), exec_aten::ScalarType::Half);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Half);
+
+  auto data_ptr = tensor->const_data_ptr<exec_aten::Half>();
+  EXPECT_EQ(static_cast<float>(data_ptr[0]), 1.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[1]), 2.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[2]), 3.0f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromDoubleToFloat) {
+  std::vector<double> double_data = {1.1, 2.2, 3.3};
+  auto tensor =
+      make_tensor_ptr(std::move(double_data), exec_aten::ScalarType::Float);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+
+  auto data_ptr = tensor->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data_ptr[0], 1.1f);
+  EXPECT_FLOAT_EQ(data_ptr[1], 2.2f);
+  EXPECT_FLOAT_EQ(data_ptr[2], 3.3f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromInt64ToInt32) {
+  std::vector<int64_t> int64_data = {10000000000, 20000000000, 30000000000};
+  auto tensor =
+      make_tensor_ptr(std::move(int64_data), exec_aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+
+  auto data_ptr = tensor->const_data_ptr<int32_t>();
+  EXPECT_NE(data_ptr[0], 10000000000); // Expected overflow
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromFloatToBFloat16) {
+  std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
+  auto tensor =
+      make_tensor_ptr(std::move(float_data), exec_aten::ScalarType::BFloat16);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::BFloat16);
+
+  auto data_ptr = tensor->const_data_ptr<exec_aten::BFloat16>();
+  EXPECT_EQ(static_cast<float>(data_ptr[0]), 1.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[1]), 2.0f);
+  EXPECT_EQ(static_cast<float>(data_ptr[2]), 3.0f);
+}
+
+TEST_F(TensorPtrTest, InitializerListDoubleToHalf) {
+  auto tensor =
+      make_tensor_ptr<double>({1.5, 2.7, 3.14}, exec_aten::ScalarType::Half);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Half);
+
+  auto data_ptr = tensor->const_data_ptr<exec_aten::Half>();
+  EXPECT_NEAR(static_cast<float>(data_ptr[0]), 1.5f, 0.01);
+  EXPECT_NEAR(static_cast<float>(data_ptr[1]), 2.7f, 0.01);
+  EXPECT_NEAR(static_cast<float>(data_ptr[2]), 3.14f, 0.01);
+}
+
+TEST_F(TensorPtrTest, InitializerListInt8ToInt64) {
+  auto tensor =
+      make_tensor_ptr<int8_t>({1, -2, 3, -4}, exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+
+  auto data_ptr = tensor->const_data_ptr<int64_t>();
+  EXPECT_EQ(data_ptr[0], 1);
+  EXPECT_EQ(data_ptr[1], -2);
+  EXPECT_EQ(data_ptr[2], 3);
+  EXPECT_EQ(data_ptr[3], -4);
+}
+
+TEST_F(TensorPtrTest, TensorInferredDimOrderAndStrides) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data, {}, {4, 1});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+  EXPECT_EQ(tensor->strides()[0], 4);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->const_data_ptr(), data);
+}
+
+TEST_F(TensorPtrTest, TensorInferredDimOrderCustomStrides) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data, {}, {1, 3});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, TensorDefaultDimOrderAndStrides) {
+  float data[24] = {0};
+  auto tensor = make_tensor_ptr({2, 3, 4}, data);
+
+  EXPECT_EQ(tensor->dim(), 3);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->size(2), 4);
+  EXPECT_EQ(tensor->strides()[0], 12);
+  EXPECT_EQ(tensor->strides()[1], 4);
+  EXPECT_EQ(tensor->strides()[2], 1);
+}
+
+TEST_F(TensorPtrTest, TensorMismatchStridesAndDimOrder) {
+  float data[12] = {0};
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_ptr({3, 4}, data, {1, 0}, {1, 4}); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorCustomDimOrderAndStrides) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data, {1, 0}, {1, 3});
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, TensorInvalidDimOrder) {
+  ET_EXPECT_DEATH(
+      {
+        float data[20] = {2};
+        auto _ = make_tensor_ptr({4, 5}, data, {2, 1}, {1, 4});
+      },
+      "");
+}
+
+TEST_F(TensorPtrTest, TensorCustomDeleter) {
+  float data[20] = {4};
+  auto tensor = make_tensor_ptr({4, 5}, data);
+
+  TensorPtr copied_tensor = tensor;
+  EXPECT_EQ(tensor.use_count(), copied_tensor.use_count());
+
+  tensor.reset();
+  EXPECT_EQ(copied_tensor.use_count(), 1);
+}
+
+TEST_F(TensorPtrTest, TensorDataDeleterReleasesCapturedSharedPtr) {
+  auto deleter_called = false;
+  std::shared_ptr<float[]> data_ptr(
+      new float[10], [](float* ptr) { delete[] ptr; });
+  auto tensor = make_tensor_ptr(
+      {4, 5},
+      data_ptr.get(),
+      {},
+      {},
+      exec_aten::ScalarType::Float,
+      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
+
+  EXPECT_EQ(data_ptr.use_count(), 2);
+
+  tensor.reset();
+  EXPECT_TRUE(deleter_called);
+  EXPECT_EQ(data_ptr.use_count(), 1);
+}
+
+TEST_F(TensorPtrTest, SharedDataManagement) {
+  auto data = std::make_shared<std::vector<float>>(100, 1.0f);
+  auto tensor1 = make_tensor_ptr({10, 10}, data->data());
+  auto tensor2 = tensor1;
+
+  EXPECT_EQ(tensor1.get(), tensor2.get());
+  EXPECT_EQ(tensor1.use_count(), 2);
+  EXPECT_EQ(tensor1->const_data_ptr<float>()[0], 1.0f);
+
+  tensor1->mutable_data_ptr<float>()[0] = 2.0f;
+  EXPECT_EQ(tensor1->const_data_ptr<float>()[0], 2.0f);
+
+  tensor1.reset();
+  EXPECT_NE(tensor2.get(), nullptr);
+  EXPECT_EQ(tensor2.use_count(), 1);
+
+  EXPECT_EQ(tensor2->const_data_ptr<float>()[0], 2.0f);
+}
+
+TEST_F(TensorPtrTest, CustomDeleterWithSharedData) {
+  auto data = std::make_shared<std::vector<float>>(100, 1.0f);
+  bool deleter_called = false;
+  {
+    auto tensor = make_tensor_ptr(
+        {10, 10},
+        data->data(),
+        {},
+        {},
+        exec_aten::ScalarType::Float,
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+        [data, &deleter_called](void*) mutable {
+          deleter_called = true;
+          data.reset();
+        });
+
+    EXPECT_EQ(data.use_count(), 2);
+    EXPECT_FALSE(deleter_called);
+  }
+  EXPECT_TRUE(deleter_called);
+  EXPECT_EQ(data.use_count(), 1);
+}
+
+TEST_F(TensorPtrTest, TensorDeducedScalarType) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 2);
+  EXPECT_EQ(tensor->strides()[0], 2);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->const_data_ptr<double>()[0], 1.0);
+  EXPECT_EQ(tensor->const_data_ptr<double>()[3], 4.0);
+}
+
+TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
+  std::vector<uint8_t> data(
+      4 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+
+  float* float_data = reinterpret_cast<float*>(data.data());
+  float_data[0] = 1.0f;
+  float_data[1] = 2.0f;
+  float_data[2] = 3.0f;
+  float_data[3] = 4.0f;
+
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 2);
+  EXPECT_EQ(tensor->strides()[0], 2);
+  EXPECT_EQ(tensor->strides()[1], 1);
+
+  EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1.0f);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[1], 2.0f);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[2], 3.0f);
+  EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0f);
+}
+
+TEST_F(TensorPtrTest, TensorUint8BufferTooSmallExpectDeath) {
+  std::vector<uint8_t> data(
+      2 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+  ET_EXPECT_DEATH(
+      { auto tensor = make_tensor_ptr({2, 2}, std::move(data)); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorUint8BufferTooLarge) {
+  std::vector<uint8_t> data(
+      4 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 2);
+  EXPECT_EQ(tensor->strides()[0], 2);
+  EXPECT_EQ(tensor->strides()[1], 1);
+}
+
+TEST_F(TensorPtrTest, StridesAndDimOrderMustMatchSizes) {
+  float data[12] = {0};
+  ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({3, 4}, data, {}, {1}); }, "");
+  ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({3, 4}, data, {0}, {4, 1}); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingInvalidCast) {
+  std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
+  ET_EXPECT_DEATH(
+      {
+        auto _ =
+            make_tensor_ptr(std::move(float_data), exec_aten::ScalarType::Int);
+      },
+      "");
+}
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index 281b63b8592..90288656674 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -24,7 +24,7 @@ add_library(
   extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp
 )
 target_link_libraries(
-  extension_threadpool PUBLIC executorch_no_prim_ops cpuinfo pthreadpool
+  extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool
 )
 target_include_directories(extension_threadpool PUBLIC ${EXECUTORCH_ROOT}/..)
 target_include_directories(
diff --git a/extension/threadpool/cpuinfo_utils.cpp b/extension/threadpool/cpuinfo_utils.cpp
index 9fb611c5644..5dc3fa7fae5 100644
--- a/extension/threadpool/cpuinfo_utils.cpp
+++ b/extension/threadpool/cpuinfo_utils.cpp
@@ -6,17 +6,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/runtime/platform/assert.h>
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+
 #include <fstream>
 #include <mutex>
 #include <string>
 #include <vector>
 
-#include "cpuinfo_utils.h"
+#include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace executorch {
-namespace cpuinfo {
+namespace executorch::extension::cpuinfo {
 
 // Ignore revisions (last digit (4 LSBs))
 #define CPUINFO_ARM_MIDR_CORTEX_A520 UINT32_C(0x410FD800)
@@ -171,6 +170,4 @@ uint32_t get_num_performant_cores() {
   }
 }
 
-} // namespace cpuinfo
-} // namespace executorch
-} // namespace torch
+} // namespace executorch::extension::cpuinfo
diff --git a/extension/threadpool/cpuinfo_utils.h b/extension/threadpool/cpuinfo_utils.h
index f9e6b32f017..d559738b728 100644
--- a/extension/threadpool/cpuinfo_utils.h
+++ b/extension/threadpool/cpuinfo_utils.h
@@ -10,12 +10,15 @@
 
 #include <cpuinfo.h>
 
-namespace torch {
-namespace executorch {
-namespace cpuinfo {
+namespace executorch::extension::cpuinfo {
 
 uint32_t get_num_performant_cores();
 
-} // namespace cpuinfo
-} // namespace executorch
-} // namespace torch
+} // namespace executorch::extension::cpuinfo
+
+namespace torch::executorch::cpuinfo { // DEPRECATED
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces. Note that threadpool incorrectly used
+// the namespace `torch::executorch` instead of `torch::executor`.
+using ::executorch::extension::cpuinfo::get_num_performant_cores; // DEPRECATED
+} // namespace torch::executorch::cpuinfo
diff --git a/extension/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp
index c244b8fcf23..e7784d3cc11 100644
--- a/extension/threadpool/test/threadpool_test.cpp
+++ b/extension/threadpool/test/threadpool_test.cpp
@@ -6,14 +6,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <gtest/gtest.h>
+#include <executorch/extension/threadpool/threadpool.h>
+
 #include <mutex>
 #include <numeric>
 #include <random>
 
-#include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/extension/threadpool/threadpool_guard.h>
 
+#include <gtest/gtest.h>
+
 using namespace ::testing;
 
 namespace {
@@ -63,7 +65,7 @@ void run_lambda_with_size(
     size_t grain_size) {
   size_t num_grains = div_round_up(range, grain_size);
 
-  auto threadpool = torch::executorch::threadpool::get_threadpool();
+  auto threadpool = ::executorch::extension::threadpool::get_threadpool();
   threadpool->run(f, range);
 }
 } // namespace
@@ -82,7 +84,7 @@ TEST(ThreadPoolTest, ParallelAdd) {
     }
   };
 
-  auto threadpool = torch::executorch::threadpool::get_threadpool();
+  auto threadpool = ::executorch::extension::threadpool::get_threadpool();
   EXPECT_GT(threadpool->get_thread_count(), 1);
 
   generate_add_test_inputs(a, b, c_ref, c, vector_size);
@@ -125,7 +127,7 @@ TEST(ThreadPoolTest, ParallelReduce) {
     }
   };
 
-  auto threadpool = torch::executorch::threadpool::get_threadpool();
+  auto threadpool = ::executorch::extension::threadpool::get_threadpool();
   EXPECT_GT(threadpool->get_thread_count(), 1);
 
   generate_reduce_test_inputs(a, c_ref, vector_size);
@@ -142,27 +144,30 @@ TEST(ThreadPoolTest, ParallelReduce) {
 // Copied from
 // caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
 TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
-  auto threadpool_ptr = torch::executorch::threadpool::get_pthreadpool();
+  auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool();
 
   ASSERT_NE(threadpool_ptr, nullptr);
   {
-    torch::executorch::threadpool::NoThreadPoolGuard g1;
-    auto threadpool_ptr1 = torch::executorch::threadpool::get_pthreadpool();
+    ::executorch::extension::threadpool::NoThreadPoolGuard g1;
+    auto threadpool_ptr1 =
+        ::executorch::extension::threadpool::get_pthreadpool();
     ASSERT_EQ(threadpool_ptr1, nullptr);
 
     {
-      torch::executorch::threadpool::NoThreadPoolGuard g2;
-      auto threadpool_ptr2 = torch::executorch::threadpool::get_pthreadpool();
+      ::executorch::extension::threadpool::NoThreadPoolGuard g2;
+      auto threadpool_ptr2 =
+          ::executorch::extension::threadpool::get_pthreadpool();
       ASSERT_EQ(threadpool_ptr2, nullptr);
     }
 
     // Guard should restore prev value (nullptr)
-    auto threadpool_ptr3 = torch::executorch::threadpool::get_pthreadpool();
+    auto threadpool_ptr3 =
+        ::executorch::extension::threadpool::get_pthreadpool();
     ASSERT_EQ(threadpool_ptr3, nullptr);
   }
 
   // Guard should restore prev value (pthreadpool_)
-  auto threadpool_ptr4 = torch::executorch::threadpool::get_pthreadpool();
+  auto threadpool_ptr4 = ::executorch::extension::threadpool::get_pthreadpool();
   ASSERT_NE(threadpool_ptr4, nullptr);
   ASSERT_EQ(threadpool_ptr4, threadpool_ptr);
 }
@@ -170,18 +175,19 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
 TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
   const std::vector<int64_t> array = {1, 2, 3};
 
-  auto pool = torch::executorch::threadpool::get_threadpool();
+  auto pool = ::executorch::extension::threadpool::get_threadpool();
   int64_t inner = 0;
   {
     // Run on same thread
-    torch::executorch::threadpool::NoThreadPoolGuard g1;
+    ::executorch::extension::threadpool::NoThreadPoolGuard g1;
     auto fn = [&array, &inner](const size_t task_id) {
       inner += array[task_id];
     };
     pool->run(fn, 3);
 
     // confirm the guard is on
-    auto threadpool_ptr = torch::executorch::threadpool::get_pthreadpool();
+    auto threadpool_ptr =
+        ::executorch::extension::threadpool::get_pthreadpool();
     ASSERT_EQ(threadpool_ptr, nullptr);
   }
   ASSERT_EQ(inner, 6);
diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
index e8f2ea5f704..4134bb8669d 100644
--- a/extension/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -7,18 +7,17 @@
  */
 
 #include <executorch/extension/threadpool/threadpool.h>
+
+#include <algorithm>
+#include <atomic>
+#include <memory>
+
 #include <executorch/extension/threadpool/threadpool_guard.h>
 #include <executorch/runtime/platform/assert.h>
-#include <algorithm>
 
 #include <cpuinfo.h>
 
-#include <atomic>
-#include <memory>
-
-namespace torch {
-namespace executorch {
-namespace threadpool {
+namespace executorch::extension::threadpool {
 
 #if !(defined(WIN32))
 namespace {
@@ -86,6 +85,7 @@ void ThreadPool::run(
       // pthreadpool_parallelize_1d() cannot go out of scope until
       // pthreadpool_parallelize_1d() returns.
       [](void* const context, const size_t item) {
+        NoThreadPoolGuard guard;
         reinterpret_cast<Context*>(context)->fn(item);
       },
       &context,
@@ -138,6 +138,4 @@ pthreadpool_t get_pthreadpool() {
   return threadpool->threadpool_.get();
 }
 
-} // namespace threadpool
-} // namespace executorch
-} // namespace torch
+} // namespace executorch::extension::threadpool
diff --git a/extension/threadpool/threadpool.h b/extension/threadpool/threadpool.h
index 1033e868a37..15133befef6 100644
--- a/extension/threadpool/threadpool.h
+++ b/extension/threadpool/threadpool.h
@@ -1,17 +1,20 @@
-#pragma once
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
-#include <pthreadpool.h>
+#pragma once
 
-// @nolint PATTERNLINT Ok to use stdlib for this optional library
 #include <functional>
-// @nolint PATTERNLINT Ok to use stdlib for this optional library
 #include <memory>
-// @nolint PATTERNLINT Ok to use stdlib for this optional library
 #include <mutex>
 
-namespace torch {
-namespace executorch {
-namespace threadpool {
+#include <pthreadpool.h>
+
+namespace executorch::extension::threadpool {
 
 class ThreadPool final {
  public:
@@ -26,55 +29,64 @@ class ThreadPool final {
   ThreadPool& operator=(const ThreadPool&) = delete;
 
   // Make threadpool non-movable.
-  // For now this is non-movable, but if we want to have clients
-  // such as say torch::executorch::Executor, to be able to own
-  // threadpool, then we will have to make this movable.
   ThreadPool(ThreadPool&&) = delete;
   ThreadPool& operator=(ThreadPool&&) = delete;
 
   size_t get_thread_count() const;
 
-  /*
-   * Resets the threadpool by creating a new threadpool with requested # of
-   * threads. This is not a thread safe call. When calling this method, threads
-   * of the threadpool might be doing some work. Some other code may also be
-   * holding on to the threadpool pointer, that is no longer valid. This is a
-   * private API, which will later be replaced by something that allows creating
-   * of threadpool with requested size and use such a threadpool with backend
-   * delegates, custom ops or optimized lib.
+  /**
+   * INTERNAL: Resets the threadpool by creating a new threadpool with requested
+   * # of threads. This is not a thread safe call. When calling this method,
+   * threads of the threadpool might be doing some work. Some other code may
+   * also be holding on to the threadpool pointer, that is no longer valid. This
+   * is a private API, which will later be replaced by something that allows
+   * creating of threadpool with requested size and use such a threadpool with
+   * backend delegates, custom ops or optimized lib.
    */
+  [[deprecated("This API is experimental and may change without notice.")]]
   bool _unsafe_reset_threadpool(uint32_t num_threads);
 
-  // Run, in parallel, function fn(task_id) over task_id in range [0, range).
-  // This function is blocking.  All input is processed by the time it returns.
-  // NoThreadPoolGuard (see threadpool_guard.h) can used to disable
-  // use of multiple threads with the scope of the guard
-  // When NoThreadPoolGuard is not used all calls to run method are serialized.
+  /**
+   * Run, in parallel, function fn(task_id) over task_id in range [0, range).
+   * This function is blocking.  All input is processed by the time it returns.
+   * NoThreadPoolGuard (see threadpool_guard.h) can used to disable use of
+   * multiple threads with the scope of the guard When NoThreadPoolGuard is not
+   * used all calls to run method are serialized.
+   */
   void run(const std::function<void(size_t)>& fn, size_t range);
 
  private:
   friend pthreadpool_t get_pthreadpool();
 
  private:
-  // This mutex is used inside get_thread_count API but it is not
-  // really needed. Since data members of ThreadPool objects are not
-  // really mutable.
-  // Figure out if we will allow set_num_threads API, in which mutex
-  // will be useful. Otherwise remove it.
-  // TODO(kimishpatel)
+  // This mutex is used inside get_thread_count API but it is not really needed
+  // since data members of ThreadPool objects are not really mutable.
+  // TODO(kimishpatel): Figure out if we will allow set_num_threads API, in
+  // which case this mutex will be useful. Otherwise remove it.
   mutable std::mutex mutex_;
   std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_;
 };
 
-// Return a singleton instance of ThreadPool for ATen/TH multithreading.
+/**
+ * Returns the singleton instance of ThreadPool for ATen/TH multithreading.
+ */
 ThreadPool* get_threadpool();
 
-// Exposes the underlying implementation of ThreadPool.
-// Only for use in external libraries so as to unify threading across
-// internal (i.e. ATen, etc.) and external (e.g. NNPACK, QNNPACK, XNNPACK)
-// use cases.
+/**
+ * Returns the underlying pthreadpool instance used by the implementation of
+ * ThreadPool returned by `get_threadpool()`. Only for use in external libraries
+ * so as to unify threading across internal (i.e. ATen, etc.) and external (e.g.
+ * NNPACK, QNNPACK, XNNPACK) use cases.
+ */
 pthreadpool_t get_pthreadpool();
 
-} // namespace threadpool
-} // namespace executorch
-} // namespace torch
+} // namespace executorch::extension::threadpool
+
+namespace torch::executorch::threadpool { // DEPRECATED
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces. Note that threadpool incorrectly used
+// the namespace `torch::executorch` instead of `torch::executor`.
+using ::executorch::extension::threadpool::get_pthreadpool; // DEPRECATED
+using ::executorch::extension::threadpool::get_threadpool; // DEPRECATED
+using ::executorch::extension::threadpool::ThreadPool; // DEPRECATED
+} // namespace torch::executorch::threadpool
diff --git a/extension/threadpool/threadpool_guard.cpp b/extension/threadpool/threadpool_guard.cpp
index ac4103fbbc7..21519eaa34a 100644
--- a/extension/threadpool/threadpool_guard.cpp
+++ b/extension/threadpool/threadpool_guard.cpp
@@ -8,9 +8,7 @@
 
 #include <executorch/extension/threadpool/threadpool_guard.h>
 
-namespace torch {
-namespace executorch {
-namespace threadpool {
+namespace executorch::extension::threadpool {
 
 thread_local bool NoThreadPoolGuard_enabled = false;
 
@@ -22,6 +20,4 @@ void NoThreadPoolGuard::set_enabled(bool enabled) {
   NoThreadPoolGuard_enabled = enabled;
 }
 
-} // namespace threadpool
-} // namespace executorch
-} // namespace torch
+} // namespace executorch::extension::threadpool
diff --git a/extension/threadpool/threadpool_guard.h b/extension/threadpool/threadpool_guard.h
index 5871897ab14..9fe1d8af737 100644
--- a/extension/threadpool/threadpool_guard.h
+++ b/extension/threadpool/threadpool_guard.h
@@ -8,9 +8,7 @@
 
 #pragma once
 
-namespace torch {
-namespace executorch {
-namespace threadpool {
+namespace executorch::extension::threadpool {
 
 // A RAII, thread local (!) guard that enables or disables guard upon
 // construction, and sets it back to the original value upon destruction.
@@ -29,6 +27,11 @@ struct NoThreadPoolGuard {
   const bool prev_mode_;
 };
 
-} // namespace threadpool
-} // namespace executorch
-} // namespace torch
+} // namespace executorch::extension::threadpool
+
+namespace torch::executorch::threadpool { // DEPRECATED
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces. Note that threadpool incorrectly used
+// the namespace `torch::executorch` instead of `torch::executor`.
+using ::executorch::extension::threadpool::NoThreadPoolGuard; // DEPRECATED
+} // namespace torch::executorch::threadpool
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
new file mode 100644
index 00000000000..e50bb3c71eb
--- /dev/null
+++ b/extension/training/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_training__srcs PREPEND "${EXECUTORCH_ROOT}/")
+
+add_library(extension_training ${_extension_training__srcs})
+target_include_directories(
+  extension_training PUBLIC ${_common_include_directories}
+)
+
+target_include_directories(extension_training PUBLIC ${EXECUTORCH_ROOT}/..)
+target_compile_options(extension_training PUBLIC ${_common_compile_options})
+target_link_libraries(extension_training executorch_core
+    extension_data_loader extension_module extension_tensor)
+
+
+list(TRANSFORM _train_xor__srcs PREPEND "${EXECUTORCH_ROOT}/")
+add_executable(train_xor ${_train_xor__srcs})
+target_include_directories(
+  train_xor PUBLIC ${_common_include_directories}
+)
+target_link_libraries(
+train_xor gflags executorch_core portable_ops_lib extension_tensor
+    extension_training program_schema
+)
+target_compile_options(train_xor PUBLIC ${_common_compile_options})
+
+# Install libraries
+install(
+  TARGETS extension_training
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/extension/training/README.md b/extension/training/README.md
new file mode 100644
index 00000000000..17e5f91f075
--- /dev/null
+++ b/extension/training/README.md
@@ -0,0 +1,277 @@
+# ExecuTorch On-device Training
+
+This subtree contains infrastructure to facilitate on-device training using ExecuTorch.
+This feature is experimental and under heavy active development, all the APIs are
+subject to change and many things may not work out of the box or at all in the
+current state.
+
+## Layout
+- `examples/` : Example end to end flows from model definition to optimizer.step()
+- `module/`: Utility class to provide an improved UX when using ExecuTorch for Training.
+- `optimizer/`: Cpp implementations of various optimizers, currently only SGD though Adam is planned.
+- `test/`: Tests that cover multiple subdirs.
+
+## Technical Birds Eye view
+
+At a high level ExecuTorch training follows a similar flow to inference with a few extra steps.
+
+Instead of relying on autograd at runtime to dynamically generate the backward graph and then walk it,
+we capture the backward graph ahead of time. This lets us be a lot leaner on-device as well as
+letting backends have more direct control over more of the model execution. Currently the optimizer is not
+captured though this may change over time.
+
+Loss functions must be embedded inside the model definition (and be the first output) this is used during
+capture to generate the backwards graph.
+
+Gradients become explicit graph outputs rather then hidden tensor state.
+
+Since the weights now need to be mutable during execution, they are memory planned ahead of time and copied
+from the .pte into the HeirarchicalAllocator arenas during Method init.
+
+Integration with backends/delegates is still a work in progress.
+
+
+## End to End Example
+
+To further understand the features of ExecuTorch Training and how to leverage it,
+consider the following end to end example with a neural network learning the XOR function.
+
+### Lowering a joint-graph model to ExecuTorch
+
+After following the [setting up ExecuTorch] guide. You can run
+
+```bash
+python3 extension/training/examples/XOR/export_model.py --outdir /tmp/foobar
+```
+to generate the model file. Below is a walkthrough of how that script works.
+
+First lets define our model.
+```python
+import torch.nn as nn
+from torch.nn import functional as F
+
+from torch.export import export
+from torch.export.experimental import _export_forward_backward
+
+
+# Basic Net for XOR
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(2, 10)
+        self.linear2 = nn.Linear(10, 2)
+
+    def forward(self, x):
+        return self.linear2(F.sigmoid(self.linear(x)))
+```
+
+The first big difference from the normal ExecuTorch flow is that for training we must embed
+the loss function into model and return the loss as our first output.
+
+We don't want to modify the original model definition so we will just wrap it.
+
+```python
+class TrainingNet(nn.Module):
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, input, label):
+        pred = self.net(input)
+        return self.loss(pred, label), pred.detach().argmax(dim=1)
+```
+
+Now that we have our model we can lower it to ExecuTorch. To do that we just have to follow
+a few simple steps.
+
+```python
+net = TrainingNet(Net())
+
+# Create our inputs, only the shapes of these matter.
+input = torch.randn(1, 2)
+label = torch.ones(1, dtype=torch.int64)
+
+# Captures the forward graph. The graph will look similar to the model definition now.
+# Will move to export_for_training soon which is the api planned to be supported in the long term.
+ep = export(net, (input, label))
+```
+
+This is what the graph looks like after export
+```python
+>>>print(ep.graph_module.graph)
+
+graph():
+    %p_net_linear_weight : [num_users=1] = placeholder[target=p_net_linear_weight]
+    %p_net_linear_bias : [num_users=1] = placeholder[target=p_net_linear_bias]
+    %p_net_linear2_weight : [num_users=1] = placeholder[target=p_net_linear2_weight]
+    %p_net_linear2_bias : [num_users=1] = placeholder[target=p_net_linear2_bias]
+    %input : [num_users=1] = placeholder[target=input]
+    %label : [num_users=1] = placeholder[target=label]
+    %linear : [num_users=1] = call_function[target=torch.ops.aten.linear.default](args = (%input, %p_net_linear_weight, %p_net_linear_bias), kwargs = {})
+    %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%linear,), kwargs = {})
+    %linear_1 : [num_users=2] = call_function[target=torch.ops.aten.linear.default](args = (%sigmoid, %p_net_linear2_weight, %p_net_linear2_bias), kwargs = {})
+    %cross_entropy_loss : [num_users=1] = call_function[target=torch.ops.aten.cross_entropy_loss.default](args = (%linear_1, %label), kwargs = {})
+    %detach : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%linear_1,), kwargs = {})
+    %argmax : [num_users=1] = call_function[target=torch.ops.aten.argmax.default](args = (%detach, 1), kwargs = {})
+    return (cross_entropy_loss, argmax)
+```
+
+It should look pretty similar to our model's forward function. Now we need to capture the backwards graph.
+
+```python
+ep = _export_forward_backward(ep)
+```
+
+and now the graph is
+
+```python
+>>>print(ep.graph_module.graph)
+
+graph():
+    %p_net_linear_weight : [num_users=1] = placeholder[target=p_net_linear_weight]
+    %p_net_linear_bias : [num_users=1] = placeholder[target=p_net_linear_bias]
+    %p_net_linear2_weight : [num_users=1] = placeholder[target=p_net_linear2_weight]
+    %p_net_linear2_bias : [num_users=1] = placeholder[target=p_net_linear2_bias]
+    %input : [num_users=2] = placeholder[target=input]
+    %label : [num_users=5] = placeholder[target=label]
+    %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%p_net_linear_weight, [1, 0]), kwargs = {})
+    %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%p_net_linear_bias, %input, %permute), kwargs = {})
+    %sigmoid : [num_users=3] = call_function[target=torch.ops.aten.sigmoid.default](args = (%addmm,), kwargs = {})
+    %alias : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%sigmoid,), kwargs = {})
+    %alias_1 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias,), kwargs = {})
+    %permute_1 : [num_users=2] = call_function[target=torch.ops.aten.permute.default](args = (%p_net_linear2_weight, [1, 0]), kwargs = {})
+    %addmm_1 : [num_users=2] = call_function[target=torch.ops.aten.addmm.default](args = (%p_net_linear2_bias, %sigmoid, %permute_1), kwargs = {})
+    %_log_softmax : [num_users=3] = call_function[target=torch.ops.aten._log_softmax.default](args = (%addmm_1, 1, False), kwargs = {})
+    %alias_2 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%_log_softmax,), kwargs = {})
+    %alias_3 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_2,), kwargs = {})
+    %ne : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%label, -100), kwargs = {})
+    %scalar_tensor : [num_users=1] = call_function[target=torch.ops.aten.scalar_tensor.default](args = (0,), kwargs = {dtype: torch.int64, layout: torch.strided, device: cpu})
+    %where : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne, %label, %scalar_tensor), kwargs = {})
+    %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%where, 1), kwargs = {})
+    %gather : [num_users=1] = call_function[target=torch.ops.aten.gather.default](args = (%_log_softmax, 1, %unsqueeze), kwargs = {})
+    %squeeze : [num_users=1] = call_function[target=torch.ops.aten.squeeze.dims](args = (%gather, [1]), kwargs = {})
+    %neg : [num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%squeeze,), kwargs = {})
+    %ne_1 : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%label, -100), kwargs = {})
+    %scalar_tensor_1 : [num_users=1] = call_function[target=torch.ops.aten.scalar_tensor.default](args = (0,), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu})
+    %where_1 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_1, %neg, %scalar_tensor_1), kwargs = {})
+    %ne_2 : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%label, -100), kwargs = {})
+    %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%ne_2, []), kwargs = {})
+    %_to_copy : [num_users=2] = call_function[target=torch.ops.aten._to_copy.default](args = (%sum_1,), kwargs = {dtype: torch.float32, device: cpu})
+    %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%where_1, []), kwargs = {})
+    %div : [num_users=2] = call_function[target=torch.ops.aten.div.Tensor](args = (%sum_2, %_to_copy), kwargs = {})
+    %alias_4 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%addmm_1,), kwargs = {})
+    %alias_5 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_4,), kwargs = {})
+    %alias_6 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_5,), kwargs = {})
+    %argmax : [num_users=1] = call_function[target=torch.ops.aten.argmax.default](args = (%alias_6, 1), kwargs = {})
+    %full_like : [num_users=1] = call_function[target=torch.ops.aten.full_like.default](args = (%div, 1), kwargs = {pin_memory: False, memory_format: torch.preserve_format})
+    %div_1 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%full_like, %_to_copy), kwargs = {})
+    %unsqueeze_1 : [num_users=3] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%label, 1), kwargs = {})
+    %ne_3 : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%unsqueeze_1, -100), kwargs = {})
+    %scalar_tensor_2 : [num_users=1] = call_function[target=torch.ops.aten.scalar_tensor.default](args = (0,), kwargs = {dtype: torch.int64, layout: torch.strided, device: cpu})
+    %where_2 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_3, %unsqueeze_1, %scalar_tensor_2), kwargs = {})
+    %full_like_1 : [num_users=1] = call_function[target=torch.ops.aten.full_like.default](args = (%_log_softmax, 0), kwargs = {pin_memory: False, memory_format: torch.preserve_format})
+    %scatter : [num_users=1] = call_function[target=torch.ops.aten.scatter.value](args = (%full_like_1, 1, %where_2, -1.0), kwargs = {})
+    %ne_4 : [num_users=1] = call_function[target=torch.ops.aten.ne.Scalar](args = (%unsqueeze_1, -100), kwargs = {})
+    %scalar_tensor_3 : [num_users=1] = call_function[target=torch.ops.aten.scalar_tensor.default](args = (0,), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu})
+    %where_3 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_4, %div_1, %scalar_tensor_3), kwargs = {})
+    %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%scatter, %where_3), kwargs = {})
+    %alias_7 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_3,), kwargs = {})
+    %alias_8 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_7,), kwargs = {})
+    %exp : [num_users=1] = call_function[target=torch.ops.aten.exp.default](args = (%alias_8,), kwargs = {})
+    %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul, [1], True), kwargs = {})
+    %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%exp, %sum_3), kwargs = {})
+    %sub : [num_users=3] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul, %mul_1), kwargs = {})
+    %permute_2 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_1, [1, 0]), kwargs = {})
+    %mm : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%sub, %permute_2), kwargs = {})
+    %permute_3 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%sub, [1, 0]), kwargs = {})
+    %mm_1 : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%permute_3, %sigmoid), kwargs = {})
+    %permute_4 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%mm_1, [1, 0]), kwargs = {})
+    %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%sub, [0], True), kwargs = {})
+    %view : [num_users=1] = call_function[target=torch.ops.aten.view.default](args = (%sum_4, [2]), kwargs = {})
+    %permute_5 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_4, [1, 0]), kwargs = {})
+    %alias_9 : [num_users=1] = call_function[target=torch.ops.aten.alias.default](args = (%alias_1,), kwargs = {})
+    %alias_10 : [num_users=2] = call_function[target=torch.ops.aten.alias.default](args = (%alias_9,), kwargs = {})
+    %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1, %alias_10), kwargs = {})
+    %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%alias_10, %sub_1), kwargs = {})
+    %mul_3 : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm, %mul_2), kwargs = {})
+    %permute_6 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%mul_3, [1, 0]), kwargs = {})
+    %mm_2 : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%permute_6, %input), kwargs = {})
+    %permute_7 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%mm_2, [1, 0]), kwargs = {})
+    %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_3, [0], True), kwargs = {})
+    %view_1 : [num_users=1] = call_function[target=torch.ops.aten.view.default](args = (%sum_5, [10]), kwargs = {})
+    %permute_8 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_7, [1, 0]), kwargs = {})
+    return (div, argmax, permute_8, view_1, permute_5, view)
+```
+
+Its a lot bigger! We call this the 'joint graph' or the 'forwards backwards graph'. We have explicitly captured the backwards graph
+alongside the forward and now our model returns [Loss, Any other user outputs, Gradients].
+
+From here we can lower the rest of the way to ExecuTorch
+```python
+ep = to_edge(ep)
+
+# After calling to_executorch the weights themselves are also appended to the model outputs. This is to make
+# some downstream passes like memory planning a little easier. A couple of hidden utility functions are also
+# embedded in the model __et_training_gradients_index_<method_name>,
+# __et_training_parameters_index_<method_name>, __et_training_fqn_<method_name>.
+#
+# These help us partition the huge list of model outputs into meaningful sections as well as assign names to each weight/gradient.
+ep = ep.to_executorch()
+
+with open("xor.pte", "wb") as file:
+    ep.write_to_file(file)
+```
+
+### Run the model train script with CMAKE
+After exporting the model for training, we can now try learning using CMake. We can build and use the train_xor, which is a sample wrapper for the ExecuTorch Runtime, TrainingModule, and SGD optimizer. We first begin by configuring the CMake build like such:
+```bash
+# cd to the root of executorch repo
+cd executorch
+
+# Get a clean cmake-out directory
+rm -rf cmake-out
+mkdir cmake-out
+
+# Configure cmake
+cmake \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON \
+    -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-out .
+```
+Then you can build the runtime componenets with
+
+```bash
+cmake --build cmake-out -j9 --target install --config Release
+```
+
+Now you should be able to find the executable built at `./cmake-out/extension/training/train_xor` you can run the executable with the model you generated as such
+```bash
+./cmake-out/extension/training/train_xor --model_path=./xor.pte
+```
+
+## What is missing?/ What is next?
+A ton! ExecuTorch training is still quite experimental and under heavy active development. Whats here currently is more of a technical preview.
+
+The _export_forward_backward is not very stable yet and may fail on more complicated model architectures, though we have verified it works for LoRA with LLMs.
+
+The ExecuTorch portable operator lib does not yet have full coverage of ops that might show up in the backwards graphs.
+
+We don't have a way yet to serialize the newly trained weights natively in ExecuTorch (though you can convert them to ATen tensors using extension/aten_util and then serialize them using ATen APIs).
+
+We plan to add a way to update models in place on-device (will be needed for finetuning).
+
+We are looking to integrate with many of the existing delegates/backends on ET enabling accelerated training.
+
+and so much more!
+
+## Help & Improvements
+If you have problems or questions, or have suggestions for ways to make
+implementation and testing better, please reach out to the PyTorch Edge team or
+create an issue on [github](https://www.github.com/pytorch/executorch/issues).
diff --git a/extension/training/__init__.py b/extension/training/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/extension/training/examples/XOR/TARGETS b/extension/training/examples/XOR/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/training/examples/XOR/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/training/examples/XOR/export_model.py b/extension/training/examples/XOR/export_model.py
new file mode 100644
index 00000000000..3089cea211e
--- /dev/null
+++ b/extension/training/examples/XOR/export_model.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import argparse
+
+import os
+
+import torch
+from executorch.exir import to_edge
+
+from executorch.extension.training.examples.XOR.model import Net, TrainingNet
+from torch.export import export
+from torch.export.experimental import _export_forward_backward
+
+
+def main() -> None:
+    torch.manual_seed(0)
+    parser = argparse.ArgumentParser(
+        prog="export_model",
+        description="Exports an nn.Module model to ExecuTorch .pte files",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="Path to the directory to write xor.pte files to",
+    )
+    args = parser.parse_args()
+
+    net = TrainingNet(Net())
+    x = torch.randn(1, 2)
+
+    # Captures the forward graph. The graph will look similar to the model definition now.
+    # Will move to export_for_training soon which is the api planned to be supported in the long term.
+    ep = export(net, (x, torch.ones(1, dtype=torch.int64)))
+    # Captures the backward graph. The exported_program now contains the joint forward and backward graph.
+    ep = _export_forward_backward(ep)
+    # Lower the graph to edge dialect.
+    ep = to_edge(ep)
+    # Lower the graph to executorch.
+    ep = ep.to_executorch()
+
+    # Write out the .pte file.
+    os.makedirs(args.outdir, exist_ok=True)
+    outfile = os.path.join(args.outdir, "xor.pte")
+    with open(outfile, "wb") as fp:
+        fp.write(
+            ep.buffer,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extension/training/examples/XOR/model.py b/extension/training/examples/XOR/model.py
new file mode 100644
index 00000000000..3c84238e7c5
--- /dev/null
+++ b/extension/training/examples/XOR/model.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+# Basic Net for XOR
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(2, 10)
+        self.linear2 = nn.Linear(10, 2)
+
+    def forward(self, x):
+        return self.linear2(F.sigmoid(self.linear(x)))
+
+
+# On device training requires the loss to be embedded in the model (and be the first output).
+# We wrap the original model here and add the loss calculation. This will be the model we export.
+class TrainingNet(nn.Module):
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, input, label):
+        pred = self.net(input)
+        return self.loss(pred, label), pred.detach().argmax(dim=1)
diff --git a/extension/training/examples/XOR/targets.bzl b/extension/training/examples/XOR/targets.bzl
new file mode 100644
index 00000000000..ccd7f4bf6f8
--- /dev/null
+++ b/extension/training/examples/XOR/targets.bzl
@@ -0,0 +1,51 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_binary(
+        name = "train_xor",
+        srcs = ["train.cpp"],
+        deps = [
+            "//executorch/extension/training/module:training_module",
+            "//executorch/extension/tensor:tensor",
+            "//executorch/extension/training/optimizer:sgd",
+            "//executorch/runtime/executor:program",
+            "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/kernels/portable:generated_lib",
+        ],
+        external_deps = ["gflags"],
+        define_static_target = True,
+    )
+
+    runtime.python_library(
+        name = "model",
+        srcs = ["model.py"],
+        visibility = [],  # Private
+        deps = [
+            "//caffe2:torch",
+        ],
+    )
+
+    runtime.python_library(
+        name = "export_model_lib",
+        srcs = ["export_model.py"],
+        visibility = [],
+        deps = [
+            ":model",
+            "//caffe2:torch",
+            "//executorch/exir:lib",
+        ],
+    )
+
+    runtime.python_binary(
+        name = "export_model",
+        main_module = "executorch.extension.training.examples.XOR.export_model",
+        deps = [
+            ":export_model_lib",
+        ],
+    )
diff --git a/extension/training/examples/XOR/train.cpp b/extension/training/examples/XOR/train.cpp
new file mode 100644
index 00000000000..bca433fd889
--- /dev/null
+++ b/extension/training/examples/XOR/train.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/extension/training/module/training_module.h>
+#include <executorch/extension/training/optimizer/sgd.h>
+#include <gflags/gflags.h>
+#include <random>
+
+#pragma clang diagnostic ignored \
+    "-Wbraced-scalar-init" // {0} below upsets clang.
+
+using executorch::extension::FileDataLoader;
+using executorch::extension::training::optimizer::SGD;
+using executorch::extension::training::optimizer::SGDOptions;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+DEFINE_string(model_path, "xor.pte", "Model serialized in flatbuffer format.");
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (argc != 1) {
+    std::string msg = "Extra commandline args: ";
+    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+      msg += argv[i];
+    }
+    ET_LOG(Error, "%s", msg.c_str());
+    return 1;
+  }
+
+  // Load the model file.
+  executorch::runtime::Result<executorch::extension::FileDataLoader>
+      loader_res =
+          executorch::extension::FileDataLoader::from(FLAGS_model_path.c_str());
+  if (loader_res.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to open model file: %s", FLAGS_model_path.c_str());
+    return 1;
+  }
+  auto loader = std::make_unique<executorch::extension::FileDataLoader>(
+      std::move(loader_res.get()));
+
+  auto mod = executorch::extension::training::TrainingModule(std::move(loader));
+
+  // Create full data set of input and labels.
+  std::vector<std::pair<
+      executorch::extension::TensorPtr,
+      executorch::extension::TensorPtr>>
+      data_set;
+  data_set.push_back( // XOR(1, 1) = 0
+      {executorch::extension::make_tensor_ptr<float>({1, 2}, {1, 1}),
+       executorch::extension::make_tensor_ptr<int64_t>({1}, {0})});
+  data_set.push_back( // XOR(0, 0) = 0
+      {executorch::extension::make_tensor_ptr<float>({1, 2}, {0, 0}),
+       executorch::extension::make_tensor_ptr<int64_t>({1}, {0})});
+  data_set.push_back( // XOR(1, 0) = 1
+      {executorch::extension::make_tensor_ptr<float>({1, 2}, {1, 0}),
+       executorch::extension::make_tensor_ptr<int64_t>({1}, {1})});
+  data_set.push_back( // XOR(0, 1) = 1
+      {executorch::extension::make_tensor_ptr<float>({1, 2}, {0, 1}),
+       executorch::extension::make_tensor_ptr<int64_t>({1}, {1})});
+
+  // Create optimizer.
+  // Get the params and names
+  auto param_res = mod.named_parameters("forward");
+  if (param_res.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to get named parameters");
+    return 1;
+  }
+
+  SGDOptions options{0.1};
+  SGD optimizer(param_res.get(), options);
+
+  // Randomness to sample the data set.
+  std::default_random_engine URBG{std::random_device{}()};
+  std::uniform_int_distribution<int> dist{
+      0, static_cast<int>(data_set.size()) - 1};
+
+  // Train the model.
+  size_t num_epochs = 5000;
+  for (int i = 0; i < num_epochs; i++) {
+    int index = dist(URBG);
+    auto& data = data_set[index];
+    const auto& results =
+        mod.execute_forward_backward("forward", {*data.first, *data.second});
+    if (results.error() != Error::Ok) {
+      ET_LOG(Error, "Failed to execute forward_backward");
+      return 1;
+    }
+    if (i % 500 == 0 || i == num_epochs - 1) {
+      ET_LOG(
+          Info,
+          "Step %d, Loss %f, Input [%.0f, %.0f], Prediction %ld, Label %ld",
+          i,
+          results.get()[0].toTensor().const_data_ptr<float>()[0],
+          data.first->const_data_ptr<float>()[0],
+          data.first->const_data_ptr<float>()[1],
+          results.get()[1].toTensor().const_data_ptr<int64_t>()[0],
+          data.second->const_data_ptr<int64_t>()[0]);
+    }
+    optimizer.step(mod.named_gradients("forward").get());
+  }
+}
diff --git a/extension/training/optimizer/sgd.cpp b/extension/training/optimizer/sgd.cpp
index fd63722b4f7..383383abc3e 100644
--- a/extension/training/optimizer/sgd.cpp
+++ b/extension/training/optimizer/sgd.cpp
@@ -7,21 +7,49 @@
  */
 
 #include <executorch/extension/training/optimizer/sgd.h>
-#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
 
 #include <executorch/runtime/core/error.h>
-#include <executorch/runtime/kernel/kernel_runtime_context.h>
 
 using exec_aten::Tensor;
 using exec_aten::TensorImpl;
 using ::executorch::runtime::Error;
-using ::executorch::runtime::KernelRuntimeContext;
 
 namespace executorch {
 namespace extension {
 namespace training {
 namespace optimizer {
 
+namespace {
+void add_out_hack(
+    const Tensor& a,
+    const Tensor& b,
+    const double alpha,
+    Tensor& out) {
+  auto a_ptr = a.const_data_ptr<float>();
+  auto b_ptr = b.const_data_ptr<float>();
+  auto out_ptr = out.mutable_data_ptr<float>();
+  for (size_t i = 0; i < a.numel(); ++i) {
+    out_ptr[i] = a_ptr[i] + b_ptr[i] * alpha;
+  }
+}
+
+void mul_out_hack(const Tensor& a, const double alpha, Tensor& out) {
+  auto a_ptr = a.const_data_ptr<float>();
+  auto out_ptr = out.mutable_data_ptr<float>();
+  for (size_t i = 0; i < a.numel(); ++i) {
+    out_ptr[i] = a_ptr[i] * alpha;
+  }
+}
+
+void clone_out_hack(const Tensor& a, Tensor& out) {
+  auto a_ptr = a.const_data_ptr<float>();
+  auto out_ptr = out.mutable_data_ptr<float>();
+  for (size_t i = 0; i < a.numel(); ++i) {
+    out_ptr[i] = a_ptr[i];
+  }
+}
+} // namespace
+
 bool SGDParamGroup::has_options() const {
   return options_ != nullptr;
 }
@@ -55,7 +83,6 @@ void SGD::add_param_group(const SGDParamGroup& param_group) {
 
 Error SGD::step(const std::map<exec_aten::string_view, exec_aten::Tensor>&
                     named_gradients) {
-  KernelRuntimeContext context;
   for (auto& group : param_groups_) {
     auto& options = static_cast<SGDOptions&>(group.options());
     auto weight_decay = options.weight_decay();
@@ -73,10 +100,7 @@ Error SGD::step(const std::map<exec_aten::string_view, exec_aten::Tensor>&
         auto p = param_iter->second;
         if (weight_decay != 0) {
           // uses weight_decay specified and adds it to the gradient
-          torch::executor::aten::add_outf(context, d_p, p, weight_decay, d_p);
-          if (context.failure_state() != Error::Ok) {
-            return context.failure_state();
-          }
+          add_out_hack(d_p, p, weight_decay, d_p);
         }
         if (momentum != 0) {
           Tensor buf(nullptr);
@@ -100,11 +124,7 @@ Error SGD::step(const std::map<exec_aten::string_view, exec_aten::Tensor>&
                 const_cast<TensorImpl::DimOrderType*>(d_p.dim_order().data()));
             buf = Tensor(buf_impl);
 #endif
-            torch::executor::aten::clone_outf(
-                context, d_p, exec_aten::MemoryFormat::Contiguous, buf);
-            if (context.failure_state() != Error::Ok) {
-              return context.failure_state();
-            }
+            clone_out_hack(d_p, buf);
 
             // save the state of the momentum buffer to be reused in later
             // epochs
@@ -115,31 +135,18 @@ Error SGD::step(const std::map<exec_aten::string_view, exec_aten::Tensor>&
                       .momentum_buffer();
 
             // update the momentum buffer and apply dampening
-            torch::executor::aten::mul_outf(context, buf, momentum, buf);
-            if (context.failure_state() != Error::Ok) {
-              return context.failure_state();
-            }
-            torch::executor::aten::add_outf(
-                context, buf, d_p, 1 - dampening, buf);
-            if (context.failure_state() != Error::Ok) {
-              return context.failure_state();
-            }
+            mul_out_hack(buf, momentum, buf);
+            add_out_hack(buf, d_p, 1 - dampening, buf);
           }
           if (nesterov) {
             // apply nesterov momentum
-            torch::executor::aten::add_outf(context, d_p, buf, momentum, d_p);
-            if (context.failure_state() != Error::Ok) {
-              return context.failure_state();
-            }
+            add_out_hack(d_p, buf, momentum, d_p);
           } else {
             d_p = buf;
           }
         }
         // update the parameter using the gradient and learning rate
-        torch::executor::aten::add_outf(context, p, d_p, -1 * options.lr(), p);
-        if (context.failure_state() != Error::Ok) {
-          return context.failure_state();
-        }
+        add_out_hack(p, d_p, -1 * options.lr(), p);
       }
     }
   }
diff --git a/extension/training/optimizer/sgd.h b/extension/training/optimizer/sgd.h
index 3a85f85c77b..00e84b1348c 100644
--- a/extension/training/optimizer/sgd.h
+++ b/extension/training/optimizer/sgd.h
@@ -32,7 +32,7 @@ namespace optimizer {
  * SGD optimizer state. This keeps track of the state of a given parameter to
  * be used in later epochs.
  */
-class SGDParamState {
+class ET_EXPERIMENTAL SGDParamState {
  public:
   /**
    * Constructs a new SGD param state.
@@ -55,7 +55,7 @@ class SGDParamState {
  * SGD optimizer options. This contains options for performing training on a
  * param group, such as the learning rate.
  */
-class SGDOptions {
+class ET_EXPERIMENTAL SGDOptions {
  public:
   /**
    * Constructs a new SGD optimizer options.
@@ -128,7 +128,7 @@ class SGDOptions {
  * SGD optimizer param group. This contains the parameters and
  * the SGDOptions associated to it.
  */
-class SGDParamGroup {
+class ET_EXPERIMENTAL SGDParamGroup {
  public:
   // NOTE: In order to store `SGDParamGroup` in a `std::vector`, it has
   // to be copy-constructible.
@@ -176,7 +176,7 @@ class SGDParamGroup {
  * SGD optimizer class. This is responsible for performing the optimization
  * step.
  */
-class SGD {
+class ET_EXPERIMENTAL SGD {
  public:
   explicit SGD(
       const std::vector<SGDParamGroup>& param_groups,
diff --git a/extension/training/optimizer/targets.bzl b/extension/training/optimizer/targets.bzl
index 69682feaee4..3b00ae0bfdc 100644
--- a/extension/training/optimizer/targets.bzl
+++ b/extension/training/optimizer/targets.bzl
@@ -10,20 +10,19 @@ def define_common_targets():
     for aten_mode in (True, False):
         aten_suffix = "_aten" if aten_mode else ""
 
-        if aten_mode:
-            kernel_deps = [
-                "//executorch/kernels/aten:generated_lib",
-                "//executorch/kernels/aten:generated_lib_headers",
-                "//executorch/kernels/test:function_header_wrapper_aten",
-            ]
-        else:
-            kernel_deps = [
-                "//executorch/kernels/portable/cpu:op_add",
-                "//executorch/kernels/portable/cpu:op_mul",
-                "//executorch/kernels/portable/cpu:op_clone",
-                "//executorch/kernels/portable:generated_lib_headers",
-                "//executorch/kernels/test:function_header_wrapper_portable",
-            ]
+        # if aten_mode:
+        #     kernel_deps = [
+        #         "//executorch/kernels/aten:generated_lib",
+        #         "//executorch/kernels/aten:generated_lib_headers",
+        #         "//executorch/kernels/test:function_header_wrapper_aten",
+        #     ]
+        # else:
+        #     kernel_deps = [
+        #         "//executorch/kernels/portable/cpu:op_add",
+        #         "//executorch/kernels/portable/cpu:op_mul",
+        #         "//executorch/kernels/portable/cpu:op_clone",
+        #         "//executorch/kernels/portable:generated_lib_headers",
+        #     ]
 
         runtime.cxx_library(
             name = "sgd" + aten_suffix,
@@ -34,9 +33,9 @@ def define_common_targets():
                 "sgd.h",
             ],
             exported_deps = [
-                "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
+                "//executorch/runtime/core:core",
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-            ] + kernel_deps,
+            ],  # + kernel_deps,
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
diff --git a/install_requirements.py b/install_requirements.py
index 64243ec6943..5c6777e783d 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -94,14 +94,14 @@ def python_is_compatible():
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20240901"
+NIGHTLY_VERSION = "dev20241007"
 
 # The pip repository that hosts nightly torch packages.
 TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"
 
 # pip packages needed by exir.
 EXIR_REQUIREMENTS = [
-    f"torch==2.5.0.{NIGHTLY_VERSION}",
+    f"torch==2.6.0.{NIGHTLY_VERSION}",
     f"torchvision==0.20.0.{NIGHTLY_VERSION}",  # For testing.
     "typing-extensions",
 ]
diff --git a/kernels/README.md b/kernels/README.md
index 026778cc287..68b0ce222b3 100644
--- a/kernels/README.md
+++ b/kernels/README.md
@@ -356,7 +356,7 @@ cmake . \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-  -DEXECUTORCH_BUILD_SDK=ON \
+  -DEXECUTORCH_BUILD_DEVTOOLS=ON \
   -DEXECUTORCH_BUILD_VULKAN=OFF \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -Bcmake-out
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 858e51160e5..abdeeb73453 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -42,7 +42,9 @@ endif()
 # Build cpublas.
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
-target_link_libraries(cpublas PRIVATE executorch_no_prim_ops eigen_blas)
+target_link_libraries(
+  cpublas PRIVATE executorch_core eigen_blas extension_threadpool
+)
 target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
@@ -58,7 +60,9 @@ message("Generated files ${gen_command_sources}")
 
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
-target_link_libraries(optimized_kernels PRIVATE executorch_no_prim_ops cpublas)
+target_link_libraries(
+  optimized_kernels PRIVATE executorch_core cpublas extension_threadpool
+)
 target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
 # Build a library for _optimized_kernels_srcs
 #
diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp
index 7202c8cd472..a3e2172504d 100644
--- a/kernels/optimized/blas/BlasKernel.cpp
+++ b/kernels/optimized/blas/BlasKernel.cpp
@@ -10,6 +10,7 @@
 
 #ifdef __aarch64__
 #include <arm_neon.h>
+#include <cpuinfo.h>
 #endif
 
 using torch::executor::BFloat16;
@@ -23,7 +24,7 @@ static inline float32x4_t f32_fma(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmaq_f32(a, b, c);
 #else
   return vaddq_f32(a, vmulq_f32(b, c));
-#endif
+#endif // __ARM_FEATURE_FMA
 }
 
 // The below reduce overload and fp16_dot_with_fp32_arith are adapted
@@ -73,13 +74,32 @@ f32_fma_bf16(float32x4_t a, uint16x4_t b, uint16x4_t c) {
   return f32_fma(a, to_bfloat16(b), to_bfloat16(c));
 }
 
-static ET_INLINE void dot_with_fp32_arith_main_inner_loop(
+#define ET_TARGET_ARM_BF16_ATTRIBUTE \
+  __attribute__((target("arch=armv8.2-a+bf16")))
+ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE float32x4_t
+f32_dot_bf16(float32x4_t a, bfloat16x8_t b, bfloat16x8_t c) {
+  return vbfdotq_f32(a, b, c);
+}
+
+ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE void
+dot_with_fp32_arith_main_inner_loop_bfdot(
+    const BFloat16* vec1,
+    const BFloat16* vec2,
+    float32x4_t sum[kF32RegistersPerIteration],
+    int registerPairIndex) {
+  const bfloat16x8_t temp_vec1 = vld1q_bf16(reinterpret_cast<const __bf16*>(
+      &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  const bfloat16x8_t temp_vec2 = vld1q_bf16(reinterpret_cast<const __bf16*>(
+      &vec2[registerPairIndex * 2 * kF32ElementsPerRegister]));
+  sum[registerPairIndex] =
+      f32_dot_bf16(sum[registerPairIndex], temp_vec1, temp_vec2);
+}
+
+static ET_INLINE void dot_with_fp32_arith_main_inner_loop_no_bfdot(
     const BFloat16* vec1,
     const BFloat16* vec2,
     float32x4_t sum[kF32RegistersPerIteration],
     int registerPairIndex) {
-  // TODO: detect intrinsic availability, use them if they're available.
-  // __ARM_FEATURE_BF16 Load a pair of f32 registers at a time.
   const uint16x8_t temp_vec1 = vld1q_u16(reinterpret_cast<const uint16_t*>(
       &vec1[registerPairIndex * 2 * kF32ElementsPerRegister]));
   const uint16x8_t temp_vec2 = vld1q_u16(reinterpret_cast<const uint16_t*>(
@@ -95,6 +115,22 @@ static ET_INLINE void dot_with_fp32_arith_main_inner_loop(
       vget_high_u16(temp_vec2));
 }
 
+template <bool useBfdot>
+ET_TARGET_ARM_BF16_ATTRIBUTE static ET_INLINE void
+dot_with_fp32_arith_main_inner_loop(
+    const BFloat16* vec1,
+    const BFloat16* vec2,
+    float32x4_t sum[kF32RegistersPerIteration],
+    int registerPairIndex) {
+  if constexpr (useBfdot) {
+    dot_with_fp32_arith_main_inner_loop_bfdot(
+        vec1, vec2, sum, registerPairIndex);
+  } else {
+    dot_with_fp32_arith_main_inner_loop_no_bfdot(
+        vec1, vec2, sum, registerPairIndex);
+  }
+}
+
 static ET_INLINE void dot_with_fp32_arith_vectorized_tail_inner_loop(
     const BFloat16* vec1,
     const BFloat16* vec2,
@@ -107,17 +143,40 @@ static ET_INLINE void dot_with_fp32_arith_vectorized_tail_inner_loop(
   *tailSum = f32_fma_bf16(*tailSum, temp_vec1, temp_vec2);
 }
 
-template <typename T>
-float dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
+namespace {
+template <int n>
+struct ForcedUnrollTargetBFloat16 {
+  template <typename Func>
+  ET_TARGET_ARM_BF16_ATTRIBUTE ET_INLINE void operator()(const Func& f) const {
+    ForcedUnrollTargetBFloat16<n - 1>{}(f);
+    f(n - 1);
+  }
+};
+
+template <>
+struct ForcedUnrollTargetBFloat16<1> {
+  template <typename Func>
+  ET_TARGET_ARM_BF16_ATTRIBUTE ET_INLINE void operator()(const Func& f) const {
+    f(0);
+  }
+};
+
+} // namespace
+
+template <typename T, bool useBFloat16Dot>
+ET_TARGET_ARM_BF16_ATTRIBUTE float
+dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
   float32x4_t sum[kF32RegistersPerIteration] = {vdupq_n_f32(0)};
   const auto len_aligned = len & ~(kF32ElementsPerIteration - 1);
   for (int j = 0; j < len_aligned; j += kF32ElementsPerIteration) {
     const auto* vec1_ = vec1 + j;
     const auto* vec2_ = vec2 + j;
-    utils::ForcedUnroll<kF32RegisterPairsPerIteration>{}(
-        [vec1_, vec2_, &sum](auto k) ET_INLINE_ATTRIBUTE {
-          dot_with_fp32_arith_main_inner_loop(vec1_, vec2_, sum, k);
-        });
+    ForcedUnrollTargetBFloat16<kF32RegisterPairsPerIteration>{}(
+        [vec1_, vec2_, &sum](auto k)
+            ET_INLINE_ATTRIBUTE ET_TARGET_ARM_BF16_ATTRIBUTE {
+              dot_with_fp32_arith_main_inner_loop<useBFloat16Dot>(
+                  vec1_, vec2_, sum, k);
+            });
   }
   auto reducedSum = reduce(sum);
 
@@ -143,9 +202,13 @@ float bf16_dot_with_fp32_arith(
     const BFloat16* vec1,
     const BFloat16* vec2,
     int64_t len) {
-  return dot_with_fp32_arith(vec1, vec2, len);
+  if (cpuinfo_has_arm_bf16()) {
+    return dot_with_fp32_arith<BFloat16, true>(vec1, vec2, len);
+  } else {
+    return dot_with_fp32_arith<BFloat16, false>(vec1, vec2, len);
+  }
 }
-#endif
+#endif // __aarch64__
 } // namespace internal
 } // namespace cpublas
 } // namespace executorch
diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h
index f594d1748e7..c2b03cfebdd 100644
--- a/kernels/optimized/blas/BlasKernel.h
+++ b/kernels/optimized/blas/BlasKernel.h
@@ -11,16 +11,11 @@
 #include <executorch/kernels/optimized/utils/math_utils.h>
 #include <executorch/kernels/optimized/utils/unroll.h>
 
+#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/runtime/core/portable_type/bfloat16.h>
 
 #include <array>
 
-namespace torch {
-namespace executor {
-struct BFloat16;
-} // namespace executor
-} // namespace torch
-
 namespace executorch {
 namespace cpublas {
 
@@ -177,34 +172,37 @@ inline void gemm_transa_<torch::executor::BFloat16, torch::executor::BFloat16>(
     torch::executor::BFloat16 beta,
     torch::executor::BFloat16 *c, int64_t ldc) {
   // c = alpha * (a.T @ b) + beta * c
-//  parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
   if (alpha == 1 && beta == 0) {
-    const auto *a_ = a;
-    for (int i = 0; i < m; ++i) {
+    executorch::extension::parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
+      const auto *a_ = a + begin * lda;
+      for (int i = begin; i < end; ++i) {
+        const auto *b_ = b;
+        for (int j = 0; j < n; ++j) {
+          const auto dot = internal::bf16_dot_with_fp32_arith(a_, b_, k);
+          b_ += ldb;
+          c[j*ldc+i] = dot;
+        }
+        a_ += lda;
+      }
+    });
+    return;
+  }
+  executorch::extension::parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
+    const auto *a_ = a + begin * lda;
+    for (int i = begin; i < end; ++i) {
       const auto *b_ = b;
       for (int j = 0; j < n; ++j) {
         const auto dot = internal::bf16_dot_with_fp32_arith(a_, b_, k);
         b_ += ldb;
-        c[j*ldc+i] = dot;
+        if (beta == 0) {
+          c[j*ldc+i] = alpha*dot;
+        } else {
+          c[j*ldc+i] = beta*c[j*ldc+i]+alpha*dot;
+        }
       }
       a_ += lda;
     }
-    return;
-  }
-  const auto *a_ = a;
-  for (int i = 0; i < m; ++i) {
-    const auto *b_ = b;
-    for (int j = 0; j < n; ++j) {
-      const auto dot = internal::bf16_dot_with_fp32_arith(a_, b_, k);
-      b_ += ldb;
-      if (beta == 0) {
-        c[j*ldc+i] = alpha*dot;
-      } else {
-        c[j*ldc+i] = beta*c[j*ldc+i]+alpha*dot;
-      }
-    }
-    a_ += lda;
-  }
+  });
 }
 #endif
 
diff --git a/kernels/optimized/blas/CPUBlas.cpp b/kernels/optimized/blas/CPUBlas.cpp
index 99003f8f0ea..d30064b953c 100644
--- a/kernels/optimized/blas/CPUBlas.cpp
+++ b/kernels/optimized/blas/CPUBlas.cpp
@@ -24,7 +24,8 @@ extern "C" void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float
 namespace executorch {
 namespace cpublas {
 
-// using Half = exec_aten::Half;
+using exec_aten::BFloat16;
+using exec_aten::Half;
 
 #ifdef ET_BUILD_WITH_BLAS
 #ifdef ET_BUILD_FOR_APPLE
diff --git a/kernels/optimized/blas/CPUBlas.h b/kernels/optimized/blas/CPUBlas.h
index 71e50601238..89f0992e30f 100644
--- a/kernels/optimized/blas/CPUBlas.h
+++ b/kernels/optimized/blas/CPUBlas.h
@@ -17,9 +17,6 @@
 namespace executorch {
 namespace cpublas {
 
-using BFloat16 = torch::executor::BFloat16;
-using Half = torch::executor::Half;
-
 enum class TransposeType {
   NoTranspose,
   Transpose,
@@ -100,20 +97,20 @@ void gemm(
 void gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
-    const Half alpha,
-    const Half *a, int64_t lda,
-    const Half *b, int64_t ldb,
-    const Half beta,
-    Half *c, int64_t ldc);
+    const exec_aten::Half alpha,
+    const exec_aten::Half *a, int64_t lda,
+    const exec_aten::Half *b, int64_t ldb,
+    const exec_aten::Half beta,
+    exec_aten::Half *c, int64_t ldc);
 
 void gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
-    const BFloat16 alpha,
-    const BFloat16 *a, int64_t lda,
-    const BFloat16 *b, int64_t ldb,
-    const BFloat16 beta,
-    BFloat16 *c, int64_t ldc);
+    const exec_aten::BFloat16 alpha,
+    const exec_aten::BFloat16 *a, int64_t lda,
+    const exec_aten::BFloat16 *b, int64_t ldb,
+    const exec_aten::BFloat16 beta,
+    exec_aten::BFloat16 *c, int64_t ldc);
 // clang-format on
 
 // clang-format off
diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp
index 108d0fa2b3e..4d7b8efe9e3 100644
--- a/kernels/optimized/cpu/op_div.cpp
+++ b/kernels/optimized/cpu/op_div.cpp
@@ -84,8 +84,11 @@ Tensor& opt_div_out(
                 tensor->const_data_ptr<CTYPE>(),
                 out.numel());
           } else {
+            Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted);
             executorch::vec::map<CTYPE>(
-                [scalar_casted](Vec x) { return x / Vec(scalar_casted); },
+                [inv_scalar_casted_vec](Vec x) {
+                  return x * inv_scalar_casted_vec;
+                },
                 out.mutable_data_ptr<CTYPE>(),
                 tensor->const_data_ptr<CTYPE>(),
                 out.numel());
@@ -220,8 +223,9 @@ Tensor& opt_div_scalar_out(
             CTYPE b_casted = static_cast<CTYPE>(b_val);
 
             using Vec = executorch::vec::Vectorized<CTYPE>;
+            Vec inv_b_casted_vec(CTYPE(1) / b_casted);
             executorch::vec::map<CTYPE>(
-                [b_casted](Vec x) { return x / Vec(b_casted); },
+                [inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; },
                 out.mutable_data_ptr<CTYPE>(),
                 a.const_data_ptr<CTYPE>(),
                 out.numel());
@@ -239,6 +243,7 @@ Tensor& opt_div_scalar_out(
                             CTYPE_B b_val;
                             ET_EXTRACT_SCALAR(b, b_val);
                             CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
+                            CTYPE_IN inv_b_casted = CTYPE_IN(1) / b_casted;
 
                             const size_t n = a.numel();
                             const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
@@ -246,7 +251,8 @@ Tensor& opt_div_scalar_out(
                                 out.mutable_data_ptr<CTYPE_OUT>();
                             for (auto i = 0; i < n; ++i) {
                               out_data[i] = static_cast<CTYPE_OUT>(
-                                  static_cast<CTYPE_IN>(a_data[i]) / b_casted);
+                                  static_cast<CTYPE_IN>(a_data[i]) *
+                                  inv_b_casted);
                             }
                           });
                     });
diff --git a/kernels/optimized/cpu/op_exp.cpp b/kernels/optimized/cpu/op_exp.cpp
index b9d22a84edc..8c234d3d1da 100644
--- a/kernels/optimized/cpu/op_exp.cpp
+++ b/kernels/optimized/cpu/op_exp.cpp
@@ -26,9 +26,9 @@ template <
     typename CTYPE_IN,
     typename CTYPE_OUT,
     typename std::enable_if<
-        std::is_same<CTYPE_IN, CTYPE_OUT>::value &&
-            !std::is_same<CTYPE_IN, torch::executor::Half>::value &&
-            !std::is_same<CTYPE_OUT, torch::executor::Half>::value,
+        std::is_same_v<CTYPE_IN, CTYPE_OUT> &&
+            !std::is_same_v<CTYPE_IN, exec_aten::Half> &&
+            !std::is_same_v<CTYPE_OUT, exec_aten::BFloat16>,
         int>::type = 0>
 void exp_data(
     const CTYPE_IN* in_data,
@@ -46,9 +46,11 @@ template <
     typename CTYPE_IN,
     typename CTYPE_OUT,
     typename std::enable_if<
-        !std::is_same<CTYPE_IN, CTYPE_OUT>::value ||
-            std::is_same<CTYPE_IN, torch::executor::Half>::value ||
-            std::is_same<CTYPE_OUT, torch::executor::Half>::value,
+        !std::is_same_v<CTYPE_IN, CTYPE_OUT> ||
+            std::is_same_v<CTYPE_IN, exec_aten::Half> ||
+            std::is_same_v<CTYPE_IN, exec_aten::BFloat16> ||
+            std::is_same_v<CTYPE_OUT, exec_aten::Half> ||
+            std::is_same_v<CTYPE_OUT, exec_aten::BFloat16>,
         int>::type = 0>
 void exp_data(
     const CTYPE_IN* in_data,
@@ -76,13 +78,14 @@ Tensor& opt_exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
 
   ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
 
-  ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "exp.out", CTYPE_IN, [&] {
-    ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "exp.out", CTYPE_OUT, [&] {
-      exp_data<CTYPE_IN, CTYPE_OUT>(
-          in.const_data_ptr<CTYPE_IN>(),
-          in.numel(),
-          out.mutable_data_ptr<CTYPE_OUT>());
-    });
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "exp.out", CTYPE_IN, [&] {
+    ET_SWITCH_FLOATHBF16_TYPES(
+        out.scalar_type(), ctx, "exp.out", CTYPE_OUT, [&] {
+          exp_data<CTYPE_IN, CTYPE_OUT>(
+              in.const_data_ptr<CTYPE_IN>(),
+              in.numel(),
+              out.mutable_data_ptr<CTYPE_OUT>());
+        });
   });
 
   return out;
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
index ce82e49cc27..51ff4fbd571 100644
--- a/kernels/optimized/cpu/op_sub.cpp
+++ b/kernels/optimized/cpu/op_sub.cpp
@@ -134,8 +134,8 @@ Tensor& opt_sub_out(
           }
         });
       });
+      return out;
     }
-    return out;
   }
 
   auto selected_optimized_path = select_optimized_path(a, b, out);
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 16ce446df40..367c23f0813 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -1,5 +1,6 @@
 load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFORM_REGEX")
 load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
+load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 # Because vec exists as a collection of header files, compile and preprocessor
@@ -109,6 +110,8 @@ def define_libs():
         ],
     )
 
+    LIBBLAS_DEPS = [third_party_dep("cpuinfo")]
+
     for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]:
         runtime.cxx_library(
             name = libblas_name,
@@ -155,8 +158,9 @@ def define_libs():
             deps = select({
                 ":linux-x86_64": [mkl_dep] if not runtime.is_oss else [],
                 "DEFAULT": [],
-            }),
+            }) + LIBBLAS_DEPS,
             exported_deps = [
+                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
             ],
diff --git a/kernels/optimized/vec/vec_base.h b/kernels/optimized/vec/vec_base.h
index 7e1fb4eb244..4a386f9e8ce 100644
--- a/kernels/optimized/vec/vec_base.h
+++ b/kernels/optimized/vec/vec_base.h
@@ -3,6 +3,7 @@
 // @nolint PATTERNLINT <functional> is required for std::equal_to, etc.
 
 #include <cassert>
+#include <cstdint>
 #include <cstring>
 #include <functional>
 #include <cmath>
diff --git a/kernels/portable/cpu/op_acos.cpp b/kernels/portable/cpu/op_acos.cpp
index 46f9f965bb9..dac3b1546f3 100644
--- a/kernels/portable/cpu/op_acos.cpp
+++ b/kernels/portable/cpu/op_acos.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& acos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::acos, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::acos, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_acosh.cpp b/kernels/portable/cpu/op_acosh.cpp
index d1d3b0aa232..77f7edf4c5d 100644
--- a/kernels/portable/cpu/op_acosh.cpp
+++ b/kernels/portable/cpu/op_acosh.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& acosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::acosh, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::acosh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_asin.cpp b/kernels/portable/cpu/op_asin.cpp
index 0200009b330..6affa6e4122 100644
--- a/kernels/portable/cpu/op_asin.cpp
+++ b/kernels/portable/cpu/op_asin.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& asin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::asin, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::asin, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_asinh.cpp b/kernels/portable/cpu/op_asinh.cpp
index 17336618a19..bce8dcf6d5a 100644
--- a/kernels/portable/cpu/op_asinh.cpp
+++ b/kernels/portable/cpu/op_asinh.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& asinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::asinh, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::asinh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_atan.cpp b/kernels/portable/cpu/op_atan.cpp
index 0c980c6a785..23549627a3b 100644
--- a/kernels/portable/cpu/op_atan.cpp
+++ b/kernels/portable/cpu/op_atan.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& atan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::atan, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::atan, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_atanh.cpp b/kernels/portable/cpu/op_atanh.cpp
index 2c13fb6efd8..13e6e8ca141 100644
--- a/kernels/portable/cpu/op_atanh.cpp
+++ b/kernels/portable/cpu/op_atanh.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& atanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::atanh, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::atanh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index ec34fa9bd35..37c5d0f6c21 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -212,39 +212,31 @@ Tensor& clamp_tensor_out(
 
   ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
-  constexpr auto name = "clamp.Tensor_out";
-
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() {
-    ET_SWITCH_REALHB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() {
-      ET_SWITCH_REALHB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() {
-        ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
-          apply_ternary_elementwise_fn<
-              CTYPE_IN,
-              CTYPE_MIN,
-              CTYPE_MAX,
-              CTYPE_OUT>(
-              [has_min, has_max](
-                  const CTYPE_IN val_in,
-                  const CTYPE_MIN val_min,
-                  const CTYPE_MAX val_max) {
-                CTYPE_OUT val_out = static_cast<CTYPE_OUT>(val_in);
-                if (has_min) {
-                  val_out = utils::max_override(
-                      val_out, static_cast<CTYPE_OUT>(val_min));
-                }
-                if (has_max) {
-                  val_out = utils::min_override(
-                      val_out, static_cast<CTYPE_OUT>(val_max));
-                }
-                return val_out;
-              },
-              in,
-              min,
-              max,
-              out);
-        });
-      });
-    });
+  static constexpr const char op_name[] = "clamp.Tensor_out";
+
+  ET_SWITCH_REALHB_TYPES(common_type, ctx, op_name, CTYPE_COMMON, [&]() {
+    apply_ternary_elementwise_fn<CTYPE_COMMON, op_name>(
+        [has_min, has_max](
+            const CTYPE_COMMON val_in,
+            const CTYPE_COMMON val_min,
+            const CTYPE_COMMON val_max) {
+          CTYPE_COMMON val_out = val_in;
+          if (has_min) {
+            val_out = utils::max_override(val_out, val_min);
+          }
+          if (has_max) {
+            val_out = utils::min_override(val_out, val_max);
+          }
+          return val_out;
+        },
+        in,
+        SupportedTensorDtypes::REALHBBF16,
+        min,
+        SupportedTensorDtypes::REALHBBF16,
+        max,
+        SupportedTensorDtypes::REALHBBF16,
+        out,
+        SupportedTensorDtypes::REALHBBF16);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_cos.cpp b/kernels/portable/cpu/op_cos.cpp
index 56cf9236df2..e536060d162 100644
--- a/kernels/portable/cpu/op_cos.cpp
+++ b/kernels/portable/cpu/op_cos.cpp
@@ -15,7 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& cos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::cos, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::cos, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_cosh.cpp b/kernels/portable/cpu/op_cosh.cpp
index 4f4e263286b..e622bbe6fcd 100644
--- a/kernels/portable/cpu/op_cosh.cpp
+++ b/kernels/portable/cpu/op_cosh.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& cosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::cosh, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::cosh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_empty.cpp b/kernels/portable/cpu/op_empty.cpp
index f3c30f56e11..9b37a527c92 100644
--- a/kernels/portable/cpu/op_empty.cpp
+++ b/kernels/portable/cpu/op_empty.cpp
@@ -26,7 +26,7 @@ using exec_aten::Tensor;
 Tensor& empty_out(
     KernelRuntimeContext& context,
     IntArrayRef size,
-    torch::executor::optional<torch::executor::MemoryFormat> memory_format,
+    exec_aten::optional<exec_aten::MemoryFormat> memory_format,
     Tensor& out) {
   (void)context;
 
diff --git a/kernels/portable/cpu/op_erf.cpp b/kernels/portable/cpu/op_erf.cpp
index bf85608d546..6897bcda95b 100644
--- a/kernels/portable/cpu/op_erf.cpp
+++ b/kernels/portable/cpu/op_erf.cpp
@@ -15,7 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& erf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::erf, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::erf, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_exp.cpp b/kernels/portable/cpu/op_exp.cpp
index c72d4d2954f..cbfc8924cb0 100644
--- a/kernels/portable/cpu/op_exp.cpp
+++ b/kernels/portable/cpu/op_exp.cpp
@@ -15,7 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::exp, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::exp, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_expm1.cpp b/kernels/portable/cpu/op_expm1.cpp
index 96b94cdfa2a..f2d49f615b1 100644
--- a/kernels/portable/cpu/op_expm1.cpp
+++ b/kernels/portable/cpu/op_expm1.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& expm1_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::expm1, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::expm1, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_index.cpp b/kernels/portable/cpu/op_index.cpp
index 780994cb75d..98f76a9e352 100644
--- a/kernels/portable/cpu/op_index.cpp
+++ b/kernels/portable/cpu/op_index.cpp
@@ -89,7 +89,7 @@ Tensor& index_Tensor_out(
   compute_dim_map(in, indices, dim_map, block_count == 1);
   compute_index_map(in, indices, ix_map);
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "index.Tensor_out", CTYPE, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "index.Tensor_out", CTYPE, [&]() {
     const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
     CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
diff --git a/kernels/portable/cpu/op_index_put.cpp b/kernels/portable/cpu/op_index_put.cpp
index e44d50f606c..33e67d207a9 100644
--- a/kernels/portable/cpu/op_index_put.cpp
+++ b/kernels/portable/cpu/op_index_put.cpp
@@ -53,7 +53,7 @@ Tensor& index_put_out(
     ET_KERNEL_CHECK(
         ctx, tensor_is_broadcastable_to(values, out), InvalidArgument, out);
 
-    ET_SWITCH_REALHB_TYPES(in_type, ctx, "index_put.out", CTYPE, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "index_put.out", CTYPE, [&]() {
       apply_binary_elementwise_fn<CTYPE, CTYPE, CTYPE>(
           [accumulate](const CTYPE val_in, const CTYPE val) {
             return accumulate ? val_in + val : val;
@@ -120,7 +120,7 @@ Tensor& index_put_out(
     x_numel *= x_sizes[i];
   }
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "index_put.out", CTYPE, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "index_put.out", CTYPE, [&]() {
     const CTYPE* const values_data = values.const_data_ptr<CTYPE>();
     CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
diff --git a/kernels/portable/cpu/op_log.cpp b/kernels/portable/cpu/op_log.cpp
index 0e959209714..8a36bce8c49 100644
--- a/kernels/portable/cpu/op_log.cpp
+++ b/kernels/portable/cpu/op_log.cpp
@@ -15,7 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& log_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::log, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::log, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_log10.cpp b/kernels/portable/cpu/op_log10.cpp
index e617ab38c15..89f9b672476 100644
--- a/kernels/portable/cpu/op_log10.cpp
+++ b/kernels/portable/cpu/op_log10.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log10_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::log10, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log10, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_log1p.cpp b/kernels/portable/cpu/op_log1p.cpp
index b92344c2e76..2daa31e37ff 100644
--- a/kernels/portable/cpu/op_log1p.cpp
+++ b/kernels/portable/cpu/op_log1p.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log1p_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::log1p, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log1p, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_log2.cpp b/kernels/portable/cpu/op_log2.cpp
index 19f4daa0d45..4d7406832e4 100644
--- a/kernels/portable/cpu/op_log2.cpp
+++ b/kernels/portable/cpu/op_log2.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log2_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::log2, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log2, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_reciprocal.cpp b/kernels/portable/cpu/op_reciprocal.cpp
index dc120119a35..f22f9883858 100644
--- a/kernels/portable/cpu/op_reciprocal.cpp
+++ b/kernels/portable/cpu/op_reciprocal.cpp
@@ -22,7 +22,8 @@ double reciprocal(double x) {
 
 Tensor&
 reciprocal_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(reciprocal, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      reciprocal, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_rsqrt.cpp b/kernels/portable/cpu/op_rsqrt.cpp
index bb9a6dc4582..19c4c6c1a57 100644
--- a/kernels/portable/cpu/op_rsqrt.cpp
+++ b/kernels/portable/cpu/op_rsqrt.cpp
@@ -21,7 +21,7 @@ double rsqrt(double x) {
 } // namespace
 
 Tensor& rsqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(rsqrt, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_sin.cpp b/kernels/portable/cpu/op_sin.cpp
index 102d0a5cd9d..ad65c4be18b 100644
--- a/kernels/portable/cpu/op_sin.cpp
+++ b/kernels/portable/cpu/op_sin.cpp
@@ -15,7 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& sin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::sin, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::sin, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_sinh.cpp b/kernels/portable/cpu/op_sinh.cpp
index b06a0a2b06c..21666392392 100644
--- a/kernels/portable/cpu/op_sinh.cpp
+++ b/kernels/portable/cpu/op_sinh.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& sinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::sinh, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::sinh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_sqrt.cpp b/kernels/portable/cpu/op_sqrt.cpp
index d1a88869f9a..bd2075f5b04 100644
--- a/kernels/portable/cpu/op_sqrt.cpp
+++ b/kernels/portable/cpu/op_sqrt.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& sqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::sqrt, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::sqrt, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_tan.cpp b/kernels/portable/cpu/op_tan.cpp
index fa5ab083630..a2b921d5146 100644
--- a/kernels/portable/cpu/op_tan.cpp
+++ b/kernels/portable/cpu/op_tan.cpp
@@ -15,7 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& tan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::tan, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::tan, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_tanh.cpp b/kernels/portable/cpu/op_tanh.cpp
index 0935c5bc93d..ae9f93dc62c 100644
--- a/kernels/portable/cpu/op_tanh.cpp
+++ b/kernels/portable/cpu/op_tanh.cpp
@@ -15,7 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& tanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_floath(std::tanh, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::tanh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_unsqueeze_copy.cpp b/kernels/portable/cpu/op_unsqueeze_copy.cpp
index 37a94375eae..3b74033d9d4 100644
--- a/kernels/portable/cpu/op_unsqueeze_copy.cpp
+++ b/kernels/portable/cpu/op_unsqueeze_copy.cpp
@@ -38,11 +38,6 @@ Tensor& unsqueeze_copy_out(
   ET_KERNEL_CHECK(ctx, self.dim() + 1 == out.dim(), InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, dim <= self.dim(), InvalidArgument, out);
 
-  ET_KERNEL_CHECK(
-      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
-
-  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out);
-
   for (size_t i = 0; i < out.dim(); ++i) {
     if (i < dim) {
       expected_output_size[i] = self.size(i);
diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp
index a7736247597..90cb2442a2a 100644
--- a/kernels/portable/cpu/op_where.cpp
+++ b/kernels/portable/cpu/op_where.cpp
@@ -38,27 +38,25 @@ Tensor& where_out(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(cond, a, b, out), InvalidArgument, out);
 
-  constexpr auto name = "where.self_out";
+  static constexpr const char op_name[] = "where.self_out";
 
   ET_CHECK_MSG(
       cond_type == ScalarType::Bool || cond_type == ScalarType::Byte,
       "Unhandled dtype %s for where.self_out",
       torch::executor::toString(cond_type));
-  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
-      using CTYPE_OUT =
-          typename torch::executor::promote_types<CTYPE_A, CTYPE_B>::type;
-      apply_ternary_elementwise_fn<CTYPE_A, CTYPE_B, uint8_t, CTYPE_OUT>(
-          [](const CTYPE_A val_a, const CTYPE_B val_b, const uint8_t val_c) {
-            CTYPE_OUT a_casted = static_cast<CTYPE_OUT>(val_a);
-            CTYPE_OUT b_casted = static_cast<CTYPE_OUT>(val_b);
-            return val_c ? a_casted : b_casted;
-          },
-          a,
-          b,
-          cond,
-          out);
-    });
+  ET_SWITCH_REALHBBF16_TYPES(common_type, ctx, op_name, CTYPE_COMMON, [&]() {
+    apply_ternary_elementwise_fn<CTYPE_COMMON, op_name>(
+        [](const CTYPE_COMMON val_a,
+           const CTYPE_COMMON val_b,
+           const CTYPE_COMMON val_c) { return val_c ? val_a : val_b; },
+        a,
+        SupportedTensorDtypes::REALHBBF16,
+        b,
+        SupportedTensorDtypes::REALHBBF16,
+        cond,
+        SupportedTensorDtypes::BOOL_OR_BYTE,
+        out,
+        SupportedTensorDtypes::SAME_AS_COMMON);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp b/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp
index 0c454cae792..ebc685afa51 100644
--- a/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp
+++ b/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp
@@ -34,9 +34,9 @@ Tensor& binary_ufunc_realb_realb_to_realb_logical(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, __func__, CTYPE_A, [&]() {
-    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, __func__, CTYPE_B, [&]() {
-      ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, __func__, CTYPE_OUT, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, __func__, CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, __func__, CTYPE_B, [&]() {
+      ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&]() {
         apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
             [fn](const CTYPE_A val_a, const CTYPE_B val_b) {
               bool a_casted = static_cast<bool>(val_a);
diff --git a/kernels/portable/cpu/pattern/pattern.h b/kernels/portable/cpu/pattern/pattern.h
index db3c0c59b35..7863cdce0fc 100644
--- a/kernels/portable/cpu/pattern/pattern.h
+++ b/kernels/portable/cpu/pattern/pattern.h
@@ -79,11 +79,11 @@ Tensor& unary_ufunc_realhb_to_bool(
 
 /**
  * Implements an op pattern for ops that take a single input tensor of any
- * realhb dtye (real, half and boolean), no additional arguments, and outputs a
- * floating point tensor of the same size. The function fn specifies the math
- * operation which is applied to the input tensor element-wise.
+ * realhbbf16 dtype (real/half/bool/bfloat16), no additional arguments, and
+ * outputs a floating point tensor of the same size. The function fn specifies
+ * the math operation which is applied to the input tensor element-wise.
  */
-Tensor& unary_ufunc_realhb_to_floath(
+Tensor& unary_ufunc_realhbbf16_to_floathbf16(
     double (*fn)(double),
     KernelRuntimeContext& ctx,
     const Tensor& in,
@@ -91,7 +91,7 @@ Tensor& unary_ufunc_realhb_to_floath(
 
 /**
  * Implements an op pattern for ops that take two broadcastable input tensors
- * of any realb dtye, no additional arguments, performs an element-wise binary
+ * of any realb dtype, no additional arguments, performs an element-wise binary
  * logical operation, and outputs a realb tensor. The function fn specifies the
  * binary logical operation which is applied to the input tensors element-wise.
  */
diff --git a/kernels/portable/cpu/pattern/targets.bzl b/kernels/portable/cpu/pattern/targets.bzl
index 06743e9ed71..250a8fb6a72 100644
--- a/kernels/portable/cpu/pattern/targets.bzl
+++ b/kernels/portable/cpu/pattern/targets.bzl
@@ -34,7 +34,7 @@ def define_common_targets():
         name = "pattern",
         srcs = [
             "unary_ufunc_realhb_to_bool.cpp",
-            "unary_ufunc_realhb_to_floath.cpp",
+            "unary_ufunc_realhbbf16_to_floathbf16.cpp",
             "unary_ufunc_realh.cpp",
             "binary_ufunc_realb_realb_to_realb_logical.cpp",
         ],
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
index 76bef5fe021..3be4b258cc4 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
@@ -43,7 +43,7 @@ Tensor& unary_ufunc_realhb_to_bool(
 
   const auto in_type = in.scalar_type();
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
     apply_unary_map_fn(
         [fn](const CTYPE_IN val_in) { return fn(val_in); },
         in.const_data_ptr<CTYPE_IN>(),
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
similarity index 88%
rename from kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp
rename to kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
index 31ec04dfed0..602b5b1bfd2 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
@@ -15,7 +15,7 @@ namespace executor {
 namespace native {
 namespace internal {
 
-Tensor& unary_ufunc_realhb_to_floath(
+Tensor& unary_ufunc_realhbbf16_to_floathbf16(
     double (*fn)(double),
     KernelRuntimeContext& ctx,
     const Tensor& in,
@@ -38,8 +38,8 @@ Tensor& unary_ufunc_realhb_to_floath(
   const auto in_type = in.scalar_type();
   const auto out_type = out.scalar_type();
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
-    ET_SWITCH_FLOATH_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&] {
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATHBF16_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&] {
       apply_unary_map_fn(
           [fn](const CTYPE_IN val_in) {
             CTYPE_OUT xi = static_cast<CTYPE_OUT>(val_in);
diff --git a/kernels/portable/cpu/test/scalar_utils_test.cpp b/kernels/portable/cpu/test/scalar_utils_test.cpp
index 82539f02a0b..1983f707da1 100644
--- a/kernels/portable/cpu/test/scalar_utils_test.cpp
+++ b/kernels/portable/cpu/test/scalar_utils_test.cpp
@@ -16,7 +16,7 @@ struct promote_type_with_scalar_type_is_valid
           (std::is_same<T2, torch::executor::internal::B1>::value ||
            std::is_same<T2, torch::executor::internal::I8>::value ||
            std::is_same<T2, torch::executor::internal::F8>::value) &&
-              !std::is_same<T1, torch::executor::BFloat16>::value &&
+              !std::is_same<T1, exec_aten::BFloat16>::value &&
               !torch::executor::is_qint_type<T1>::value &&
               !torch::executor::is_bits_type<T1>::value> {};
 
diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h
index 92d35f322fb..beda475610f 100644
--- a/kernels/portable/cpu/util/broadcast_util.h
+++ b/kernels/portable/cpu/util/broadcast_util.h
@@ -270,6 +270,125 @@ size_t linearize_access_indexes(
 // Mapping with broadcasting
 //
 
+namespace internal {
+template <typename To, typename From>
+To load_and_convert(const void* fromPtr) {
+  return static_cast<To>(*reinterpret_cast<const From*>(fromPtr));
+}
+
+template <typename To, typename From>
+void convert_and_store(From f, void* dst) {
+  *reinterpret_cast<To*>(dst) = static_cast<To>(f);
+}
+
+template <typename CTYPE_COMMON>
+using load_to_common_fn = CTYPE_COMMON (*)(const void*);
+
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realhbbf16(
+    const Tensor& t) {
+  CTYPE_COMMON (*result)(const void*) = nullptr;
+  ET_SWITCH_REALHBBF16_TYPES(
+      t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+        result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+      });
+  return result;
+}
+
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
+    const Tensor& t) {
+  CTYPE_COMMON (*result)(const void*) = nullptr;
+  ET_SWITCH_TWO_TYPES(
+      Bool, Byte, t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+        result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+      });
+  return result;
+}
+
+template <typename CTYPE_COMMON>
+using store_common_to_tensor_fn = void (*)(CTYPE_COMMON, void*);
+
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON>
+get_store_common_to_tensor_fn_realhbbf16(const Tensor& t) {
+  void (*result)(CTYPE_COMMON, void*) = nullptr;
+  ET_SWITCH_REALHBBF16_TYPES(
+      t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+      });
+  return result;
+}
+
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON>
+get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
+  void (*result)(CTYPE_COMMON, void*) = nullptr;
+  ET_SWITCH_TWO_TYPES(
+      Bool, Byte, t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+      });
+  return result;
+}
+} // namespace internal
+
+enum class SupportedTensorDtypes {
+  REALHBBF16,
+  BOOL_OR_BYTE,
+  SAME_AS_COMMON,
+};
+
+namespace internal {
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  switch (dtypes) {
+    case SupportedTensorDtypes::REALHBBF16:
+      return get_load_to_common_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::BOOL_OR_BYTE:
+      return get_load_to_common_fn_bool_or_byte<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::SAME_AS_COMMON: {
+      constexpr auto common_scalar_type =
+          CppTypeToScalarType<CTYPE_COMMON>::value;
+      ET_CHECK_MSG(
+          t.scalar_type() == common_scalar_type,
+          "Unhandled dtype %s for %s",
+          ::executorch::runtime::toString(common_scalar_type),
+          op_name);
+      return internal::load_and_convert<CTYPE_COMMON, CTYPE_COMMON>;
+    }
+  }
+  ET_CHECK(false);
+  return nullptr;
+}
+
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  switch (dtypes) {
+    case SupportedTensorDtypes::REALHBBF16:
+      return get_store_common_to_tensor_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::BOOL_OR_BYTE:
+      return get_store_common_to_tensor_fn_bool_or_byte<CTYPE_COMMON, op_name>(
+          t);
+    case SupportedTensorDtypes::SAME_AS_COMMON: {
+      constexpr auto common_scalar_type =
+          CppTypeToScalarType<CTYPE_COMMON>::value;
+      ET_CHECK_MSG(
+          t.scalar_type() == common_scalar_type,
+          "Unhandled dtype %s for %s",
+          ::executorch::runtime::toString(common_scalar_type),
+          op_name);
+      return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
+    }
+  }
+  ET_CHECK(false);
+  return nullptr;
+}
+} // namespace internal
+
 /**
  * Useful for binary elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
@@ -313,29 +432,56 @@ inline void apply_binary_elementwise_fn(
  * Useful for ternary elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
+ *
+ * In order to mitigate build time cost (straightforwardly |CTYPE_A| *
+ * |CTYPE_B| * |CTYPE_C| * |CTYPE_OUT|), all arguments to compute_fun
+ * are passed as CTYPE_COMMON.
+ *
+ * Each tensor's supported dtypes set must be provided. The tensor
+ * will be checked to ensure that its dtype falls into that set.
+ *
+ * op_name is used to support dtype selective build, as with the
+ * ET_SWITCH family of macros. Note: because of C++17 quirks, you
+ * can't pass a string literal for op_name. Instead, you should do the
+ * following:
+ *
+ * static constexpr const char op_name[] = "my_op";
+ * apply_ternary_elementwise_fn<CTYPE_COMMON, op_name>.
  */
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_C,
-    typename CTYPE_OUT,
-    typename Op>
+template <typename CTYPE_COMMON, const char* op_name, typename Op>
 inline void apply_ternary_elementwise_fn(
     const Op& compute_fun,
     const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
     const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
     const Tensor& c,
-    const Tensor& out) {
+    SupportedTensorDtypes c_dtypes,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes) {
   const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
   const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
   const bool c_is_broadcasted = !out.sizes().equals(c.sizes());
   const bool any_is_broadcasted =
       (a_is_broadcasted || b_is_broadcasted || c_is_broadcasted);
 
-  const CTYPE_A* const data_a = a.const_data_ptr<CTYPE_A>();
-  const CTYPE_B* const data_b = b.const_data_ptr<CTYPE_B>();
-  const CTYPE_C* const data_c = c.const_data_ptr<CTYPE_C>();
-  CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
+  const auto load_a_to_common =
+      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(a, a_dtypes);
+  const auto load_b_to_common =
+      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(b, b_dtypes);
+  const auto load_c_to_common =
+      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(c, c_dtypes);
+  const auto store_common_to_out =
+      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
+          out, out_dtypes);
+  const char* const data_a = reinterpret_cast<const char*>(a.const_data_ptr());
+  const char* const data_b = reinterpret_cast<const char*>(b.const_data_ptr());
+  const char* const data_c = reinterpret_cast<const char*>(c.const_data_ptr());
+  const auto a_element_size = a.element_size();
+  const auto b_element_size = b.element_size();
+  const auto c_element_size = c.element_size();
+  const auto out_element_size = out.element_size();
+  char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
 
   for (size_t i = 0; i < out.numel(); ++i) {
     size_t a_linear_index = i;
@@ -357,8 +503,11 @@ inline void apply_ternary_elementwise_fn(
       }
     }
 
-    data_out[i] = compute_fun(
-        data_a[a_linear_index], data_b[b_linear_index], data_c[c_linear_index]);
+    auto result = compute_fun(
+        load_a_to_common(&data_a[a_linear_index * a_element_size]),
+        load_b_to_common(&data_b[b_linear_index * b_element_size]),
+        load_c_to_common(&data_c[c_linear_index * c_element_size]));
+    store_common_to_out(result, &data_out[i * out_element_size]);
   }
 }
 
diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h
index df175147062..05935fff389 100644
--- a/kernels/portable/cpu/util/math_util.h
+++ b/kernels/portable/cpu/util/math_util.h
@@ -96,9 +96,8 @@ INT_T max_override(INT_T a, INT_T b) {
 
 template <
     typename T,
-    typename std::enable_if<
-        std::is_same<T, torch::executor::Half>::value,
-        bool>::type = true>
+    typename std::enable_if<std::is_same<T, exec_aten::Half>::value, bool>::
+        type = true>
 T min_override(T a, T b) {
   const auto float_a = static_cast<float>(a);
   if (std::isnan(float_a)) {
@@ -117,9 +116,8 @@ T min_override(T a, T b) {
 
 template <
     typename T,
-    typename std::enable_if<
-        std::is_same<T, torch::executor::Half>::value,
-        bool>::type = true>
+    typename std::enable_if<std::is_same<T, exec_aten::Half>::value, bool>::
+        type = true>
 T max_override(T a, T b) {
   const auto float_a = static_cast<float>(a);
   if (std::isnan(float_a)) {
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 3961add0fd7..82d3d84fa23 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -205,7 +205,7 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:tensor_util",
             ":broadcast_util",
         ],
-        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."],
+        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/...", "@EXECUTORCH_CLIENTS"],
     )
 
     runtime.cxx_library(
@@ -249,5 +249,10 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
-            visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."],
+            visibility = [
+                "//executorch/extension/llm/custom_ops/...",
+                "//executorch/kernels/portable/cpu/...",
+                "//executorch/kernels/quantized/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
         )
diff --git a/kernels/portable/cpu/util/test/broadcast_test.cpp b/kernels/portable/cpu/util/test/broadcast_test.cpp
index 168977eb831..d87e8ecec85 100644
--- a/kernels/portable/cpu/util/test/broadcast_test.cpp
+++ b/kernels/portable/cpu/util/test/broadcast_test.cpp
@@ -20,8 +20,14 @@
 using namespace ::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::ArrayRef;
-using torch::executor::testing::TensorFactory;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::testing::TensorFactory;
+using torch::executor::broadcast_tensor;
+using torch::executor::delinearize_index;
+using torch::executor::get_broadcast_target_size;
+using torch::executor::linearize_access_indexes;
+using torch::executor::tensor_is_broadcastable_to;
+using torch::executor::tensors_are_broadcastable_between;
 
 TEST(BroadcastUtilTest, BroadcastTensor) {
   TensorFactory<ScalarType::Int> tf;
@@ -112,17 +118,17 @@ TEST(BroadcastUtilTest, GetBroadcastTargetSize) {
   Tensor a = tf.zeros({2, 1});
   Tensor b = tf.zeros({5, 1, 2});
 
-  get_broadcast_target_size(
+  executorch::runtime::Error err = get_broadcast_target_size(
       a,
       b,
       expected_output_size,
       torch::executor::kTensorDimensionLimit,
       &expected_output_dim);
+  EXPECT_EQ(err, torch::executor::Error::Ok);
 
   EXPECT_TRUE(
-      torch::executor::ArrayRef<Tensor::SizesType>(
-          expected_output_size, expected_output_dim)
-          .equals(torch::executor::ArrayRef<Tensor::SizesType>({5, 2, 2})));
+      ArrayRef<Tensor::SizesType>(expected_output_size, expected_output_dim)
+          .equals(ArrayRef<Tensor::SizesType>({5, 2, 2})));
 }
 
 size_t linearize_indexes(size_t* indexes, size_t indexes_len, const Tensor& t) {
diff --git a/kernels/portable/cpu/util/test/reduce_test.cpp b/kernels/portable/cpu/util/test/reduce_test.cpp
index 9ee37aab657..e7bb03c30c8 100644
--- a/kernels/portable/cpu/util/test/reduce_test.cpp
+++ b/kernels/portable/cpu/util/test/reduce_test.cpp
@@ -19,7 +19,10 @@ using exec_aten::ArrayRef;
 using exec_aten::optional;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
+using executorch::runtime::testing::TensorFactory;
+using torch::executor::apply_over_dim;
+using torch::executor::apply_over_dim_list;
+using torch::executor::get_out_numel;
 
 void _apply_over_dim(const Tensor& in, const optional<int64_t>& dim) {
   int64_t* in_data = in.mutable_data_ptr<int64_t>();
diff --git a/kernels/portable/test/op_mul_test.cpp b/kernels/portable/test/op_mul_test.cpp
index 0c7f300dcfc..c95fdf32253 100644
--- a/kernels/portable/test/op_mul_test.cpp
+++ b/kernels/portable/test/op_mul_test.cpp
@@ -36,19 +36,18 @@ class OpMulOutKernelTest : public OperatorTest {
 TEST_F(OpMulOutKernelTest, UnhandledDtypeDies) {
   // mul_out() doesn't handle QInt8.
   // TensorFactory cannot be used with ScalarType::QInt8 since
-  // torch::executor::qint8 does not have a default constructor. It must be
+  // exec_aten::qint8 does not have a default constructor. It must be
   // initialized with an explicit value. So, we need to manually create the
   // underlying data without default construction and then the tensors from that
   // data via TensorImpl.
 
   std::vector<SizesType> sizes = {2, 2};
 
-  std::vector<torch::executor::qint8> a_data{};
-  std::generate_n(std::back_inserter(a_data), 4, []() {
-    return torch::executor::qint8{0};
-  });
-  std::vector<torch::executor::qint8> b_data(a_data);
-  std::vector<torch::executor::qint8> out_data(a_data);
+  std::vector<exec_aten::qint8> a_data{};
+  std::generate_n(
+      std::back_inserter(a_data), 4, []() { return exec_aten::qint8{0}; });
+  std::vector<exec_aten::qint8> b_data(a_data);
+  std::vector<exec_aten::qint8> out_data(a_data);
 
   auto a_impl = torch::executor::TensorImpl(
       ScalarType::QInt8, 2, sizes.data(), a_data.data());
diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp
index d0b51707cbd..323c372f73a 100644
--- a/kernels/prim_ops/register_prim_ops.cpp
+++ b/kernels/prim_ops/register_prim_ops.cpp
@@ -12,7 +12,6 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 
-using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>;
 using torch::executor::function::et_copy_index;
 
 namespace torch {
@@ -287,6 +286,21 @@ static Kernel prim_ops[] = {
           out = EValue(a.toInt() % b.toInt());
         }),
 
+    // executorch_prim::mod.Scalar(Scalar, Scalar) -> Scalar
+    Kernel(
+        "executorch_prim::mod.Scalar",
+        [](KernelRuntimeContext& context, EValue** stack) {
+          (void)context;
+          EValue& a = *stack[0];
+          EValue& b = *stack[1];
+          EValue& out = *stack[2];
+          if (a.isInt() && b.isInt()) {
+            out = EValue(a.toInt() % b.toInt());
+          } else {
+            ET_CHECK_MSG(false, "%zu, %zu", (size_t)a.tag, (size_t)b.tag);
+          }
+        }),
+
     // executorch_prim::et_copy_index.tensor(tensor, tensor) -> tensor
     Kernel("executorch_prim::et_copy_index.tensor", &et_copy_index),
     // executorch_prim::et_view.default(Tensor, int[]) -> Tensor
@@ -294,13 +308,14 @@ static Kernel prim_ops[] = {
 
 };
 
-static KernelArrayRef kernel_array_ref(
+executorch::runtime::Span<const executorch::runtime::Kernel> kernel_span(
     prim_ops,
     prim_ops + sizeof(prim_ops) / sizeof(Kernel));
 
 // Return value not used. Keep the static variable assignment to register
 // operators in static initialization time.
-static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+auto success_with_kernel_reg =
+    executorch::runtime::register_kernels(kernel_span);
 
 } // namespace
 } // namespace function
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
index 240e42a97ce..4b4b35a2324 100644
--- a/kernels/prim_ops/test/prim_ops_test.cpp
+++ b/kernels/prim_ops/test/prim_ops_test.cpp
@@ -115,6 +115,9 @@ TEST_F(RegisterPrimOpsTest, TestAlgebraOps) {
   getOpsFn("executorch_prim::mod.int")(context, stack);
   EXPECT_EQ(stack[2]->toInt(), 3);
 
+  getOpsFn("executorch_prim::mod.Scalar")(context, stack);
+  EXPECT_EQ(stack[2]->toInt(), 3);
+
   getOpsFn("executorch_prim::sym_float.Scalar")(context, stack);
   EXPECT_FLOAT_EQ(stack[1]->toDouble(), 3.0);
 }
diff --git a/kernels/quantized/cpu/op_choose_qparams.cpp b/kernels/quantized/cpu/op_choose_qparams.cpp
index 13b26e0b738..14f7d3157ae 100644
--- a/kernels/quantized/cpu/op_choose_qparams.cpp
+++ b/kernels/quantized/cpu/op_choose_qparams.cpp
@@ -36,7 +36,8 @@ void check_quantize_per_tensor_args(
     int64_t qmax,
     ScalarType dtype,
     Tensor& scale_out,
-    Tensor& zero_point_out) {
+    Tensor& zero_point_out,
+    bool is_per_token = false) {
   (void)dtype;
   ET_CHECK_MSG(
       qmin < qmax,
@@ -56,27 +57,49 @@ void check_quantize_per_tensor_args(
       zero_point_out.scalar_type() == ScalarType::Long,
       "Expected scale to be Long tensor received: %" PRId8,
       static_cast<int8_t>(zero_point_out.scalar_type()));
-  ET_CHECK_MSG(
-      scale_out.numel() == 1,
-      "Exepcted scale to only have one element received: %zd",
-      ssize_t(scale_out.numel()));
-  ET_CHECK_MSG(
-      zero_point_out.numel() == 1,
-      "Exepcted zero_point to only have one element received: %zd",
-      ssize_t(zero_point_out.numel()));
+
+  if (is_per_token) {
+    for (auto i = 0; i < input.dim() - 1; i++) {
+      ET_CHECK_MSG(
+          scale_out.size(i) == input.size(i),
+          "Exepcted scale to have the same number of elements at dimentions %d got %zd",
+          i,
+          scale_out.size(i));
+      ET_CHECK_MSG(
+          zero_point_out.size(i) == input.size(i),
+          "Exepcted zero pont to have the same number of elements at dimentions %d got %zd",
+          i,
+          zero_point_out.size(i));
+    }
+    ET_CHECK_MSG(
+        scale_out.size(input.dim() - 1) == 1,
+        "Exepcted scale to have only one element at dimentions %zd but got %zd",
+        input.dim() - 1,
+        scale_out.size(input.dim() - 1));
+    ET_CHECK_MSG(
+        zero_point_out.size(input.dim() - 1) == 1,
+        "Exepcted zero point to have only one element at dimentions %zd but got %zd",
+        input.dim() - 1,
+        zero_point_out.size(input.dim() - 1));
+  } else {
+    ET_CHECK_MSG(
+        scale_out.numel() == 1,
+        "Exepcted scale to only have one element received: %zd",
+        ssize_t(scale_out.numel()));
+    ET_CHECK_MSG(
+        zero_point_out.numel() == 1,
+        "Exepcted zero_point to only have one element received: %zd",
+        ssize_t(zero_point_out.numel()));
+  }
 }
 
-void choose_qparams(
-    const Tensor& input,
+void calculate_scale_and_zero_point(
+    float min,
+    float max,
     int32_t qmin,
     int32_t qmax,
-    Tensor& scale_out,
-    Tensor& zero_point_out) {
-  const float* x_fp32 = input.const_data_ptr<float>();
-  // Compute x_min, x_max and q_params (scale, zero_point)
-  float min = torch::executor::vec_minf(x_fp32, input.numel());
-  float max = torch::executor::vec_maxf(x_fp32, input.numel());
-
+    double& scale,
+    int32_t& zero_point) {
   // We extend the [min, max] interval to ensure that it contains 0.
   // Otherwise, we would not meet the requirement that 0 be an exactly
   // representable value.
@@ -85,7 +108,7 @@ void choose_qparams(
 
   // Use double precision for intermediate computation but use single precision
   // in final number to reflect the actual number used during quantization.
-  double scale = (static_cast<double>(max) - min) / (qmax - qmin);
+  scale = (static_cast<double>(max) - min) / (qmax - qmin);
   // If scale is 0 or too small so its reciprocal is infinity, we arbitrary
   // adjust the scale to 0.1 . We want to avoid scale's reciprocal being
   // infinity because some of fbgemm code pre-computes scale's reciprocal to do
@@ -143,9 +166,54 @@ void choose_qparams(
   } else {
     nudged_zero_point = nearbyint(static_cast<float>(initial_zero_point));
   }
+  zero_point = nudged_zero_point;
+  return;
+}
+
+void choose_qparams(
+    const Tensor& input,
+    int32_t qmin,
+    int32_t qmax,
+    Tensor& scale_out,
+    Tensor& zero_point_out) {
+  const float* x_fp32 = input.const_data_ptr<float>();
+  // Compute x_min, x_max and q_params (scale, zero_point)
+  float min = torch::executor::vec_minf(x_fp32, input.numel());
+  float max = torch::executor::vec_maxf(x_fp32, input.numel());
+
+  double scale;
+  int32_t zero_point;
+  calculate_scale_and_zero_point(min, max, qmin, qmax, scale, zero_point);
 
   scale_out.mutable_data_ptr<double>()[0] = scale;
-  zero_point_out.mutable_data_ptr<int64_t>()[0] = nudged_zero_point;
+  zero_point_out.mutable_data_ptr<int64_t>()[0] = zero_point;
+}
+
+void choose_qparams_per_token(
+    const Tensor& input,
+    int32_t qmin,
+    int32_t qmax,
+    Tensor& scale_out,
+    Tensor& zero_point_out) {
+  const float* x_fp32 = input.const_data_ptr<float>();
+  // Compute x_min, x_max and q_params (scale, zero_point)
+  auto num_tokens = 1;
+  for (auto i = 0; i < input.dim() - 1; i++) {
+    num_tokens *= input.size(i);
+  }
+  auto token_dim_size = input.size(input.dim() - 1);
+  for (auto i = 0; i < num_tokens; i++) {
+    // vec_minf uses std::min_element. Check if it actually
+    // gets vectorized.
+    float min = torch::executor::vec_minf(x_fp32, token_dim_size);
+    float max = torch::executor::vec_maxf(x_fp32, token_dim_size);
+    double scale;
+    int32_t zero_point;
+    calculate_scale_and_zero_point(min, max, qmin, qmax, scale, zero_point);
+    scale_out.mutable_data_ptr<double>()[i] = scale;
+    zero_point_out.mutable_data_ptr<int64_t>()[i] = zero_point;
+    x_fp32 += token_dim_size;
+  }
 }
 } // namespace
 
@@ -180,6 +248,54 @@ ::std::tuple<Tensor&, Tensor&> choose_qparams_tensor_out(
       input, quant_min, quant_max, eps, dtype, scale_out, zero_point_out);
 }
 
+std::tuple<Tensor&, Tensor&> choose_qparams_per_token_asymmetric_out(
+    const Tensor& input,
+    ScalarType dtype,
+    Tensor& scale_out,
+    Tensor& zero_point_out) {
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  for (ssize_t i = 0; i < input.dim() - 1; i++) {
+    output_sizes[i] = input.size(i);
+  }
+  output_sizes[input.dim() - 1] = 1;
+  size_t output_dim = input.dim();
+  torch::executor::Error err =
+      resize_tensor(scale_out, {output_sizes, output_dim});
+  ET_CHECK_MSG(
+      err == torch::executor::Error::Ok,
+      "Failed to resize scale_out Tensor in choose_qparams");
+  err = resize_tensor(zero_point_out, {output_sizes, output_dim});
+  ET_CHECK_MSG(
+      err == torch::executor::Error::Ok,
+      "Failed to resize zero_point_out Tensor in choose_qparams");
+
+  check_quantize_per_tensor_args(
+      input,
+      quant_min,
+      quant_max,
+      dtype,
+      scale_out,
+      zero_point_out,
+      true /* is_per_token*/);
+
+  choose_qparams_per_token(
+      input, quant_min, quant_max, scale_out, zero_point_out);
+  return {scale_out, zero_point_out};
+}
+
+::std::tuple<Tensor&, Tensor&> choose_qparams_per_token_asymmetric_out(
+    RuntimeContext& context,
+    const Tensor& input,
+    ScalarType dtype,
+    Tensor& scale_out,
+    Tensor& zero_point_out) {
+  (void)context;
+  return choose_qparams_per_token_asymmetric_out(
+      input, dtype, scale_out, zero_point_out);
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/quantized/cpu/op_dequantize.cpp b/kernels/quantized/cpu/op_dequantize.cpp
index 836af474015..8973d69cc31 100644
--- a/kernels/quantized/cpu/op_dequantize.cpp
+++ b/kernels/quantized/cpu/op_dequantize.cpp
@@ -168,6 +168,19 @@ Tensor& dequantize_per_tensor_tensor_args_out(
   return out;
 }
 
+float get_scale(const Tensor& scale, size_t channel_ix) {
+  ET_CHECK_MSG(
+      (scale.scalar_type() == ScalarType::Double) ||
+          (scale.scalar_type() == ScalarType::Float),
+      "scale.scalar_type() %" PRId8 " is not double or float type",
+      static_cast<int8_t>(scale.scalar_type()));
+  if (scale.scalar_type() == ScalarType::Double) {
+    return static_cast<float>(scale.const_data_ptr<double>()[channel_ix]);
+  } else {
+    return scale.const_data_ptr<float>()[channel_ix];
+  }
+}
+
 Tensor& dequantize_per_channel_out(
     const Tensor& input,
     const Tensor& scale,
@@ -178,8 +191,6 @@ Tensor& dequantize_per_channel_out(
     ScalarType dtype,
     exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
-  torch::executor::Error err = resize_tensor(out, input.sizes());
-
   // normalize axis
   ET_CHECK_MSG(
       tensor_has_dim(input, axis),
@@ -191,15 +202,6 @@ Tensor& dequantize_per_channel_out(
     axis += nonzero_dim(input);
   }
 
-  ET_CHECK_MSG(
-      err == torch::executor::Error::Ok,
-      "Failed to resize out Tensor in dequantize_per_channel_out");
-
-  ET_CHECK_MSG(
-      scale.scalar_type() == ScalarType::Float,
-      "scale.scalar_type() %" PRId8 " is not float type",
-      static_cast<int8_t>(scale.scalar_type()));
-
   ET_CHECK_MSG(
       scale.numel() == input.size(axis),
       "scale.numel() %zd != input.size(axis) %zd",
@@ -232,7 +234,6 @@ Tensor& dequantize_per_channel_out(
       dims[i] = i + 1;
     }
   }
-  const float* scale_data = scale.const_data_ptr<float>();
   const int64_t* zero_point_data;
   if (opt_zero_points.has_value()) {
     zero_point_data = opt_zero_points.value().const_data_ptr<int64_t>();
@@ -260,11 +261,11 @@ Tensor& dequantize_per_channel_out(
           axis == 0, "Axis must be 0 for a single dimensional tensors");       \
       const optional<int64_t> dim;                                             \
       apply_over_dim(                                                          \
-          [input_data_ptr, out_data_ptr, scale_data, zero_point_data](         \
+          [input_data_ptr, out_data_ptr, zero_point_data, &scale](             \
               size_t numel, size_t stride, size_t base_ix) {                   \
             for (size_t i = 0; i < numel; i++) {                               \
               size_t current_ix = base_ix * stride + i;                        \
-              float _scale = scale_data[current_ix];                           \
+              float _scale = get_scale(scale, current_ix);                     \
               int64_t zero_point = 0;                                          \
               if (zero_point_data != nullptr) {                                \
                 zero_point = zero_point_data[current_ix];                      \
@@ -280,7 +281,7 @@ Tensor& dequantize_per_channel_out(
       break;                                                                   \
     }                                                                          \
     for (size_t channel_ix = 0; channel_ix < input.size(axis); ++channel_ix) { \
-      float _scale = scale_data[channel_ix];                                   \
+      float _scale = get_scale(scale, channel_ix);                             \
       int64_t _zero_point = 0;                                                 \
       if (zero_point_data != nullptr) {                                        \
         _zero_point = zero_point_data[channel_ix];                             \
@@ -335,6 +336,11 @@ Tensor& dequantize_per_channel_out(
     exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   (void)context;
+  torch::executor::Error err = resize_tensor(out, input.sizes());
+  ET_CHECK_MSG(
+      err == torch::executor::Error::Ok,
+      "Failed to resize out Tensor in dequantize_per_channel_out");
+
   return dequantize_per_channel_out(
       input,
       scale,
@@ -381,6 +387,79 @@ Tensor& dequantize_per_tensor_tensor_args_out(
       input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out);
 }
 
+Tensor& dequantize_per_token_out(
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    ScalarType out_dtype,
+    Tensor& out) {
+  // Refactor this into a util
+  size_t num_channels = 1;
+  for (size_t i = 0; i < input.dim() - 1; i++) {
+    num_channels *= input.size(i);
+  }
+  // This unfortunate change is needed because we compile op_quantize for aten
+  // mode as well
+  std::array<exec_aten::SizesType, 2> input_sizes;
+  input_sizes[0] = static_cast<exec_aten::SizesType>(num_channels);
+  input_sizes[1] =
+      static_cast<exec_aten::SizesType>(input.size(input.dim() - 1));
+#ifdef USE_ATEN_LIB
+  Tensor reshaped_input = at::from_blob(
+      input.mutable_data_ptr(),
+      input_sizes,
+      at::TensorOptions(input.scalar_type()));
+#else
+  std::array<exec_aten::DimOrderType, 2> input_dim_order{0, 1};
+  std::array<exec_aten::StridesType, 2> input_strides;
+  dim_order_to_stride_nocheck(
+      input_sizes.data(), input_dim_order.data(), 2, input_strides.data());
+  void* input_data = input.mutable_data_ptr();
+  TensorImpl reshaped_input_impl = TensorImpl(
+      input.scalar_type(),
+      2,
+      input_sizes.data(),
+      input_data,
+      input_dim_order.data(),
+      input_strides.data(),
+      TensorShapeDynamism::STATIC);
+  Tensor reshaped_input(&reshaped_input_impl);
+  torch::executor::Error err = resize_tensor(out, input.sizes());
+  ET_CHECK_MSG(
+      err == torch::executor::Error::Ok,
+      "Failed to resize out Tensor in dequantize_per_channel_out");
+#endif
+
+  return dequantize_per_channel_out(
+      reshaped_input,
+      scale,
+      zero_points,
+      0, /* axis */
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype,
+      out);
+}
+
+Tensor& dequantize_per_token_out(
+    RuntimeContext& context,
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_points,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    ScalarType out_dtype,
+    Tensor& out) {
+  (void)context;
+  return dequantize_per_token_out(
+      input, scale, zero_points, quant_min, quant_max, dtype, out_dtype, out);
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index 065dc743d92..9e95b11d592 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -241,8 +241,6 @@ Tensor& quantize_per_channel_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-  torch::executor::Error err = resize_tensor(out, input.sizes());
-
   // normalize axis
   ET_CHECK_MSG(
       tensor_has_dim(input, axis),
@@ -254,10 +252,6 @@ Tensor& quantize_per_channel_out(
     axis += nonzero_dim(input);
   }
 
-  ET_CHECK_MSG(
-      err == torch::executor::Error::Ok,
-      "Failed to resize out Tensor in quantize_per_channel_out");
-
   ET_CHECK_MSG(
       scale.scalar_type() == ScalarType::Double,
       "scale.scalar_type() %" PRId8 " is not double type",
@@ -368,9 +362,76 @@ Tensor& quantize_per_channel_out(
     ScalarType dtype,
     Tensor& out) {
   (void)context;
+  torch::executor::Error err = resize_tensor(out, input.sizes());
+  ET_CHECK_MSG(
+      err == torch::executor::Error::Ok,
+      "Failed to resize out Tensor in quantize_per_channel_out");
+
   return quantize_per_channel_out(
       input, scale, zero_point, axis, quant_min, quant_max, dtype, out);
 }
+
+Tensor& quantize_per_token_out(
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  size_t num_tokens = 1;
+  for (size_t i = 0; i < input.dim() - 1; i++) {
+    num_tokens *= input.size(i);
+  }
+// This unfortunate change is needed because we compile op_quantize for aten
+// mode as well
+#ifdef USE_ATEN_LIB
+  std::vector<int64_t> sizes(2);
+  sizes[0] = num_tokens;
+  sizes[1] = input.size(input.dim() - 1);
+  Tensor reshaped_input = at::from_blob(
+      input.mutable_data_ptr(), sizes, at::TensorOptions(input.scalar_type()));
+#else
+  std::array<exec_aten::DimOrderType, 2> input_dim_order{0, 1};
+  std::array<exec_aten::SizesType, 2> input_sizes;
+  input_sizes[0] = num_tokens;
+  input_sizes[1] = input.size(input.dim() - 1);
+  std::array<exec_aten::StridesType, 2> input_strides;
+  dim_order_to_stride_nocheck(
+      input_sizes.data(), input_dim_order.data(), 2, input_strides.data());
+  void* input_data = input.mutable_data_ptr();
+  TensorImpl reshaped_input_impl = TensorImpl(
+      input.scalar_type(),
+      2,
+      input_sizes.data(),
+      input_data,
+      input_dim_order.data(),
+      input_strides.data(),
+      TensorShapeDynamism::STATIC);
+  Tensor reshaped_input(&reshaped_input_impl);
+  torch::executor::Error err = resize_tensor(out, input.sizes());
+  ET_CHECK_MSG(
+      err == torch::executor::Error::Ok,
+      "Failed to resize out Tensor in quantize_per_channel_out");
+#endif
+
+  return quantize_per_channel_out(
+      reshaped_input, scale, zero_point, 0, quant_min, quant_max, dtype, out);
+}
+
+Tensor& quantize_per_token_out(
+    RuntimeContext& context,
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  (void)context;
+  return quantize_per_token_out(
+      input, scale, zero_point, quant_min, quant_max, dtype, out);
+}
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml
index ca2360b7d80..eb7586bad77 100644
--- a/kernels/quantized/quantized.yaml
+++ b/kernels/quantized/quantized.yaml
@@ -81,3 +81,21 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::quantize_per_tensor_tensor_args_out
+
+- func: quantized_decomposed::choose_qparams_per_token_asymmetric.out(Tensor input, ScalarType dtype, *, Tensor(a!) scale_out, Tensor(b!) zero_point_out) -> (Tensor(a!), Tensor(b!))
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::choose_qparams_per_token_asymmetric_out
+
+- func: quantized_decomposed::quantize_per_token.out(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::quantize_per_token_out
+
+- func: quantized_decomposed::dequantize_per_token.out(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::dequantize_per_token_out
diff --git a/kernels/quantized/targets.bzl b/kernels/quantized/targets.bzl
index fd35ad3728d..829cd7e9aeb 100644
--- a/kernels/quantized/targets.bzl
+++ b/kernels/quantized/targets.bzl
@@ -16,14 +16,17 @@ def define_common_targets():
         ops = [
             "quantized_decomposed::add.out",
             "quantized_decomposed::choose_qparams.Tensor_out",
+            "quantized_decomposed::choose_qparams_per_token_asymmetric.out",
             "quantized_decomposed::dequantize_per_channel.out",
             "quantized_decomposed::dequantize_per_tensor.out",
             "quantized_decomposed::dequantize_per_tensor.Tensor_out",
+            "quantized_decomposed::dequantize_per_token.out",
             "quantized_decomposed::mixed_linear.out",
             "quantized_decomposed::mixed_mm.out",
             "quantized_decomposed::quantize_per_channel.out",
             "quantized_decomposed::quantize_per_tensor.out",
             "quantized_decomposed::quantize_per_tensor.Tensor_out",
+            "quantized_decomposed::quantize_per_token.out",
         ],
         define_static_targets = True,
     )
diff --git a/kernels/quantized/test/TARGETS b/kernels/quantized/test/TARGETS
index ec1ddacfc41..a820e3da3fa 100644
--- a/kernels/quantized/test/TARGETS
+++ b/kernels/quantized/test/TARGETS
@@ -1,4 +1,5 @@
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
@@ -18,3 +19,36 @@ python_unittest(
         "//executorch/kernels/quantized:quantized_ops_lib",
     ],
 )
+
+runtime.cxx_library(
+    name = "quantized_ops_for_test_lib",
+    srcs = [
+        "quantized_ops_aot_register.cpp",
+    ],
+    visibility = [
+        "//executorch/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "//executorch/extension/aten_util:aten_bridge",
+        "//executorch/kernels/quantized/cpu:op_dequantize",
+        "//executorch/kernels/quantized/cpu:op_quantize",
+        "//executorch/runtime/core/exec_aten:lib",
+    ],
+    external_deps = [
+        "libtorch",
+    ],
+)
+
+python_unittest(
+    name = "test_quant_dequant_per_token",
+    srcs = [
+        "test_quant_dequant_per_token.py",
+    ],
+    preload_deps = [
+        ":quantized_ops_for_test_lib",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
diff --git a/kernels/quantized/test/op_choose_qparams_test.cpp b/kernels/quantized/test/op_choose_qparams_test.cpp
index e7acfb0cf46..5cc3fc21169 100644
--- a/kernels/quantized/test/op_choose_qparams_test.cpp
+++ b/kernels/quantized/test/op_choose_qparams_test.cpp
@@ -11,6 +11,7 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
@@ -21,6 +22,7 @@ using exec_aten::ArrayRef;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using torch::executor::native::choose_qparams_per_token_asymmetric_out;
 using torch::executor::native::choose_qparams_tensor_out;
 using torch::executor::testing::TensorFactory;
 
@@ -28,6 +30,7 @@ using torch::executor::testing::TensorFactory;
 /// zeros().
 template <ScalarType DTYPE>
 void test_dtype() {
+  et_pal_init();
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Double> tf_double;
   TensorFactory<ScalarType::Long> tf_long;
@@ -48,6 +51,115 @@ void test_dtype() {
   EXPECT_TENSOR_EQ(zero_point_out, expected_zero_point);
 }
 
-TEST(OpChooseQparamsTensorOutTest, AllDtypesSupported) {
-  test_dtype<ScalarType::Byte>();
+TEST(OpChooseQparamsPerTokenAsymmetricTensorOutTest, Float) {
+  et_pal_init();
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  Tensor input = tf_float.make({2, 3}, {-0.5, 0.3, 1.2, 0.1, -0.8, 2.1});
+  Tensor scale_out = tf_double.zeros({2, 1});
+  Tensor zero_point_out = tf_long.zeros({2, 1});
+  Tensor expected_scale = tf_double.make({2, 1}, {0.00666667, 0.0113725485});
+  Tensor expected_zero_point = tf_long.make({2, 1}, {-53, -58});
+
+  choose_qparams_per_token_asymmetric_out(
+      input, ScalarType::Float, scale_out, zero_point_out);
+
+  EXPECT_TENSOR_CLOSE_WITH_TOL(scale_out, expected_scale, 1e-4, 1e-4);
+  EXPECT_TENSOR_EQ(zero_point_out, expected_zero_point);
+}
+
+TEST(OpChooseQparamsPerTokenAsymmetricTensorOutTest, ExtraDimFloat) {
+  et_pal_init();
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  Tensor input = tf_float.make({1, 2, 3}, {-0.5, 0.3, 1.2, 0.1, -0.8, 2.1});
+  Tensor scale_out = tf_double.zeros({1, 2, 1});
+  Tensor zero_point_out = tf_long.zeros({1, 2, 1});
+  Tensor expected_scale = tf_double.make({1, 2, 1}, {0.00666667, 0.0113725485});
+  Tensor expected_zero_point = tf_long.make({1, 2, 1}, {-53, -58});
+
+  choose_qparams_per_token_asymmetric_out(
+      input, ScalarType::Float, scale_out, zero_point_out);
+
+  EXPECT_TENSOR_CLOSE_WITH_TOL(scale_out, expected_scale, 1e-4, 1e-4);
+  EXPECT_TENSOR_EQ(zero_point_out, expected_zero_point);
+}
+
+TEST(OpChooseQparamsPerTokenAsymmetricTensorOutTest, LargeArray) {
+  et_pal_init();
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  Tensor input = tf_float.make(
+      {5, 17},
+      {0.41654,  0.26599, 0.4141,   0.83809,  0.02938,  0.12199, 0.53667,
+       0.799,    0.6606,  0.46657,  0.66142,  0.71787,  0.56098, 0.30202,
+       0.059377, 0.85473, 0.8017,   0.2703,   0.44299,  0.49045, 0.75581,
+       0.24429,  0.43906, 0.78652,  0.83885,  0.31034,  0.76534, 0.74422,
+       0.62549,  0.80006, 0.38144,  0.70652,  0.33553,  0.89136, 0.49126,
+       0.072916, 0.75654, 0.82057,  0.083848, 0.29753,  0.62718, 0.95579,
+       0.83097,  0.47293, 0.15666,  0.6248,   0.21672,  0.14626, 0.71834,
+       0.93664,  0.23382, 0.68931,  0.70866,  0.60545,  0.98648, 0.30335,
+       0.62439,  0.19195, 0.1923,   0.75638,  0.81114,  0.34778, 0.0070671,
+       0.50918,  0.19698, 0.19969,  0.57687,  0.062786, 0.18447, 0.22961,
+       0.29656,  0.25486, 0.75965,  0.11328,  0.86468,  0.21264, 0.99591,
+       0.75231,  0.97834, 0.042441, 0.39978,  0.9633,   0.9297,  0.12188,
+       0.73564});
+  Tensor scale_out = tf_double.zeros({5, 1});
+  Tensor zero_point_out = tf_long.zeros({5, 1});
+  Tensor expected_scale = tf_double.make(
+      {5, 1}, {0.0033519, 0.0034955, 0.0037482, 0.0038685, 0.0039055});
+  Tensor expected_zero_point =
+      tf_long.make({5, 1}, {-128, -128, -128, -128, -128});
+
+  choose_qparams_per_token_asymmetric_out(
+      input, ScalarType::Float, scale_out, zero_point_out);
+
+  EXPECT_TENSOR_CLOSE_WITH_TOL(scale_out, expected_scale, 1e-5, 1e-5);
+  EXPECT_TENSOR_EQ(zero_point_out, expected_zero_point);
+}
+
+TEST(OpChooseQparamsPerTokenAsymmetricTensorOutTest, DynamicShapeFloat) {
+  et_pal_init();
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  Tensor input = tf_float.make({1, 2, 3}, {-0.5, 0.3, 1.2, 0.1, -0.8, 2.1});
+  Tensor scale_out = tf_double.zeros(
+      {1, 5, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+  Tensor zero_point_out = tf_long.zeros(
+      {1, 5, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+  Tensor expected_scale = tf_double.make({1, 2, 1}, {0.00666667, 0.0113725485});
+  Tensor expected_zero_point = tf_long.make({1, 2, 1}, {-53, -58});
+
+  choose_qparams_per_token_asymmetric_out(
+      input, ScalarType::Float, scale_out, zero_point_out);
+
+  EXPECT_TENSOR_CLOSE_WITH_TOL(scale_out, expected_scale, 1e-4, 1e-4);
+  EXPECT_TENSOR_EQ(zero_point_out, expected_zero_point);
+
+  Tensor new_input = tf_float.make(
+      {1, 5, 8},
+      {5.2254,   5.6041,   5.7653,   -1.0126,  -0.86126, -0.1606,  -0.99196,
+       -1.067,   5.5913,   5.7713,   5.4901,   -0.43128, -1.1759,  -0.60466,
+       -0.82913, -0.73623, 5.4588,   5.4066,   5.2644,   -0.89692, -0.16866,
+       -0.63169, -0.42352, -0.48866, 5.594,    5.5223,   5.5277,   -0.17658,
+       -0.30669, -1.1777,  -0.65389, -0.36422, 5.6375,   5.1857,   5.0743,
+       -0.46654, -0.43817, -0.41506, -0.94515, -0.60247});
+  Tensor new_expected_scale = tf_double.make(
+      {1, 5, 1}, {0.026793, 0.027244, 0.024924, 0.026556, 0.025814});
+  Tensor new_expected_zero_point =
+      tf_long.make({1, 5, 1}, {-88, -85, -92, -84, -91});
+
+  choose_qparams_per_token_asymmetric_out(
+      new_input, ScalarType::Float, scale_out, zero_point_out);
+
+  EXPECT_TENSOR_CLOSE_WITH_TOL(scale_out, new_expected_scale, 1e-4, 1e-4);
+  EXPECT_TENSOR_EQ(zero_point_out, new_expected_zero_point);
 }
diff --git a/kernels/quantized/test/op_dequantize_test.cpp b/kernels/quantized/test/op_dequantize_test.cpp
index 4cdebb662b7..8aa17772baf 100644
--- a/kernels/quantized/test/op_dequantize_test.cpp
+++ b/kernels/quantized/test/op_dequantize_test.cpp
@@ -11,6 +11,7 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
@@ -57,10 +58,12 @@ void test_dtype() {
 }
 
 TEST(OpDequantizeOutTest, AllDtypesSupported) {
+  et_pal_init();
   test_dtype<ScalarType::Byte>();
 }
 
 TEST(OpDequantizeOutTest, NonWholeNumbers) {
+  et_pal_init();
   TensorFactory<ScalarType::Byte> tf;
 
   Tensor input = tf.full({3, 5}, 100);
@@ -87,6 +90,7 @@ TEST(OpDequantizeOutTest, NonWholeNumbers) {
 }
 
 TEST(OpDequantizeOutTest, TensorArgOverload) {
+  et_pal_init();
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Double> tf_double;
   TensorFactory<ScalarType::Long> tf_long;
@@ -115,12 +119,13 @@ TEST(OpDequantizeOutTest, TensorArgOverload) {
 }
 
 TEST(OpDequantizeOutTest, DequantizePerChannel) {
+  et_pal_init();
   TensorFactory<ScalarType::Byte> tf_byte;
-  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
   TensorFactory<ScalarType::Long> tf_long;
 
   Tensor input = tf_byte.full({3, 2}, 100);
-  Tensor scale = tf_float.make({2}, {0.5, 1});
+  Tensor scale = tf_double.make({2}, {0.5, 1});
   Tensor zero_point = tf_long.make({2}, {30, 60});
   int64_t quant_min = 0;
   int64_t quant_max = 255;
@@ -145,7 +150,7 @@ TEST(OpDequantizeOutTest, DequantizePerChannel) {
 
   // Test with a different axis
   out = tfo.zeros({3, 2});
-  scale = tf_float.make({3}, {0.5, 0.75, 1});
+  scale = tf_double.make({3}, {0.5, 0.75, 1});
   zero_point = tf_long.make({3}, {30, 50, 60});
   // (100 - 30) * 0.5
   // (100 - 50) * 0.75
@@ -167,7 +172,7 @@ TEST(OpDequantizeOutTest, DequantizePerChannel) {
   // Test with a different axis
   out = tfo.zeros({3});
   input = tf_byte.make({3}, {100, 100, 100});
-  scale = tf_float.make({3}, {0.5, 0.75, 1});
+  scale = tf_double.make({3}, {0.5, 0.75, 1});
   zero_point = tf_long.make({3}, {30, 50, 60});
   // (100 - 30) * 0.5
   // (100 - 50) * 0.75
diff --git a/kernels/quantized/test/quantized_ops_aot_register.cpp b/kernels/quantized/test/quantized_ops_aot_register.cpp
new file mode 100644
index 00000000000..e20f719c1e5
--- /dev/null
+++ b/kernels/quantized/test/quantized_ops_aot_register.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <torch/library.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+Tensor& quantize_per_token_out(
+    RuntimeContext& context,
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out);
+
+Tensor& quantize_per_token_out_no_context(
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  exec_aten::RuntimeContext context{};
+  ::torch::executor::runtime_init();
+  quantize_per_token_out(
+      context, input, scale, zero_point, quant_min, quant_max, dtype, out);
+  return out;
+}
+
+at::Tensor quantize_per_token_aten(
+    const at::Tensor& input,
+    const at::Tensor& scale,
+    const at::Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    c10::ScalarType dtype) {
+  auto sizes = input.sizes().vec();
+  auto output = at::zeros(sizes, dtype);
+  TORCH_CHECK(dtype == c10::ScalarType::Char, "dtype must be char");
+  WRAP_TO_ATEN(quantize_per_token_out_no_context, 6)
+  (input, scale, zero_point, quant_min, quant_max, ScalarType::Char, output);
+  return output;
+}
+
+Tensor& dequantize_per_token_out(
+    RuntimeContext& context,
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    ScalarType out_dtype,
+    Tensor& out);
+
+Tensor& dequantize_per_token_out_no_context(
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    ScalarType out_dtype,
+    Tensor& out) {
+  exec_aten::RuntimeContext context{};
+  ::torch::executor::runtime_init();
+  dequantize_per_token_out(
+      context,
+      input,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype,
+      out);
+  return out;
+}
+
+at::Tensor dequantize_per_token_aten(
+    const at::Tensor& input,
+    const at::Tensor& scale,
+    const at::Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    c10::ScalarType dtype,
+    c10::ScalarType out_dtype) {
+  auto sizes = input.sizes().vec();
+  auto output = at::zeros(sizes, out_dtype);
+  TORCH_CHECK(dtype == c10::ScalarType::Char, "dtype must be char");
+  TORCH_CHECK(out_dtype == c10::ScalarType::Float, "out_dtype must be float");
+  WRAP_TO_ATEN(dequantize_per_token_out_no_context, 7)
+  (input,
+   scale,
+   zero_point,
+   quant_min,
+   quant_max,
+   ScalarType::Char,
+   ScalarType::Float,
+   output);
+  return output;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+TORCH_LIBRARY(et_quant_test, m) {
+  m.def(
+      "quantize_per_token(Tensor input, Tensor scale, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype) -> Tensor");
+  m.def(
+      "dequantize_per_token(Tensor input, Tensor scale, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType out_dtype) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(et_quant_test, CompositeExplicitAutograd, m) {
+  m.impl(
+      "quantize_per_token", torch::executor::native::quantize_per_token_aten);
+  m.impl(
+      "dequantize_per_token",
+      torch::executor::native::dequantize_per_token_aten);
+}
diff --git a/kernels/quantized/test/test_quant_dequant_per_token.py b/kernels/quantized/test/test_quant_dequant_per_token.py
new file mode 100644
index 00000000000..1286baea597
--- /dev/null
+++ b/kernels/quantized/test/test_quant_dequant_per_token.py
@@ -0,0 +1,149 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import unittest
+
+import torch
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
+
+
+class QuantizePerTokenTest(unittest.TestCase):
+
+    def test_quantize_per_token(self):
+        input_tensor = torch.tensor(
+            [[-0.5, 0.3, 1.2], [0.1, -0.8, 2.1], [-5, 1, 2]], dtype=torch.float32
+        )
+        scale = torch.tensor([0.5, 0.8, 1.0], dtype=torch.float64)
+        scale = scale.unsqueeze(-1)
+        zero_point = torch.tensor([-1, -2, 0])
+        zero_point = zero_point.unsqueeze(-1)
+        quantized_tensor = torch.ops.quantized_decomposed.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+        expected_quantized_tensor = torch.ops.et_quant_test.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+
+        self.assertTrue(torch.equal(quantized_tensor, expected_quantized_tensor))
+
+    def test_quantize_per_token_large_tensor(self):
+        input_tensor = torch.rand((8, 32))
+        scale = torch.rand((8, 1), dtype=torch.float64)
+        zero_point = torch.randint(0, 10, (8, 1))
+        quantized_tensor = torch.ops.quantized_decomposed.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+        expected_quantized_tensor = torch.ops.et_quant_test.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+
+        self.assertTrue(torch.equal(quantized_tensor, expected_quantized_tensor))
+
+    def test_quantize_per_token_high_rank(self):
+        input_tensor = torch.rand((1, 3, 8, 32))
+        scale = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        zero_point = torch.randint(0, 10, (1, 3, 8, 1))
+        quantized_tensor = torch.ops.quantized_decomposed.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+        expected_quantized_tensor = torch.ops.et_quant_test.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+
+        self.assertTrue(torch.equal(quantized_tensor, expected_quantized_tensor))
+
+    def test_quantize_per_token_dynamic(self):
+        input_tensor = torch.rand((1, 1, 8, 1))
+        scale = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        zero_point = torch.randint(0, 10, (1, 1, 8, 1))
+        quantized_tensor = torch.ops.quantized_decomposed.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+        expected_quantized_tensor = torch.ops.et_quant_test.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+
+        self.assertTrue(torch.equal(quantized_tensor, expected_quantized_tensor))
+
+        input_tensor = torch.rand((1, 3, 8, 1))
+        scale = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        zero_point = torch.randint(0, 10, (1, 3, 8, 1))
+        quantized_tensor = torch.ops.quantized_decomposed.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+        expected_quantized_tensor = torch.ops.et_quant_test.quantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8
+        )
+
+        self.assertTrue(torch.equal(quantized_tensor, expected_quantized_tensor))
+
+    def test_dequantize_per_token(self):
+        input_tensor = torch.randint(-50, 120, (3, 3), dtype=torch.int8)
+        scale = torch.tensor([0.5, 0.8, 1.0], dtype=torch.float64)
+        scale = scale.unsqueeze(-1)
+        zero_point = torch.tensor([-1, -2, 0])
+        zero_point = zero_point.unsqueeze(-1)
+        dequantized_tensor = torch.ops.quantized_decomposed.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+        expected_dequantized_tensor = torch.ops.et_quant_test.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+
+        self.assertTrue(torch.allclose(dequantized_tensor, expected_dequantized_tensor))
+
+    def test_dequantize_per_token_large_tensor(self):
+        input_tensor = torch.randint(-50, 120, (8, 32), dtype=torch.int8)
+        scale = torch.rand((8, 1), dtype=torch.float64)
+        zero_point = torch.randint(0, 10, (8, 1))
+        dequantized_tensor = torch.ops.quantized_decomposed.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+        expected_dequantized_tensor = torch.ops.et_quant_test.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+
+        self.assertTrue(torch.allclose(dequantized_tensor, expected_dequantized_tensor))
+
+    def test_dequantize_per_token_high_rank(self):
+        input_tensor = torch.randint(-50, 120, (1, 3, 8, 32), dtype=torch.int8)
+        scale = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        zero_point = torch.randint(0, 10, (1, 3, 8, 1))
+        dequantized_tensor = torch.ops.quantized_decomposed.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+        expected_dequantized_tensor = torch.ops.et_quant_test.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+
+        self.assertTrue(torch.allclose(dequantized_tensor, expected_dequantized_tensor))
+
+    def test_dequantize_per_token_dynamic(self):
+        input_tensor = torch.randint(-50, 120, (1, 1, 8, 32), dtype=torch.int8)
+        scale = torch.rand((1, 1, 8, 1), dtype=torch.float64)
+        zero_point = torch.randint(0, 10, (1, 1, 8, 1))
+        dequantized_tensor = torch.ops.quantized_decomposed.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+        expected_dequantized_tensor = torch.ops.et_quant_test.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+
+        self.assertTrue(torch.allclose(dequantized_tensor, expected_dequantized_tensor))
+
+        input_tensor = torch.randint(-50, 120, (1, 3, 8, 32), dtype=torch.int8)
+        scale = torch.rand((1, 3, 8, 1), dtype=torch.float64)
+        zero_point = torch.randint(0, 10, (1, 3, 8, 1))
+        dequantized_tensor = torch.ops.quantized_decomposed.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+        expected_dequantized_tensor = torch.ops.et_quant_test.dequantize_per_token(
+            input_tensor, scale, zero_point, -128, 127, torch.int8, torch.float32
+        )
+
+        self.assertTrue(torch.allclose(dequantized_tensor, expected_dequantized_tensor))
diff --git a/kernels/test/BinaryLogicalOpTest.cpp b/kernels/test/BinaryLogicalOpTest.cpp
new file mode 100644
index 00000000000..7557e7c9068
--- /dev/null
+++ b/kernels/test/BinaryLogicalOpTest.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/BinaryLogicalOpTest.h>
+
+namespace torch::executor::testing {
+
+void BinaryLogicalOpTest::test_all_dtypes() {
+#define TEST_ENTRY(ctype, dtype) \
+  test_op_out<ScalarType::dtype, ScalarType::Double, ScalarType::Double>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+#define TEST_ENTRY(ctype, dtype) \
+  test_op_out<ScalarType::Double, ScalarType::dtype, ScalarType::Double>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+#define TEST_ENTRY(ctype, dtype) \
+  test_op_out<ScalarType::Double, ScalarType::Double, ScalarType::dtype>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+} // namespace torch::executor::testing
diff --git a/kernels/test/BinaryLogicalOpTest.h b/kernels/test/BinaryLogicalOpTest.h
new file mode 100644
index 00000000000..0cf412c3373
--- /dev/null
+++ b/kernels/test/BinaryLogicalOpTest.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+namespace torch::executor::testing {
+class BinaryLogicalOpTest : public OperatorTest {
+ protected:
+  // Implement this to call the torch::executor::aten::op_outf function for the
+  // op.
+  virtual exec_aten::Tensor& op_out(
+      const exec_aten::Tensor& lhs,
+      const exec_aten::Tensor& rhs,
+      exec_aten::Tensor& out) = 0;
+
+  // Scalar reference implementation of the function in question for testing.
+  virtual double op_reference(double x, double y) const = 0;
+
+  template <
+      exec_aten::ScalarType IN_DTYPE,
+      exec_aten::ScalarType IN_DTYPE2,
+      exec_aten::ScalarType OUT_DTYPE>
+  void test_op_out() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<IN_DTYPE2> tf_in2;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    exec_aten::Tensor out = tf_out.zeros({1, 4});
+
+    using CTYPE1 = typename decltype(tf_in)::ctype;
+    std::vector<CTYPE1> test_vector1 = {0, CTYPE1(-1), CTYPE1(0), CTYPE1(31)};
+
+    using CTYPE2 = typename decltype(tf_in2)::ctype;
+    std::vector<CTYPE2> test_vector2 = {
+        CTYPE2(0),
+        CTYPE2(0),
+        CTYPE2(15),
+        CTYPE2(12),
+    };
+
+    std::vector<typename decltype(tf_out)::ctype> expected_vector;
+    for (int ii = 0; ii < test_vector1.size(); ++ii) {
+      expected_vector.push_back(
+          op_reference(test_vector1[ii], test_vector2[ii]));
+    }
+
+    op_out(
+        tf_in.make({1, 4}, test_vector1),
+        tf_in2.make({1, 4}, test_vector2),
+        out);
+
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 4}, expected_vector));
+  }
+
+  void test_all_dtypes();
+};
+
+#define IMPLEMENT_BINARY_LOGICAL_OP_TEST(TestName) \
+  TEST_F(TestName, SimpleTestAllTypes) {           \
+    test_all_dtypes();                             \
+  }
+} // namespace torch::executor::testing
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 791c2184e9f..30ea8c79ab7 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -68,6 +68,7 @@ add_custom_target(
 )
 
 set(all_test_sources
+    "BinaryLogicalOpTest.cpp"
     "op__to_dim_order_copy_test.cpp"
     "op_abs_test.cpp"
     "op_acos_test.cpp"
@@ -211,7 +212,8 @@ set(all_test_sources
     "op_view_copy_test.cpp"
     "op_where_test.cpp"
     "op_zeros_test.cpp"
-)
+    "UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp"
+  )
 
 set(_portable_kernels_test_sources
     ${all_test_sources}
@@ -240,6 +242,7 @@ set(_optimized_kernels_test_sources
     "op_native_layer_norm_test.cpp"
     "op_neg_test.cpp"
     "op_sub_test.cpp"
+    "UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp"
     ${CMAKE_CURRENT_BINARY_DIR}/include/portable/executorch/kernels/test/supported_features.cpp
 )
 
@@ -253,9 +256,12 @@ et_cxx_test(
   SOURCES
   ${_optimized_kernels_test_sources}
   EXTRA_LIBS
+  cpuinfo
+  extension_threadpool
   optimized_kernels
   optimized_ops_lib
   portable_kernels
+  pthreadpool
   eigen_blas
 )
 add_dependencies(optimized_kernels_test generate_wrapper)
diff --git a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp
new file mode 100644
index 00000000000..e3c38fc4710
--- /dev/null
+++ b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
+
+namespace torch::executor::testing {
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::test_bool_input() {
+  TensorFactory<exec_aten::ScalarType::Bool> tf_bool;
+  TensorFactory<exec_aten::ScalarType::Float> tf_float;
+
+  const std::vector<int32_t> sizes = {1, 2};
+
+  exec_aten::Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
+  exec_aten::Tensor out = tf_float.zeros(sizes);
+  exec_aten::Tensor res = tf_float.make(
+      sizes,
+      /*data=*/{(float)op_reference(false), (float)op_reference(true)});
+
+  EXPECT_TENSOR_CLOSE(op_out(a, out), res);
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::test_mismatched_input_shapes_dies() {
+  if (get_supported_features()->is_aten) {
+    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
+  }
+  TensorFactory<exec_aten::ScalarType::Float> tf;
+
+  exec_aten::Tensor a = tf.ones(/*sizes=*/{4});
+  exec_aten::Tensor out = tf.ones(/*sizes=*/{2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_out(a, out));
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_half_output_static_dynamism_support() {
+#define TEST_ENTRY(ctype, dtype)    \
+  test_floating_point_op_out<       \
+      exec_aten::ScalarType::dtype, \
+      exec_aten::ScalarType::Half>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_bfloat16_output_static_dynamism_support() {
+#define TEST_ENTRY(ctype, dtype)    \
+  test_floating_point_op_out<       \
+      exec_aten::ScalarType::dtype, \
+      exec_aten::ScalarType::BFloat16>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_float_output_static_dynamism_support() {
+#define TEST_ENTRY(ctype, dtype)    \
+  test_floating_point_op_out<       \
+      exec_aten::ScalarType::dtype, \
+      exec_aten::ScalarType::Float>();
+  ET_FORALL_REALH_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_double_output_static_dynamism_support() {
+#define TEST_ENTRY(ctype, dtype)    \
+  test_floating_point_op_out<       \
+      exec_aten::ScalarType::dtype, \
+      exec_aten::ScalarType::Double>();
+  ET_FORALL_REALH_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_half_output_bound_dynamism_support() {
+#define TEST_ENTRY(ctype, dtype)    \
+  test_floating_point_op_out<       \
+      exec_aten::ScalarType::dtype, \
+      exec_aten::ScalarType::Half>( \
+      {10, 10}, exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_bfloat16_output_bound_dynamism_support() {
+#define TEST_ENTRY(ctype, dtype)        \
+  test_floating_point_op_out<           \
+      exec_aten::ScalarType::dtype,     \
+      exec_aten::ScalarType::BFloat16>( \
+      {10, 10}, exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_float_output_bound_dynamism_support() {
+#define TEST_ENTRY(ctype, dtype)     \
+  test_floating_point_op_out<        \
+      exec_aten::ScalarType::dtype,  \
+      exec_aten::ScalarType::Float>( \
+      {10, 10}, exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+  ET_FORALL_REALH_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_double_output_bound_dynamism_support() {
+#define TEST_ENTRY(ctype, dtype)      \
+  test_floating_point_op_out<         \
+      exec_aten::ScalarType::dtype,   \
+      exec_aten::ScalarType::Double>( \
+      {10, 10}, exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+  ET_FORALL_REALH_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_float_output_unbound_dynamism_support() {
+  if (!get_supported_features()->is_aten) {
+    GTEST_SKIP() << "Dynamic shape unbound not supported";
+  }
+#define TEST_ENTRY(ctype, dtype)     \
+  test_floating_point_op_out<        \
+      exec_aten::ScalarType::dtype,  \
+      exec_aten::ScalarType::Float>( \
+      {1, 1}, exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  ET_FORALL_REALH_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::
+    test_all_real_input_double_output_unbound_dynamism_support() {
+  if (!get_supported_features()->is_aten) {
+    GTEST_SKIP() << "Dynamic shape unbound not supported";
+  }
+#define TEST_ENTRY(ctype, dtype)      \
+  test_floating_point_op_out<         \
+      exec_aten::ScalarType::dtype,   \
+      exec_aten::ScalarType::Double>( \
+      {1, 1}, exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  ET_FORALL_REALH_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+void UnaryUfuncRealHBBF16ToFloatHBF16Test::test_non_float_output_dtype_dies() {
+#define TEST_ENTRY(ctype, dtype)     \
+  test_op_invalid_output_dtype_dies< \
+      exec_aten::ScalarType::Float,  \
+      exec_aten::ScalarType::dtype>();
+  ET_FORALL_INT_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+} // namespace torch::executor::testing
diff --git a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h
new file mode 100644
index 00000000000..eeeb89b6ecb
--- /dev/null
+++ b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+namespace torch::executor::testing {
+// Generic test harness for ops that use unary_ufunc_realhb_to_floath
+// -- in other words, ops that just apply an elementwise function
+// mapping to a float or half.
+class UnaryUfuncRealHBBF16ToFloatHBF16Test : public OperatorTest {
+ protected:
+  // Implement this to call the torch::executor::aten::op_outf function for the
+  // op.
+  virtual exec_aten::Tensor& op_out(
+      const exec_aten::Tensor& self,
+      exec_aten::Tensor& out) = 0;
+
+  // Scalar reference implementation of the function in question for testing.
+  virtual double op_reference(double x) const = 0;
+
+  // The SupportedFeatures system assumes that it can build each test
+  // target with a separate SupportedFeatures (really just one
+  // portable, one optimzed but between one and the infinite, two is
+  // ridiculous and can't exist). We work around that by calling
+  // SupportedFeatures::get() in the concrete test translation
+  // unit. You need to declare an override, but we implement it for you
+  // in IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST.
+  virtual SupportedFeatures* get_supported_features() const = 0;
+
+  template <exec_aten::ScalarType IN_DTYPE, exec_aten::ScalarType OUT_DTYPE>
+  void test_floating_point_op_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      exec_aten::TensorShapeDynamism dynamism =
+          exec_aten::TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    exec_aten::Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    using IN_CTYPE = typename decltype(tf_in)::ctype;
+    using OUT_CTYPE = typename decltype(tf_out)::ctype;
+    std::vector<IN_CTYPE> test_vector = {0, 1, 3, 5, 10, 100};
+    std::vector<OUT_CTYPE> expected_vector;
+    for (int ii = 0; ii < test_vector.size(); ++ii) {
+      auto ref_result = this->op_reference(test_vector[ii]);
+      // Drop test cases with high magnitude results due to precision
+      // issues.
+      if ((std::abs(ref_result) > 1e30 || std::abs(ref_result) < -1e30)) {
+        test_vector[ii] = 2;
+        ref_result = this->op_reference(2);
+      }
+      expected_vector.push_back(ref_result);
+    }
+
+    // clang-format off
+    op_out(tf_in.make({1, 6}, test_vector), out);
+
+    auto expected = tf_out.make({1, 6}, expected_vector);
+    if (IN_DTYPE == ScalarType::BFloat16 || OUT_DTYPE == ScalarType::BFloat16) {
+      double rtol = executorch::runtime::testing::internal::kDefaultRtol;
+      // It appears we need a higher tolerance for at least some ATen
+      // tests, like aten_op_acosh_test.
+      if (get_supported_features()->is_aten) {
+        rtol = 3e-3;
+      }
+      EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, rtol, executorch::runtime::testing::internal::kDefaultBFloat16Atol);
+    } else if (IN_DTYPE == ScalarType::Half || OUT_DTYPE == ScalarType::Half) {
+      double rtol = executorch::runtime::testing::internal::kDefaultRtol;
+      // It appears we need a higher tolerance for at least some ATen
+      // tests, like aten_op_acosh_test.
+      if (get_supported_features()->is_aten) {
+        rtol = 1e-3;
+      }
+      EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, rtol, executorch::runtime::testing::internal::kDefaultHalfAtol);
+    } else {
+      EXPECT_TENSOR_CLOSE(out, expected);
+    }
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <
+      exec_aten::ScalarType INPUT_DTYPE,
+      exec_aten::ScalarType OUTPUT_DTYPE>
+  void test_op_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    exec_aten::Tensor in = tf.ones(sizes);
+    exec_aten::Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_out(in, out));
+  }
+
+  void test_bool_input();
+
+  void test_mismatched_input_shapes_dies();
+
+  void test_all_real_input_half_output_static_dynamism_support();
+
+  void test_all_real_input_bfloat16_output_static_dynamism_support();
+
+  void test_all_real_input_float_output_static_dynamism_support();
+
+  void test_all_real_input_double_output_static_dynamism_support();
+
+  void test_all_real_input_half_output_bound_dynamism_support();
+
+  void test_all_real_input_bfloat16_output_bound_dynamism_support();
+
+  void test_all_real_input_float_output_bound_dynamism_support();
+
+  void test_all_real_input_double_output_bound_dynamism_support();
+
+  void test_all_real_input_float_output_unbound_dynamism_support();
+
+  void test_all_real_input_double_output_unbound_dynamism_support();
+
+  void test_non_float_output_dtype_dies();
+};
+
+#define IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(TestName)         \
+  torch::executor::testing::SupportedFeatures*                        \
+  TestName::get_supported_features() const {                          \
+    return torch::executor::testing::SupportedFeatures::get();        \
+  }                                                                   \
+  TEST_F(TestName, HandleBoolInput) {                                 \
+    test_bool_input();                                                \
+  }                                                                   \
+  TEST_F(TestName, AllRealInputHalfOutputStaticDynamismSupport) {     \
+    test_all_real_input_half_output_static_dynamism_support();        \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllRealInputBFloat16OutputStaticDynamismSupport) { \
+    test_all_real_input_bfloat16_output_static_dynamism_support();    \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllRealInputFloatOutputStaticDynamismSupport) {    \
+    test_all_real_input_float_output_static_dynamism_support();       \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllRealInputDoubleOutputStaticDynamismSupport) {   \
+    test_all_real_input_double_output_static_dynamism_support();      \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllRealInputBFloat16OutputBoundDynamismSupport) {  \
+    test_all_real_input_bfloat16_output_bound_dynamism_support();     \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllRealInputFloatOutputBoundDynamismSupport) {     \
+    test_all_real_input_float_output_bound_dynamism_support();        \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllRealInputDoubleOutputBoundDynamismSupport) {    \
+    test_all_real_input_double_output_bound_dynamism_support();       \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllRealInputFloatOutputUnboundDynamismSupport) {   \
+    test_all_real_input_float_output_unbound_dynamism_support();      \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllRealInputDoubleOutputUnboundDynamismSupport) {  \
+    test_all_real_input_double_output_unbound_dynamism_support();     \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, AllNonFloatOutputDTypeDies) {                      \
+    test_non_float_output_dtype_dies();                               \
+  }                                                                   \
+                                                                      \
+  TEST_F(TestName, MismatchedInputShapesDies) {                       \
+    test_mismatched_input_shapes_dies();                              \
+  }
+
+} // namespace torch::executor::testing
diff --git a/kernels/test/custom_kernel_example/my_functions.yaml b/kernels/test/custom_kernel_example/my_functions.yaml
index 72f1d2cf865..de5ce952ab4 100644
--- a/kernels/test/custom_kernel_example/my_functions.yaml
+++ b/kernels/test/custom_kernel_example/my_functions.yaml
@@ -5,4 +5,4 @@
 - op: relu.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::my_custom_kernel::my_relu_out
+      kernel_name: my_custom_kernels::my_relu_out
diff --git a/kernels/test/custom_kernel_example/op_relu.cpp b/kernels/test/custom_kernel_example/op_relu.cpp
index e59fbf4bd72..39be620d86b 100644
--- a/kernels/test/custom_kernel_example/op_relu.cpp
+++ b/kernels/test/custom_kernel_example/op_relu.cpp
@@ -12,14 +12,15 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace my_custom_kernel {
+namespace my_custom_kernels {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using executor::Error;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::runtime::Error;
 using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::resize_tensor;
+using executorch::runtime::tensors_have_same_shape_and_dtype;
 
 namespace {
 
@@ -67,7 +68,7 @@ my_relu_out(KernelRuntimeContext& context, const Tensor& input, Tensor& out) {
   resize(out, input.sizes());
   ET_KERNEL_CHECK(
       context,
-      executor::tensors_have_same_shape_and_dtype(input, out),
+      tensors_have_same_shape_and_dtype(input, out),
       InvalidArgument,
       out);
 
@@ -94,5 +95,4 @@ my_relu_out(KernelRuntimeContext& context, const Tensor& input, Tensor& out) {
 }
 
 } // namespace native
-} // namespace my_custom_kernel
-} // namespace torch
+} // namespace my_custom_kernels
diff --git a/kernels/test/op_acos_test.cpp b/kernels/test/op_acos_test.cpp
index 9c9c9211be0..a95994d5074 100644
--- a/kernels/test/op_acos_test.cpp
+++ b/kernels/test/op_acos_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpAcosOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpAcosOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_acos_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::acos_outf(context_, self, out);
   }
 
-  // Common testing for acos operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_acos_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the acos operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_acos_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { 1.570796, 0.000000, NAN, NAN, NAN, NAN }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::acos(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_acos_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_acos_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpAcosOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{1.570796, 0.000000});
-
-  EXPECT_TENSOR_CLOSE(op_acos_out(a, out), res);
-}
-
-TEST_F(OpAcosOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_acos_out<ScalarType::dtype, ScalarType::Half>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcosOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_acos_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcosOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_acos_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcosOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_acos_out<ScalarType::dtype, ScalarType::Half>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcosOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_acos_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcosOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_acos_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcosOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_acos_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcosOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_acos_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcosOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_acos_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpAcosOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_acos_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpAcosOutTest)
diff --git a/kernels/test/op_acosh_test.cpp b/kernels/test/op_acosh_test.cpp
index ce01411fd3f..99ef815c6c0 100644
--- a/kernels/test/op_acosh_test.cpp
+++ b/kernels/test/op_acosh_test.cpp
@@ -7,144 +7,27 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
+#include <cmath>
+
 using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
 
-class OpAcoshOutTest : public OperatorTest {
+class OpAcoshOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_acosh_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::acosh_outf(context_, self, out);
   }
 
-  // Common testing for acosh operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_acosh_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the acosh operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_acosh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { NAN, 0.000000, 1.762747, 2.292432, 2.993223, 5.298292 }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::acosh(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_acosh_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_acosh_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpAcoshOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{NAN, 0.000000});
-
-  EXPECT_TENSOR_CLOSE(op_acosh_out(a, out), res);
-}
-
-TEST_F(OpAcoshOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcoshOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcoshOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcoshOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                        \
-  test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcoshOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcoshOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                        \
-  test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAcoshOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_acosh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpAcoshOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_acosh_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpAcoshOutTest)
diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp
index e35a4100c9a..0e4e2fc6359 100644
--- a/kernels/test/op_add_test.cpp
+++ b/kernels/test/op_add_test.cpp
@@ -18,11 +18,12 @@
 #include <iostream>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
 using torch::executor::testing::SupportedFeatures;
-using torch::executor::testing::TensorFactory;
+namespace etrt = executorch::runtime;
 
 class OpAddOutKernelTest : public OperatorTest {
  protected:
@@ -63,7 +64,8 @@ class OpAddOutKernelTest : public OperatorTest {
     test_add<DTYPE_A, DTYPE_B, ScalarType::Float>();
     test_add<DTYPE_A, DTYPE_B, ScalarType::Double>();
     // Integral out type is only allowed if both inputs are integral types
-    if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
+    if (etrt::isIntegralType(DTYPE_A, false) &&
+        etrt::isIntegralType(DTYPE_B, false)) {
       test_add<DTYPE_A, DTYPE_B, ScalarType::Int>();
       test_add<DTYPE_A, DTYPE_B, ScalarType::Long>();
     }
diff --git a/kernels/test/op_asin_test.cpp b/kernels/test/op_asin_test.cpp
index ae0af71d2de..65a6a141e22 100644
--- a/kernels/test/op_asin_test.cpp
+++ b/kernels/test/op_asin_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpAsinOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpAsinOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_asin_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::asin_outf(context_, self, out);
   }
 
-  // Common testing for asin operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_asin_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the asin operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_asin_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { 0.000000, 1.570796, NAN, NAN, NAN, NAN }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::asin(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_asin_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_asin_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpAsinOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, 1.5707960});
-
-  EXPECT_TENSOR_CLOSE(op_asin_out(a, out), res);
-}
-
-TEST_F(OpAsinOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_asin_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_asin_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_asin_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_asin_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_asin_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_asin_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_asin_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_asin_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_asin_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpAsinOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_asin_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpAsinOutTest)
diff --git a/kernels/test/op_asinh_test.cpp b/kernels/test/op_asinh_test.cpp
index cd887404b75..eb7f0229957 100644
--- a/kernels/test/op_asinh_test.cpp
+++ b/kernels/test/op_asinh_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpAsinhOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpAsinhOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_asinh_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::asinh_outf(context_, self, out);
   }
 
-  // Common testing for asinh operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_asinh_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the asinh operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_asinh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { 0.000000, 0.881374, 1.818447, 2.312438, 2.998223, 5.298342 }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::asinh(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_asinh_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_asinh_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpAsinhOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, 0.881374});
-
-  EXPECT_TENSOR_CLOSE(op_asinh_out(a, out), res);
-}
-
-TEST_F(OpAsinhOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinhOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinhOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinhOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinhOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinhOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                        \
-  test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinhOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinhOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                        \
-  test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAsinhOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_asinh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpAsinhOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_asinh_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpAsinhOutTest)
diff --git a/kernels/test/op_atan_test.cpp b/kernels/test/op_atan_test.cpp
index 6258819432f..61a10229781 100644
--- a/kernels/test/op_atan_test.cpp
+++ b/kernels/test/op_atan_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpAtanOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpAtanOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_atan_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::atan_outf(context_, self, out);
   }
 
-  // Common testing for atan operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_atan_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the atan operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_atan_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { 0.000000, 0.785398, 1.249046, 1.373401, 1.471128, 1.560797 }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::atan(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_atan_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_atan_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpAtanOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, 0.785398});
-
-  EXPECT_TENSOR_CLOSE(op_atan_out(a, out), res);
-}
-
-TEST_F(OpAtanOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_atan_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_atan_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_atan_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_atan_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_atan_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_atan_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_atan_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_atan_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_atan_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpAtanOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_atan_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpAtanOutTest)
diff --git a/kernels/test/op_atanh_test.cpp b/kernels/test/op_atanh_test.cpp
index 88f02603c85..84270359f43 100644
--- a/kernels/test/op_atanh_test.cpp
+++ b/kernels/test/op_atanh_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpAtanhOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpAtanhOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_atanh_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::atanh_outf(context_, self, out);
   }
 
-  // Common testing for atanh operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_atanh_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the atanh operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_atanh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { 0.0, std::numeric_limits<float>::infinity(), NAN, NAN, NAN, NAN }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::atanh(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_atanh_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_atanh_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpAtanhOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, INFINITY});
-
-  EXPECT_TENSOR_CLOSE(op_atanh_out(a, out), res);
-}
-
-TEST_F(OpAtanhOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanhOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanhOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanhOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanhOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanhOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                        \
-  test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanhOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanhOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                        \
-  test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpAtanhOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_atanh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpAtanhOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_atanh_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpAtanhOutTest)
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index d9d45509084..533dfee7ae1 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -484,3 +484,51 @@ TEST_F(OpClampTensorOutTest, SmokeTest) {
   op_clamp_tensor_out(in, min, max, out);
   EXPECT_TENSOR_EQ(out, expected);
 }
+
+TEST_F(OpClampTensorOutTest, DowncastingSmokeTest) {
+  TensorFactory<ScalarType::Byte> tf_in;
+  TensorFactory<ScalarType::Short> tf_min;
+  TensorFactory<ScalarType::Int> tf_max;
+  TensorFactory<ScalarType::Char> tf_out;
+
+  Tensor in = tf_in.make({}, {5});
+  Tensor min = tf_min.make({}, {-129});
+  Tensor max = tf_max.make({}, {300});
+  Tensor out = tf_out.zeros({});
+  Tensor expected = tf_out.make({}, {5});
+
+  op_clamp_tensor_out(in, min, max, out);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpClampTensorOutTest, DowncastingSmokeTest2) {
+  TensorFactory<ScalarType::Short> tf_in;
+  TensorFactory<ScalarType::Short> tf_min;
+  TensorFactory<ScalarType::Int> tf_max;
+  TensorFactory<ScalarType::Char> tf_out;
+
+  Tensor in = tf_in.make({}, {301});
+  Tensor min = tf_min.make({}, {-129});
+  Tensor max = tf_max.make({}, {300});
+  Tensor out = tf_out.zeros({});
+  Tensor expected = tf_out.make({}, {44});
+
+  op_clamp_tensor_out(in, min, max, out);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpClampTensorOutTest, DowncastingSmokeTest3) {
+  TensorFactory<ScalarType::Short> tf_in;
+  TensorFactory<ScalarType::Short> tf_min;
+  TensorFactory<ScalarType::Int> tf_max;
+  TensorFactory<ScalarType::Char> tf_out;
+
+  Tensor in = tf_in.make({}, {45});
+  Tensor min = tf_min.make({}, {-129});
+  Tensor max = tf_max.make({}, {300});
+  Tensor out = tf_out.zeros({});
+  Tensor expected = tf_out.make({}, {45});
+
+  op_clamp_tensor_out(in, min, max, out);
+  EXPECT_TENSOR_EQ(out, expected);
+}
diff --git a/kernels/test/op_cos_test.cpp b/kernels/test/op_cos_test.cpp
index f6105787a8e..8fd88d9082d 100644
--- a/kernels/test/op_cos_test.cpp
+++ b/kernels/test/op_cos_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpCosOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpCosOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_cos_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::cos_outf(context_, self, out);
   }
 
-  // Common testing for cos operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_cos_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the cos operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_cos_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { 1.000000,  0.540302, -0.989992,  0.283662, -0.839072,  0.862319 }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::cos(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_cos_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_cos_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpCosOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{1.000000, 0.540302});
-
-  EXPECT_TENSOR_CLOSE(op_cos_out(a, out), res);
-}
-
-TEST_F(OpCosOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_cos_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCosOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_cos_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCosOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_cos_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCosOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_cos_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCosOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_cos_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCosOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_cos_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCosOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_cos_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCosOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_cos_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCosOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_cos_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpCosOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_cos_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpCosOutTest)
diff --git a/kernels/test/op_cosh_test.cpp b/kernels/test/op_cosh_test.cpp
index d9939a81938..db4c3d221e1 100644
--- a/kernels/test/op_cosh_test.cpp
+++ b/kernels/test/op_cosh_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpCoshOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpCoshOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_cosh_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::cosh_outf(context_, self, out);
   }
 
-  // Common testing for cosh operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_cosh_out(
-      const std::vector<int32_t>& out_shape = {1, 5},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the cosh operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_cosh_out(tf_in.make({1, 5}, { 0, 1, 3, 5, 10 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 5}, { 1.000000e+00, 1.543081e+00, 1.006766e+01, 7.420995e+01, 1.101323e+04 }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::cosh(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_cosh_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_cosh_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpCoshOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{1.000000, 1.543081});
-
-  EXPECT_TENSOR_CLOSE(op_cosh_out(a, out), res);
-}
-
-TEST_F(OpCoshOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCoshOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCoshOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCoshOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCoshOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCoshOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCoshOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCoshOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpCoshOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_cosh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpCoshOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_cosh_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpCoshOutTest)
diff --git a/kernels/test/op_erf_test.cpp b/kernels/test/op_erf_test.cpp
index 73f8d6cd5a0..2b54a707d0b 100644
--- a/kernels/test/op_erf_test.cpp
+++ b/kernels/test/op_erf_test.cpp
@@ -8,7 +8,7 @@
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
 #include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
@@ -20,14 +20,22 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-class OpErfTest : public OperatorTest {
+class OpErfOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_erf_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::erf_outf(context_, self, out);
   }
+
+  double op_reference(double x) const override {
+    return std::erf(x);
+  }
+
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpErfTest, SanityCheck) {
+TEST_F(OpErfOutTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-3.0, -2.99, -1.01, 0.0, 1.01, 2.99, 3.0});
@@ -36,36 +44,10 @@ TEST_F(OpErfTest, SanityCheck) {
   Tensor expected = tf.make({1, 7}, {-0.999978, -0.999976, -0.846811,  0.000000,  0.846811,  0.999976, 0.999978});
   // clang-format on
 
-  Tensor ret = op_erf_out(in, out);
+  Tensor ret = op_out(in, out);
 
   EXPECT_TENSOR_EQ(out, ret);
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST_F(OpErfTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, 0.842701});
-
-  EXPECT_TENSOR_CLOSE(op_erf_out(a, out), res);
-}
-
-TEST_F(OpErfTest, HalfSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-  TensorFactory<ScalarType::Half> tf_half;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_half.make(sizes, /*data=*/{0.0, 1.0});
-  Tensor out = tf_half.zeros(sizes);
-  Tensor res = tf_half.make(sizes, /*data=*/{0.000000, 0.842701});
-
-  EXPECT_TENSOR_CLOSE(op_erf_out(a, out), res);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpErfOutTest)
diff --git a/kernels/test/op_exp_test.cpp b/kernels/test/op_exp_test.cpp
index 220aad1ee1d..07e10e33b02 100644
--- a/kernels/test/op_exp_test.cpp
+++ b/kernels/test/op_exp_test.cpp
@@ -7,178 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::testing::SupportedFeatures;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpExpOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpExpOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_exp_out(const Tensor& a, Tensor& out) {
-    return torch::executor::aten::exp_outf(context_, a, out);
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
+    return torch::executor::aten::exp_outf(context_, self, out);
   }
 
-  template <typename CTYPE>
-  CTYPE apply_log(double x) {
-    return static_cast<CTYPE>(std::log(x));
-  }
-
-  // Common testing for log operator
-  template <typename CTYPE_IN, ScalarType DTYPE, ScalarType DTYPE_OUT>
-  void test__exp_out() {
-    TensorFactory<DTYPE> tf;
-    TensorFactory<DTYPE_OUT> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 2};
-
-    // clang-format off
-    Tensor x = tf.make(
-        sizes,
-        {
-            apply_log<CTYPE_IN>(1.),  apply_log<CTYPE_IN>(2.),
-            apply_log<CTYPE_IN>(4.),  apply_log<CTYPE_IN>(8.),
-        });
-    // clang-format on
-
-    // clang-format off
-    Tensor expected = tf_out.make(
-        sizes,
-        {
-            1.,  2.,
-            4.,  8.,
-        });
-    // clang-format on
-
-    Tensor out = tf_out.zeros(sizes);
-
-    op_exp_out(x, out);
-    EXPECT_TENSOR_CLOSE(out, expected);
+  double op_reference(double x) const override {
+    return std::exp(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType OUTPUT_DTYPE>
-  void test_exp_invalid_output_dtype_dies() {
-    TensorFactory<ScalarType::Float> tf_float;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf_float.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_exp_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpExpOutTest, AllFloatInputFloatOutputSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test__exp_out<ctype, ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpExpOutTest, AllFloatInputDoubleOutputSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test__exp_out<ctype, ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpExpOutTest, HandleBoolInput) {
-  // op_exp_out() handles Bool as input.
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{true, false});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{2.718282, 1});
-
-  EXPECT_TENSOR_CLOSE(op_exp_out(a, out), res);
-}
-
-TEST_F(OpExpOutTest, HandleHalfInput) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-  TensorFactory<ScalarType::Half> tf_half;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_half.make(sizes, /*data=*/{-2.5, -3.0});
-  Tensor out = tf_half.zeros(sizes);
-  Tensor res = tf_half.make(sizes, /*data=*/{0.082085, 0.049787});
-
-  EXPECT_TENSOR_CLOSE(op_exp_out(a, out), res);
-}
-
-// Mismatched shape tests.
-TEST_F(OpExpOutTest, MismatchedShapesDies) {
-  if (SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
-  }
-
-  TensorFactory<ScalarType::Int> tf_int;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  Tensor a = tf_int.ones(/*sizes=*/{4});
-  Tensor out = tf_float.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_exp_out(a, out));
-}
-
-TEST_F(OpExpOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_exp_invalid_output_dtype_dies<ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-#ifndef USE_ATEN_LIB
-TEST_F(OpExpOutTest, DynamicOutputShape) {
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Float> tf_out;
-
-  const std::vector<int32_t> sizes = {4, 2};
-  const std::vector<int32_t> out_size = {8, 1};
-
-  // clang-format off
-  Tensor x = tf.make(
-      sizes,
-      {
-          apply_log<float>(1.),  apply_log<float>(2.),
-          apply_log<float>(4.),  apply_log<float>(8.),
-          apply_log<float>(3.),  apply_log<float>(6.),
-          apply_log<float>(7.),  apply_log<float>(5.),
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor expected = tf_out.make(
-      sizes,
-      {
-          1.,  2.,
-          4.,  8.,
-          3.,  6.,
-          7.,  5.,
-      });
-  // clang-format on
-
-  Tensor out =
-      tf.zeros(out_size, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-
-  op_exp_out(x, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-#endif
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpExpOutTest)
diff --git a/kernels/test/op_expm1_test.cpp b/kernels/test/op_expm1_test.cpp
index c0d3a226309..b91b8544f72 100644
--- a/kernels/test/op_expm1_test.cpp
+++ b/kernels/test/op_expm1_test.cpp
@@ -7,32 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
+#include <cmath>
+
 using exec_aten::Tensor;
-using torch::executor::testing::SupportedFeatures;
-using torch::executor::testing::TensorFactory;
+class OpExpm1OutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
+ protected:
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
+    return torch::executor::aten::expm1_outf(context_, self, out);
+  }
 
-Tensor& op_expm1_out(const Tensor& a, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
-  return torch::executor::aten::expm1_outf(context, a, out);
-}
+  double op_reference(double x) const override {
+    return std::expm1(x);
+  }
 
-TEST(OpExpm1OutTest, SmokeTest) {
-  TensorFactory<ScalarType::Double> tfDouble;
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
+};
 
-  Tensor self = tfDouble.full({}, -31.375);
-  Tensor out = tfDouble.zeros({});
-  Tensor out_expected = tfDouble.full({}, -0.9999999999999764);
-  op_expm1_out(self, out);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpExpm1OutTest)
diff --git a/kernels/test/op_index_put_test.cpp b/kernels/test/op_index_put_test.cpp
index b685edc6aaf..868c11600f4 100644
--- a/kernels/test/op_index_put_test.cpp
+++ b/kernels/test/op_index_put_test.cpp
@@ -707,7 +707,7 @@ TEST_F(OpIndexPutOutTest, AllDtypesSupportedForInput) {
 #define TEST_ENTRY(ctype, dtype) \
   test_dtype<ScalarType::dtype, ScalarType::Long>();
 
-  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+  ET_FORALL_REALHBBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
 }
diff --git a/kernels/test/op_index_test.cpp b/kernels/test/op_index_test.cpp
index 03a91005e83..8afa0f93455 100644
--- a/kernels/test/op_index_test.cpp
+++ b/kernels/test/op_index_test.cpp
@@ -33,7 +33,7 @@ class OpIndexTensorOutTest : public OperatorTest {
       OptTensorArrayRef indices,
       Tensor& out) {
 #ifdef USE_ATEN_LIB
-    c10::List<c10::optional<at::Tensor>> indices_list(indices);
+    c10::List<std::optional<at::Tensor>> indices_list(indices);
     return torch::executor::aten::index_outf(
         context_, input, indices_list, out);
 #else
@@ -107,7 +107,7 @@ class OpIndexTensorOutTest : public OperatorTest {
 #define TEST_ENTRY(ctype, dtype) \
   test_dtype<ScalarType::dtype, ScalarType::Long, ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+    ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
   }
diff --git a/kernels/test/op_isinf_test.cpp b/kernels/test/op_isinf_test.cpp
index 3b544dc65f0..d7916984371 100644
--- a/kernels/test/op_isinf_test.cpp
+++ b/kernels/test/op_isinf_test.cpp
@@ -25,66 +25,29 @@ class OpIsInfTest : public OperatorTest {
   Tensor& op_isinf_out(const Tensor& self, Tensor& out) {
     return torch::executor::aten::isinf_outf(context_, self, out);
   }
-};
-
-TEST_F(OpIsInfTest, SanityCheckFloat) {
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Bool> tfb;
 
-  Tensor in = tf.make(
-      {1, 5}, {-1.0, 0.0, 1.0, NAN, std::numeric_limits<float>::infinity()});
-  Tensor out = tfb.zeros({1, 5});
-  Tensor expected = tfb.make({1, 5}, {false, false, false, false, true});
+  template <ScalarType DTYPE>
+  void test_sanity_check() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<ScalarType::Bool> tfb;
 
-  Tensor ret = op_isinf_out(in, out);
+    using CTYPE = typename TensorFactory<DTYPE>::ctype;
+    Tensor in = tf.make(
+        {1, 5}, {-1, 0, 1, NAN, std::numeric_limits<CTYPE>::infinity()});
+    Tensor out = tfb.zeros({1, 5});
+    Tensor expected = tfb.make({1, 5}, {false, false, false, false, true});
 
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
-}
+    Tensor ret = op_isinf_out(in, out);
 
-TEST_F(OpIsInfTest, SanityCheckHalf) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(out, expected);
   }
-  TensorFactory<ScalarType::Half> tf;
-  TensorFactory<ScalarType::Bool> tfb;
-
-  Tensor in = tf.make(
-      {1, 5}, {-1.0, 0.0, 1.0, NAN, std::numeric_limits<float>::infinity()});
-  Tensor out = tfb.zeros({1, 5});
-  Tensor expected = tfb.make({1, 5}, {false, false, false, false, true});
-
-  Tensor ret = op_isinf_out(in, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST_F(OpIsInfTest, SanityCheckByte) {
-  TensorFactory<ScalarType::Byte> tf;
-  TensorFactory<ScalarType::Bool> tfb;
-
-  Tensor in = tf.make({1, 5}, {1, 2, 3, 4, 5});
-  Tensor out = tfb.zeros({1, 5});
-  Tensor expected = tfb.make({1, 5}, {false, false, false, false, false});
-
-  Tensor ret = op_isinf_out(in, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST_F(OpIsInfTest, SanityCheckBool) {
-  TensorFactory<ScalarType::Bool> tfb;
-
-  Tensor in = tfb.make({1, 5}, {true, false, true, true, false});
-  Tensor out = tfb.zeros({1, 5});
-  Tensor expected = tfb.make({1, 5}, {false, false, false, false, false});
-
-  Tensor ret = op_isinf_out(in, out);
+};
 
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
+TEST_F(OpIsInfTest, SanityCheck) {
+#define TEST_ENTRY(ctype, dtype) test_sanity_check<ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
 }
 
 TEST_F(OpIsInfTest, SanityCheckOutDtype) {
diff --git a/kernels/test/op_isnan_test.cpp b/kernels/test/op_isnan_test.cpp
index 2894fc88ccf..c63fc838eae 100644
--- a/kernels/test/op_isnan_test.cpp
+++ b/kernels/test/op_isnan_test.cpp
@@ -25,66 +25,29 @@ class OpIsNanTest : public OperatorTest {
   Tensor& op_isnan_out(const Tensor& self, Tensor& out) {
     return torch::executor::aten::isnan_outf(context_, self, out);
   }
-};
-
-TEST_F(OpIsNanTest, SanityCheckFloat) {
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Bool> tfb;
 
-  Tensor in = tf.make(
-      {1, 5}, {-1.0, 0.0, 1.0, NAN, std::numeric_limits<float>::infinity()});
-  Tensor out = tfb.zeros({1, 5});
-  Tensor expected = tfb.make({1, 5}, {false, false, false, true, false});
+  template <ScalarType DTYPE>
+  void test_sanity_check() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<ScalarType::Bool> tfb;
 
-  Tensor ret = op_isnan_out(in, out);
+    using CTYPE = typename TensorFactory<DTYPE>::ctype;
+    Tensor in = tf.make(
+        {1, 5}, {-1, 0, 1, NAN, std::numeric_limits<CTYPE>::infinity()});
+    Tensor out = tfb.zeros({1, 5});
+    Tensor expected = tfb.make({1, 5}, {false, false, false, true, false});
 
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
-}
+    Tensor ret = op_isnan_out(in, out);
 
-TEST_F(OpIsNanTest, SanityCheckHalf) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(out, expected);
   }
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Bool> tfb;
-
-  Tensor in = tf.make(
-      {1, 5}, {-1.0, 0.0, 1.0, NAN, std::numeric_limits<float>::infinity()});
-  Tensor out = tfb.zeros({1, 5});
-  Tensor expected = tfb.make({1, 5}, {false, false, false, true, false});
-
-  Tensor ret = op_isnan_out(in, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST_F(OpIsNanTest, SanityCheckByte) {
-  TensorFactory<ScalarType::Byte> tf;
-  TensorFactory<ScalarType::Bool> tfb;
-
-  Tensor in = tf.make({1, 5}, {1, 2, 3, 4, 5});
-  Tensor out = tfb.zeros({1, 5});
-  Tensor expected = tfb.make({1, 5}, {false, false, false, false, false});
-
-  Tensor ret = op_isnan_out(in, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST_F(OpIsNanTest, SanityCheckBool) {
-  TensorFactory<ScalarType::Bool> tfb;
-
-  Tensor in = tfb.make({1, 5}, {true, false, true, true, false});
-  Tensor out = tfb.zeros({1, 5});
-  Tensor expected = tfb.make({1, 5}, {false, false, false, false, false});
-
-  Tensor ret = op_isnan_out(in, out);
+};
 
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
+TEST_F(OpIsNanTest, SanityCheck) {
+#define TEST_ENTRY(ctype, dtype) test_sanity_check<ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
 }
 
 TEST_F(OpIsNanTest, SanityCheckOutDtype) {
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
index 96875cc6f77..47f8925af08 100644
--- a/kernels/test/op_linear_test.cpp
+++ b/kernels/test/op_linear_test.cpp
@@ -43,16 +43,16 @@ class OpLinearOutTest : public OperatorTest {
       }
     }
 
-    // matmul gives 4 * 2 * 3 = 24
-    Tensor x = tf.full({3, 4}, 2);
-    Tensor y = tf.full({5, 4}, 3);
+    // matmul gives 32 * 2 * 3 = 192
+    Tensor x = tf.full({3, 32}, 2);
+    Tensor y = tf.full({5, 32}, 3);
 
     // Output shape should be (3, 5)
     Tensor out = tf.zeros({3, 5});
 
     op_linear_out(x, y, out);
 
-    Tensor expected = tf.full({3, 5}, 24);
+    Tensor expected = tf.full({3, 5}, 192);
 
     EXPECT_TENSOR_EQ(out, expected);
   }
diff --git a/kernels/test/op_log10_test.cpp b/kernels/test/op_log10_test.cpp
index d4e14100497..a5aa5749927 100644
--- a/kernels/test/op_log10_test.cpp
+++ b/kernels/test/op_log10_test.cpp
@@ -7,35 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
+#include <cmath>
+
 using exec_aten::Tensor;
-using torch::executor::testing::SupportedFeatures;
-using torch::executor::testing::TensorFactory;
+class OpLog10OutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
+ protected:
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
+    return torch::executor::aten::log10_outf(context_, self, out);
+  }
 
-Tensor& op_log10_out(const Tensor& a, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
-  return torch::executor::aten::log10_outf(context, a, out);
-}
+  double op_reference(double x) const override {
+    return std::log10(x);
+  }
 
-TEST(OpLog10OutTest, SmokeTest) {
-  TensorFactory<ScalarType::Double> tfDouble;
-  TensorFactory<ScalarType::Short> tfShort;
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
+};
 
-  Tensor self = tfShort.make({8}, {-12, -6, -65, -61, 16, -44, -47, 54});
-  Tensor out = tfDouble.zeros({8});
-  Tensor out_expected = tfDouble.make(
-      {8},
-      {NAN, NAN, NAN, NAN, 1.2041200399398804, NAN, NAN, 1.732393741607666});
-  op_log10_out(self, out);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpLog10OutTest)
diff --git a/kernels/test/op_log1p_test.cpp b/kernels/test/op_log1p_test.cpp
index 3d4b0f1c567..d4e195b5c83 100644
--- a/kernels/test/op_log1p_test.cpp
+++ b/kernels/test/op_log1p_test.cpp
@@ -7,33 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
+#include <cmath>
+
 using exec_aten::Tensor;
-using torch::executor::testing::SupportedFeatures;
-using torch::executor::testing::TensorFactory;
+class OpLog1pOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
+ protected:
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
+    return torch::executor::aten::log1p_outf(context_, self, out);
+  }
 
-Tensor& op_log1p_out(const Tensor& a, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
-  return torch::executor::aten::log1p_outf(context, a, out);
-}
+  double op_reference(double x) const override {
+    return std::log1p(x);
+  }
 
-TEST(OpLog1pOutTest, SmokeTest) {
-  TensorFactory<ScalarType::Char> tfChar;
-  TensorFactory<ScalarType::Double> tfDouble;
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
+};
 
-  Tensor self = tfChar.full({}, 13);
-  Tensor out = tfDouble.zeros({});
-  Tensor out_expected = tfDouble.full({}, 2.6390573978424072);
-  op_log1p_out(self, out);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpLog1pOutTest)
diff --git a/kernels/test/op_log2_test.cpp b/kernels/test/op_log2_test.cpp
index cbbd8f6a985..e8d36559a3a 100644
--- a/kernels/test/op_log2_test.cpp
+++ b/kernels/test/op_log2_test.cpp
@@ -7,34 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
+#include <cmath>
+
 using exec_aten::Tensor;
-using torch::executor::testing::SupportedFeatures;
-using torch::executor::testing::TensorFactory;
+class OpLog2OutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
+ protected:
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
+    return torch::executor::aten::log2_outf(context_, self, out);
+  }
 
-Tensor& op_log2_out(const Tensor& a, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
-  return torch::executor::aten::log2_outf(context, a, out);
-}
+  double op_reference(double x) const override {
+    return std::log2(x);
+  }
 
-TEST(OpLog2OutTest, SmokeTest) {
-  TensorFactory<ScalarType::Byte> tfByte;
-  TensorFactory<ScalarType::Float> tfFloat;
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
+};
 
-  Tensor self = tfByte.make({3}, {45, 55, 82});
-  Tensor out = tfFloat.zeros({3});
-  Tensor out_expected = tfFloat.make(
-      {3}, {5.4918532371521, 5.781359672546387, 6.3575520515441895});
-  op_log2_out(self, out);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpLog2OutTest)
diff --git a/kernels/test/op_log_test.cpp b/kernels/test/op_log_test.cpp
index 887defe621d..f8ac831e746 100644
--- a/kernels/test/op_log_test.cpp
+++ b/kernels/test/op_log_test.cpp
@@ -7,117 +7,32 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::Scalar;
+#include <cmath>
+
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
-
-class OpLogOutTest : public OperatorTest {
+class OpLogOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_log_out(const Tensor& a, Tensor& out) {
-    return torch::executor::aten::log_outf(context_, a, out);
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
+    return torch::executor::aten::log_outf(context_, self, out);
   }
 
-  // Common testing for log operator
-  template <ScalarType DTYPE, ScalarType OUT_DTYPE>
-  void test__log_out() {
-    TensorFactory<DTYPE> tf;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 2};
-
-    Tensor out = tf_out.zeros(sizes);
-
-    // Valid input should give the expected output
-    op_log_out(tf.make(sizes, /*data=*/{0, 1, 2, 4}), out);
-    EXPECT_TENSOR_CLOSE(
-        out, tf_out.make(sizes, /*data=*/{-INFINITY, 0, 0.693147, 1.386294}));
+  double op_reference(double x) const override {
+    return std::log(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType OUTPUT_DTYPE>
-  void test_log_invalid_output_dtype_dies() {
-    TensorFactory<ScalarType::Float> tf_float;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf_float.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_log_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpLogOutTest, AllRealInputHalfOutputSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test__log_out<ScalarType::dtype, ScalarType::Half>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpLogOutTest, AllRealInputFloatOutputSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test__log_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpLogOutTest, AllRealInputDoubleOutputSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test__log_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpLogOutTest, HandleBoolInput) {
-  // op_log_out() handles Bool as input.
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{true, false});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0, -INFINITY});
-
-  EXPECT_TENSOR_EQ(op_log_out(a, out), res);
-}
-
-// Mismatched shape tests.
-TEST_F(OpLogOutTest, MismatchedShapesDies) {
-  if (SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
-  }
-
-  TensorFactory<ScalarType::Int> tf_int;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  Tensor a = tf_int.ones(/*sizes=*/{4});
-  Tensor out = tf_float.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_log_out(a, out));
-}
-
-TEST_F(OpLogOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_log_invalid_output_dtype_dies<ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpLogOutTest)
 
 TEST_F(OpLogOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
@@ -144,7 +59,7 @@ TEST_F(OpLogOutTest, SimpleGeneratedCase) {
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
 
   Tensor out = tf.zeros({10, 10});
-  Tensor ret = op_log_out(x, out);
+  Tensor ret = op_out(x, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
@@ -170,59 +85,6 @@ TEST_F(OpLogOutTest, DynamicShapeUpperBoundSameAsExpected) {
 
   Tensor out =
       tf.zeros({3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-  Tensor ret = op_log_out(x, out);
-  EXPECT_TENSOR_CLOSE(out, expected_result);
-}
-
-TEST_F(OpLogOutTest, DynamicShapeUpperBoundLargerThanExpected) {
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor x = tf.make(
-      {3, 2},
-      {0.6879220604896545,
-       0.8289883136749268,
-       0.7889447808265686,
-       0.6339777112007141,
-       0.8719115853309631,
-       0.4185197353363037});
-  Tensor expected_result = tf.make(
-      {3, 2},
-      {-0.37407973408699036,
-       -0.18754921853542328,
-       -0.23705895245075226,
-       -0.4557414948940277,
-       -0.1370672583580017,
-       -0.8710312247276306});
-
-  Tensor out =
-      tf.zeros({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-  Tensor ret = op_log_out(x, out);
-  EXPECT_TENSOR_CLOSE(out, expected_result);
-}
-
-TEST_F(OpLogOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor x = tf.make(
-      {3, 2},
-      {0.6879220604896545,
-       0.8289883136749268,
-       0.7889447808265686,
-       0.6339777112007141,
-       0.8719115853309631,
-       0.4185197353363037});
-  Tensor expected_result = tf.make(
-      {3, 2},
-      {-0.37407973408699036,
-       -0.18754921853542328,
-       -0.23705895245075226,
-       -0.4557414948940277,
-       -0.1370672583580017,
-       -0.8710312247276306});
-
-  Tensor out =
-      tf.zeros({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
-  Tensor ret = op_log_out(x, out);
+  Tensor ret = op_out(x, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
diff --git a/kernels/test/op_logical_and_test.cpp b/kernels/test/op_logical_and_test.cpp
index 68422ee7493..454b2f0d663 100644
--- a/kernels/test/op_logical_and_test.cpp
+++ b/kernels/test/op_logical_and_test.cpp
@@ -6,23 +6,26 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/test/BinaryLogicalOpTest.h>
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
 
-class OpLogicalAndTest : public OperatorTest {
+class OpLogicalAndTest : public torch::executor::testing::BinaryLogicalOpTest {
  protected:
-  Tensor&
-  op_logical_and_out(const Tensor& self, const Tensor& other, Tensor& out) {
+  Tensor& op_out(const Tensor& self, const Tensor& other, Tensor& out)
+      override {
     return torch::executor::aten::logical_and_outf(context_, self, other, out);
   }
+
+  double op_reference(double x, double y) const override {
+    uint64_t lhs, rhs;
+    std::memcpy(&lhs, &x, sizeof(lhs));
+    std::memcpy(&rhs, &y, sizeof(rhs));
+    return lhs && rhs;
+  }
 };
+
+IMPLEMENT_BINARY_LOGICAL_OP_TEST(OpLogicalAndTest)
diff --git a/kernels/test/op_logical_or_test.cpp b/kernels/test/op_logical_or_test.cpp
index e8dfb5e589e..1f966a72124 100644
--- a/kernels/test/op_logical_or_test.cpp
+++ b/kernels/test/op_logical_or_test.cpp
@@ -6,23 +6,26 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/test/BinaryLogicalOpTest.h>
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
 
-class OpLogicalOrTest : public OperatorTest {
+class OpLogicalOrTest : public torch::executor::testing::BinaryLogicalOpTest {
  protected:
-  Tensor&
-  op_logical_or_out(const Tensor& self, const Tensor& other, Tensor& out) {
+  Tensor& op_out(const Tensor& self, const Tensor& other, Tensor& out)
+      override {
     return torch::executor::aten::logical_or_outf(context_, self, other, out);
   }
+
+  double op_reference(double x, double y) const override {
+    uint64_t lhs, rhs;
+    std::memcpy(&lhs, &x, sizeof(lhs));
+    std::memcpy(&rhs, &y, sizeof(rhs));
+    return lhs || rhs;
+  }
 };
+
+IMPLEMENT_BINARY_LOGICAL_OP_TEST(OpLogicalOrTest)
diff --git a/kernels/test/op_logical_xor_test.cpp b/kernels/test/op_logical_xor_test.cpp
index ab162a27967..b1fdddcc1fb 100644
--- a/kernels/test/op_logical_xor_test.cpp
+++ b/kernels/test/op_logical_xor_test.cpp
@@ -6,23 +6,26 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/test/BinaryLogicalOpTest.h>
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
 
-class OpLogicalXorTest : public OperatorTest {
+class OpLogicalXorTest : public torch::executor::testing::BinaryLogicalOpTest {
  protected:
-  Tensor&
-  op_logical_xor_out(const Tensor& self, const Tensor& other, Tensor& out) {
+  Tensor& op_out(const Tensor& self, const Tensor& other, Tensor& out)
+      override {
     return torch::executor::aten::logical_xor_outf(context_, self, other, out);
   }
+
+  double op_reference(double x, double y) const override {
+    uint64_t lhs, rhs;
+    std::memcpy(&lhs, &x, sizeof(lhs));
+    std::memcpy(&rhs, &y, sizeof(rhs));
+    return bool(lhs) != bool(rhs);
+  }
 };
+
+IMPLEMENT_BINARY_LOGICAL_OP_TEST(OpLogicalXorTest)
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index f8205ea601e..f3c9e54c862 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -17,11 +17,12 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
 using torch::executor::testing::SupportedFeatures;
-using torch::executor::testing::TensorFactory;
+namespace etrt = executorch::runtime;
 
 class OpMulOutTest : public OperatorTest {
  protected:
@@ -61,7 +62,8 @@ class OpMulOutTest : public OperatorTest {
     test_mul<DTYPE_A, DTYPE_B, ScalarType::Float>();
     test_mul<DTYPE_A, DTYPE_B, ScalarType::Double>();
     // Integral out type is only allowed if both inputs are integral types
-    if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
+    if (etrt::isIntegralType(DTYPE_A, false) &&
+        etrt::isIntegralType(DTYPE_B, false)) {
       test_mul<DTYPE_A, DTYPE_B, ScalarType::Int>();
       test_mul<DTYPE_A, DTYPE_B, ScalarType::Long>();
     }
diff --git a/kernels/test/op_reciprocal_test.cpp b/kernels/test/op_reciprocal_test.cpp
index b3da01c7905..5835b8c91f5 100644
--- a/kernels/test/op_reciprocal_test.cpp
+++ b/kernels/test/op_reciprocal_test.cpp
@@ -7,65 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpReciprocalTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpReciprocalOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_reciprocal_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::reciprocal_outf(context_, self, out);
   }
-};
-
-TEST_F(OpReciprocalTest, SanityCheck) {
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor in = tf.make({1, 7}, {-3.0, -2.99, -1.01, 0.0, 1.01, 2.99, 3.0});
-  Tensor out = tf.zeros({1, 7});
-  // clang-format off
-  Tensor expected = tf.make({1, 7}, {-0.333333, -0.334448, -0.990099, std::numeric_limits<float>::infinity(), 0.990099, 0.334448, 0.333333});
-  // clang-format on
-
-  Tensor ret = op_reciprocal_out(in, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
 
-TEST_F(OpReciprocalTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{INFINITY, 1.0});
-
-  EXPECT_TENSOR_CLOSE(op_reciprocal_out(a, out), res);
-}
-
-TEST_F(OpReciprocalTest, HandleHalfInput) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
+  double op_reference(double x) const override {
+    return 1.0 / x;
   }
-  TensorFactory<ScalarType::Half> tf_half;
-
-  const std::vector<int32_t> sizes = {1, 2};
 
-  Tensor a = tf_half.make(sizes, /*data=*/{5.0, -2.0});
-  Tensor out = tf_half.zeros(sizes);
-  Tensor res = tf_half.make(sizes, /*data=*/{0.2, -0.5});
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
+};
 
-  EXPECT_TENSOR_CLOSE(op_reciprocal_out(a, out), res);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpReciprocalOutTest)
diff --git a/kernels/test/op_rsqrt_test.cpp b/kernels/test/op_rsqrt_test.cpp
index 3332e3be8e1..c52eeb50c60 100644
--- a/kernels/test/op_rsqrt_test.cpp
+++ b/kernels/test/op_rsqrt_test.cpp
@@ -7,65 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpRsqrtTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpRsqrtOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_rsqrt_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::rsqrt_outf(context_, self, out);
   }
-};
-
-TEST_F(OpRsqrtTest, SanityCheck) {
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor in = tf.make({1, 7}, {-3.0, -2.99, -1.01, 0.0, 1.01, 2.99, 3.0});
-  Tensor out = tf.zeros({1, 7});
-  // clang-format off
-  Tensor expected = tf.make({1, 7}, {NAN, NAN, NAN, std::numeric_limits<float>::infinity(), 0.995037, 0.578315, 0.577350});
-  // clang-format on
-
-  Tensor ret = op_rsqrt_out(in, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
 
-TEST_F(OpRsqrtTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{INFINITY, 1.0});
-
-  EXPECT_TENSOR_CLOSE(op_rsqrt_out(a, out), res);
-}
-
-TEST_F(OpRsqrtTest, HandleHalfInput) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
+  double op_reference(double x) const override {
+    return 1.0 / std::sqrt(x);
   }
-  TensorFactory<ScalarType::Half> tf_half;
-
-  const std::vector<int32_t> sizes = {1, 2};
 
-  Tensor a = tf_half.make(sizes, /*data=*/{3.5, 2.6});
-  Tensor out = tf_half.zeros(sizes);
-  Tensor res = tf_half.make(sizes, /*data=*/{0.53452248, 0.62017367});
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
+};
 
-  EXPECT_TENSOR_CLOSE(op_rsqrt_out(a, out), res);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpRsqrtOutTest)
diff --git a/kernels/test/op_sin_test.cpp b/kernels/test/op_sin_test.cpp
index 74991f6cc37..e2c883f4899 100644
--- a/kernels/test/op_sin_test.cpp
+++ b/kernels/test/op_sin_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpSinOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpSinOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_sin_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::sin_outf(context_, self, out);
   }
 
-  // Common testing for sin operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_sin_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the sin operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_sin_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { 0.000000,  0.841471,  0.141120, -0.958924, -0.544021, -0.506366 }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::sin(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_sin_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_sin_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpSinOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, 0.841471});
-
-  EXPECT_TENSOR_CLOSE(op_sin_out(a, out), res);
-}
-
-TEST_F(OpSinOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_sin_out<ScalarType::dtype, ScalarType::Half>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_sin_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_sin_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                    \
-  test_floating_point_sin_out<ScalarType::dtype, ScalarType::Half>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_sin_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_sin_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_sin_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_sin_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_sin_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpSinOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_sin_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpSinOutTest)
diff --git a/kernels/test/op_sinh_test.cpp b/kernels/test/op_sinh_test.cpp
index 8f533ad3251..a9c41bc3fca 100644
--- a/kernels/test/op_sinh_test.cpp
+++ b/kernels/test/op_sinh_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpSinhOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpSinhOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_sinh_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::sinh_outf(context_, self, out);
   }
 
-  // Common testing for sinh operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_sinh_out(
-      const std::vector<int32_t>& out_shape = {1, 5},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the sinh operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_sinh_out(tf_in.make({1, 5}, { 0, 1, 3, 5, 10 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 5}, { 0.000000e+00, 1.175201e+00, 1.001787e+01, 7.420321e+01, 1.101323e+04 }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::sinh(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_sinh_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_sinh_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpSinhOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, 1.175201});
-
-  EXPECT_TENSOR_CLOSE(op_sinh_out(a, out), res);
-}
-
-TEST_F(OpSinhOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Half>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinhOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinhOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinhOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Half>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinhOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinhOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinhOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinhOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                       \
-  test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpSinhOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_sinh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpSinhOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_sinh_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpSinhOutTest)
diff --git a/kernels/test/op_sqrt_test.cpp b/kernels/test/op_sqrt_test.cpp
index f14f345a8cf..3bbed6ecf53 100644
--- a/kernels/test/op_sqrt_test.cpp
+++ b/kernels/test/op_sqrt_test.cpp
@@ -7,65 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpSqrtTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpSqrtOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_sqrt_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::sqrt_outf(context_, self, out);
   }
-};
-
-TEST_F(OpSqrtTest, SanityCheck) {
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor in = tf.make({1, 7}, {-9., -2., -1., 0., 1., 2., 9.});
-  Tensor out = tf.zeros({1, 7});
-  // clang-format off
-  Tensor expected = tf.make({1, 7}, {NAN, NAN, NAN, 0., 1., 1.414214, 3.});
-  // clang-format on
-
-  Tensor ret = op_sqrt_out(in, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
 
-TEST_F(OpSqrtTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.0, 1.0});
-
-  EXPECT_TENSOR_CLOSE(op_sqrt_out(a, out), res);
-}
-
-TEST_F(OpSqrtTest, HandleHalfInput) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
+  double op_reference(double x) const override {
+    return std::sqrt(x);
   }
-  TensorFactory<ScalarType::Half> tf_half;
-
-  const std::vector<int32_t> sizes = {1, 2};
 
-  Tensor a = tf_half.make(sizes, /*data=*/{4.0, 6.25});
-  Tensor out = tf_half.zeros(sizes);
-  Tensor res = tf_half.make(sizes, /*data=*/{2.0, 2.5});
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
+};
 
-  EXPECT_TENSOR_CLOSE(op_sqrt_out(a, out), res);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpSqrtOutTest)
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index 9f795516723..f0285bc85e9 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -16,11 +16,12 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
 using torch::executor::testing::SupportedFeatures;
-using torch::executor::testing::TensorFactory;
+namespace etrt = executorch::runtime;
 
 class OpSubOutTest : public OperatorTest {
  protected:
@@ -60,7 +61,8 @@ class OpSubOutTest : public OperatorTest {
     test_sub<DTYPE_A, DTYPE_B, ScalarType::Float>();
     test_sub<DTYPE_A, DTYPE_B, ScalarType::Double>();
     // Integral out type is only allowed if both inputs are integral types
-    if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
+    if (etrt::isIntegralType(DTYPE_A, false) &&
+        etrt::isIntegralType(DTYPE_B, false)) {
       test_sub<DTYPE_A, DTYPE_B, ScalarType::Int>();
       test_sub<DTYPE_A, DTYPE_B, ScalarType::Long>();
     }
@@ -105,6 +107,27 @@ class OpSubOutTest : public OperatorTest {
 
 #undef ENUMERATE_TEST_ENTRY
   }
+
+  template <ScalarType DTYPE>
+  void test_broadcast_rank1_scalar() {
+    TensorFactory<DTYPE> tf;
+
+    Tensor a = tf.make({2, 1, 3}, {2, 3, 4, 5, 6, 7});
+    Tensor b = tf.make({1}, {2});
+
+    // Destination for the broadcasting div. Follow the broadcasting rules in
+    // https://fburl.com/n9wl4d0o
+    Tensor out = tf.zeros({2, 1, 3});
+
+    op_sub_out(a, b, 1, out);
+
+    Tensor ret = tf.make({2, 1, 3}, {0, 1, 2, 3, 4, 5});
+    EXPECT_TENSOR_EQ(out, ret);
+
+    op_sub_out(b, a, 1, out);
+    ret = tf.make({2, 1, 3}, {0, -1, -2, -3, -4, -5});
+    EXPECT_TENSOR_EQ(out, ret);
+  }
 };
 
 class OpSubScalarOutTest : public OperatorTest {
@@ -169,19 +192,8 @@ TEST_F(OpSubOutTest, BroadcastSupported2) {
 }
 
 TEST_F(OpSubOutTest, BroadcastScalarSupported1) {
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.make({2, 1, 3}, {2, 3, 4, 5, 6, 7});
-  Tensor b = tf.make({1}, {2});
-
-  // Destination for the broadcasting div. Follow the broadcasting rules in
-  // https://fburl.com/n9wl4d0o
-  Tensor out = tf.zeros({2, 1, 3});
-
-  op_sub_out(a, b, 1, out);
-
-  Tensor ret = tf.make({2, 1, 3}, {0, 1, 2, 3, 4, 5});
-  EXPECT_TENSOR_EQ(out, ret);
+  test_broadcast_rank1_scalar<ScalarType::Float>();
+  test_broadcast_rank1_scalar<ScalarType::Half>();
 }
 
 TEST_F(OpSubOutTest, BroadcastScalarSupported2) {
diff --git a/kernels/test/op_tan_test.cpp b/kernels/test/op_tan_test.cpp
index 560da69d8bb..bf0547d3381 100644
--- a/kernels/test/op_tan_test.cpp
+++ b/kernels/test/op_tan_test.cpp
@@ -7,164 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorShapeDynamism;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpTanOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpTanOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_tan_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::tan_outf(context_, self, out);
   }
 
-  // Common testing for tan operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_tan_out(
-      const std::vector<int32_t>& out_shape = {1, 6},
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    // Destination for the tan operator.
-    Tensor out = tf_out.zeros(out_shape, dynamism);
-
-    // clang-format off
-    op_tan_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make({1, 6}, { 0.000000,  1.557408, -0.142547, -3.380515,  0.648361, -0.587214 }));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::tan(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_tan_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_tan_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpTanOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, 1.557408});
-
-  EXPECT_TENSOR_CLOSE(op_tan_out(a, out), res);
-}
-
-TEST_F(OpTanOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_tan_out<ScalarType::dtype, ScalarType::Half>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_tan_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_tan_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype)                                    \
-  test_floating_point_tan_out<ScalarType::dtype, ScalarType::Half>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_tan_out<ScalarType::dtype, ScalarType::Float>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_tan_out<ScalarType::dtype, ScalarType::Double>( \
-      {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                     \
-  test_floating_point_tan_out<ScalarType::dtype, ScalarType::Float>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
-  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Dynamic shape unbound not supported";
-  }
-#define TEST_ENTRY(ctype, dtype)                                      \
-  test_floating_point_tan_out<ScalarType::dtype, ScalarType::Double>( \
-      {1, 1}, TensorShapeDynamism::DYNAMIC_UNBOUND);
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_tan_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpTanOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_tan_out(a, out));
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpTanOutTest)
diff --git a/kernels/test/op_tanh_test.cpp b/kernels/test/op_tanh_test.cpp
index 7ded964e8b3..7396353b566 100644
--- a/kernels/test/op_tanh_test.cpp
+++ b/kernels/test/op_tanh_test.cpp
@@ -7,254 +7,26 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
-#include <executorch/kernels/test/TestUtil.h>
-#include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
-#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h>
 
 #include <gtest/gtest.h>
 
-using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
+#include <cmath>
 
-class OpTanhOutTest : public OperatorTest {
+using exec_aten::Tensor;
+class OpTanhOutTest
+    : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
-  Tensor& op_tanh_out(const Tensor& self, Tensor& out) {
+  Tensor& op_out(const Tensor& self, Tensor& out) override {
     return torch::executor::aten::tanh_outf(context_, self, out);
   }
 
-  // Common testing for tanh operator and all kinds of supported input types
-  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-  void test_floating_point_tanh_out() {
-    TensorFactory<IN_DTYPE> tf_in;
-    TensorFactory<OUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {1, 12};
-
-    // Destination for the tanh operator.
-    Tensor out = tf_out.zeros(sizes);
-
-    // clang-format off
-    op_tanh_out(
-        tf_in.make(sizes, /*data=*/{ 0,  1,  2,  3,   4,  5,
-                                     6,  7,  8,  9,  10,  100}),
-        out);
-  
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make(
-            sizes, /*data=*/{ 0.0000000000,  0.7615941763,
-                               0.9640275836,  0.9950547814,  0.9993293285,
-                               0.9999092221,  0.9999877214,  0.9999983311,
-                               0.9999997616,  0.9999999404,  1.0000000000, 1.0000000000}));
-    // clang-format on
+  double op_reference(double x) const override {
+    return std::tanh(x);
   }
 
-  // Unhandled output dtypes.
-  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-  void test_tanh_invalid_output_dtype_dies() {
-    TensorFactory<INPUT_DTYPE> tf;
-    TensorFactory<OUTPUT_DTYPE> tf_out;
-
-    const std::vector<int32_t> sizes = {2, 5};
-
-    Tensor in = tf.ones(sizes);
-    Tensor out = tf_out.zeros(sizes);
-
-    ET_EXPECT_KERNEL_FAILURE(context_, op_tanh_out(in, out));
-  }
+  torch::executor::testing::SupportedFeatures* get_supported_features()
+      const override;
 };
 
-TEST_F(OpTanhOutTest, HandleBoolInput) {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {1, 2};
-
-  Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  Tensor out = tf_float.zeros(sizes);
-  Tensor res = tf_float.make(sizes, /*data=*/{0.000000, 0.761594});
-
-  EXPECT_TENSOR_CLOSE(op_tanh_out(a, out), res);
-}
-
-TEST_F(OpTanhOutTest, AllRealInputHalfOutputSupport) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_tanh_out<ScalarType::dtype, ScalarType::Half>();
-  ET_FORALL_REALH_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanhOutTest, AllRealInputFloatOutputSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_tanh_out<ScalarType::dtype, ScalarType::Float>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanhOutTest, AllRealInputDoubleOutputSupport) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_floating_point_tanh_out<ScalarType::dtype, ScalarType::Double>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-TEST_F(OpTanhOutTest, AllNonFloatOutputDTypeDies) {
-#define TEST_ENTRY(ctype, dtype) \
-  test_tanh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
-  ET_FORALL_INT_TYPES(TEST_ENTRY);
-#undef TEST_ENTRY
-}
-
-// Mismatched shape tests.
-TEST_F(OpTanhOutTest, MismatchedInputShapesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
-  }
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor a = tf.ones(/*sizes=*/{4});
-  Tensor out = tf.ones(/*sizes=*/{2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(context_, op_tanh_out(a, out));
-}
-
-TEST_F(OpTanhOutTest, SimpleGeneratedCase) {
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor x = tf.make(
-      {10, 10},
-      {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
-  Tensor expected_result = tf.make(
-      {10, 10}, {0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194, 0.7615941762924194, 0.7615941762924194,
-                 0.7615941762924194});
-
-  Tensor out = tf.zeros({10, 10});
-  Tensor ret = op_tanh_out(x, out);
-  EXPECT_TENSOR_CLOSE(out, expected_result);
-}
-
-TEST_F(OpTanhOutTest, DynamicShapeUpperBoundSameAsExpected) {
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor x = tf.make(
-      {3, 2},
-      {0.23026639223098755,
-       0.24356824159622192,
-       0.9074369668960571,
-       0.167863667011261,
-       0.8099868297576904,
-       0.6270960569381714});
-  Tensor expected_result = tf.make(
-      {3, 2},
-      {0.22628112137317657,
-       0.2388632595539093,
-       0.7198998332023621,
-       0.1663045436143875,
-       0.6695830225944519,
-       0.5560494065284729});
-
-  Tensor out =
-      tf.zeros({3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-  Tensor ret = op_tanh_out(x, out);
-  EXPECT_TENSOR_CLOSE(out, expected_result);
-}
-
-TEST_F(OpTanhOutTest, DynamicShapeUpperBoundLargerThanExpected) {
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor x = tf.make(
-      {3, 2},
-      {0.23026639223098755,
-       0.24356824159622192,
-       0.9074369668960571,
-       0.167863667011261,
-       0.8099868297576904,
-       0.6270960569381714});
-  Tensor expected_result = tf.make(
-      {3, 2},
-      {0.22628112137317657,
-       0.2388632595539093,
-       0.7198998332023621,
-       0.1663045436143875,
-       0.6695830225944519,
-       0.5560494065284729});
-
-  Tensor out =
-      tf.zeros({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-  Tensor ret = op_tanh_out(x, out);
-  EXPECT_TENSOR_CLOSE(out, expected_result);
-}
-
-TEST_F(OpTanhOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor x = tf.make(
-      {3, 2},
-      {0.23026639223098755,
-       0.24356824159622192,
-       0.9074369668960571,
-       0.167863667011261,
-       0.8099868297576904,
-       0.6270960569381714});
-  Tensor expected_result = tf.make(
-      {3, 2},
-      {0.22628112137317657,
-       0.2388632595539093,
-       0.7198998332023621,
-       0.1663045436143875,
-       0.6695830225944519,
-       0.5560494065284729});
-
-  Tensor out =
-      tf.zeros({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
-  Tensor ret = op_tanh_out(x, out);
-  EXPECT_TENSOR_CLOSE(out, expected_result);
-}
+IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST(OpTanhOutTest)
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index f8ea484435a..7bc2e7555c6 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -15,7 +15,11 @@ def make_example_generated_op_test_target():
     Makes a test for kernels/test/util generated_op_test() helper
     Here we use portable kernel. Try with `buck test xplat/executorch/kernels/test:op_<>_test`
     """
-    op_test_cpp_files = native.glob(["op_*_test.cpp"])
+    op_test_cpp_files = native.glob(
+        ["op_*_test.cpp"],
+        # linear has no portable op.
+        exclude = ["op_linear_test.cpp"],
+    )
 
     # The op name is from the beginning to the part without `_test.cpp` (:-9)
     op_to_test = [f[:-9] for f in op_test_cpp_files]
@@ -39,23 +43,34 @@ def define_common_targets():
         aten_suffix = "_aten" if aten_kernel else ""
         runtime.cxx_library(
             name = "test_util" + aten_suffix,
+            srcs = [
+                "BinaryLogicalOpTest.cpp",
+                "UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp",
+            ],
             exported_headers = [
+                "BinaryLogicalOpTest.h",
                 "TestUtil.h",
+                "UnaryUfuncRealHBBF16ToFloatHBF16Test.h",
             ],
             visibility = [
                 "//executorch/kernels/...",
                 "@EXECUTORCH_CLIENTS",
             ],
             preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_kernel else [],
+            exported_deps = [
+                ":supported_features_header",
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/testing_util:tensor_util" + aten_suffix,
+                "//executorch/runtime/kernel:kernel_includes",
+                "//executorch/test/utils:utils" + aten_suffix,
+            ],
             fbcode_exported_deps = [
                 "//common/init:init",
                 "//common/gtest:gtest",
-                "//executorch/runtime/kernel:kernel_includes",
             ],
             xplat_exported_deps = [
                 "//xplat/folly:init_init",
                 "//third-party/googletest:gtest_main",
-                "//executorch/runtime/kernel:kernel_includes",
             ],
         )
 
diff --git a/pytest.ini b/pytest.ini
index 701c0187ecf..ecd58ea07e4 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -29,6 +29,10 @@ addopts =
     # kernels/
     kernels/prim_ops/test
     kernels/quantized
+    # Because this test depends on test only cpp ops lib
+    # Will add test only cmake targets to re-enable this test
+    # but maybe it is a bit of anti-pattern
+    --ignore=kernels/quantized/test/test_quant_dequant_per_token.py
     kernels/test/test_case_gen.py
     # backends/xnnpack
     backends/xnnpack/test
@@ -39,7 +43,7 @@ addopts =
     --ignore=backends/xnnpack/test/ops/linear.py
     --ignore=backends/xnnpack/test/models/llama2_et_example.py
     # T200992559: Add torchao to ET as core dependency
-    --ignore=examples/models/llama2/tests/test_spinquant_transforms.py
+    --ignore=examples/models/llama2/tests/test_pre_quantization_transforms.py
     --ignore=exir/backend/test/demos
     --ignore=exir/backend/test/test_backends.py
     --ignore=exir/backend/test/test_backends_lifted.py
diff --git a/runtime/core/evalue.h b/runtime/core/evalue.h
index c0c534e0692..d2e57c35d4d 100644
--- a/runtime/core/evalue.h
+++ b/runtime/core/evalue.h
@@ -248,7 +248,9 @@ struct EValue {
           decltype(*std::forward<T>(value)),
           EValue>::value>::type* = 0) {
     ET_CHECK_MSG(value != nullptr, "Pointer is null.");
-    *this = EValue(*std::forward<T>(value));
+    // Note that this ctor does not initialize this->tag directly; it is set by
+    // moving in the new value.
+    moveFrom(*std::forward<T>(value));
   }
 
   // Delete constructor for raw pointers to ensure they cannot be used.
diff --git a/runtime/core/event_tracer.h b/runtime/core/event_tracer.h
index 5a26d24ca45..ff483b1f77d 100644
--- a/runtime/core/event_tracer.h
+++ b/runtime/core/event_tracer.h
@@ -67,6 +67,18 @@ enum class EventTracerDebugLogLevel {
   kIntermediateOutputs,
 };
 
+/**
+ * Indicates the level of profiling that should be enabled. Profiling
+ * events will be logged in increasing order of verbosity as we go down the
+ * enum list. Thus it is important to keep the enum values in the right order.
+ */
+enum class EventTracerProfilingLevel {
+  /// No operator profiling.
+  kProfileMethodOnly,
+  /// All profiling events enabled.
+  kProfileAllEvents,
+};
+
 /**
  * This is the struct which should be returned when a profiling event is
  * started. This is used to uniquely identify that profiling event and will be
@@ -423,6 +435,21 @@ class EventTracer {
     return event_tracer_debug_level_;
   }
 
+  /**
+   * Set the level of event tracer profiling that is desired.
+   */
+  void set_event_tracer_profiling_level(
+      EventTracerProfilingLevel profiling_level) {
+    event_tracer_profiling_level_ = profiling_level;
+  }
+
+  /**
+   * Return the current level of event tracer profiling.
+   */
+  EventTracerProfilingLevel event_tracer_profiling_level() {
+    return event_tracer_profiling_level_;
+  }
+
   /**
    * Return the current status of intermediate outputs logging mode.
    */
@@ -458,6 +485,8 @@ class EventTracer {
   int bundled_input_index_ = kUnsetBundledInputIndex;
   EventTracerDebugLogLevel event_tracer_debug_level_ =
       EventTracerDebugLogLevel::kNoLogging;
+  EventTracerProfilingLevel event_tracer_profiling_level_ =
+      EventTracerProfilingLevel::kProfileAllEvents;
 };
 
 } // namespace runtime
diff --git a/runtime/core/event_tracer_hooks.h b/runtime/core/event_tracer_hooks.h
index 76fa17f62af..40754160c41 100644
--- a/runtime/core/event_tracer_hooks.h
+++ b/runtime/core/event_tracer_hooks.h
@@ -33,14 +33,58 @@ namespace executorch {
 namespace runtime {
 namespace internal {
 
+/**
+ * This class enables scope based profiling where needed using RAII for
+ * operators only. If operator profiling is disabled then this class is a no-op.
+ */
+class EventTracerProfileOpScope final {
+ public:
+  EventTracerProfileOpScope(EventTracer* event_tracer, const char* name) {
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_ = event_tracer;
+    if (event_tracer_ == nullptr) {
+      return;
+    }
+    if (event_tracer_->event_tracer_profiling_level() >
+        executorch::runtime::EventTracerProfilingLevel::kProfileMethodOnly) {
+      event_entry_ = event_tracer->start_profiling(name);
+    }
+#else //! ET_EVENT_TRACER_ENABLED
+    (void)event_tracer;
+    (void)name;
+#endif
+  }
+
+  ~EventTracerProfileOpScope() {
+#ifdef ET_EVENT_TRACER_ENABLED
+    if (event_tracer_ == nullptr) {
+      return;
+    }
+    if (event_tracer_->event_tracer_profiling_level() >
+        executorch::runtime::EventTracerProfilingLevel::kProfileMethodOnly) {
+      event_tracer_->end_profiling(event_entry_);
+    }
+#endif
+  }
+
+ private:
+#ifdef ET_EVENT_TRACER_ENABLED
+  EventTracer* event_tracer_;
+  EventTracerEntry event_entry_;
+#endif
+};
+
+using EventTracerProfileScope = EventTracerProfileOpScope;
+
 /**
  * This class enables scope based profiling where needed using RAII.
  * Profiling will be started when the object is created and will end
- * when the object goes out of scope.
+ * when the object goes out of scope. This is specifically intended to
+ * be used for profiling methods in the runtime.
  */
-class EventTracerProfileScope final {
+class EventTracerProfileMethodScope final {
  public:
-  EventTracerProfileScope(EventTracer* event_tracer, const char* name) {
+  EventTracerProfileMethodScope(EventTracer* event_tracer, const char* name) {
 #ifdef ET_EVENT_TRACER_ENABLED
     event_tracer_ = event_tracer;
     if (event_tracer_ == nullptr) {
@@ -53,7 +97,7 @@ class EventTracerProfileScope final {
 #endif
   }
 
-  ~EventTracerProfileScope() {
+  ~EventTracerProfileMethodScope() {
 #ifdef ET_EVENT_TRACER_ENABLED
     if (event_tracer_ == nullptr) {
       return;
@@ -111,6 +155,13 @@ class EventTracerProfileInstructionScope final {
 #endif
 };
 
+inline bool event_tracer_enabled() {
+#ifdef ET_EVENT_TRACER_ENABLED
+  return true;
+#else //! ET_EVENT_TRACER_ENABLED
+  return false;
+#endif
+}
 /**
  * Create a new event block with the specified name. Any events logged
  * after this will be associated with this new event block.
@@ -271,7 +322,10 @@ using ::executorch::runtime::internal::event_tracer_set_bundled_input_index;
 using ::executorch::runtime::internal::event_tracer_track_allocation;
 using ::executorch::runtime::internal::event_tracer_track_allocator;
 using ::executorch::runtime::internal::EventTracerProfileInstructionScope;
+using ::executorch::runtime::internal::EventTracerProfileMethodScope;
+using ::executorch::runtime::internal::EventTracerProfileOpScope;
 using ::executorch::runtime::internal::EventTracerProfileScope;
+
 } // namespace internal
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index 536128d633d..df6ef60c833 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -46,9 +46,10 @@
 
 #endif
 
-namespace exec_aten {
+namespace executorch {
+namespace aten {
 
-using TensorShapeDynamism = torch::executor::TensorShapeDynamism;
+using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism;
 
 #ifdef USE_ATEN_LIB
 
@@ -86,6 +87,7 @@ using IntArrayRef = at::IntArrayRef;
 
 template <typename T>
 using OptionalArrayRef = c10::OptionalArrayRef<T>;
+using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
 
 inline ssize_t compute_numel(const SizesType* sizes, ssize_t dim) {
   return static_cast<ssize_t>(
@@ -131,16 +133,21 @@ using IntArrayRef = torch::executor::IntArrayRef;
 template <typename T>
 using OptionalArrayRef =
     torch::executor::optional<torch::executor::ArrayRef<T>>;
+using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
 
 using torch::executor::compute_numel;
 
-#endif // Use executor types
+#endif // Use ExecuTorch types
 
-} // namespace exec_aten
+} // namespace aten
+} // namespace executorch
+
+// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
+// instead.
+namespace exec_aten = executorch::aten;
 
 namespace torch {
 namespace executor {
 using TensorList = exec_aten::TensorList;
-
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp
index 0301cc9a519..f1c25eb9fe6 100644
--- a/runtime/core/exec_aten/testing_util/tensor_util.cpp
+++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp
@@ -76,13 +76,19 @@ bool data_is_close(
   return true;
 }
 
+double default_atol_for_type(ScalarType t) {
+  if (t == ScalarType::Half) {
+    return internal::kDefaultHalfAtol;
+  }
+  return internal::kDefaultAtol;
+}
 } // namespace
 
 bool tensors_are_close(
     const Tensor& a,
     const Tensor& b,
     double rtol,
-    double atol) {
+    std::optional<double> opt_atol) {
   if (a.scalar_type() != b.scalar_type() || a.sizes() != b.sizes()) {
     return false;
   }
@@ -100,6 +106,8 @@ bool tensors_are_close(
   // So we can just compare the two underlying data sequentially to figure out
   // if the two tensors are same.
 
+  double atol = opt_atol.value_or(default_atol_for_type(a.scalar_type()));
+
   if (a.nbytes() == 0) {
     // Note that this case is important. It's valid for a zero-size tensor to
     // have a null data pointer, but in some environments it's invalid to pass a
@@ -149,11 +157,12 @@ bool tensor_data_is_close(
     const Tensor& a,
     const Tensor& b,
     double rtol,
-    double atol) {
+    std::optional<double> opt_atol) {
   if (a.scalar_type() != b.scalar_type() || a.numel() != b.numel()) {
     return false;
   }
 
+  double atol = opt_atol.value_or(default_atol_for_type(a.scalar_type()));
   if (a.nbytes() == 0) {
     // Note that this case is important. It's valid for a zero-size tensor to
     // have a null data pointer, but in some environments it's invalid to pass a
@@ -185,12 +194,12 @@ bool tensor_lists_are_close(
     const exec_aten::Tensor* tensors_b,
     size_t num_tensors_b,
     double rtol,
-    double atol) {
+    std::optional<double> opt_atol) {
   if (num_tensors_a != num_tensors_b) {
     return false;
   }
   for (size_t i = 0; i < num_tensors_a; i++) {
-    if (!tensors_are_close(tensors_a[i], tensors_b[i], rtol, atol)) {
+    if (!tensors_are_close(tensors_a[i], tensors_b[i], rtol, opt_atol)) {
       return false;
     }
   }
@@ -208,8 +217,9 @@ bool tensor_lists_are_close(
  * These functions must be declared in the original namespaces of their
  * associated types so that C++ can find them.
  */
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * Prints the ScalarType to the stream as a human-readable string.
@@ -298,7 +308,8 @@ std::ostream& operator<<(std::ostream& os, const Tensor& t) {
   return os;
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
 
 #endif // !USE_ATEN_LIB
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.h b/runtime/core/exec_aten/testing_util/tensor_util.h
index 00f3c782c2f..3d1aca34787 100644
--- a/runtime/core/exec_aten/testing_util/tensor_util.h
+++ b/runtime/core/exec_aten/testing_util/tensor_util.h
@@ -11,6 +11,8 @@
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <gmock/gmock.h> // For MATCHER_P
 
+#include <optional>
+
 namespace executorch {
 namespace runtime {
 namespace testing {
@@ -18,6 +20,14 @@ namespace testing {
 namespace internal {
 constexpr double kDefaultRtol = 1e-5;
 constexpr double kDefaultAtol = 1e-8;
+// Per
+// https://en.wikipedia.org/wiki/Half-precision_floating-point_format,
+// float16 has about 3.3 digits of precision.
+constexpr double kDefaultHalfAtol = 1e-3;
+
+// Following similar reasoning to float16, BFloat16 has
+// math.log10(2**8) = 2.4 digits of precision.
+constexpr double kDefaultBFloat16Atol = 1e-2;
 } // namespace internal
 
 /**
@@ -61,7 +71,7 @@ bool tensors_are_close(
     const exec_aten::Tensor& a,
     const exec_aten::Tensor& b,
     double rtol = internal::kDefaultRtol,
-    double atol = internal::kDefaultAtol);
+    std::optional<double> opt_atol = std::nullopt);
 
 /**
  * Returns true if the tensors are of the same numel and dtype, and if all
@@ -92,7 +102,7 @@ bool tensor_data_is_close(
     const exec_aten::Tensor& a,
     const exec_aten::Tensor& b,
     double rtol = internal::kDefaultRtol,
-    double atol = internal::kDefaultAtol);
+    std::optional<double> opt_atol = std::nullopt);
 
 /**
  * Returns true if the two lists are of the same length, and
@@ -105,7 +115,7 @@ bool tensor_lists_are_close(
     const exec_aten::Tensor* tensors_b,
     size_t num_tensors_b,
     double rtol = internal::kDefaultRtol,
-    double atol = internal::kDefaultAtol);
+    std::optional<double> opt_atol = std::nullopt);
 
 /**
  * Lets gtest users write `EXPECT_THAT(tensor1, IsCloseTo(tensor2))` or
@@ -325,23 +335,25 @@ MATCHER_P(IsListEqualTo, other, "") {
  * These functions must be declared in the original namespaces of their
  * associated types so that C++ can find them.
  */
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * Prints the ScalarType to the stream as a human-readable string.
  *
  * See also executorch::runtime::toString(ScalarType t) in ScalarTypeUtil.h.
  */
-std::ostream& operator<<(std::ostream& os, const exec_aten::ScalarType& t);
+std::ostream& operator<<(std::ostream& os, const ScalarType& t);
 
 /**
  * Prints the Tensor to the stream as a human-readable string.
  */
-std::ostream& operator<<(std::ostream& os, const exec_aten::Tensor& t);
+std::ostream& operator<<(std::ostream& os, const Tensor& t);
 
-} // namespace executor
-} // namespace torch
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
 
 #endif // !USE_ATEN_LIB
 
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index 57c7c0dc6d3..e25c5e36920 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -37,17 +37,24 @@
 // here.
 #define ET_FORALL_SCALAR_TYPES AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS
 #include <c10/core/ScalarType.h>
-namespace exec_aten {
+namespace executorch {
+namespace aten {
 using ScalarType = at::ScalarType;
-}
+} // namespace aten
+} // namespace executorch
 #else // !USE_ATEN_LIB
 #include <executorch/runtime/core/portable_type/scalar_type.h>
 #include <executorch/runtime/core/portable_type/string_view.h>
-namespace exec_aten {
+namespace executorch {
+namespace aten {
 using ScalarType = torch::executor::ScalarType;
 using string_view = torch::executor::string_view;
-} // namespace exec_aten
+} // namespace aten
+} // namespace executorch
 #endif // USE_ATEN_LIB
+// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
+// instead.
+namespace exec_aten = ::executorch::aten;
 
 namespace executorch {
 namespace runtime {
@@ -80,13 +87,13 @@ constexpr bool is_reduced_floating_point_v =
 #endif
 
 /// Maps ScalarTypes to C++ types.
-template <exec_aten::ScalarType N>
+template <::executorch::aten::ScalarType N>
 struct ScalarTypeToCppType;
 
-#define SPECIALIZE_ScalarTypeToCppType(cpp_type, scalar_type)      \
-  template <>                                                      \
-  struct ScalarTypeToCppType<exec_aten::ScalarType::scalar_type> { \
-    using type = cpp_type;                                         \
+#define SPECIALIZE_ScalarTypeToCppType(cpp_type, scalar_type)               \
+  template <>                                                               \
+  struct ScalarTypeToCppType<::executorch::aten::ScalarType::scalar_type> { \
+    using type = cpp_type;                                                  \
   };
 
 ET_FORALL_SCALAR_TYPES(SPECIALIZE_ScalarTypeToCppType)
@@ -101,8 +108,8 @@ struct CppTypeToScalarType;
   template <>                                                 \
   struct CppTypeToScalarType<cpp_type>                        \
       : std::integral_constant<                               \
-            exec_aten::ScalarType,                            \
-            exec_aten::ScalarType::scalar_type> {};
+            ::executorch::aten::ScalarType,                   \
+            ::executorch::aten::ScalarType::scalar_type> {};
 
 ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
 
@@ -147,14 +154,14 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int)                    \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long)
 
-#define ET_FORALL_INT_TYPES_AND(SCALARTYPE, _)      \
-  _(uint8_t, Byte)                                  \
-  _(int8_t, Char)                                   \
-  _(int16_t, Short)                                 \
-  _(int32_t, Int)                                   \
-  _(int64_t, Long)                                  \
-  _(::executorch::runtime::ScalarTypeToCppType<     \
-        ::exec_aten::ScalarType::SCALARTYPE>::type, \
+#define ET_FORALL_INT_TYPES_AND(SCALARTYPE, _)             \
+  _(uint8_t, Byte)                                         \
+  _(int8_t, Char)                                          \
+  _(int16_t, Short)                                        \
+  _(int32_t, Int)                                          \
+  _(int64_t, Long)                                         \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
     SCALARTYPE)
 
 // In this context, "FLOAT" means float C types, which is why BFloat16 is not
@@ -163,21 +170,21 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(float, Float)                \
   _(double, Double)
 
-#define ET_FORALL_FLOAT_TYPES_AND(SCALARTYPE, _)    \
-  _(float, Float)                                   \
-  _(double, Double)                                 \
-  _(::executorch::runtime::ScalarTypeToCppType<     \
-        ::exec_aten::ScalarType::SCALARTYPE>::type, \
+#define ET_FORALL_FLOAT_TYPES_AND(SCALARTYPE, _)           \
+  _(float, Float)                                          \
+  _(double, Double)                                        \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
     SCALARTYPE)
 
 #define ET_FORALL_FLOAT_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
   _(float, Float)                                               \
   _(double, Double)                                             \
   _(::executorch::runtime::ScalarTypeToCppType<                 \
-        ::exec_aten::ScalarType::SCALARTYPE1>::type,            \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,     \
     SCALARTYPE1)                                                \
   _(::executorch::runtime::ScalarTypeToCppType<                 \
-        ::exec_aten::ScalarType::SCALARTYPE2>::type,            \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,     \
     SCALARTYPE2)
 
 #define ET_FORALL_FLOATH_TYPES(_) ET_FORALL_FLOAT_TYPES_AND(Half, _)
@@ -199,8 +206,8 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
 #define ET_FORALL_FLOATHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                           \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)                         \
-  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::exec_aten::Half, Half)                \
-  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::exec_aten::BFloat16, BFloat16)
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::Half, Half)         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::BFloat16, BFloat16)
 
 // In this context, "REAL" means integer/float C types, which is why BFloat16
 // and Half are not included.
@@ -242,21 +249,21 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long)                         \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                          \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)                        \
-  _(ANOTHER_INPUT1, ANOTHER_INPUT2, exec_aten::Half, Half)                 \
-  _(ANOTHER_INPUT1, ANOTHER_INPUT2, exec_aten::BFloat16, BFloat16)
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::Half, Half)        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::executorch::aten::BFloat16, BFloat16)
 
 // For macros that take `SCALARTYPEn` parameters, those parameters should be
 // an unquoted/unqualified enumerator name like `Int` or `Float`.
-#define ET_FORALL_REAL_TYPES_AND(SCALARTYPE, _)     \
-  _(uint8_t, Byte)                                  \
-  _(int8_t, Char)                                   \
-  _(int16_t, Short)                                 \
-  _(int32_t, Int)                                   \
-  _(int64_t, Long)                                  \
-  _(float, Float)                                   \
-  _(double, Double)                                 \
-  _(::executorch::runtime::ScalarTypeToCppType<     \
-        ::exec_aten::ScalarType::SCALARTYPE>::type, \
+#define ET_FORALL_REAL_TYPES_AND(SCALARTYPE, _)            \
+  _(uint8_t, Byte)                                         \
+  _(int8_t, Char)                                          \
+  _(int16_t, Short)                                        \
+  _(int32_t, Int)                                          \
+  _(int64_t, Long)                                         \
+  _(float, Float)                                          \
+  _(double, Double)                                        \
+  _(::executorch::runtime::ScalarTypeToCppType<            \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type, \
     SCALARTYPE)
 
 #define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
@@ -268,10 +275,10 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(float, Float)                                              \
   _(double, Double)                                            \
   _(::executorch::runtime::ScalarTypeToCppType<                \
-        ::exec_aten::ScalarType::SCALARTYPE1>::type,           \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,    \
     SCALARTYPE1)                                               \
   _(::executorch::runtime::ScalarTypeToCppType<                \
-        ::exec_aten::ScalarType::SCALARTYPE2>::type,           \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,    \
     SCALARTYPE2)
 
 #define ET_FORALL_REALH_TYPES(_) ET_FORALL_REAL_TYPES_AND(Half, _)
@@ -292,7 +299,7 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT, double, Double)                                  \
   _(ANOTHER_INPUT,                                                  \
     ::executorch::runtime::ScalarTypeToCppType<                     \
-        ::exec_aten::ScalarType::SCALARTYPE>::type,                 \
+        ::executorch::aten::ScalarType::SCALARTYPE>::type,          \
     SCALARTYPE)
 
 #define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
@@ -304,10 +311,10 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(float, Float)                                              \
   _(double, Double)                                            \
   _(::executorch::runtime::ScalarTypeToCppType<                \
-        ::exec_aten::ScalarType::SCALARTYPE1>::type,           \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,    \
     SCALARTYPE1)                                               \
   _(::executorch::runtime::ScalarTypeToCppType<                \
-        ::exec_aten::ScalarType::SCALARTYPE2>::type,           \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,    \
     SCALARTYPE2)
 
 #define ET_FORALL_REAL_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \
@@ -319,13 +326,13 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(float, Float)                                                           \
   _(double, Double)                                                         \
   _(::executorch::runtime::ScalarTypeToCppType<                             \
-        ::exec_aten::ScalarType::SCALARTYPE1>::type,                        \
+        ::executorch::aten::ScalarType::SCALARTYPE1>::type,                 \
     SCALARTYPE1)                                                            \
   _(::executorch::runtime::ScalarTypeToCppType<                             \
-        ::exec_aten::ScalarType::SCALARTYPE2>::type,                        \
+        ::executorch::aten::ScalarType::SCALARTYPE2>::type,                 \
     SCALARTYPE2)                                                            \
   _(::executorch::runtime::ScalarTypeToCppType<                             \
-        ::exec_aten::ScalarType::SCALARTYPE3>::type,                        \
+        ::executorch::aten::ScalarType::SCALARTYPE3>::type,                 \
     SCALARTYPE3)
 
 #define ET_FORALL_QINT_TYPES(_)            \
@@ -349,10 +356,10 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
  * Returns true if the parameter is one of the values covered by
  * ET_FORALL_SCALAR_TYPES.
  */
-inline bool isValid(exec_aten::ScalarType type) {
+inline bool isValid(::executorch::aten::ScalarType type) {
   return static_cast<int8_t>(type) >= 0 &&
-      type < exec_aten::ScalarType::NumOptions &&
-      type != exec_aten::ScalarType::Undefined;
+      type < ::executorch::aten::ScalarType::NumOptions &&
+      type != ::executorch::aten::ScalarType::Undefined;
 }
 
 /**
@@ -361,14 +368,14 @@ inline bool isValid(exec_aten::ScalarType type) {
  * @param[in] t The type to get the name of.
  * @return The name of the type, or "UNKNOWN_SCALAR" if the type is not known.
  */
-inline const char* toString(exec_aten::ScalarType t) {
-#define DEFINE_CASE(_, name)        \
-  case exec_aten::ScalarType::name: \
+inline const char* toString(::executorch::aten::ScalarType t) {
+#define DEFINE_CASE(_, name)                 \
+  case ::executorch::aten::ScalarType::name: \
     return #name;
 
   switch (t) {
     ET_FORALL_SCALAR_TYPES(DEFINE_CASE)
-    case exec_aten::ScalarType::Undefined:
+    case ::executorch::aten::ScalarType::Undefined:
       return "Undefined";
     default:
       return "UNKNOWN_SCALAR";
@@ -384,9 +391,9 @@ inline const char* toString(exec_aten::ScalarType t) {
  * @param[in] t The type to get the underlying C type size of.
  * @return The size of the associated C type in bytes.
  */
-inline size_t elementSize(exec_aten::ScalarType t) {
-#define CASE_ELEMENTSIZE_CASE(ctype, name) \
-  case exec_aten::ScalarType::name:        \
+inline size_t elementSize(::executorch::aten::ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype, name)   \
+  case ::executorch::aten::ScalarType::name: \
     return sizeof(ctype);
 
   switch (t) {
@@ -398,12 +405,14 @@ inline size_t elementSize(exec_aten::ScalarType t) {
 }
 
 inline constexpr bool isIntegralType(
-    exec_aten::ScalarType t,
+    ::executorch::aten::ScalarType t,
     bool includeBool) {
-  return (includeBool && t == exec_aten::ScalarType::Bool) ||
-      (t == exec_aten::ScalarType::Byte || t == exec_aten::ScalarType::Char ||
-       t == exec_aten::ScalarType::Int || t == exec_aten::ScalarType::Long ||
-       t == exec_aten::ScalarType::Short);
+  return (includeBool && t == ::executorch::aten::ScalarType::Bool) ||
+      (t == ::executorch::aten::ScalarType::Byte ||
+       t == ::executorch::aten::ScalarType::Char ||
+       t == ::executorch::aten::ScalarType::Int ||
+       t == ::executorch::aten::ScalarType::Long ||
+       t == ::executorch::aten::ScalarType::Short);
 }
 
 template <typename T, bool includeBool>
@@ -412,41 +421,50 @@ struct is_integral_type
           bool,
           isIntegralType(CppTypeToScalarType<T>::value, includeBool)> {};
 
-inline constexpr bool isFloatingType(exec_aten::ScalarType t) {
+inline constexpr bool isFloatingType(::executorch::aten::ScalarType t) {
   return (
-      t == exec_aten::ScalarType::Double || t == exec_aten::ScalarType::Float ||
-      t == exec_aten::ScalarType::Half || t == exec_aten::ScalarType::BFloat16);
+      t == ::executorch::aten::ScalarType::Double ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Half ||
+      t == ::executorch::aten::ScalarType::BFloat16);
 }
 
-inline bool isRealType(exec_aten::ScalarType t) {
+inline bool isRealType(::executorch::aten::ScalarType t) {
   return (
-      t == exec_aten::ScalarType::Byte || t == exec_aten::ScalarType::Char ||
-      t == exec_aten::ScalarType::Short || t == exec_aten::ScalarType::Int ||
-      t == exec_aten::ScalarType::Long || t == exec_aten::ScalarType::Float ||
-      t == exec_aten::ScalarType::Double);
+      t == ::executorch::aten::ScalarType::Byte ||
+      t == ::executorch::aten::ScalarType::Char ||
+      t == ::executorch::aten::ScalarType::Short ||
+      t == ::executorch::aten::ScalarType::Int ||
+      t == ::executorch::aten::ScalarType::Long ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Double);
 }
 
-inline bool isRealHType(exec_aten::ScalarType t) {
+inline bool isRealHType(::executorch::aten::ScalarType t) {
   return (
-      t == exec_aten::ScalarType::Byte || t == exec_aten::ScalarType::Char ||
-      t == exec_aten::ScalarType::Short || t == exec_aten::ScalarType::Int ||
-      t == exec_aten::ScalarType::Long || t == exec_aten::ScalarType::Float ||
-      t == exec_aten::ScalarType::Double || t == exec_aten::ScalarType::Half);
+      t == ::executorch::aten::ScalarType::Byte ||
+      t == ::executorch::aten::ScalarType::Char ||
+      t == ::executorch::aten::ScalarType::Short ||
+      t == ::executorch::aten::ScalarType::Int ||
+      t == ::executorch::aten::ScalarType::Long ||
+      t == ::executorch::aten::ScalarType::Float ||
+      t == ::executorch::aten::ScalarType::Double ||
+      t == ::executorch::aten::ScalarType::Half);
 }
 
-inline bool isRealHBType(exec_aten::ScalarType t) {
-  return (isRealHType(t) || t == exec_aten::ScalarType::Bool);
+inline bool isRealHBType(::executorch::aten::ScalarType t) {
+  return (isRealHType(t) || t == ::executorch::aten::ScalarType::Bool);
 }
 
-inline bool isRealHBBF16Type(exec_aten::ScalarType t) {
-  return (isRealHBType(t) || t == exec_aten::ScalarType::BFloat16);
+inline bool isRealHBBF16Type(::executorch::aten::ScalarType t) {
+  return (isRealHBType(t) || t == ::executorch::aten::ScalarType::BFloat16);
 }
 
-inline constexpr bool isComplexType(exec_aten::ScalarType t) {
+inline constexpr bool isComplexType(::executorch::aten::ScalarType t) {
   return (
-      t == exec_aten::ScalarType::ComplexHalf ||
-      t == exec_aten::ScalarType::ComplexFloat ||
-      t == exec_aten::ScalarType::ComplexDouble);
+      t == ::executorch::aten::ScalarType::ComplexHalf ||
+      t == ::executorch::aten::ScalarType::ComplexFloat ||
+      t == ::executorch::aten::ScalarType::ComplexDouble);
 }
 
 template <typename T>
@@ -454,11 +472,12 @@ struct is_complex_type : std::integral_constant<
                              bool,
                              isComplexType(CppTypeToScalarType<T>::value)> {};
 
-constexpr bool isBitsType(exec_aten::ScalarType t) {
-  return t == exec_aten::ScalarType::Bits1x8 ||
-      t == exec_aten::ScalarType::Bits2x4 ||
-      t == exec_aten::ScalarType::Bits4x2 ||
-      t == exec_aten::ScalarType::Bits8 || t == exec_aten::ScalarType::Bits16;
+constexpr bool isBitsType(::executorch::aten::ScalarType t) {
+  return t == ::executorch::aten::ScalarType::Bits1x8 ||
+      t == ::executorch::aten::ScalarType::Bits2x4 ||
+      t == ::executorch::aten::ScalarType::Bits4x2 ||
+      t == ::executorch::aten::ScalarType::Bits8 ||
+      t == ::executorch::aten::ScalarType::Bits16;
 }
 
 template <typename T>
@@ -466,13 +485,13 @@ struct is_bits_type
     : std::integral_constant<bool, isBitsType(CppTypeToScalarType<T>::value)> {
 };
 
-constexpr bool isQIntType(exec_aten::ScalarType t) {
+constexpr bool isQIntType(::executorch::aten::ScalarType t) {
   // Don't forget to extend this when adding new QInt types
-  return t == exec_aten::ScalarType::QInt8 ||
-      t == exec_aten::ScalarType::QUInt8 ||
-      t == exec_aten::ScalarType::QInt32 ||
-      t == exec_aten::ScalarType::QUInt4x2 ||
-      t == exec_aten::ScalarType::QUInt2x4;
+  return t == ::executorch::aten::ScalarType::QInt8 ||
+      t == ::executorch::aten::ScalarType::QUInt8 ||
+      t == ::executorch::aten::ScalarType::QInt32 ||
+      t == ::executorch::aten::ScalarType::QUInt4x2 ||
+      t == ::executorch::aten::ScalarType::QUInt2x4;
 }
 
 template <typename T>
@@ -480,49 +499,51 @@ struct is_qint_type
     : std::integral_constant<bool, isQIntType(CppTypeToScalarType<T>::value)> {
 };
 
-inline exec_aten::ScalarType toQIntType(exec_aten::ScalarType t) {
+inline ::executorch::aten::ScalarType toQIntType(
+    ::executorch::aten::ScalarType t) {
   switch (t) {
-    case exec_aten::ScalarType::Byte:
-      return exec_aten::ScalarType::QUInt8;
-    case exec_aten::ScalarType::Char:
-      return exec_aten::ScalarType::QInt8;
-    case exec_aten::ScalarType::Int:
-      return exec_aten::ScalarType::QInt32;
+    case ::executorch::aten::ScalarType::Byte:
+      return ::executorch::aten::ScalarType::QUInt8;
+    case ::executorch::aten::ScalarType::Char:
+      return ::executorch::aten::ScalarType::QInt8;
+    case ::executorch::aten::ScalarType::Int:
+      return ::executorch::aten::ScalarType::QInt32;
     default:
       return t;
   }
 }
 
-inline exec_aten::ScalarType toUnderlying(exec_aten::ScalarType t) {
+inline ::executorch::aten::ScalarType toUnderlying(
+    ::executorch::aten::ScalarType t) {
   switch (t) {
-    case exec_aten::ScalarType::QUInt8:
-      return exec_aten::ScalarType::Byte;
-    case exec_aten::ScalarType::QInt8:
-      return exec_aten::ScalarType::Char;
-    case exec_aten::ScalarType::QInt32:
-      return exec_aten::ScalarType::Int;
-    case exec_aten::ScalarType::QUInt4x2:
-      return exec_aten::ScalarType::Byte;
-    case exec_aten::ScalarType::QUInt2x4:
-      return exec_aten::ScalarType::Byte;
+    case ::executorch::aten::ScalarType::QUInt8:
+      return ::executorch::aten::ScalarType::Byte;
+    case ::executorch::aten::ScalarType::QInt8:
+      return ::executorch::aten::ScalarType::Char;
+    case ::executorch::aten::ScalarType::QInt32:
+      return ::executorch::aten::ScalarType::Int;
+    case ::executorch::aten::ScalarType::QUInt4x2:
+      return ::executorch::aten::ScalarType::Byte;
+    case ::executorch::aten::ScalarType::QUInt2x4:
+      return ::executorch::aten::ScalarType::Byte;
     default:
       return t;
   }
 }
 
-inline bool isSignedType(exec_aten::ScalarType t) {
+inline bool isSignedType(::executorch::aten::ScalarType t) {
   ET_CHECK_MSG(
-      !executorch::runtime::isQIntType(t),
+      !::executorch::runtime::isQIntType(t),
       "isSignedType not supported for quantized types like %" PRId8,
       static_cast<int8_t>(t));
-#define CASE_SIGNED(ctype, name)    \
-  case exec_aten::ScalarType::name: \
+#define CASE_SIGNED(ctype, name)             \
+  case ::executorch::aten::ScalarType::name: \
     return std::numeric_limits<ctype>::is_signed;
 
   switch (t) {
-    case exec_aten::ScalarType::ComplexHalf:
-    case exec_aten::ScalarType::ComplexFloat:
-    case exec_aten::ScalarType::ComplexDouble:
+    case ::executorch::aten::ScalarType::ComplexHalf:
+    case ::executorch::aten::ScalarType::ComplexFloat:
+    case ::executorch::aten::ScalarType::ComplexDouble:
       return true;
       ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, CASE_SIGNED)
     default:
@@ -532,42 +553,44 @@ inline bool isSignedType(exec_aten::ScalarType t) {
 }
 
 inline bool isUnderlying(
-    exec_aten::ScalarType type,
-    exec_aten::ScalarType qtype) {
-  return type == executorch::runtime::toUnderlying(qtype);
+    ::executorch::aten::ScalarType type,
+    ::executorch::aten::ScalarType qtype) {
+  return type == ::executorch::runtime::toUnderlying(qtype);
 }
 
-inline exec_aten::ScalarType toRealValueType(exec_aten::ScalarType t) {
+inline ::executorch::aten::ScalarType toRealValueType(
+    ::executorch::aten::ScalarType t) {
   switch (t) {
-    case exec_aten::ScalarType::ComplexHalf:
-      return exec_aten::ScalarType::Half;
-    case exec_aten::ScalarType::ComplexFloat:
-      return exec_aten::ScalarType::Float;
-    case exec_aten::ScalarType::ComplexDouble:
-      return exec_aten::ScalarType::Double;
+    case ::executorch::aten::ScalarType::ComplexHalf:
+      return ::executorch::aten::ScalarType::Half;
+    case ::executorch::aten::ScalarType::ComplexFloat:
+      return ::executorch::aten::ScalarType::Float;
+    case ::executorch::aten::ScalarType::ComplexDouble:
+      return ::executorch::aten::ScalarType::Double;
     default:
       return t;
   }
 }
 
-inline exec_aten::ScalarType toComplexType(exec_aten::ScalarType t) {
+inline ::executorch::aten::ScalarType toComplexType(
+    ::executorch::aten::ScalarType t) {
   switch (t) {
-    case exec_aten::ScalarType::BFloat16:
+    case ::executorch::aten::ScalarType::BFloat16:
       // BFloat16 has range equivalent to Float,
       // so we map it to ComplexFloat.
-      return exec_aten::ScalarType::ComplexFloat;
-    case exec_aten::ScalarType::Half:
-      return exec_aten::ScalarType::ComplexHalf;
-    case exec_aten::ScalarType::Float:
-      return exec_aten::ScalarType::ComplexFloat;
-    case exec_aten::ScalarType::Double:
-      return exec_aten::ScalarType::ComplexDouble;
-    case exec_aten::ScalarType::ComplexHalf:
-      return exec_aten::ScalarType::ComplexHalf;
-    case exec_aten::ScalarType::ComplexFloat:
-      return exec_aten::ScalarType::ComplexFloat;
-    case exec_aten::ScalarType::ComplexDouble:
-      return exec_aten::ScalarType::ComplexDouble;
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::Half:
+      return ::executorch::aten::ScalarType::ComplexHalf;
+    case ::executorch::aten::ScalarType::Float:
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::Double:
+      return ::executorch::aten::ScalarType::ComplexDouble;
+    case ::executorch::aten::ScalarType::ComplexHalf:
+      return ::executorch::aten::ScalarType::ComplexHalf;
+    case ::executorch::aten::ScalarType::ComplexFloat:
+      return ::executorch::aten::ScalarType::ComplexFloat;
+    case ::executorch::aten::ScalarType::ComplexDouble:
+      return ::executorch::aten::ScalarType::ComplexDouble;
     default:
       ET_CHECK_MSG(
           false,
@@ -580,17 +603,17 @@ inline exec_aten::ScalarType toComplexType(exec_aten::ScalarType t) {
  * Encodes type casting rules that are consistent with ATen behaviour.
  */
 inline constexpr bool canCast(
-    const exec_aten::ScalarType from,
-    const exec_aten::ScalarType to) {
+    const ::executorch::aten::ScalarType from,
+    const ::executorch::aten::ScalarType to) {
   // Disallow complex -> non-complex
-  return !(executorch::runtime::isComplexType(from) &&
-           !executorch::runtime::isComplexType(to)) &&
+  return !(::executorch::runtime::isComplexType(from) &&
+           !::executorch::runtime::isComplexType(to)) &&
       // Disallow float -> integral
-      !(executorch::runtime::isFloatingType(from) &&
-        executorch::runtime::isIntegralType(to, /*includeBool=*/false)) &&
+      !(::executorch::runtime::isFloatingType(from) &&
+        ::executorch::runtime::isIntegralType(to, /*includeBool=*/false)) &&
       // Treat bool as a special category. Disallow non-bool -> bool
-      !(from != exec_aten::ScalarType::Bool &&
-        to == exec_aten::ScalarType::Bool);
+      !(from != ::executorch::aten::ScalarType::Bool &&
+        to == ::executorch::aten::ScalarType::Bool);
 }
 
 template <typename T1, typename T2>
@@ -635,22 +658,32 @@ struct promote_types_lookup<T1, T1> {
   using type = T1;
 };
 
-using U1 = typename ScalarTypeToCppType<exec_aten::ScalarType::Byte>::type;
-using I1 = typename ScalarTypeToCppType<exec_aten::ScalarType::Char>::type;
-using I2 = typename ScalarTypeToCppType<exec_aten::ScalarType::Short>::type;
-using I4 = typename ScalarTypeToCppType<exec_aten::ScalarType::Int>::type;
-using I8 = typename ScalarTypeToCppType<exec_aten::ScalarType::Long>::type;
-using F2 = typename ScalarTypeToCppType<exec_aten::ScalarType::Half>::type;
-using F4 = typename ScalarTypeToCppType<exec_aten::ScalarType::Float>::type;
-using F8 = typename ScalarTypeToCppType<exec_aten::ScalarType::Double>::type;
-using C2 =
-    typename ScalarTypeToCppType<exec_aten::ScalarType::ComplexHalf>::type;
-using C4 =
-    typename ScalarTypeToCppType<exec_aten::ScalarType::ComplexFloat>::type;
-using C8 =
-    typename ScalarTypeToCppType<exec_aten::ScalarType::ComplexDouble>::type;
-using B1 = typename ScalarTypeToCppType<exec_aten::ScalarType::Bool>::type;
-using BF = typename ScalarTypeToCppType<exec_aten::ScalarType::BFloat16>::type;
+using U1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Byte>::type;
+using I1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Char>::type;
+using I2 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Short>::type;
+using I4 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Int>::type;
+using I8 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Long>::type;
+using F2 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Half>::type;
+using F4 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Float>::type;
+using F8 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Double>::type;
+using C2 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexHalf>::type;
+using C4 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexFloat>::type;
+using C8 = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::ComplexDouble>::type;
+using B1 =
+    typename ScalarTypeToCppType<::executorch::aten::ScalarType::Bool>::type;
+using BF = typename ScalarTypeToCppType<
+    ::executorch::aten::ScalarType::BFloat16>::type;
 
 #define TABLE_ENTRY(key1, key2, value)      \
   template <>                               \
@@ -856,12 +889,12 @@ struct promote_types {
           (std::is_same<
                promoted_type_not_respecting_half_to_float,
                typename ScalarTypeToCppType<
-                   exec_aten::ScalarType::Half>::type>::value ||
+                   ::executorch::aten::ScalarType::Half>::type>::value ||
            std::is_same<
                promoted_type_not_respecting_half_to_float,
                typename ScalarTypeToCppType<
-                   exec_aten::ScalarType::BFloat16>::type>::value),
-      typename ScalarTypeToCppType<exec_aten::ScalarType::Float>::type,
+                   ::executorch::aten::ScalarType::BFloat16>::type>::value),
+      typename ScalarTypeToCppType<::executorch::aten::ScalarType::Float>::type,
       promoted_type_not_respecting_half_to_float>::type;
 };
 
@@ -871,56 +904,57 @@ struct promote_types {
  * If half_to_float is set to true, then half and bfloat16 will be promoted to
  * float instead
  */
-inline exec_aten::ScalarType promoteTypes(
-    exec_aten::ScalarType a,
-    exec_aten::ScalarType b,
+inline ::executorch::aten::ScalarType promoteTypes(
+    ::executorch::aten::ScalarType a,
+    ::executorch::aten::ScalarType b,
     bool half_to_float = false) {
   // This is generated according to NumPy's promote_types
-  constexpr auto u1 = exec_aten::ScalarType::Byte;
-  constexpr auto i1 = exec_aten::ScalarType::Char;
-  constexpr auto i2 = exec_aten::ScalarType::Short;
-  constexpr auto i4 = exec_aten::ScalarType::Int;
-  constexpr auto i8 = exec_aten::ScalarType::Long;
-  constexpr auto f2 = exec_aten::ScalarType::Half;
-  constexpr auto f4 = exec_aten::ScalarType::Float;
-  constexpr auto f8 = exec_aten::ScalarType::Double;
-  constexpr auto c2 = exec_aten::ScalarType::ComplexHalf;
-  constexpr auto c4 = exec_aten::ScalarType::ComplexFloat;
-  constexpr auto c8 = exec_aten::ScalarType::ComplexDouble;
-  constexpr auto b1 = exec_aten::ScalarType::Bool;
-  constexpr auto bf = exec_aten::ScalarType::BFloat16;
+  constexpr auto u1 = ::executorch::aten::ScalarType::Byte;
+  constexpr auto i1 = ::executorch::aten::ScalarType::Char;
+  constexpr auto i2 = ::executorch::aten::ScalarType::Short;
+  constexpr auto i4 = ::executorch::aten::ScalarType::Int;
+  constexpr auto i8 = ::executorch::aten::ScalarType::Long;
+  constexpr auto f2 = ::executorch::aten::ScalarType::Half;
+  constexpr auto f4 = ::executorch::aten::ScalarType::Float;
+  constexpr auto f8 = ::executorch::aten::ScalarType::Double;
+  constexpr auto c2 = ::executorch::aten::ScalarType::ComplexHalf;
+  constexpr auto c4 = ::executorch::aten::ScalarType::ComplexFloat;
+  constexpr auto c8 = ::executorch::aten::ScalarType::ComplexDouble;
+  constexpr auto b1 = ::executorch::aten::ScalarType::Bool;
+  constexpr auto bf = ::executorch::aten::ScalarType::BFloat16;
 
   // For QInt types, only allow exact match
-  if (executorch::runtime::isQIntType(a) && a == b) {
+  if (::executorch::runtime::isQIntType(a) && a == b) {
     return a;
   }
-  if (executorch::runtime::isQIntType(a) ||
-      executorch::runtime::isQIntType(b)) {
+  if (::executorch::runtime::isQIntType(a) ||
+      ::executorch::runtime::isQIntType(b)) {
     ET_CHECK_MSG(false, "promoteTypes not valid for quantized dtypes");
   }
 
   // For Bits types, only allow exact match
-  if (executorch::runtime::isBitsType(a) && a == b) {
+  if (::executorch::runtime::isBitsType(a) && a == b) {
     return a;
   }
-  if (executorch::runtime::isBitsType(a) ||
-      executorch::runtime::isBitsType(b)) {
+  if (::executorch::runtime::isBitsType(a) ||
+      ::executorch::runtime::isBitsType(b)) {
     ET_CHECK_MSG(false, "promoteTypes not valid for bits dtypes");
   }
 
   // 12 types are handled by this function, see the constexpr definitions above
   const int NUM_PROMOTE_TYPES = 13;
 
-  static constexpr std::array<int, int(exec_aten::ScalarType::NumOptions)>
-      dtype2index = {{
-          0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
-          -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1,
-      }};
+  static constexpr std::
+      array<int, int(::executorch::aten::ScalarType::NumOptions)>
+          dtype2index = {{
+              0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1,
+          }};
   auto ix_a = dtype2index[(int)a];
   ET_CHECK(ix_a != -1);
   auto ix_b = dtype2index[(int)b];
   ET_CHECK(ix_b != -1);
-  static constexpr exec_aten::ScalarType
+  static constexpr ::executorch::aten::ScalarType
       _promoteTypesLookup[NUM_PROMOTE_TYPES][NUM_PROMOTE_TYPES] = {
           /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  bf*/
           /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, bf},
@@ -938,12 +972,13 @@ inline exec_aten::ScalarType promoteTypes(
           /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, bf},
       };
 
-  exec_aten::ScalarType promoted_type = _promoteTypesLookup[ix_a][ix_b];
+  ::executorch::aten::ScalarType promoted_type =
+      _promoteTypesLookup[ix_a][ix_b];
 
   if (half_to_float &&
-      (promoted_type == exec_aten::ScalarType::Half ||
-       promoted_type == exec_aten::ScalarType::BFloat16)) {
-    promoted_type = exec_aten::ScalarType::Float;
+      (promoted_type == ::executorch::aten::ScalarType::Half ||
+       promoted_type == ::executorch::aten::ScalarType::BFloat16)) {
+    promoted_type = ::executorch::aten::ScalarType::Float;
   }
 
   return promoted_type;
@@ -978,6 +1013,7 @@ inline exec_aten::ScalarType promoteTypes(
   [&] {                                              \
     const auto& _st = TYPE;                          \
     constexpr const char* et_switch_name = NAME;     \
+    (void)et_switch_name; /* Suppress unused var */  \
     switch (_st) {                                   \
       __VA_ARGS__                                    \
       default:                                       \
@@ -989,166 +1025,168 @@ inline exec_aten::ScalarType promoteTypes(
     }                                                \
   }()
 
-#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...)           \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)           \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                            \
-      exec_aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...)                    \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__)         \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)           \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)         \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)        \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)          \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)         \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)        \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)        \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__)       \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__)       \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__)       \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__)         \
+  ET_INTERNAL_SWITCH_CASE(                                                     \
+      ::executorch::aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...)           \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                   \
-      exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
 
-#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                    \
-    ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)                     \
-  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                          \
-      exec_aten::ScalarType::ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                          \
-      exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                             \
+    ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)                              \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(             \
     ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                   \
       ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)    \
   ET_INTERNAL_SWITCH_CASE(                                   \
-      exec_aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)   \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                    \
-      exec_aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)            \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                             \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_INT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                  \
-      exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
 
-#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...)  \
-  ET_INTERNAL_SWITCH_CASE(                                     \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                     \
-      exec_aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...)           \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                    \
-      exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2( \
     ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)   \
   ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(        \
       ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__)      \
   ET_INTERNAL_SWITCH_CASE(                        \
-      exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...)     \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)    \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                       \
-      exec_aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...)      \
-  ET_INTERNAL_SWITCH_CASE(                                           \
-      exec_aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                           \
-      exec_aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, ...) \
-  ET_INTERNAL_SWITCH_CASE(                                         \
-      exec_aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                         \
-      exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                         \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...)              \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)    \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...)               \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, ...)    \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                            \
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_REAL_TYPES(CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE(                                              \
-      exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)            \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)   \
   ET_INTERNAL_SWITCH_CASE(                                              \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_INTB_TYPES(CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE(                                              \
-      exec_aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)            \
-  ET_INTERNAL_SWITCH_CASE(exec_aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_FLOATB_TYPES(CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE(                                                \
-      exec_aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)              \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)     \
   ET_INTERNAL_SWITCH_CASE(                                                \
-      exec_aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
+      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
 //
 // Switch case macros
@@ -1346,33 +1384,37 @@ inline exec_aten::ScalarType promoteTypes(
       CONTEXT,                                                             \
       NAME,                                                                \
       ET_INTERNAL_SWITCH_CASE(                                             \
-          exec_aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)             \
+          ::executorch::aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)    \
           ET_INTERNAL_SWITCH_CASE(                                         \
-              exec_aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__))
-
-#define ET_SWITCH_THREE_TYPES(                                     \
-    T1, T2, T3, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)             \
-  ET_INTERNAL_SWITCH(                                              \
-      TYPE,                                                        \
-      CONTEXT,                                                     \
-      NAME,                                                        \
-      ET_INTERNAL_SWITCH_CASE(                                     \
-          exec_aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)     \
-          ET_INTERNAL_SWITCH_CASE(                                 \
-              exec_aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__) \
-              ET_INTERNAL_SWITCH_CASE(                             \
-                  exec_aten::ScalarType::T3, CTYPE_ALIAS, __VA_ARGS__))
+              ::executorch::aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__))
+
+#define ET_SWITCH_THREE_TYPES(                                              \
+    T1, T2, T3, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)                      \
+  ET_INTERNAL_SWITCH(                                                       \
+      TYPE,                                                                 \
+      CONTEXT,                                                              \
+      NAME,                                                                 \
+      ET_INTERNAL_SWITCH_CASE(                                              \
+          ::executorch::aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__)     \
+          ET_INTERNAL_SWITCH_CASE(                                          \
+              ::executorch::aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__) \
+              ET_INTERNAL_SWITCH_CASE(                                      \
+                  ::executorch::aten::ScalarType::T3,                       \
+                  CTYPE_ALIAS,                                              \
+                  __VA_ARGS__))
 
 } // namespace runtime
 } // namespace executorch
 
-namespace exec_aten {
+namespace executorch {
+namespace aten {
 #ifdef USE_ATEN_LIB
 using ::at::elementSize;
 #else // USE_ATEN_LIB
 using ::executorch::runtime::elementSize;
 #endif // USE_ATEN_LIB
-} // namespace exec_aten
+} // namespace aten
+} // namespace executorch
 
 namespace torch {
 namespace executor {
diff --git a/runtime/core/portable_type/bfloat16.h b/runtime/core/portable_type/bfloat16.h
index e665e6152e3..c1ff250885a 100644
--- a/runtime/core/portable_type/bfloat16.h
+++ b/runtime/core/portable_type/bfloat16.h
@@ -14,8 +14,9 @@
 #include <limits>
 #include <ostream>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 namespace internal {
 inline float f32_from_bits(uint16_t src) {
@@ -26,12 +27,6 @@ inline float f32_from_bits(uint16_t src) {
   return res;
 }
 
-inline uint16_t bits_from_f32(float src) {
-  uint32_t res = 0;
-  std::memcpy(&res, &src, sizeof(res));
-  return res >> 16;
-}
-
 inline uint16_t round_to_nearest_even(float src) {
   if (std::isnan(src)) {
     return UINT16_C(0x7FC0);
@@ -264,13 +259,22 @@ inline bool operator<(BFloat16& lhs, BFloat16& rhs) {
   return float(lhs) < float(rhs);
 }
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::BFloat16;
 } // namespace executor
 } // namespace torch
 
 namespace std {
 
 template <>
-class numeric_limits<torch::executor::BFloat16> {
+class numeric_limits<executorch::runtime::etensor::BFloat16> {
  public:
   static constexpr bool is_signed = true;
   static constexpr bool is_specialized = true;
diff --git a/runtime/core/portable_type/bits_types.h b/runtime/core/portable_type/bits_types.h
index 076ee642d75..cddffc485ec 100644
--- a/runtime/core/portable_type/bits_types.h
+++ b/runtime/core/portable_type/bits_types.h
@@ -9,8 +9,9 @@
 #pragma once
 #include <cstdint>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * bits1x8 is an uninterpreted dtype of a tensor with 1 bit (packed to byte
@@ -65,5 +66,18 @@ struct alignas(2) bits16 {
   explicit bits16(uint16_t val) : val_(val) {}
 };
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::bits16;
+using ::executorch::runtime::etensor::bits1x8;
+using ::executorch::runtime::etensor::bits2x4;
+using ::executorch::runtime::etensor::bits4x2;
+using ::executorch::runtime::etensor::bits8;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/complex.h b/runtime/core/portable_type/complex.h
index 0d4684a992e..e89a19e54d7 100644
--- a/runtime/core/portable_type/complex.h
+++ b/runtime/core/portable_type/complex.h
@@ -10,8 +10,9 @@
 
 #include <executorch/runtime/core/portable_type/half.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * An implementation of complex numbers, compatible with c10/util/complex.h from
@@ -32,5 +33,14 @@ struct alignas(4) complex<Half> {
   Half imag_;
 };
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::complex;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/device.h b/runtime/core/portable_type/device.h
index 7c09cfd29c3..d789df8a84d 100644
--- a/runtime/core/portable_type/device.h
+++ b/runtime/core/portable_type/device.h
@@ -10,8 +10,9 @@
 
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /// Denotes the specific genre of compute device.
 /// Subset of https://github.com/pytorch/pytorch/blob/main/c10/core/Device.h
@@ -59,5 +60,15 @@ struct Device final {
   DeviceIndex index_ = -1;
 };
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Device;
+using ::executorch::runtime::etensor::DeviceType;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h
index 8987d82804b..fa40a80782f 100644
--- a/runtime/core/portable_type/half.h
+++ b/runtime/core/portable_type/half.h
@@ -32,8 +32,9 @@
 #endif // __x86_64__ || _M_X64 || __i386 || _M_IX86
 #endif // __GNUC__ || __clang__
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * A half-precision floating point type, compatible with c10/util/Half.h from
@@ -676,18 +677,26 @@ inline Half operator/(int64_t a, Half b) {
 
 static inline std::ostream& operator<<(
     std::ostream& out,
-    const torch::executor::Half& value) {
+    const executorch::runtime::etensor::Half& value) {
   out << (float)value;
   return out;
 }
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Half;
 } // namespace executor
 } // namespace torch
 
 namespace std {
 
 template <>
-class numeric_limits<torch::executor::Half> {
+class numeric_limits<executorch::runtime::etensor::Half> {
  public:
   static constexpr bool is_specialized = true;
   static constexpr bool is_signed = true;
@@ -714,32 +723,41 @@ class numeric_limits<torch::executor::Half> {
   static constexpr auto traps = numeric_limits<float>::traps;
   static constexpr auto tinyness_before =
       numeric_limits<float>::tinyness_before;
-  static constexpr torch::executor::Half min() {
-    return torch::executor::Half(0x0400, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half min() {
+    return executorch::runtime::etensor::Half(
+        0x0400, executorch::runtime::etensor::Half::from_bits());
   }
-  static constexpr torch::executor::Half lowest() {
-    return torch::executor::Half(0xFBFF, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half lowest() {
+    return executorch::runtime::etensor::Half(
+        0xFBFF, executorch::runtime::etensor::Half::from_bits());
   }
-  static constexpr torch::executor::Half max() {
-    return torch::executor::Half(0x7BFF, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half max() {
+    return executorch::runtime::etensor::Half(
+        0x7BFF, executorch::runtime::etensor::Half::from_bits());
   }
-  static constexpr torch::executor::Half epsilon() {
-    return torch::executor::Half(0x1400, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half epsilon() {
+    return executorch::runtime::etensor::Half(
+        0x1400, executorch::runtime::etensor::Half::from_bits());
   }
-  static constexpr torch::executor::Half round_error() {
-    return torch::executor::Half(0x3800, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half round_error() {
+    return executorch::runtime::etensor::Half(
+        0x3800, executorch::runtime::etensor::Half::from_bits());
   }
-  static constexpr torch::executor::Half infinity() {
-    return torch::executor::Half(0x7C00, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half infinity() {
+    return executorch::runtime::etensor::Half(
+        0x7C00, executorch::runtime::etensor::Half::from_bits());
   }
-  static constexpr torch::executor::Half quiet_NaN() {
-    return torch::executor::Half(0x7E00, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half quiet_NaN() {
+    return executorch::runtime::etensor::Half(
+        0x7E00, executorch::runtime::etensor::Half::from_bits());
   }
-  static constexpr torch::executor::Half signaling_NaN() {
-    return torch::executor::Half(0x7D00, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half signaling_NaN() {
+    return executorch::runtime::etensor::Half(
+        0x7D00, executorch::runtime::etensor::Half::from_bits());
   }
-  static constexpr torch::executor::Half denorm_min() {
-    return torch::executor::Half(0x0001, torch::executor::Half::from_bits());
+  static constexpr executorch::runtime::etensor::Half denorm_min() {
+    return executorch::runtime::etensor::Half(
+        0x0001, executorch::runtime::etensor::Half::from_bits());
   }
 };
 
diff --git a/runtime/core/portable_type/optional.h b/runtime/core/portable_type/optional.h
index 3d8cb41eac8..21fe0d39267 100644
--- a/runtime/core/portable_type/optional.h
+++ b/runtime/core/portable_type/optional.h
@@ -12,8 +12,9 @@
 #include <new>
 #include <utility> // std::forward and other template magic checks
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /// Used to indicate an optional type with uninitialized state.
 struct nullopt_t final {
@@ -177,5 +178,16 @@ class optional final {
   bool init_;
 };
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::nullopt;
+using ::executorch::runtime::etensor::nullopt_t;
+using ::executorch::runtime::etensor::optional;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/qint_types.h b/runtime/core/portable_type/qint_types.h
index f7c78e3a180..183675e1829 100644
--- a/runtime/core/portable_type/qint_types.h
+++ b/runtime/core/portable_type/qint_types.h
@@ -10,8 +10,9 @@
 
 #include <cstdint>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * qint8 is for signed 8 bit quantized Tensors
@@ -65,5 +66,18 @@ struct alignas(1) quint2x4 {
   explicit quint2x4(uint8_t val) : val_(val) {}
 };
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::qint32;
+using ::executorch::runtime::etensor::qint8;
+using ::executorch::runtime::etensor::quint2x4;
+using ::executorch::runtime::etensor::quint4x2;
+using ::executorch::runtime::etensor::quint8;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/scalar.h b/runtime/core/portable_type/scalar.h
index 1147fee7cc9..0922cec6b95 100644
--- a/runtime/core/portable_type/scalar.h
+++ b/runtime/core/portable_type/scalar.h
@@ -16,8 +16,9 @@
 #include <cstdint>
 #include <type_traits>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * Represents a scalar value.
@@ -109,5 +110,14 @@ ET_DEFINE_SCALAR_TO_METHOD(int64_t, Int)
 ET_DEFINE_SCALAR_TO_METHOD(bool, Bool)
 #undef ET_DEFINE_SCALAR_TO_METHOD
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Scalar;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/scalar_type.h b/runtime/core/portable_type/scalar_type.h
index 5b06cd6ec62..286aee3387c 100644
--- a/runtime/core/portable_type/scalar_type.h
+++ b/runtime/core/portable_type/scalar_type.h
@@ -43,8 +43,9 @@
 #include <executorch/runtime/core/portable_type/half.h>
 #include <executorch/runtime/core/portable_type/qint_types.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * Calls the provided macro on every ScalarType, providing the C type and the
@@ -98,5 +99,14 @@ enum class ScalarType : int8_t {
   NumOptions,
 };
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::ScalarType;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/string_view.h b/runtime/core/portable_type/string_view.h
index 47a9f335eb5..977a0f542d0 100644
--- a/runtime/core/portable_type/string_view.h
+++ b/runtime/core/portable_type/string_view.h
@@ -14,14 +14,11 @@
 #include <executorch/runtime/platform/assert.h>
 
 // TODO(T154113473): Document this file
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
-namespace {
-constexpr std::size_t min(const std::size_t a, const std::size_t b) {
-  return (b < a) ? b : a;
-}
-} // namespace
+namespace internal {
 
 /**
  * Reimplementation of std::string_view for C++11.
@@ -128,7 +125,7 @@ class basic_string_view final {
 
   size_type copy(pointer dest, size_type count, size_type pos = 0) const {
     ET_CHECK_MSG(pos > size_, "basic_string_view::copy: out of range.");
-    size_type copy_length = min(count, size_ - pos);
+    size_type copy_length = min_(count, size_ - pos);
     for (auto iter = begin() + pos, end = iter + copy_length; iter != end;) {
       *(dest++) = *(iter++);
     }
@@ -145,7 +142,7 @@ class basic_string_view final {
   constexpr int compare(basic_string_view rhs) const noexcept {
 #if __cpp_constexpr >= 201304
     // if we are in C++14, write it iteratively. This is faster.
-    for (size_t i = 0, end = min(size(), rhs.size()); i < end; ++i) {
+    for (size_t i = 0, end = min_(size(), rhs.size()); i < end; ++i) {
       if (at_(i) < rhs.at_(i)) {
         return -1;
       } else if (at_(i) > rhs.at_(i)) {
@@ -315,7 +312,7 @@ class basic_string_view final {
     }
 
     if (v.size() <= size()) {
-      pos = min(size() - v.size(), pos);
+      pos = min_(size() - v.size(), pos);
       do {
         if (v.at_(0) == at_(pos) &&
             v.substr_(1).equals_(substr_(pos + 1, v.size() - 1))) {
@@ -432,6 +429,10 @@ class basic_string_view final {
   }
 
  private:
+  static constexpr std::size_t min_(const std::size_t a, const std::size_t b) {
+    return (b < a) ? b : a;
+  }
+
   static constexpr size_type strlen_(const_pointer str) noexcept {
 #if __cpp_constexpr >= 201304
     // if we are in C++14, write it iteratively. This is faster.
@@ -453,7 +454,7 @@ class basic_string_view final {
 
   constexpr basic_string_view substr_(size_type pos = 0, size_type count = npos)
       const {
-    return basic_string_view{begin_ + pos, min(count, size() - pos)};
+    return basic_string_view{begin_ + pos, min_(count, size() - pos)};
   }
 
   template <class Condition>
@@ -485,7 +486,7 @@ class basic_string_view final {
 #if __cpp_constexpr >= 201304
     // if we are in C++14, write it iteratively. This is faster.
     if (size() > 0) {
-      pos = min(size() - 1, pos);
+      pos = min_(size() - 1, pos);
       do {
         if (condition(at_(pos))) {
           return pos;
@@ -570,7 +571,18 @@ inline void swap(
   lhs.swap(rhs);
 }
 
-using string_view = basic_string_view<char>;
+} // namespace internal
 
+using string_view = internal::basic_string_view<char>;
+
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::string_view;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/tensor.h b/runtime/core/portable_type/tensor.h
index 6e952e30b9b..775bccc1b52 100644
--- a/runtime/core/portable_type/tensor.h
+++ b/runtime/core/portable_type/tensor.h
@@ -12,8 +12,9 @@
 
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * A minimal Tensor type whose API is a source compatible subset of at::Tensor.
@@ -35,7 +36,7 @@ class Tensor {
   using StridesType = TensorImpl::StridesType;
 
   Tensor() = delete;
-  explicit Tensor(TensorImpl* impl) : impl_(impl) {}
+  explicit constexpr Tensor(TensorImpl* impl) : impl_(impl) {}
 
   /**
    * Returns a pointer to the underlying TensorImpl.
@@ -85,6 +86,10 @@ class Tensor {
     return impl_->scalar_type();
   }
 
+  inline ScalarType dtype() const {
+    return scalar_type();
+  }
+
   /// Returns the size in bytes of one element of the tensor.
   ssize_t element_size() const {
     return impl_->element_size();
@@ -105,6 +110,11 @@ class Tensor {
     return impl_->strides();
   }
 
+  /// Returns the mutability of the shape of the tensor.
+  TensorShapeDynamism shape_dynamism() const {
+    return impl_->shape_dynamism();
+  }
+
   /// Returns a pointer of type T to the constant underlying data blob.
   template <typename T>
   inline const T* const_data_ptr() const {
@@ -152,5 +162,14 @@ class Tensor {
   TensorImpl* impl_ = nullptr;
 };
 
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Tensor;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index ad0fa5868c1..2082b8a4c70 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -17,8 +17,9 @@
 #include <executorch/runtime/core/portable_type/scalar_type.h>
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * Compute the number of elements based on the sizes of a tensor.
@@ -119,5 +120,6 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
   return Error::Ok;
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h
index 57fc96aa325..fd2fd124c28 100644
--- a/runtime/core/portable_type/tensor_impl.h
+++ b/runtime/core/portable_type/tensor_impl.h
@@ -24,8 +24,9 @@ class TensorResizerFriend;
 } // namespace runtime
 } // namespace executorch
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * Manages the storage behind an ETensor (torch::executor::Tensor).
@@ -148,6 +149,10 @@ class TensorImpl {
     return type_;
   }
 
+  inline ScalarType dtype() const {
+    return scalar_type();
+  }
+
   /// Returns the size in bytes of one element of the tensor.
   ssize_t element_size() const;
 
@@ -166,6 +171,11 @@ class TensorImpl {
     return ArrayRef<StridesType>{strides_, static_cast<size_t>(dim_)};
   }
 
+  /// Returns the mutability of the shape of the tensor.
+  TensorShapeDynamism shape_dynamism() const {
+    return shape_dynamism_;
+  }
+
   /// Returns a pointer of type T to the constant underlying data blob.
   template <typename T>
   inline const T* data() const {
@@ -257,16 +267,18 @@ class TensorImpl {
  * Compute the number of elements based on the sizes of a tensor.
  */
 ssize_t compute_numel(
-    const ::torch::executor::TensorImpl::SizesType* sizes,
+    const ::executorch::runtime::etensor::TensorImpl::SizesType* sizes,
     ssize_t dim);
 
-} // namespace executor
-} // namespace torch
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
 
-namespace executorch {
-namespace runtime {
+namespace torch {
+namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using torch::executor::compute_numel;
-} // namespace runtime
-} // namespace executorch
+using ::executorch::runtime::etensor::compute_numel;
+using ::executorch::runtime::etensor::TensorImpl;
+} // namespace executor
+} // namespace torch
diff --git a/runtime/core/portable_type/tensor_options.h b/runtime/core/portable_type/tensor_options.h
index a6e604cf837..8b8f9848648 100644
--- a/runtime/core/portable_type/tensor_options.h
+++ b/runtime/core/portable_type/tensor_options.h
@@ -10,8 +10,9 @@
 
 #include <cstdint>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
+namespace etensor {
 
 /**
  * Tensor data memory formats supported by ExecuTorch. This concept only exists
@@ -45,5 +46,15 @@ enum class Layout : int8_t {
    */
   Strided = 0,
 };
+} // namespace etensor
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::etensor::Layout;
+using ::executorch::runtime::etensor::MemoryFormat;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/portable_type/test/bfloat16_test.cpp b/runtime/core/portable_type/test/bfloat16_test.cpp
index 9ea53e6cba2..6b42a6e4a5e 100644
--- a/runtime/core/portable_type/test/bfloat16_test.cpp
+++ b/runtime/core/portable_type/test/bfloat16_test.cpp
@@ -1,8 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #include <executorch/runtime/core/portable_type/bfloat16.h>
 
 #include <gtest/gtest.h>
 
-using torch::executor::BFloat16;
+using executorch::runtime::etensor::BFloat16;
+using executorch::runtime::etensor::internal::f32_from_bits;
+using executorch::runtime::etensor::internal::round_to_nearest_even;
 
 namespace {
 float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) {
@@ -21,6 +31,13 @@ float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) {
   return res;
 }
 
+// Opposite of f32_from_bits.
+uint16_t bits_from_f32(float src) {
+  uint32_t res = 0;
+  std::memcpy(&res, &src, sizeof(res));
+  return res >> 16;
+}
+
 TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float in[100];
@@ -35,8 +52,8 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
   float out[100];
 
   for (int i = 0; i < 100; ++i) {
-    bfloats[i].x = torch::executor::internal::bits_from_f32(in[i]);
-    out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x);
+    bfloats[i].x = bits_from_f32(in[i]);
+    out[i] = f32_from_bits(bfloats[i].x);
 
     // The relative error should be less than 1/(2^7) since BFloat16
     // has 7 bits mantissa.
@@ -58,8 +75,8 @@ TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
   float out[100];
 
   for (int i = 0; i < 100; ++i) {
-    bfloats[i].x = torch::executor::internal::round_to_nearest_even(in[i]);
-    out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x);
+    bfloats[i].x = round_to_nearest_even(in[i]);
+    out[i] = f32_from_bits(bfloats[i].x);
 
     // The relative error should be less than 1/(2^7) since BFloat16
     // has 7 bits mantissa.
@@ -72,7 +89,7 @@ TEST(BFloat16Conversion, NaN) {
   EXPECT_TRUE(std::isnan(inNaN));
 
   BFloat16 a = BFloat16(inNaN);
-  float out = torch::executor::internal::f32_from_bits(a.x);
+  float out = f32_from_bits(a.x);
 
   EXPECT_TRUE(std::isnan(out));
 }
@@ -82,7 +99,7 @@ TEST(BFloat16Conversion, Inf) {
   EXPECT_TRUE(std::isinf(inInf));
 
   BFloat16 a = BFloat16(inInf);
-  float out = torch::executor::internal::f32_from_bits(a.x);
+  float out = f32_from_bits(a.x);
 
   EXPECT_TRUE(std::isinf(out));
 }
@@ -91,7 +108,7 @@ TEST(BFloat16Conversion, SmallestDenormal) {
   float in = std::numeric_limits<float>::denorm_min(); // The smallest non-zero
                                                        // subnormal number
   BFloat16 a = BFloat16(in);
-  float out = torch::executor::internal::f32_from_bits(a.x);
+  float out = f32_from_bits(a.x);
 
   EXPECT_FLOAT_EQ(in, out);
 }
@@ -112,10 +129,10 @@ TEST(BFloat16Math, Addition) {
 
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   BFloat16 b;
-  b.x = torch::executor::internal::bits_from_f32(input);
+  b.x = bits_from_f32(input);
   b = b + b;
 
-  float res = torch::executor::internal::f32_from_bits(b.x);
+  float res = f32_from_bits(b.x);
   EXPECT_EQ(res, expected);
 }
 
@@ -135,10 +152,10 @@ TEST(BFloat16Math, Subtraction) {
 
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   BFloat16 b;
-  b.x = torch::executor::internal::bits_from_f32(input);
+  b.x = bits_from_f32(input);
   b = b - 5;
 
-  float res = torch::executor::internal::f32_from_bits(b.x);
+  float res = f32_from_bits(b.x);
   EXPECT_EQ(res, expected);
 }
 
@@ -174,7 +191,7 @@ class BFloat16Test : public ::testing::Test,
 
 TEST_P(BFloat16Test, BFloat16RNETest) {
   float value = BinaryToFloat(GetParam().input);
-  uint16_t rounded = torch::executor::internal::round_to_nearest_even(value);
+  uint16_t rounded = round_to_nearest_even(value);
   EXPECT_EQ(GetParam().rne, rounded);
 }
 
diff --git a/runtime/core/portable_type/test/half_test.cpp b/runtime/core/portable_type/test/half_test.cpp
index 18ab6cb4b22..0d5dca0e958 100644
--- a/runtime/core/portable_type/test/half_test.cpp
+++ b/runtime/core/portable_type/test/half_test.cpp
@@ -11,8 +11,8 @@
 #include <gtest/gtest.h>
 #include <cmath>
 
-namespace torch {
-namespace executor {
+using executorch::runtime::etensor::Half;
+
 namespace {
 
 /**
@@ -211,6 +211,3 @@ TEST(HalfTest, ArithmeticInt64Div) {
   EXPECT_TRUE(closeEnoughFloat16(ah / b, af / b));
   EXPECT_TRUE(closeEnoughFloat16(b / ah, b / af));
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/runtime/core/portable_type/test/optional_test.cpp b/runtime/core/portable_type/test/optional_test.cpp
index 11241aedbb1..fe27186bbf2 100644
--- a/runtime/core/portable_type/test/optional_test.cpp
+++ b/runtime/core/portable_type/test/optional_test.cpp
@@ -15,9 +15,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-
-namespace torch {
-namespace executor {
+using executorch::runtime::etensor::nullopt;
+using executorch::runtime::etensor::optional;
 
 // Test that optional::value_type matches the template parameter type.
 static_assert(
@@ -141,6 +140,3 @@ TEST(TestOptional, ImplicitReturnOfNullopt) {
   auto o = function_returning_nullopt();
   EXPECT_FALSE(o.has_value());
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/runtime/core/portable_type/test/scalar_test.cpp b/runtime/core/portable_type/test/scalar_test.cpp
index 4736d3c2a9e..fd211f916c3 100644
--- a/runtime/core/portable_type/test/scalar_test.cpp
+++ b/runtime/core/portable_type/test/scalar_test.cpp
@@ -10,8 +10,7 @@
 #include <executorch/test/utils/DeathTest.h>
 #include <gtest/gtest.h>
 
-namespace torch {
-namespace executor {
+using executorch::runtime::etensor::Scalar;
 
 TEST(ScalarTest, ToScalarType) {
   Scalar s_d((double)3.141);
@@ -46,5 +45,3 @@ TEST(ScalarTest, IntConstructor) {
   EXPECT_EQ(s_int.to<int64_t>(), s_int32.to<int64_t>());
   EXPECT_EQ(s_int32.to<int64_t>(), s_int64.to<int64_t>());
 }
-} // namespace executor
-} // namespace torch
diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp
index 77dd01ea23f..bd5f82c5d1f 100644
--- a/runtime/core/portable_type/test/tensor_impl_test.cpp
+++ b/runtime/core/portable_type/test/tensor_impl_test.cpp
@@ -17,20 +17,22 @@
 
 using namespace ::testing;
 
-namespace torch {
-namespace executor {
-
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Error;
+using executorch::runtime::TensorShapeDynamism;
+using executorch::runtime::etensor::ScalarType;
+using executorch::runtime::etensor::TensorImpl;
 using SizesType = TensorImpl::SizesType;
 using DimOrderType = TensorImpl::DimOrderType;
 using StridesType = TensorImpl::StridesType;
-using torch::executor::internal::resize_tensor_impl;
+using executorch::runtime::internal::resize_tensor_impl;
 
 class TensorImplTest : public ::testing::Test {
  protected:
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    runtime_init();
+    executorch::runtime::runtime_init();
   }
 };
 
@@ -446,6 +448,3 @@ TEST_F(TensorImplTest, TestResizingTensorToZeroAndBack) {
   EXPECT_GT(t.numel(), 0);
   EXPECT_EQ(t.data(), data);
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/runtime/core/portable_type/test/tensor_test.cpp b/runtime/core/portable_type/test/tensor_test.cpp
index 7a772cd0769..714cdc25661 100644
--- a/runtime/core/portable_type/test/tensor_test.cpp
+++ b/runtime/core/portable_type/test/tensor_test.cpp
@@ -13,15 +13,16 @@
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 
-namespace torch {
-namespace executor {
+using executorch::runtime::etensor::ScalarType;
+using executorch::runtime::etensor::Tensor;
+using executorch::runtime::etensor::TensorImpl;
 
 class TensorTest : public ::testing::Test {
  protected:
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    runtime_init();
+    executorch::runtime::runtime_init();
   }
 };
 
@@ -77,6 +78,3 @@ TEST_F(TensorTest, ModifyDataOfConstTensor) {
   EXPECT_EQ(a.scalar_type(), ScalarType::Int);
   EXPECT_EQ(a.const_data_ptr<int32_t>()[0], 0);
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index 4978e73169a..7e0aeb5d28c 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -15,7 +15,7 @@ def build_sdk():
 def get_sdk_flags():
     sdk_flags = []
     if build_sdk():
-        sdk_flags += ["-DEXECUTORCH_BUILD_SDK"]
+        sdk_flags += ["-DEXECUTORCH_BUILD_DEVTOOLS"]
     return sdk_flags
 
 def define_common_targets():
diff --git a/runtime/core/test/evalue_test.cpp b/runtime/core/test/evalue_test.cpp
index 4c08695dc4b..a7eb5c185a7 100644
--- a/runtime/core/test/evalue_test.cpp
+++ b/runtime/core/test/evalue_test.cpp
@@ -16,9 +16,6 @@
 
 using namespace ::testing;
 
-namespace torch {
-namespace executor {
-
 using exec_aten::ScalarType;
 using executorch::runtime::BoxedEvalueList;
 using executorch::runtime::EValue;
@@ -30,7 +27,7 @@ class EValueTest : public ::testing::Test {
   void SetUp() override {
     // Since these tests cause ET_LOG to be called, the PAL must be initialized
     // first.
-    runtime_init();
+    executorch::runtime::runtime_init();
   }
 };
 
@@ -276,6 +273,3 @@ TEST_F(EValueTest, ConstructFromNullPtrAborts) {
 
   ET_EXPECT_DEATH({ EValue evalue(null_ptr); }, "");
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/runtime/core/test/event_tracer_test.cpp b/runtime/core/test/event_tracer_test.cpp
index 29081c337fd..6422f9b668e 100644
--- a/runtime/core/test/event_tracer_test.cpp
+++ b/runtime/core/test/event_tracer_test.cpp
@@ -44,14 +44,16 @@ class DummyEventTracer : public EventTracer {
       const char* name,
       ChainID chain_id = kUnsetChainId,
       DebugHandle debug_handle = kUnsetDebugHandle) override {
-    (void)name;
     (void)chain_id;
     (void)debug_handle;
+    ET_CHECK(strlen(name) + 1 < sizeof(event_name_));
+    memcpy(event_name_, name, strlen(name) + 1);
     return EventTracerEntry();
   }
 
   void end_profiling(EventTracerEntry prof_entry) override {
     (void)prof_entry;
+    memset(event_name_, 0, sizeof(event_name_));
     return;
   }
 
@@ -156,6 +158,10 @@ class DummyEventTracer : public EventTracer {
     return logged_evalue_type_;
   }
 
+  char* get_event_name() {
+    return event_name_;
+  }
+
   void reset_logged_value() {
     logged_evalue_ = EValue(false);
   }
@@ -163,6 +169,7 @@ class DummyEventTracer : public EventTracer {
  private:
   EValue logged_evalue_ = EValue(false);
   LoggedEValueType logged_evalue_type_;
+  char event_name_[1024];
 };
 
 /**
@@ -175,7 +182,7 @@ void RunSimpleTracerTest(EventTracer* event_tracer) {
   using executorch::runtime::internal::event_tracer_track_allocation;
   using executorch::runtime::internal::event_tracer_track_allocator;
   using executorch::runtime::internal::EventTracerProfileInstructionScope;
-  using executorch::runtime::internal::EventTracerProfileScope;
+  using executorch::runtime::internal::EventTracerProfileMethodScope;
 
   event_tracer_create_event_block(event_tracer, "ExampleEvent");
   event_tracer_create_event_block(event_tracer, "ExampleEvent");
@@ -183,7 +190,7 @@ void RunSimpleTracerTest(EventTracer* event_tracer) {
       event_tracer_begin_profiling_event(event_tracer, "ExampleEvent");
   event_tracer_end_profiling_event(event_tracer, event_entry);
   {
-    EventTracerProfileScope event_tracer_profile_scope(
+    EventTracerProfileMethodScope event_tracer_profile_scope(
         event_tracer, "ExampleScope");
   }
   {
@@ -282,3 +289,39 @@ TEST(TestEventTracer, SimpleEventTracerTestLogging) {
 
 // TODO(T163645377): Add more test coverage to log and verify events passed into
 // DummyTracer.
+TEST(TestEventTracer, EventTracerProfileOpControl) {
+  DummyEventTracer dummy;
+  // Op profiling is enabled by default. Test that it works.
+  {
+    {
+      executorch::runtime::internal::EventTracerProfileOpScope
+          event_tracer_op_scope(&dummy, "ExampleOpScope");
+      EXPECT_EQ(strcmp(dummy.get_event_name(), "ExampleOpScope"), 0);
+    }
+    EXPECT_EQ(strcmp(dummy.get_event_name(), ""), 0);
+
+    // Normal profiling should still work.
+    {
+      executorch::runtime::internal::EventTracerProfileMethodScope
+          event_tracer_profiler_scope(&dummy, "ExampleProfilerScope");
+      EXPECT_EQ(strcmp(dummy.get_event_name(), "ExampleProfilerScope"), 0);
+    }
+
+    dummy.set_event_tracer_profiling_level(
+        executorch::runtime::EventTracerProfilingLevel::kProfileMethodOnly);
+
+    // Op profiling should be disabled now.
+    {
+      executorch::runtime::internal::EventTracerProfileOpScope
+          event_tracer_op_scope(&dummy, "ExampleOpScope");
+      EXPECT_EQ(strcmp(dummy.get_event_name(), ""), 0);
+    }
+
+    // Normal profiling should still work.
+    {
+      executorch::runtime::internal::EventTracerProfileMethodScope
+          event_tracer_profiler_scope(&dummy, "1ExampleProfilerScope");
+      EXPECT_EQ(strcmp(dummy.get_event_name(), "1ExampleProfilerScope"), 0);
+    }
+  }
+}
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index a6ed7e354a9..0838529bc51 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -572,8 +572,8 @@ Result<Method> Method::load(
 
 Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
   EXECUTORCH_SCOPE_PROF("Method::init");
-  internal::EventTracerProfileScope event_tracer_profile_scope =
-      internal::EventTracerProfileScope(event_tracer_, "Method::init");
+  internal::EventTracerProfileMethodScope event_tracer_profile_scope =
+      internal::EventTracerProfileMethodScope(event_tracer_, "Method::init");
   ET_CHECK_OR_RETURN_ERROR(
       // Don't use !initialized() here because we also want to fail on the
       // InitializationFailed state.
@@ -744,40 +744,6 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
     }
   }
 
-  // Validate input values and get tensor pre-allocation info.
-  pre_allocated_input_ = false;
-  for (int i = 0; i < inputs_size(); i++) {
-    // get_input() will panic if the index is invalid, so do this manually.
-    size_t index = get_input_index(i);
-    ET_CHECK_OR_RETURN_ERROR(
-        index < n_value_,
-        InvalidProgram,
-        "Input index %zu >= %zu",
-        index,
-        n_value_);
-    const EValue& input = values_[index];
-    if (input.isTensor()) {
-      pre_allocated_input_ |= input.toTensor().const_data_ptr() != nullptr;
-    }
-  }
-
-  // Validate output values and get tensor pre-allocation info.
-  pre_allocated_output_ = false;
-  for (int i = 0; i < outputs_size(); i++) {
-    // get_output() will panic if the index is invalid, so do this manually.
-    size_t index = get_output_index(i);
-    ET_CHECK_OR_RETURN_ERROR(
-        index < n_value_,
-        InvalidProgram,
-        "output index %zu >= %zu",
-        index,
-        n_value_);
-    const EValue& output = values_[index];
-    if (output.isTensor()) {
-      pre_allocated_output_ |= output.toTensor().const_data_ptr() != nullptr;
-    }
-  }
-
   step_state_ = StepState{0, 0};
 
   init_state_ = InitializationState::Initialized;
@@ -841,7 +807,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
         input_idx,
         static_cast<uint32_t>(err));
     Error error;
-    if (pre_allocated_input_) {
+    auto tensor_meta = this->method_meta().input_tensor_meta(input_idx);
+    if (tensor_meta->is_memory_planned()) {
       error = internal::copy_tensor_data(t_dst, t_src);
     } else {
       error = internal::share_tensor_data(t_dst, t_src);
@@ -950,21 +917,11 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
       InvalidState,
       "Outputs can not be retrieved until method has been initialized.");
 
-  // ET_CHECK_OR_RETURN_ERROR(
-  //     !pre_allocated_output_,
-  //     InvalidState,
-  //     "Overriding output data pointer allocated by memory plan is not
-  //     allowed.");
-  // TODO(T188740925): for now, return error without logs.
-  if (pre_allocated_output_) {
-    return Error::InvalidState;
-  }
-
   // Check the args
   ET_CHECK_OR_RETURN_ERROR(
-      output_idx <= outputs_size(),
+      output_idx < outputs_size(),
       InvalidArgument,
-      "output_idx: %zu num_outputs: %zu",
+      "output_idx: %zu > num_outputs: %zu",
       output_idx,
       outputs_size());
 
@@ -975,6 +932,16 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
       "output type: %zu is not tensor",
       (size_t)output.tag);
 
+  auto tensor_meta = this->method_meta().output_tensor_meta(output_idx);
+  if (tensor_meta->is_memory_planned()) {
+    ET_LOG(
+        Error,
+        "Output %zu is memory planned, or is a constant. Cannot override \
+        the existing data pointer.",
+        output_idx);
+    return Error::InvalidState;
+  }
+
   auto& t = output.toTensor();
   ET_CHECK_OR_RETURN_ERROR(
       output.isTensor(),
@@ -1055,8 +1022,8 @@ Error Method::execute_instruction() {
   switch (instruction->instr_args_type()) {
     case executorch_flatbuffer::InstructionArguments::KernelCall: {
       EXECUTORCH_SCOPE_PROF("OPERATOR_CALL");
-      internal::EventTracerProfileScope event_tracer_scope =
-          internal::EventTracerProfileScope(event_tracer_, "OPERATOR_CALL");
+      internal::EventTracerProfileOpScope event_tracer_op_scope =
+          internal::EventTracerProfileOpScope(event_tracer_, "OPERATOR_CALL");
       // TODO(T147221312): Also expose tensor resizer via the context.
       KernelRuntimeContext context(event_tracer_, temp_allocator_);
       auto args = chain.argument_lists_[step_state_.instr_idx];
@@ -1090,8 +1057,8 @@ Error Method::execute_instruction() {
     } break;
     case executorch_flatbuffer::InstructionArguments::DelegateCall: {
       EXECUTORCH_SCOPE_PROF("DELEGATE_CALL");
-      internal::EventTracerProfileScope event_tracer_profile_scope =
-          internal::EventTracerProfileScope(event_tracer_, "DELEGATE_CALL");
+      internal::EventTracerProfileOpScope event_tracer_op_scope =
+          internal::EventTracerProfileOpScope(event_tracer_, "DELEGATE_CALL");
       // We know that instr_args_as_DelegateCall is non-null because it was
       // checked at init time.
       auto delegate_idx =
@@ -1134,8 +1101,8 @@ Error Method::execute_instruction() {
     } break;
     case executorch_flatbuffer::InstructionArguments::JumpFalseCall: {
       EXECUTORCH_SCOPE_PROF("JF_CALL");
-      internal::EventTracerProfileScope event_tracer_profile_scope =
-          internal::EventTracerProfileScope(event_tracer_, "JF_CALL");
+      internal::EventTracerProfileOpScope event_tracer_op_scope =
+          internal::EventTracerProfileOpScope(event_tracer_, "JF_CALL");
       // We know that instr_args_as_JumpFalseCall is non-null because it was
       // checked at init time.
       auto jf_call = instruction->instr_args_as_JumpFalseCall();
@@ -1153,8 +1120,8 @@ Error Method::execute_instruction() {
     } break;
     case executorch_flatbuffer::InstructionArguments::MoveCall: {
       EXECUTORCH_SCOPE_PROF("MOVE_CALL");
-      internal::EventTracerProfileScope event_tracer_profile_scope =
-          internal::EventTracerProfileScope(event_tracer_, "MOVE_CALL");
+      internal::EventTracerProfileOpScope event_tracer_op_scope =
+          internal::EventTracerProfileOpScope(event_tracer_, "MOVE_CALL");
       // We know that instr_args_as_MoveCall is non-null because it was checked
       // at init time.
       auto move_call = instruction->instr_args_as_MoveCall();
@@ -1162,8 +1129,8 @@ Error Method::execute_instruction() {
     } break;
     case executorch_flatbuffer::InstructionArguments::FreeCall: {
       EXECUTORCH_SCOPE_PROF("FREE_CALL");
-      internal::EventTracerProfileScope event_tracer_profile_scope =
-          internal::EventTracerProfileScope(event_tracer_, "FREE_CALL");
+      internal::EventTracerProfileOpScope event_tracer_op_scope =
+          internal::EventTracerProfileOpScope(event_tracer_, "FREE_CALL");
       // We know that instr_args_as_FreeCall is non-null because it was checked
       // at init time.
       auto free_call = instruction->instr_args_as_FreeCall();
@@ -1224,8 +1191,8 @@ Error Method::step() {
           static_cast<int32_t>(step_state_.chain_idx),
           static_cast<uint32_t>(step_state_.instr_idx));
   EXECUTORCH_SCOPE_PROF("Method::step");
-  internal::EventTracerProfileScope event_tracer_profile_scope =
-      internal::EventTracerProfileScope(event_tracer_, "Method::step");
+  internal::EventTracerProfileMethodScope event_tracer_profile_scope =
+      internal::EventTracerProfileMethodScope(event_tracer_, "Method::step");
   ET_CHECK_OR_RETURN_ERROR(
       initialized(),
       InvalidState,
@@ -1266,8 +1233,8 @@ Error Method::experimental_step() {
 
 Error Method::execute() {
   internal::event_tracer_create_event_block(event_tracer_, "Execute");
-  internal::EventTracerProfileScope event_tracer_profile_scope =
-      internal::EventTracerProfileScope(event_tracer_, "Method::execute");
+  internal::EventTracerProfileMethodScope event_tracer_profile_scope =
+      internal::EventTracerProfileMethodScope(event_tracer_, "Method::execute");
   EXECUTORCH_SCOPE_PROF("Method::execute");
   ET_CHECK_OR_RETURN_ERROR(
       initialized(),
diff --git a/runtime/executor/method.h b/runtime/executor/method.h
index 0a35d6b9282..66e3c96d292 100644
--- a/runtime/executor/method.h
+++ b/runtime/executor/method.h
@@ -62,9 +62,7 @@ class Method final {
         delegates_(rhs.delegates_),
         n_chains_(rhs.n_chains_),
         chains_(rhs.chains_),
-        init_state_(rhs.init_state_),
-        pre_allocated_input_(rhs.pre_allocated_input_),
-        pre_allocated_output_(rhs.pre_allocated_output_) {
+        init_state_(rhs.init_state_) {
     // Required: clear out fields that the dtor looks at, so that we don't free
     // anything twice.
     rhs.n_value_ = 0;
@@ -82,8 +80,6 @@ class Method final {
     rhs.event_tracer_ = nullptr;
     rhs.n_chains_ = 0;
     rhs.chains_ = nullptr;
-    rhs.pre_allocated_input_ = false;
-    rhs.pre_allocated_output_ = false;
   }
 
   /**
@@ -288,9 +284,7 @@ class Method final {
         delegates_(nullptr),
         n_chains_(0),
         chains_(nullptr),
-        init_state_(InitializationState::Uninitialized),
-        pre_allocated_input_(false),
-        pre_allocated_output_(false) {}
+        init_state_(InitializationState::Uninitialized) {}
 
   /// Static factory used by Program.
   ET_NODISCARD static Result<Method> load(
@@ -336,8 +330,6 @@ class Method final {
   Chain* chains_;
 
   InitializationState init_state_;
-  bool pre_allocated_input_;
-  bool pre_allocated_output_;
 
   /**
    * Parses the elements of the values_ array. On error, n_value_ will be set to
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index 309ecf0ec85..5acf055a89f 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -139,7 +139,9 @@ Result<TensorInfo> MethodMeta::input_tensor_meta(size_t index) const {
       Span<const uint8_t>(
           tensor_value->dim_order()->data(), tensor_value->dim_order()->size()),
       static_cast<exec_aten::ScalarType>(tensor_value->scalar_type()),
-      tensor_value->allocation_info() != nullptr);
+      tensor_value->allocation_info() != nullptr ||
+          tensor_value->data_buffer_idx() !=
+              0); // Count constant returns as memory planned.
 }
 
 size_t MethodMeta::num_outputs() const {
@@ -170,15 +172,18 @@ Result<TensorInfo> MethodMeta::output_tensor_meta(size_t index) const {
       "Tag: %zu output: %zu is not Tensor",
       (size_t)tag.get(),
       index);
-  auto input_index = s_plan_->outputs()->Get(index);
-  auto tensor_value = s_plan_->values()->Get(input_index)->val_as_Tensor();
+  auto output_index = s_plan_->outputs()->Get(index);
+  auto tensor_value = s_plan_->values()->Get(output_index)->val_as_Tensor();
+
   return TensorInfo(
       Span<const int32_t>(
           tensor_value->sizes()->data(), tensor_value->sizes()->size()),
       Span<const uint8_t>(
           tensor_value->dim_order()->data(), tensor_value->dim_order()->size()),
       static_cast<exec_aten::ScalarType>(tensor_value->scalar_type()),
-      tensor_value->allocation_info() != nullptr);
+      tensor_value->allocation_info() != nullptr ||
+          tensor_value->data_buffer_idx() !=
+              0); // Count constant returns as memory planned.
 }
 
 size_t MethodMeta::num_memory_planned_buffers() const {
diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp
index 48d36602d33..0a15b0d0f7f 100644
--- a/runtime/executor/program.cpp
+++ b/runtime/executor/program.cpp
@@ -241,8 +241,9 @@ Result<Method> Program::load_method(
     EventTracer* event_tracer) const {
   EXECUTORCH_SCOPE_PROF("Program::load_method");
   internal::event_tracer_create_event_block(event_tracer, "Default");
-  internal::EventTracerProfileScope event_tracer_scope =
-      internal::EventTracerProfileScope(event_tracer, "Program::load_method");
+  internal::EventTracerProfileMethodScope event_tracer_scope =
+      internal::EventTracerProfileMethodScope(
+          event_tracer, "Program::load_method");
   // If we can't create a MethodMeta for the Method, the Method is corrupt;
   // Method::method_meta() assumes success, so we must fail here.
   Result<MethodMeta> meta = method_meta(method_name);
diff --git a/runtime/kernel/kernel_runtime_context.h b/runtime/kernel/kernel_runtime_context.h
index 213cb45ba1e..96ad3d51e36 100644
--- a/runtime/kernel/kernel_runtime_context.h
+++ b/runtime/kernel/kernel_runtime_context.h
@@ -110,21 +110,22 @@ class KernelRuntimeContext {
 } // namespace runtime
 } // namespace executorch
 
-namespace torch {
-namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::KernelRuntimeContext;
-} // namespace executor
-} // namespace torch
-
-// TODO(T147221312): Remove these aliases once all code uses
-// KernelRuntimeContext.
-namespace exec_aten {
-using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
-} // namespace exec_aten
 namespace torch {
 namespace executor {
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
+using ::executorch::runtime::KernelRuntimeContext;
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
 using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
 } // namespace executor
 } // namespace torch
+namespace executorch {
+namespace aten {
+/// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
+using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
+} // namespace aten
+} // namespace executorch
+// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
+// instead.
+namespace exec_aten = ::executorch::aten;
diff --git a/setup.py b/setup.py
index f6adb4f86c3..49a208233ce 100644
--- a/setup.py
+++ b/setup.py
@@ -423,7 +423,31 @@ def run(self):
                 "devtools/bundled_program/schema/scalar_type.fbs",
                 "devtools/bundled_program/serialize/scalar_type.fbs",
             ),
+            # Install executorch-config.cmake to the root of the package.
+            (
+                "build/executorch-config.cmake",
+                "executorch-config.cmake",
+            ),
         ]
+        # Copy all the necessary headers into include/executorch/ so that they can
+        # be found in the pip package. This is the subset of headers that are
+        # essential for building custom ops extensions.
+        # TODO: Use cmake to gather the headers instead of hard-coding them here.
+        # For example: https://discourse.cmake.org/t/installing-headers-the-modern-
+        # way-regurgitated-and-revisited/3238/3
+        for include_dir in [
+            "runtime/core/",
+            "runtime/kernel/",
+            "runtime/platform/",
+            "extension/kernel_util/",
+            "extension/tensor/",
+            "extension/threadpool/",
+        ]:
+            src_list = Path(include_dir).rglob("*.h")
+            for src in src_list:
+                src_to_dst.append(
+                    (str(src), os.path.join("include/executorch", str(src)))
+                )
         for src, dst in src_to_dst:
             dst = os.path.join(dst_root, dst)
 
diff --git a/shim/BUCK b/shim/BUCK
index 56fe035920b..365a7bc0765 100644
--- a/shim/BUCK
+++ b/shim/BUCK
@@ -1,3 +1,4 @@
+load("@prelude//platforms:defs.bzl", "execution_platform")
 load("@prelude//toolchains:cxx.bzl", "system_cxx_toolchain")
 load("@prelude//toolchains:genrule.bzl", "system_genrule_toolchain")
 load("@prelude//toolchains:go.bzl", "system_go_toolchain")
@@ -55,3 +56,21 @@ remote_test_execution_toolchain(
     name = "remote_test_execution",
     visibility = ["PUBLIC"],
 )
+
+execution_platform(
+    name = "android-arm64",
+    cpu_configuration = "prelude//cpu:arm64",
+    os_configuration = "prelude//os:android",
+    # REVIEW: not sure if this is correct
+    use_windows_path_separators = host_info().os.is_windows,
+    visibility = ["PUBLIC"],
+)
+
+execution_platform(
+    name = "android-x86_64",
+    cpu_configuration = "prelude//cpu:x86_64",
+    os_configuration = "prelude//os:android",
+    # REVIEW: not sure if this is correct
+    use_windows_path_separators = host_info().os.is_windows,
+    visibility = ["PUBLIC"],
+)
diff --git a/shim/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl b/shim/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
index 75501ef5b2b..3c6f79c8a95 100644
--- a/shim/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
+++ b/shim/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
@@ -4,7 +4,7 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 # the values of the dictionary are lists where the first element is the internal dep and the
 # second element is the OSS dep
 _THIRD_PARTY_LIBS = {
-    "FP16": ["//xplat/third-party/FP16:FP16", "//backends/xnnpack/third-party:FP16"],
+    "FP16": ["fbsource//xplat/third-party/FP16:FP16Fbcode", "//backends/xnnpack/third-party:FP16"],
     "FXdiv": ["//xplat/third-party/FXdiv:FXdiv", "//backends/xnnpack/third-party:FXdiv"],
     "XNNPACK": ["//xplat/third-party/XNNPACK:XNNPACK", "//backends/xnnpack/third-party:XNNPACK"],
     "clog": ["//xplat/third-party/clog:clog", "//backends/xnnpack/third-party:clog"],
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl
index 5b0acd36dab..c4111c744b8 100644
--- a/shim/xplat/executorch/build/env_interface.bzl
+++ b/shim/xplat/executorch/build/env_interface.bzl
@@ -118,7 +118,8 @@ def _remove_platform_specific_args(kwargs):
     """
     keys = []
     for key in kwargs:
-        if key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or key.startswith("fbobjc"):
+        if (key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or
+            key.startswith("fbobjc") or key.endswith("_platform_compiler_flags")):
             keys.append(key)
     for key in keys:
         kwargs.pop(key)
@@ -200,6 +201,8 @@ def _struct_to_json(object):
     return native.json.encode(object)
 
 env = struct(
+    # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
+    command_alias = native.command_alias,
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     cxx_binary = native.cxx_binary,
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
diff --git a/shim/xplat/executorch/build/runtime_wrapper.bzl b/shim/xplat/executorch/build/runtime_wrapper.bzl
index 6b0ada353fa..ea5b2eee1fc 100644
--- a/shim/xplat/executorch/build/runtime_wrapper.bzl
+++ b/shim/xplat/executorch/build/runtime_wrapper.bzl
@@ -301,6 +301,10 @@ def _filegroup(*args, **kwargs):
     _patch_kwargs_common(kwargs)
     env.filegroup(*args, **kwargs)
 
+def _command_alias(*args, **kwargs):
+    _patch_kwargs_common(kwargs)
+    env.command_alias(*args, **kwargs)
+
 def _genrule(*args, **kwargs):
     _patch_kwargs_common(kwargs)
     env.patch_platforms(kwargs)
@@ -345,6 +349,7 @@ def get_oss_build_kwargs():
 # see the "Build Rules" section in the sidebar of
 # https://buck.build/concept/build_rule.html.
 runtime = struct(
+    command_alias = _command_alias,
     cxx_binary = _cxx_binary,
     cxx_library = _cxx_library,
     cxx_python_extension = _cxx_python_extension,
diff --git a/shim/xplat/executorch/codegen/codegen.bzl b/shim/xplat/executorch/codegen/codegen.bzl
index 34a8f81e874..312cc1edf9e 100644
--- a/shim/xplat/executorch/codegen/codegen.bzl
+++ b/shim/xplat/executorch/codegen/codegen.bzl
@@ -49,32 +49,43 @@ def et_operator_library(
         model = None,
         include_all_operators = False,
         ops_schema_yaml_target = None,
+        server_generated_yaml_target = None,
         **kwargs):
-    genrule_cmd = [
-        "$(exe //executorch/codegen/tools:gen_oplist)",
-        "--output_path=${OUT}",
-    ]
-    if ops_schema_yaml_target:
-        genrule_cmd.append(
-            "--ops_schema_yaml_path=$(location {})".format(ops_schema_yaml_target),
-        )
-    if ops:
-        genrule_cmd.append(
-            "--root_ops=" + ",".join(ops),
-        )
-    if ops_dict:
-        ops_dict_json = struct_to_json(ops_dict)
-        genrule_cmd.append(
-            "--ops_dict='{}'".format(ops_dict_json),
-        )
-    if model:
-        genrule_cmd.append(
-            "--model_file_path=$(location {})".format(model),
-        )
-    if include_all_operators:
-        genrule_cmd.append(
-            "--include_all_operators",
-        )
+    # do a dummy copy if server_generated_yaml_target is set
+    if server_generated_yaml_target:
+        if include_all_operators or ops_schema_yaml_target or model or ops or ops_dict:
+            fail("Since server_generated_yaml_target is set, ops, ops_dict, include_all_operators and ops_schema_yaml_target shouldn't be set.")
+        genrule_cmd = [
+            "cp",
+            "$(location {})".format(server_generated_yaml_target),
+            "$OUT",
+        ]
+    else:
+        genrule_cmd = [
+            "$(exe //executorch/codegen/tools:gen_oplist)",
+            "--output_path=${OUT}",
+        ]
+        if ops_schema_yaml_target:
+            genrule_cmd.append(
+                "--ops_schema_yaml_path=$(location {})".format(ops_schema_yaml_target),
+            )
+        if ops:
+            genrule_cmd.append(
+                "--root_ops=" + ",".join(ops),
+            )
+        if ops_dict:
+            ops_dict_json = struct_to_json(ops_dict)
+            genrule_cmd.append(
+                "--ops_dict='{}'".format(ops_dict_json),
+            )
+        if model:
+            genrule_cmd.append(
+                "--model_file_path=$(location {})".format(model),
+            )
+        if include_all_operators:
+            genrule_cmd.append(
+                "--include_all_operators",
+            )
 
     # TODO(larryliu0820): Remove usages of this flag.
     if "define_static_targets" in kwargs:
diff --git a/shim/xplat/executorch/extension/pybindings/pybindings.bzl b/shim/xplat/executorch/extension/pybindings/pybindings.bzl
index 5ef9fe59266..52191eb978a 100644
--- a/shim/xplat/executorch/extension/pybindings/pybindings.bzl
+++ b/shim/xplat/executorch/extension/pybindings/pybindings.bzl
@@ -52,6 +52,7 @@ def executorch_pybindings(python_module_name, srcs = [], cppdeps = [], visibilit
             "-DEXECUTORCH_PYTHON_MODULE_NAME={}".format(python_module_name),
         ],
         deps = [
+            "//executorch/exir:_warnings",
             "//executorch/runtime/core:core",
         ] + cppdeps,
         external_deps = [
diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
index 6e6b97b7186..2365450ae59 100644
--- a/test/end2end/exported_module.py
+++ b/test/end2end/exported_module.py
@@ -147,7 +147,7 @@ def return_wrapper():
             for method in methods:
                 method_name_to_dynamic_shapes[method] = trace_dynamic_shapes
 
-        memory_planning_pass = MemoryPlanningPass("greedy")
+        memory_planning_pass = MemoryPlanningPass()
         if hasattr(eager_module, "get_memory_planning_pass"):
             memory_planning_pass = eager_module.get_memory_planning_pass()
 
diff --git a/test/models/deprecated/README.md b/test/models/deprecated/README.md
index f1d47d03264..44ed9647735 100644
--- a/test/models/deprecated/README.md
+++ b/test/models/deprecated/README.md
@@ -3,7 +3,7 @@
 This readme documents deprecated models that remain compatible with versions of the ExecuTorch runtime.
 
 ModuleLinear-no-constant-segment.pte
-- This file contains constants stored in the constant_buffer, which was deprecated in D61996249 on 2024-09-05. Now, constants are stored in a separate segment.
+- This file contains constants stored in the constant_buffer, which was deprecated in D61996249, [#5096](https://github.com/pytorch/executorch/pull/5096) on 2024-09-06. Now, constants are stored in a separate segment.
 - This .pte file was generated internally using hg commit hash rFBS5e49dc0319b1d2d9969bbcef92857ab76a899c34, with command:
     ```
     buck2 build fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte] --show-output
diff --git a/test/models/export_program.py b/test/models/export_program.py
index d753475b829..caea394f33c 100644
--- a/test/models/export_program.py
+++ b/test/models/export_program.py
@@ -121,7 +121,6 @@ def get_dynamic_shapes(self):
 
     def get_memory_planning_pass(self):
         return MemoryPlanningPass(
-            memory_planning_algo="greedy",
             alloc_graph_input=False,
             alloc_graph_output=False,
         )
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 078196bfc1e..e771fd4b12e 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -36,7 +36,7 @@ build_executorch() {
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_SDK=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_BUILD_VULKAN=$BUILD_VULKAN \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -Bcmake-out
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index dca2a7bbbce..d2b0ff35758 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -64,7 +64,6 @@
     {
         "directory": "extension/tensor/test",
         "sources": [
-            "tensor_impl_ptr_test.cpp",
             "tensor_ptr_maker_test.cpp",
             "tensor_ptr_test.cpp"
         ],
diff --git a/test/utils/alignment.h b/test/utils/alignment.h
index f8c9b47362a..dd3378821fd 100644
--- a/test/utils/alignment.h
+++ b/test/utils/alignment.h
@@ -12,8 +12,8 @@
 
 #include <gmock/gmock.h> // For MATCHER_P
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace runtime {
 namespace testing {
 
 /**
@@ -28,7 +28,7 @@ inline bool is_aligned(const void* ptr, size_t alignment) {
  * Lets gtest users write `EXPECT_THAT(ptr, IsAlignedTo(alignment))` or
  * `EXPECT_THAT(ptr, Not(IsAlignedTo(alignment)))`.
  *
- * See also `EXPECT_POINTER_IS_ALIGNED_TO()`.
+ * See also `EXPECT_ALIGNED()`.
  */
 MATCHER_P(IsAlignedTo, other, "") {
   return is_aligned(arg, other);
@@ -39,10 +39,10 @@ MATCHER_P(IsAlignedTo, other, "") {
  */
 
 #define EXPECT_ALIGNED(ptr, alignment) \
-  EXPECT_THAT((ptr), torch::executor::testing::IsAlignedTo((alignment)))
+  EXPECT_THAT((ptr), executorch::runtime::testing::IsAlignedTo((alignment)))
 #define ASSERT_ALIGNED(ptr, alignment) \
-  ASSERT_THAT((ptr), torch::executor::testing::IsAlignedTo((alignment)))
+  ASSERT_THAT((ptr), executorch::runtime::testing::IsAlignedTo((alignment)))
 
 } // namespace testing
-} // namespace executor
-} // namespace torch
+} // namespace runtime
+} // namespace executorch
diff --git a/version.txt b/version.txt
index f28aaa5cd4e..515423ed567 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.0a0
+0.5.0a0